From 9135acc06eedfdb0f31ebf0aa4e8f01c36656352 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 24 Apr 2024 12:36:07 +0800 Subject: [PATCH 001/484] LoongArch: Fix a build error due to __tlb_remove_tlb_entry() commit 7ab22b5c2af54e233f3d05d7d601025947e4ff05 upstream Conflicts: none Backport-reason: mm fixes With LLVM=1 and W=1 we get: ./include/asm-generic/tlb.h:629:10: error: parameter 'ptep' set but not used [-Werror,-Wunused-but-set-parameter] We fixed a similar issue via Arnd in the introducing commit, missed the LoongArch variant. Turns out, there is no need for LoongArch to have a custom variant, so let's just drop it and rely on the asm-generic one. Fixes: 4d5bf0b6183f ("mm/mmu_gather: add tlb_remove_tlb_entries()") Closes: https://lkml.kernel.org/r/CANiq72mQh3O9S4umbvrKBgMMorty48UMwS01U22FR0mRyd3cyQ@mail.gmail.com Reported-by: Miguel Ojeda Reviewed-by: Miguel Ojeda Tested-by: Miguel Ojeda Tested-by: Arnd Bergmann Signed-off-by: David Hildenbrand Signed-off-by: Huacai Chen Signed-off-by: Kairui Song --- arch/loongarch/include/asm/tlb.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/arch/loongarch/include/asm/tlb.h b/arch/loongarch/include/asm/tlb.h index da7a3b5b9374..e071f5e9e858 100644 --- a/arch/loongarch/include/asm/tlb.h +++ b/arch/loongarch/include/asm/tlb.h @@ -132,8 +132,6 @@ static __always_inline void invtlb_all(u32 op, u32 info, u64 addr) ); } -#define __tlb_remove_tlb_entry(tlb, ptep, address) do { } while (0) - static void tlb_flush(struct mmu_gather *tlb); #define tlb_flush tlb_flush -- Gitee From 0225c83711446065366318320483368b13d24825 Mon Sep 17 00:00:00 2001 From: Nick Desaulniers Date: Fri, 6 Sep 2024 13:01:28 +0200 Subject: [PATCH 002/484] scripts: subarch.include: fix SUBARCH on macOS hosts Upstream: commit 8a62d44588451095fd6645d6ed7ff8761edf62aa Conflicts: none Backport-reason: MacOS build support. When building the Linux kernel on an aarch64 macOS based host, if we don't specify a value for ARCH when invoking make, we default to arm and thus multi_v7_defconfig rather than the expected arm64 and arm64's defconfig. This is because subarch.include invokes `uname -m` which on MacOS hosts evaluates to `arm64` but on Linux hosts evaluates to `aarch64`, This allows us to build ARCH=arm64 natively on macOS (as in ARCH need not be specified on an aarch64-based system). Avoid matching arm64 by excluding it from the arm.* sed expression. Signed-off-by: Nick Desaulniers Suggested-by: Nicolas Schier Signed-off-by: Daniel Gomez Signed-off-by: Masahiro Yamada --- scripts/subarch.include | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/subarch.include b/scripts/subarch.include index 4bd327d0ae42..c4592d59d69b 100644 --- a/scripts/subarch.include +++ b/scripts/subarch.include @@ -6,7 +6,7 @@ SUBARCH := $(shell uname -m | sed -e s/i.86/x86/ -e s/x86_64/x86/ \ -e s/sun4u/sparc64/ \ - -e s/arm.*/arm/ -e s/sa110/arm/ \ + -e /^arm64$$/!s/arm.*/arm/ -e s/sa110/arm/ \ -e s/s390x/s390/ \ -e s/ppc.*/powerpc/ -e s/mips.*/mips/ \ -e s/sh[234].*/sh/ -e s/aarch64.*/arm64/ \ -- Gitee From a545d30f98c9a121bbc059b7a320e3c0c39a3826 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Sat, 7 Sep 2024 02:29:13 +0900 Subject: [PATCH 003/484] selinux: do not include headers from host programs Upstream: commit 541b57e313683b3d4c365fe3109fb34828b165cd Conflicts: none Backport-reason: MacOS build support The header, security/selinux/include/classmap.h, is included not only from kernel space but also from host programs. It includes and , which pull in more headers. This makes the host programs less portable, specifically causing build errors on macOS. Those headers are included for the following purposes: - for checking CAP_LAST_CAP - for checking PF_MAX These checks can be guarded by __KERNEL__ so they are skipped when building host programs. Testing them when building the kernel should be sufficient. The header, security/selinux/include/initial_sid_to_string.h, includes for the NULL definition, but this is not portable either. Instead, should be included for host programs. Reported-by: Daniel Gomez Closes: https://lore.kernel.org/lkml/20240807-macos-build-support-v1-6-4cd1ded85694@samsung.com/ Closes: https://lore.kernel.org/lkml/20240807-macos-build-support-v1-7-4cd1ded85694@samsung.com/ Signed-off-by: Masahiro Yamada Signed-off-by: Paul Moore --- scripts/selinux/genheaders/Makefile | 4 +--- scripts/selinux/genheaders/genheaders.c | 3 --- scripts/selinux/mdp/Makefile | 2 +- scripts/selinux/mdp/mdp.c | 4 ---- security/selinux/include/classmap.h | 11 ++++++++--- security/selinux/include/initial_sid_to_string.h | 4 ++++ 6 files changed, 14 insertions(+), 14 deletions(-) diff --git a/scripts/selinux/genheaders/Makefile b/scripts/selinux/genheaders/Makefile index 1faf7f07e8db..866f60e78882 100644 --- a/scripts/selinux/genheaders/Makefile +++ b/scripts/selinux/genheaders/Makefile @@ -1,5 +1,3 @@ # SPDX-License-Identifier: GPL-2.0 hostprogs-always-y += genheaders -HOST_EXTRACFLAGS += \ - -I$(srctree)/include/uapi -I$(srctree)/include \ - -I$(srctree)/security/selinux/include +HOST_EXTRACFLAGS += -I$(srctree)/security/selinux/include diff --git a/scripts/selinux/genheaders/genheaders.c b/scripts/selinux/genheaders/genheaders.c index 15520806889e..3834d7eb0af6 100644 --- a/scripts/selinux/genheaders/genheaders.c +++ b/scripts/selinux/genheaders/genheaders.c @@ -1,8 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* NOTE: we really do want to use the kernel headers here */ -#define __EXPORTED_HEADERS__ - #include #include #include diff --git a/scripts/selinux/mdp/Makefile b/scripts/selinux/mdp/Makefile index d61058ddd15c..673782e3212f 100644 --- a/scripts/selinux/mdp/Makefile +++ b/scripts/selinux/mdp/Makefile @@ -1,7 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 hostprogs-always-y += mdp HOST_EXTRACFLAGS += \ - -I$(srctree)/include/uapi -I$(srctree)/include \ + -I$(srctree)/include \ -I$(srctree)/security/selinux/include -I$(objtree)/include clean-files := policy.* file_contexts diff --git a/scripts/selinux/mdp/mdp.c b/scripts/selinux/mdp/mdp.c index 1415604c3d24..52365921c043 100644 --- a/scripts/selinux/mdp/mdp.c +++ b/scripts/selinux/mdp/mdp.c @@ -11,10 +11,6 @@ * Authors: Serge E. Hallyn */ - -/* NOTE: we really do want to use the kernel headers here */ -#define __EXPORTED_HEADERS__ - #include #include #include diff --git a/security/selinux/include/classmap.h b/security/selinux/include/classmap.h index a3c380775d41..8cf0370da810 100644 --- a/security/selinux/include/classmap.h +++ b/security/selinux/include/classmap.h @@ -1,7 +1,4 @@ /* SPDX-License-Identifier: GPL-2.0 */ -#include -#include - #define COMMON_FILE_SOCK_PERMS "ioctl", "read", "write", "create", \ "getattr", "setattr", "lock", "relabelfrom", "relabelto", "append", "map" @@ -30,9 +27,13 @@ "wake_alarm", "block_suspend", "audit_read", "perfmon", "bpf", \ "checkpoint_restore" +#ifdef __KERNEL__ /* avoid this check when building host programs */ +#include + #if CAP_LAST_CAP > CAP_CHECKPOINT_RESTORE #error New capability defined, please update COMMON_CAP2_PERMS. #endif +#endif /* * Note: The name for any socket class should be suffixed by "socket", @@ -259,6 +260,10 @@ const struct security_class_mapping secclass_map[] = { { NULL } }; +#ifdef __KERNEL__ /* avoid this check when building host programs */ +#include + #if PF_MAX > 46 #error New address family defined, please update secclass_map. #endif +#endif diff --git a/security/selinux/include/initial_sid_to_string.h b/security/selinux/include/initial_sid_to_string.h index ecc6e74fa09b..599e3d2e7243 100644 --- a/security/selinux/include/initial_sid_to_string.h +++ b/security/selinux/include/initial_sid_to_string.h @@ -1,6 +1,10 @@ /* SPDX-License-Identifier: GPL-2.0 */ +#ifdef __KERNEL__ #include +#else +#include +#endif static const char *const initial_sid_to_string[] = { NULL, -- Gitee From be388548ccd1670f2a2edbf2dac465e4637a3530 Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Tue, 26 Aug 2025 16:06:05 +0800 Subject: [PATCH 004/484] arm64: wrap all helpers for CONFIG_ALTRA_ERRATUM_82288 Upstream: doesn't apply These code are only for CONFIG_ALTRA_ERRATUM_82288, and they are breaking the build on some targets. Ideally we really should move them elsewhere but for now just wrap them with a config. Signed-off-by: Kairui Song --- arch/arm64/mm/fault.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index d68ea71c2ca9..f69a701ecabf 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -715,6 +715,7 @@ static int __kprobes do_translation_fault(unsigned long far, return 0; } +#ifdef CONFIG_ALTRA_ERRATUM_82288 static int copy_from_user_io(void *to, const void __user *from, unsigned long n) { const u8 __user *src = from; @@ -1665,6 +1666,7 @@ static int fixup_alignment(unsigned long addr, unsigned int esr, } return res; } +#endif static int do_alignment_fault(unsigned long far, unsigned long esr, struct pt_regs *regs) -- Gitee From 44d28b788e9fed40badc35739757c30f0ecf0221 Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Tue, 9 Sep 2025 20:36:22 +0800 Subject: [PATCH 005/484] Revert "mm/thp: fix deferred split unqueue naming and locking" We'll fix it later with a full backport. This reverts commit fc4951c3e3358dd82ea508e893695b916c813f17. --- mm/huge_memory.c | 35 +++++++++-------------------------- mm/internal.h | 10 +++++----- mm/memcontrol.c | 32 +++----------------------------- mm/page_alloc.c | 2 +- 4 files changed, 18 insertions(+), 61 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 693a88a6b4ba..4f9bcf6da988 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -3101,38 +3101,18 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) return ret; } -/* - * __folio_unqueue_deferred_split() is not to be called directly: - * the folio_unqueue_deferred_split() inline wrapper in mm/internal.h - * limits its calls to those folios which may have a _deferred_list for - * queueing THP splits, and that list is (racily observed to be) non-empty. - * - * It is unsafe to call folio_unqueue_deferred_split() until folio refcount is - * zero: because even when split_queue_lock is held, a non-empty _deferred_list - * might be in use on deferred_split_scan()'s unlocked on-stack list. - * - * If memory cgroups are enabled, split_queue_lock is in the mem_cgroup: it is - * therefore important to unqueue deferred split before changing folio memcg. - */ -bool __folio_unqueue_deferred_split(struct folio *folio) +void __folio_undo_large_rmappable(struct folio *folio) { struct deferred_split *ds_queue; unsigned long flags; - bool unqueued = false; - - WARN_ON_ONCE(folio_ref_count(folio)); - WARN_ON_ONCE(!mem_cgroup_disabled() && !folio_memcg(folio)); ds_queue = get_deferred_split_queue(folio); spin_lock_irqsave(&ds_queue->split_queue_lock, flags); if (!list_empty(&folio->_deferred_list)) { ds_queue->split_queue_len--; list_del_init(&folio->_deferred_list); - unqueued = true; } spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags); - - return unqueued; /* useful for debug warnings */ } void deferred_split_folio(struct folio *folio) @@ -3151,11 +3131,14 @@ void deferred_split_folio(struct folio *folio) return; /* - * Exclude swapcache: originally to avoid a corrupt deferred split - * queue. Nowadays that is fully prevented by mem_cgroup_swapout(); - * but if page reclaim is already handling the same folio, it is - * unnecessary to handle it again in the shrinker, so excluding - * swapcache here may still be a useful optimization. + * The try_to_unmap() in page reclaim path might reach here too, + * this may cause a race condition to corrupt deferred split queue. + * And, if page reclaim is already handling the same folio, it is + * unnecessary to handle it again in shrinker. + * + * Check the swapcache flag to determine if the folio is being + * handled by page reclaim since THP swap would add the folio into + * swap cache before calling try_to_unmap(). */ if (folio_test_swapcache(folio)) return; diff --git a/mm/internal.h b/mm/internal.h index a3c33910c8d2..a4b42d6791b2 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -625,11 +625,11 @@ static inline void folio_set_order(struct folio *folio, unsigned int order) #endif } -bool __folio_unqueue_deferred_split(struct folio *folio); -static inline bool folio_unqueue_deferred_split(struct folio *folio) +void __folio_undo_large_rmappable(struct folio *folio); +static inline void folio_undo_large_rmappable(struct folio *folio) { if (folio_order(folio) <= 1 || !folio_test_large_rmappable(folio)) - return false; + return; /* * At this point, there is no one trying to add the folio to @@ -637,9 +637,9 @@ static inline bool folio_unqueue_deferred_split(struct folio *folio) * to check without acquiring the split_queue_lock. */ if (data_race(list_empty(&folio->_deferred_list))) - return false; + return; - return __folio_unqueue_deferred_split(folio); + __folio_undo_large_rmappable(folio); } static inline struct folio *page_rmappable_folio(struct page *page) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index e15bd1fe17b8..f0183e317377 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -7875,8 +7875,6 @@ static int mem_cgroup_move_account(struct page *page, css_get(&to->css); css_put(&from->css); - /* Warning should never happen, so don't worry about refcount non-0 */ - WARN_ON_ONCE(folio_unqueue_deferred_split(folio)); folio->memcg_data = (unsigned long)to; __folio_memcg_unlock(from); @@ -8248,10 +8246,7 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, enum mc_target_type target_type; union mc_target target; struct page *page; - struct folio *folio; - bool tried_split_before = false; -retry_pmd: ptl = pmd_trans_huge_lock(pmd, vma); if (ptl) { if (mc.precharge < HPAGE_PMD_NR) { @@ -8261,28 +8256,6 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, target_type = get_mctgt_type_thp(vma, addr, *pmd, &target); if (target_type == MC_TARGET_PAGE) { page = target.page; - folio = page_folio(page); - /* - * Deferred split queue locking depends on memcg, - * and unqueue is unsafe unless folio refcount is 0: - * split or skip if on the queue? first try to split. - */ - if (!list_empty(&folio->_deferred_list)) { - spin_unlock(ptl); - if (!tried_split_before) - split_folio(folio); - folio_unlock(folio); - folio_put(folio); - if (tried_split_before) - return 0; - tried_split_before = true; - goto retry_pmd; - } - /* - * So long as that pmd lock is held, the folio cannot - * be racily added to the _deferred_list, because - * page_remove_rmap() will find it still pmdmapped. - */ if (isolate_lru_page(page)) { if (!mem_cgroup_move_account(page, true, mc.from, mc.to)) { @@ -9405,6 +9378,9 @@ static void uncharge_folio(struct folio *folio, struct uncharge_gather *ug) struct obj_cgroup *objcg; VM_BUG_ON_FOLIO(folio_test_lru(folio), folio); + VM_BUG_ON_FOLIO(folio_order(folio) > 1 && + !folio_test_hugetlb(folio) && + !list_empty(&folio->_deferred_list), folio); /* * Nobody should be changing or seriously looking at @@ -9451,7 +9427,6 @@ static void uncharge_folio(struct folio *folio, struct uncharge_gather *ug) ug->nr_memory += nr_pages; ug->pgpgout++; - WARN_ON_ONCE(folio_unqueue_deferred_split(folio)); folio->memcg_data = 0; } @@ -9771,7 +9746,6 @@ void mem_cgroup_swapout(struct folio *folio, swp_entry_t entry) VM_BUG_ON_FOLIO(oldid, folio); mod_memcg_state(swap_memcg, MEMCG_SWAP, nr_entries); - folio_unqueue_deferred_split(folio); folio->memcg_data = 0; if (!mem_cgroup_is_root(memcg)) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index a3af340c234d..e5e287404dc0 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -606,7 +606,7 @@ void destroy_large_folio(struct folio *folio) return; } - folio_unqueue_deferred_split(folio); + folio_undo_large_rmappable(folio); mem_cgroup_uncharge(folio); free_the_page(&folio->page, folio_order(folio)); } -- Gitee From 4622ed1914561162eca06ba4d7ed7422ea482371 Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Wed, 10 Sep 2025 19:08:27 +0800 Subject: [PATCH 006/484] Revert "mm: refactor folio_undo_large_rmappable()" We will fix it later with a full backport. This reverts commit eb6b6d3e1f1e5bb97ef9d15bb21236d514bc0006. --- mm/huge_memory.c | 13 ++++++++++++- mm/internal.h | 17 +---------------- mm/page_alloc.c | 4 +++- 3 files changed, 16 insertions(+), 18 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 4f9bcf6da988..1db33db4a2b9 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -3101,11 +3101,22 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) return ret; } -void __folio_undo_large_rmappable(struct folio *folio) +void folio_undo_large_rmappable(struct folio *folio) { struct deferred_split *ds_queue; unsigned long flags; + if (folio_order(folio) <= 1) + return; + + /* + * At this point, there is no one trying to add the folio to + * deferred_list. If folio is not in deferred_list, it's safe + * to check without acquiring the split_queue_lock. + */ + if (data_race(list_empty(&folio->_deferred_list))) + return; + ds_queue = get_deferred_split_queue(folio); spin_lock_irqsave(&ds_queue->split_queue_lock, flags); if (!list_empty(&folio->_deferred_list)) { diff --git a/mm/internal.h b/mm/internal.h index a4b42d6791b2..4d33106fb3d7 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -625,22 +625,7 @@ static inline void folio_set_order(struct folio *folio, unsigned int order) #endif } -void __folio_undo_large_rmappable(struct folio *folio); -static inline void folio_undo_large_rmappable(struct folio *folio) -{ - if (folio_order(folio) <= 1 || !folio_test_large_rmappable(folio)) - return; - - /* - * At this point, there is no one trying to add the folio to - * deferred_list. If folio is not in deferred_list, it's safe - * to check without acquiring the split_queue_lock. - */ - if (data_race(list_empty(&folio->_deferred_list))) - return; - - __folio_undo_large_rmappable(folio); -} +void folio_undo_large_rmappable(struct folio *folio); static inline struct folio *page_rmappable_folio(struct page *page) { diff --git a/mm/page_alloc.c b/mm/page_alloc.c index e5e287404dc0..ef844c113eb3 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -606,7 +606,9 @@ void destroy_large_folio(struct folio *folio) return; } - folio_undo_large_rmappable(folio); + if (folio_test_large_rmappable(folio)) + folio_undo_large_rmappable(folio); + mem_cgroup_uncharge(folio); free_the_page(&folio->page, folio_order(folio)); } -- Gitee From 90d63ee7fb009f037b7f3ed15fe5e22774155445 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 27 Sep 2023 11:34:24 +0200 Subject: [PATCH 007/484] mm/swap: Convert to use bdev_open_by_dev() commit 4c6bca43c547fe9bc1d7d1519b1d6430fee2cae2 upstream Conflicts: minor, we renamed p -> si and have some other conflicts. Backport-reason: The helper and the series is already partially backported so backport this too to reduce the gap. Convert swapping code to use bdev_open_by_dev() and pass the handle around. CC: linux-mm@kvack.org CC: Andrew Morton Acked-by: Christoph Hellwig Acked-by: Christian Brauner Signed-off-by: Jan Kara Link: https://lore.kernel.org/r/20230927093442.25915-18-jack@suse.cz Signed-off-by: Christian Brauner Signed-off-by: Kairui Song --- include/linux/swap.h | 1 + mm/swapfile.c | 23 ++++++++++++----------- 2 files changed, 13 insertions(+), 11 deletions(-) diff --git a/include/linux/swap.h b/include/linux/swap.h index fed2bdbecfde..2f29842466c8 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -316,6 +316,7 @@ struct swap_info_struct { struct swap_sequential_cluster *global_cluster; /* Use one global cluster for rotating device */ spinlock_t global_cluster_lock; /* Serialize usage of global cluster */ struct rb_root swap_extent_root;/* root of the swap extent rbtree */ + struct bdev_handle *bdev_handle;/* open handle of the bdev */ struct block_device *bdev; /* swap device or bdev of swap file */ struct file *swap_file; /* seldom referenced */ unsigned int old_block_size; /* seldom referenced */ diff --git a/mm/swapfile.c b/mm/swapfile.c index 3bb234905302..53647049da5d 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -2713,11 +2713,10 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) exit_swap_address_space(p->type); inode = mapping->host; - if (S_ISBLK(inode->i_mode)) { - struct block_device *bdev = I_BDEV(inode); - - set_blocksize(bdev, old_block_size); - blkdev_put(bdev, p); + if (p->bdev_handle) { + set_blocksize(p->bdev, old_block_size); + bdev_release(p->bdev_handle); + p->bdev_handle = NULL; } inode_lock(inode); @@ -2953,13 +2952,14 @@ static int claim_swapfile(struct swap_info_struct *si, struct inode *inode) "Swapping on block file over filesystem %s, file system operations may get bypassed unexpectedly and lead to data loss.\n", si->swap_file->f_inode->i_sb->s_id); #endif - si->bdev = blkdev_get_by_dev(inode->i_rdev, + si->bdev_handle = bdev_open_by_dev(inode->i_rdev, BLK_OPEN_READ | BLK_OPEN_WRITE, si, NULL); - if (IS_ERR(si->bdev)) { - error = PTR_ERR(si->bdev); - si->bdev = NULL; + if (IS_ERR(si->bdev_handle)) { + error = PTR_ERR(si->bdev_handle); + si->bdev_handle = NULL; return error; } + si->bdev = si->bdev_handle->bdev; si->old_block_size = block_size(si->bdev); error = set_blocksize(si->bdev, PAGE_SIZE); if (error < 0) @@ -3394,9 +3394,10 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) bad_swap_unlock_inode: inode_unlock(inode); bad_swap: - if (inode && S_ISBLK(inode->i_mode) && si->bdev) { + if (si->bdev_handle) { set_blocksize(si->bdev, si->old_block_size); - blkdev_put(si->bdev, si); + bdev_release(si->bdev_handle); + si->bdev_handle = NULL; } kfree(si->global_cluster); si->global_cluster = NULL; -- Gitee From 1965a2817e4e2ec73ba3ca59f5fceeadb70d1331 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 17 Apr 2024 18:26:54 -0400 Subject: [PATCH 008/484] swapon(2)/swapoff(2): don't bother with block size commit 798cb7f9aec35460c383eab57b9fa474d999a2eb upstream Conflicts: resolved Backport-reason: SWAP updates to reduce the gap. once upon a time that used to matter; these days we do swap IO for swap devices at the level that doesn't give a damn about block size, buffer_head or anything of that sort - just attach the page to bio, set the location and size (the latter to PAGE_SIZE) and feed into queue. Reviewed-by: Christoph Hellwig Reviewed-by: Christian Brauner Signed-off-by: Al Viro Signed-off-by: Kairui Song --- include/linux/swap.h | 1 - mm/swapfile.c | 13 +------------ 2 files changed, 1 insertion(+), 13 deletions(-) diff --git a/include/linux/swap.h b/include/linux/swap.h index 2f29842466c8..af9740315244 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -319,7 +319,6 @@ struct swap_info_struct { struct bdev_handle *bdev_handle;/* open handle of the bdev */ struct block_device *bdev; /* swap device or bdev of swap file */ struct file *swap_file; /* seldom referenced */ - unsigned int old_block_size; /* seldom referenced */ struct completion comp; /* seldom referenced */ spinlock_t lock; /* * protect map scan related fields like diff --git a/mm/swapfile.c b/mm/swapfile.c index 53647049da5d..4b8936a81a64 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -2591,7 +2591,6 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) struct inode *inode; struct filename *pathname; int err, found = 0; - unsigned int old_block_size; if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -2692,7 +2691,6 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) drain_mmlist(); swap_file = p->swap_file; - old_block_size = p->old_block_size; p->swap_file = NULL; p->max = 0; swap_map = p->swap_map; @@ -2714,7 +2712,6 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) inode = mapping->host; if (p->bdev_handle) { - set_blocksize(p->bdev, old_block_size); bdev_release(p->bdev_handle); p->bdev_handle = NULL; } @@ -2944,8 +2941,6 @@ static struct swap_info_struct *alloc_swap_info(void) static int claim_swapfile(struct swap_info_struct *si, struct inode *inode) { - int error; - if (S_ISBLK(inode->i_mode)) { #ifdef CONFIG_ENHANCED_MM WARN(si->swap_file->f_mapping->a_ops->swap_activate, @@ -2955,15 +2950,11 @@ static int claim_swapfile(struct swap_info_struct *si, struct inode *inode) si->bdev_handle = bdev_open_by_dev(inode->i_rdev, BLK_OPEN_READ | BLK_OPEN_WRITE, si, NULL); if (IS_ERR(si->bdev_handle)) { - error = PTR_ERR(si->bdev_handle); + int error = PTR_ERR(si->bdev_handle); si->bdev_handle = NULL; return error; } si->bdev = si->bdev_handle->bdev; - si->old_block_size = block_size(si->bdev); - error = set_blocksize(si->bdev, PAGE_SIZE); - if (error < 0) - return error; /* * Zoned block devices contain zones that have a sequential * write only restriction. Hence zoned block devices are not @@ -2979,7 +2970,6 @@ static int claim_swapfile(struct swap_info_struct *si, struct inode *inode) return 0; } - /* * Find out how many pages are allowed for a single swap device. There * are two limiting factors: @@ -3395,7 +3385,6 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) inode_unlock(inode); bad_swap: if (si->bdev_handle) { - set_blocksize(si->bdev, si->old_block_size); bdev_release(si->bdev_handle); si->bdev_handle = NULL; } -- Gitee From b4c684d2eab09161fcefb6dbf6915fe10becdf3b Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 17 Apr 2024 18:33:34 -0400 Subject: [PATCH 009/484] swapon(2): open swap with O_EXCL commit 51d908b3db0e588aeb2d06df37e4df3fb1754bb5 upstream Conflicts: minor ... eliminating the need to reopen block devices so they could be exclusively held. Reviewed-by: Christoph Hellwig Reviewed-by: Christian Brauner Signed-off-by: Al Viro Signed-off-by: Kairui Song --- include/linux/swap.h | 1 - mm/swapfile.c | 19 ++----------------- 2 files changed, 2 insertions(+), 18 deletions(-) diff --git a/include/linux/swap.h b/include/linux/swap.h index af9740315244..123752caac13 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -316,7 +316,6 @@ struct swap_info_struct { struct swap_sequential_cluster *global_cluster; /* Use one global cluster for rotating device */ spinlock_t global_cluster_lock; /* Serialize usage of global cluster */ struct rb_root swap_extent_root;/* root of the swap extent rbtree */ - struct bdev_handle *bdev_handle;/* open handle of the bdev */ struct block_device *bdev; /* swap device or bdev of swap file */ struct file *swap_file; /* seldom referenced */ struct completion comp; /* seldom referenced */ diff --git a/mm/swapfile.c b/mm/swapfile.c index 4b8936a81a64..792b8134cfb9 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -2711,10 +2711,6 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) exit_swap_address_space(p->type); inode = mapping->host; - if (p->bdev_handle) { - bdev_release(p->bdev_handle); - p->bdev_handle = NULL; - } inode_lock(inode); inode->i_flags &= ~S_SWAPFILE; @@ -2947,14 +2943,7 @@ static int claim_swapfile(struct swap_info_struct *si, struct inode *inode) "Swapping on block file over filesystem %s, file system operations may get bypassed unexpectedly and lead to data loss.\n", si->swap_file->f_inode->i_sb->s_id); #endif - si->bdev_handle = bdev_open_by_dev(inode->i_rdev, - BLK_OPEN_READ | BLK_OPEN_WRITE, si, NULL); - if (IS_ERR(si->bdev_handle)) { - int error = PTR_ERR(si->bdev_handle); - si->bdev_handle = NULL; - return error; - } - si->bdev = si->bdev_handle->bdev; + si->bdev = I_BDEV(inode); /* * Zoned block devices contain zones that have a sequential * write only restriction. Hence zoned block devices are not @@ -3228,7 +3217,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) name = NULL; goto bad_swap; } - swap_file = file_open_name(name, O_RDWR|O_LARGEFILE, 0); + swap_file = file_open_name(name, O_RDWR | O_LARGEFILE | O_EXCL, 0); if (IS_ERR(swap_file)) { error = PTR_ERR(swap_file); swap_file = NULL; @@ -3384,10 +3373,6 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) bad_swap_unlock_inode: inode_unlock(inode); bad_swap: - if (si->bdev_handle) { - bdev_release(si->bdev_handle); - si->bdev_handle = NULL; - } kfree(si->global_cluster); si->global_cluster = NULL; inode = NULL; -- Gitee From 44ebfa02663cac35508d74aabc72b6503b7979d0 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Thu, 18 Apr 2024 21:56:44 +0800 Subject: [PATCH 010/484] mm: swapfile: check usable swap device in __folio_throttle_swaprate() commit 80e75021486bd2f6463259c4569e2eb7668ae953 upstream Conflicts: minor, we don't have has_usable_swap Backport-reason: SWAP cleanup Skip blk_cgroup_congested() if there is no usable swap device since no swapin/out will occur, Thereby avoid taking swap_lock. The difference is shown below from perf date of CoW pagefault, perf report -g -i perf.data.swapon | egrep "blk_cgroup_congested|__folio_throttle_swaprate" 1.01% 0.16% page_fault2_pro [kernel.kallsyms] [k] __folio_throttle_swaprate 0.83% 0.80% page_fault2_pro [kernel.kallsyms] [k] blk_cgroup_congested perf report -g -i perf.data.swapoff | egrep "blk_cgroup_congested|__folio_throttle_swaprate" 0.15% 0.15% page_fault2_pro [kernel.kallsyms] [k] __folio_throttle_swaprate Link: https://lkml.kernel.org/r/20240418135644.2736748-1-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/swapfile.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/mm/swapfile.c b/mm/swapfile.c index 792b8134cfb9..3ceeb54dbed7 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -3790,6 +3790,11 @@ static void free_swap_count_continuations(struct swap_info_struct *si) } #if defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP) +static bool __has_usable_swap(void) +{ + return !plist_head_empty(&swap_active_head); +} + void __folio_throttle_swaprate(struct folio *folio, gfp_t gfp) { struct swap_info_struct *si, *next; @@ -3798,6 +3803,9 @@ void __folio_throttle_swaprate(struct folio *folio, gfp_t gfp) if (!(gfp & __GFP_IO)) return; + if (!__has_usable_swap()) + return; + if (!blk_cgroup_congested()) return; -- Gitee From 126eba218aa47c45744fd9e21a6eb840d4e5dee6 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Wed, 11 Oct 2023 18:04:27 +0100 Subject: [PATCH 011/484] mm: move vma_policy() and anon_vma_name() decls to mm_types.h commit 3657fdc2451abf135c2d20949acf57d78cc50338 upstream Conflicts: none Backport-reason: MM updates and clean up to prepare for SWAP updates Patch series "Abstract vma_merge() and split_vma()", v4. The vma_merge() interface is very confusing and its implementation has led to numerous bugs as a result of that confusion. In addition there is duplication both in invocation of vma_merge(), but also in the common mprotect()-style pattern of attempting a merge, then if this fails, splitting the portion of a VMA about to have its attributes changed. This pattern has been copy/pasted around the kernel in each instance where such an operation has been required, each very slightly modified from the last to make it even harder to decipher what is going on. Simplify the whole thing by dividing the actual uses of vma_merge() and split_vma() into specific and abstracted functions and de-duplicate the vma_merge()/split_vma() pattern altogether. Doing so also opens the door to changing how vma_merge() is implemented - by knowing precisely what cases a caller is invoking rather than having a central interface where anything might happen we can untangle the brittle and confusing vma_merge() implementation into something more workable. For mprotect()-like cases we introduce vma_modify() which performs the vma_merge()/split_vma() pattern, returning a pointer to either the merged or split VMA or an ERR_PTR(err) if the splits fail. We provide a number of inline helper functions to make things even clearer:- * vma_modify_flags() - Prepare to modify the VMA's flags. * vma_modify_flags_name() - Prepare to modify the VMA's flags/anon_vma_name * vma_modify_policy() - Prepare to modify the VMA's mempolicy. * vma_modify_flags_uffd() - Prepare to modify the VMA's flags/uffd context. For cases where a new VMA is attempted to be merged with adjacent VMAs we add:- * vma_merge_new_vma() - Prepare to merge a new VMA. * vma_merge_extend() - Prepare to extend the end of a new VMA. This patch (of 5): The vma_policy() define is a helper specifically for a VMA field so it makes sense to host it in the memory management types header. The anon_vma_name(), anon_vma_name_alloc() and anon_vma_name_free() functions are a little out of place in mm_inline.h as they define external functions, and so it makes sense to locate them in mm_types.h. The purpose of these relocations is to make it possible to abstract static inline wrappers which invoke both of these helpers. Link: https://lkml.kernel.org/r/cover.1697043508.git.lstoakes@gmail.com Link: https://lkml.kernel.org/r/24bfc6c9e382fffbcb0ea8d424392c27d56cc8ca.1697043508.git.lstoakes@gmail.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Vlastimil Babka Cc: Alexander Viro Cc: Christian Brauner Cc: Liam R. Howlett Cc: Lorenzo Stoakes Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- include/linux/mempolicy.h | 4 ---- include/linux/mm_inline.h | 20 +------------------- include/linux/mm_types.h | 27 +++++++++++++++++++++++++++ 3 files changed, 28 insertions(+), 23 deletions(-) diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h index 06736d8bbe1e..9d60b739967f 100644 --- a/include/linux/mempolicy.h +++ b/include/linux/mempolicy.h @@ -93,8 +93,6 @@ static inline struct mempolicy *mpol_dup(struct mempolicy *pol) return pol; } -#define vma_policy(vma) ((vma)->vm_policy) - static inline void mpol_get(struct mempolicy *pol) { if (pol) @@ -226,8 +224,6 @@ mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx) return NULL; } -#define vma_policy(vma) NULL - static inline int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst) { diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index 720f4fc63b92..e8d0b7224279 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -4,6 +4,7 @@ #include #include +#include #include #include #include @@ -360,15 +361,6 @@ void lruvec_del_folio(struct lruvec *lruvec, struct folio *folio) } #ifdef CONFIG_ANON_VMA_NAME -/* - * mmap_lock should be read-locked when calling anon_vma_name(). Caller should - * either keep holding the lock while using the returned pointer or it should - * raise anon_vma_name refcount before releasing the lock. - */ -extern struct anon_vma_name *anon_vma_name(struct vm_area_struct *vma); -extern struct anon_vma_name *anon_vma_name_alloc(const char *name); -extern void anon_vma_name_free(struct kref *kref); - /* mmap_lock should be read-locked */ static inline void anon_vma_name_get(struct anon_vma_name *anon_name) { @@ -423,16 +415,6 @@ static inline bool anon_vma_name_eq(struct anon_vma_name *anon_name1, } #else /* CONFIG_ANON_VMA_NAME */ -static inline struct anon_vma_name *anon_vma_name(struct vm_area_struct *vma) -{ - return NULL; -} - -static inline struct anon_vma_name *anon_vma_name_alloc(const char *name) -{ - return NULL; -} - static inline void anon_vma_name_get(struct anon_vma_name *anon_name) {} static inline void anon_vma_name_put(struct anon_vma_name *anon_name) {} static inline void dup_anon_vma_name(struct vm_area_struct *orig_vma, diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index f81fc5074755..1e6c0837ed2e 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -615,6 +615,27 @@ struct anon_vma_name { char name[]; }; +#ifdef CONFIG_ANON_VMA_NAME +/* + * mmap_lock should be read-locked when calling anon_vma_name(). Caller should + * either keep holding the lock while using the returned pointer or it should + * raise anon_vma_name refcount before releasing the lock. + */ +struct anon_vma_name *anon_vma_name(struct vm_area_struct *vma); +struct anon_vma_name *anon_vma_name_alloc(const char *name); +void anon_vma_name_free(struct kref *kref); +#else /* CONFIG_ANON_VMA_NAME */ +static inline struct anon_vma_name *anon_vma_name(struct vm_area_struct *vma) +{ + return NULL; +} + +static inline struct anon_vma_name *anon_vma_name_alloc(const char *name) +{ + return NULL; +} +#endif + struct vma_lock { struct rw_semaphore lock; }; @@ -766,6 +787,12 @@ struct vm_area_struct { KABI_RESERVE(4); } __randomize_layout; +#ifdef CONFIG_NUMA +#define vma_policy(vma) ((vma)->vm_policy) +#else +#define vma_policy(vma) NULL +#endif + #ifdef CONFIG_SCHED_MM_CID struct mm_cid { u64 time; -- Gitee From 0ab0c834804379b63292159ca463cf5995aa301a Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Wed, 11 Oct 2023 18:04:28 +0100 Subject: [PATCH 012/484] mm: abstract the vma_merge()/split_vma() pattern for mprotect() et al. commit 94d7d923395129b9248777e575c877e40007f9dc upstream Conflicts: trivial due to missing d61ea1cb0095 Backport-reason: MM updates and clean up to prepare for SWAP updates mprotect() and other functions which change VMA parameters over a range each employ a pattern of:- 1. Attempt to merge the range with adjacent VMAs. 2. If this fails, and the range spans a subset of the VMA, split it accordingly. This is open-coded and duplicated in each case. Also in each case most of the parameters passed to vma_merge() remain the same. Create a new function, vma_modify(), which abstracts this operation, accepting only those parameters which can be changed. To avoid the mess of invoking each function call with unnecessary parameters, create inline wrapper functions for each of the modify operations, parameterised only by what is required to perform the action. We can also significantly simplify the logic - by returning the VMA if we split (or merged VMA if we do not) we no longer need specific handling for merge/split cases in any of the call sites. Note that the userfaultfd_release() case works even though it does not split VMAs - since start is set to vma->vm_start and end is set to vma->vm_end, the split logic does not trigger. In addition, since we calculate pgoff to be equal to vma->vm_pgoff + (start - vma->vm_start) >> PAGE_SHIFT, and start - vma->vm_start will be 0 in this instance, this invocation will remain unchanged. We eliminate a VM_WARN_ON() in mprotect_fixup() as this simply asserts that vma_merge() correctly ensures that flags remain the same, something that is already checked in is_mergeable_vma() and elsewhere, and in any case is not specific to mprotect(). Link: https://lkml.kernel.org/r/0dfa9368f37199a423674bf0ee312e8ea0619044.1697043508.git.lstoakes@gmail.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Vlastimil Babka Cc: Alexander Viro Cc: Christian Brauner Cc: Liam R. Howlett Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- fs/userfaultfd.c | 70 ++++++++++++---------------------------------- include/linux/mm.h | 60 +++++++++++++++++++++++++++++++++++++++ mm/madvise.c | 26 +++-------------- mm/mempolicy.c | 26 ++--------------- mm/mlock.c | 25 +++-------------- mm/mmap.c | 48 +++++++++++++++++++++++++++++++ mm/mprotect.c | 29 +++---------------- 7 files changed, 141 insertions(+), 143 deletions(-) diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 5ceb1fa8eb11..1bcea1be152f 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -925,20 +925,15 @@ static int userfaultfd_release(struct inode *inode, struct file *file) uffd_wp_range(vma, vma->vm_start, vma->vm_end - vma->vm_start, false); new_flags = vma->vm_flags & ~__VM_UFFD_FLAGS; - prev = vma_merge(&vmi, mm, prev, vma->vm_start, vma->vm_end, - new_flags, vma->anon_vma, - vma->vm_file, vma->vm_pgoff, - vma_policy(vma), - NULL_VM_UFFD_CTX, anon_vma_name(vma)); - if (prev) { - vma = prev; - } else { - prev = vma; - } + vma = vma_modify_flags_uffd(&vmi, prev, vma, vma->vm_start, + vma->vm_end, new_flags, + NULL_VM_UFFD_CTX); vma_start_write(vma); userfaultfd_set_vm_flags(vma, new_flags); vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; + + prev = vma; } mmap_write_unlock(mm); mmput(mm); @@ -1328,7 +1323,6 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, bool basic_ioctls; unsigned long start, end, vma_end; struct vma_iterator vmi; - pgoff_t pgoff; user_uffdio_register = (struct uffdio_register __user *) arg; @@ -1481,28 +1475,14 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, vma_end = min(end, vma->vm_end); new_flags = (vma->vm_flags & ~__VM_UFFD_FLAGS) | vm_flags; - pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); - prev = vma_merge(&vmi, mm, prev, start, vma_end, new_flags, - vma->anon_vma, vma->vm_file, pgoff, - vma_policy(vma), - ((struct vm_userfaultfd_ctx){ ctx }), - anon_vma_name(vma)); - if (prev) { - /* vma_merge() invalidated the mas */ - vma = prev; - goto next; - } - if (vma->vm_start < start) { - ret = split_vma(&vmi, vma, start, 1); - if (ret) - break; - } - if (vma->vm_end > end) { - ret = split_vma(&vmi, vma, end, 0); - if (ret) - break; + vma = vma_modify_flags_uffd(&vmi, prev, vma, start, vma_end, + new_flags, + (struct vm_userfaultfd_ctx){ctx}); + if (IS_ERR(vma)) { + ret = PTR_ERR(vma); + break; } - next: + /* * In the vma_merge() successful mprotect-like case 8: * the next vma was merged into the current one and @@ -1564,7 +1544,6 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, unsigned long start, end, vma_end; const void __user *buf = (void __user *)arg; struct vma_iterator vmi; - pgoff_t pgoff; ret = -EFAULT; if (copy_from_user(&uffdio_unregister, buf, sizeof(uffdio_unregister))) @@ -1667,26 +1646,13 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, uffd_wp_range(vma, start, vma_end - start, false); new_flags = vma->vm_flags & ~__VM_UFFD_FLAGS; - pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); - prev = vma_merge(&vmi, mm, prev, start, vma_end, new_flags, - vma->anon_vma, vma->vm_file, pgoff, - vma_policy(vma), - NULL_VM_UFFD_CTX, anon_vma_name(vma)); - if (prev) { - vma = prev; - goto next; - } - if (vma->vm_start < start) { - ret = split_vma(&vmi, vma, start, 1); - if (ret) - break; - } - if (vma->vm_end > end) { - ret = split_vma(&vmi, vma, end, 0); - if (ret) - break; + vma = vma_modify_flags_uffd(&vmi, prev, vma, start, vma_end, + new_flags, NULL_VM_UFFD_CTX); + if (IS_ERR(vma)) { + ret = PTR_ERR(vma); + break; } - next: + /* * In the vma_merge() successful mprotect-like case 8: * the next vma was merged into the current one and diff --git a/include/linux/mm.h b/include/linux/mm.h index 2c0ba76ba17c..83422bebf1ec 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3288,6 +3288,66 @@ extern struct vm_area_struct *copy_vma(struct vm_area_struct **, unsigned long addr, unsigned long len, pgoff_t pgoff, bool *need_rmap_locks); extern void exit_mmap(struct mm_struct *); +struct vm_area_struct *vma_modify(struct vma_iterator *vmi, + struct vm_area_struct *prev, + struct vm_area_struct *vma, + unsigned long start, unsigned long end, + unsigned long vm_flags, + struct mempolicy *policy, + struct vm_userfaultfd_ctx uffd_ctx, + struct anon_vma_name *anon_name); + +/* We are about to modify the VMA's flags. */ +static inline struct vm_area_struct +*vma_modify_flags(struct vma_iterator *vmi, + struct vm_area_struct *prev, + struct vm_area_struct *vma, + unsigned long start, unsigned long end, + unsigned long new_flags) +{ + return vma_modify(vmi, prev, vma, start, end, new_flags, + vma_policy(vma), vma->vm_userfaultfd_ctx, + anon_vma_name(vma)); +} + +/* We are about to modify the VMA's flags and/or anon_name. */ +static inline struct vm_area_struct +*vma_modify_flags_name(struct vma_iterator *vmi, + struct vm_area_struct *prev, + struct vm_area_struct *vma, + unsigned long start, + unsigned long end, + unsigned long new_flags, + struct anon_vma_name *new_name) +{ + return vma_modify(vmi, prev, vma, start, end, new_flags, + vma_policy(vma), vma->vm_userfaultfd_ctx, new_name); +} + +/* We are about to modify the VMA's memory policy. */ +static inline struct vm_area_struct +*vma_modify_policy(struct vma_iterator *vmi, + struct vm_area_struct *prev, + struct vm_area_struct *vma, + unsigned long start, unsigned long end, + struct mempolicy *new_pol) +{ + return vma_modify(vmi, prev, vma, start, end, vma->vm_flags, + new_pol, vma->vm_userfaultfd_ctx, anon_vma_name(vma)); +} + +/* We are about to modify the VMA's flags and/or uffd context. */ +static inline struct vm_area_struct +*vma_modify_flags_uffd(struct vma_iterator *vmi, + struct vm_area_struct *prev, + struct vm_area_struct *vma, + unsigned long start, unsigned long end, + unsigned long new_flags, + struct vm_userfaultfd_ctx new_ctx) +{ + return vma_modify(vmi, prev, vma, start, end, new_flags, + vma_policy(vma), new_ctx, anon_vma_name(vma)); +} static inline int check_data_rlimit(unsigned long rlim, unsigned long new, diff --git a/mm/madvise.c b/mm/madvise.c index 0b467314e32b..47fb2768f2c7 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -141,7 +141,6 @@ static int madvise_update_vma(struct vm_area_struct *vma, { struct mm_struct *mm = vma->vm_mm; int error; - pgoff_t pgoff; VMA_ITERATOR(vmi, mm, start); if (new_flags == vma->vm_flags && anon_vma_name_eq(anon_vma_name(vma), anon_name)) { @@ -149,30 +148,13 @@ static int madvise_update_vma(struct vm_area_struct *vma, return 0; } - pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); - *prev = vma_merge(&vmi, mm, *prev, start, end, new_flags, - vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma), - vma->vm_userfaultfd_ctx, anon_name); - if (*prev) { - vma = *prev; - goto success; - } + vma = vma_modify_flags_name(&vmi, *prev, vma, start, end, new_flags, + anon_name); + if (IS_ERR(vma)) + return PTR_ERR(vma); *prev = vma; - if (start != vma->vm_start) { - error = split_vma(&vmi, vma, start, 1); - if (error) - return error; - } - - if (end != vma->vm_end) { - error = split_vma(&vmi, vma, end, 0); - if (error) - return error; - } - -success: /* vm_flags is protected by the mmap_lock held in write mode. */ vma_start_write(vma); vm_flags_reset(vma, new_flags); diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 41b864024999..6369acb62462 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -801,10 +801,7 @@ static int mbind_range(struct vma_iterator *vmi, struct vm_area_struct *vma, struct vm_area_struct **prev, unsigned long start, unsigned long end, struct mempolicy *new_pol) { - struct vm_area_struct *merged; unsigned long vmstart, vmend; - pgoff_t pgoff; - int err; vmend = min(end, vma->vm_end); if (start > vma->vm_start) { @@ -819,26 +816,9 @@ static int mbind_range(struct vma_iterator *vmi, struct vm_area_struct *vma, return 0; } - pgoff = vma->vm_pgoff + ((vmstart - vma->vm_start) >> PAGE_SHIFT); - merged = vma_merge(vmi, vma->vm_mm, *prev, vmstart, vmend, vma->vm_flags, - vma->anon_vma, vma->vm_file, pgoff, new_pol, - vma->vm_userfaultfd_ctx, anon_vma_name(vma)); - if (merged) { - *prev = merged; - return vma_replace_policy(merged, new_pol); - } - - if (vma->vm_start != vmstart) { - err = split_vma(vmi, vma, vmstart, 1); - if (err) - return err; - } - - if (vma->vm_end != vmend) { - err = split_vma(vmi, vma, vmend, 0); - if (err) - return err; - } + vma = vma_modify_policy(vmi, *prev, vma, vmstart, vmend, new_pol); + if (IS_ERR(vma)) + return PTR_ERR(vma); *prev = vma; return vma_replace_policy(vma, new_pol); diff --git a/mm/mlock.c b/mm/mlock.c index ffb6375f01dd..3044340af784 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -480,7 +480,6 @@ int mlock_fixup(struct vma_iterator *vmi, struct vm_area_struct *vma, unsigned long end, vm_flags_t newflags) { struct mm_struct *mm = vma->vm_mm; - pgoff_t pgoff; int nr_pages; int ret = 0; vm_flags_t oldflags = vma->vm_flags; @@ -491,28 +490,12 @@ int mlock_fixup(struct vma_iterator *vmi, struct vm_area_struct *vma, /* don't set VM_LOCKED or VM_LOCKONFAULT and don't count */ goto out; - pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); - *prev = vma_merge(vmi, mm, *prev, start, end, newflags, - vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma), - vma->vm_userfaultfd_ctx, anon_vma_name(vma)); - if (*prev) { - vma = *prev; - goto success; - } - - if (start != vma->vm_start) { - ret = split_vma(vmi, vma, start, 1); - if (ret) - goto out; - } - - if (end != vma->vm_end) { - ret = split_vma(vmi, vma, end, 0); - if (ret) - goto out; + vma = vma_modify_flags(vmi, *prev, vma, start, end, newflags); + if (IS_ERR(vma)) { + ret = PTR_ERR(vma); + goto out; } -success: /* * Keep track of amount of locked VM. */ diff --git a/mm/mmap.c b/mm/mmap.c index 35cd97e806bf..ddb6172c2598 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2482,6 +2482,54 @@ int split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, return __split_vma(vmi, vma, addr, new_below); } +/* + * We are about to modify one or multiple of a VMA's flags, policy, userfaultfd + * context and anonymous VMA name within the range [start, end). + * + * As a result, we might be able to merge the newly modified VMA range with an + * adjacent VMA with identical properties. + * + * If no merge is possible and the range does not span the entirety of the VMA, + * we then need to split the VMA to accommodate the change. + * + * The function returns either the merged VMA, the original VMA if a split was + * required instead, or an error if the split failed. + */ +struct vm_area_struct *vma_modify(struct vma_iterator *vmi, + struct vm_area_struct *prev, + struct vm_area_struct *vma, + unsigned long start, unsigned long end, + unsigned long vm_flags, + struct mempolicy *policy, + struct vm_userfaultfd_ctx uffd_ctx, + struct anon_vma_name *anon_name) +{ + pgoff_t pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); + struct vm_area_struct *merged; + + merged = vma_merge(vmi, vma->vm_mm, prev, start, end, vm_flags, + vma->anon_vma, vma->vm_file, pgoff, policy, + uffd_ctx, anon_name); + if (merged) + return merged; + + if (vma->vm_start < start) { + int err = split_vma(vmi, vma, start, 1); + + if (err) + return ERR_PTR(err); + } + + if (vma->vm_end > end) { + int err = split_vma(vmi, vma, end, 0); + + if (err) + return ERR_PTR(err); + } + + return vma; +} + /* * do_vmi_align_munmap() - munmap the aligned region from @start to @end. * @vmi: The vma iterator diff --git a/mm/mprotect.c b/mm/mprotect.c index 027a2cb9ca56..0fd7083a7666 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -584,7 +584,6 @@ mprotect_fixup(struct vma_iterator *vmi, struct mmu_gather *tlb, long nrpages = (end - start) >> PAGE_SHIFT; unsigned int mm_cp_flags = 0; unsigned long charged = 0; - pgoff_t pgoff; int error; if (newflags == oldflags) { @@ -628,34 +627,14 @@ mprotect_fixup(struct vma_iterator *vmi, struct mmu_gather *tlb, } } - /* - * First try to merge with previous and/or next vma. - */ - pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); - *pprev = vma_merge(vmi, mm, *pprev, start, end, newflags, - vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma), - vma->vm_userfaultfd_ctx, anon_vma_name(vma)); - if (*pprev) { - vma = *pprev; - VM_WARN_ON((vma->vm_flags ^ newflags) & ~VM_SOFTDIRTY); - goto success; + vma = vma_modify_flags(vmi, *pprev, vma, start, end, newflags); + if (IS_ERR(vma)) { + error = PTR_ERR(vma); + goto fail; } *pprev = vma; - if (start != vma->vm_start) { - error = split_vma(vmi, vma, start, 1); - if (error) - goto fail; - } - - if (end != vma->vm_end) { - error = split_vma(vmi, vma, end, 0); - if (error) - goto fail; - } - -success: /* * vm_flags and vm_page_prot are protected by the mmap_lock * held in write mode. -- Gitee From 3d57a5bd5a520152a451bfd4eb307d14e158b266 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Wed, 11 Oct 2023 18:04:29 +0100 Subject: [PATCH 013/484] mm: make vma_merge() and split_vma() internal commit adb20b0c785e9e8d5e4d4a07b9c3340d7de0e5dc upstream Conflicts: none Backport-reason: MM updates and clean up to prepare for SWAP updates Now the common pattern of - attempting a merge via vma_merge() and should this fail splitting VMAs via split_vma() - has been abstracted, the former can be placed into mm/internal.h and the latter made static. In addition, the split_vma() nommu variant also need not be exported. Link: https://lkml.kernel.org/r/405f2be10e20c4e9fbcc9fe6b2dfea105f6642e0.1697043508.git.lstoakes@gmail.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Vlastimil Babka Cc: Alexander Viro Cc: Christian Brauner Cc: Liam R. Howlett Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- include/linux/mm.h | 9 --------- mm/internal.h | 9 +++++++++ mm/mmap.c | 8 ++++---- mm/nommu.c | 4 ++-- 4 files changed, 15 insertions(+), 15 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 83422bebf1ec..89e3e9ec68cc 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3272,16 +3272,7 @@ extern int vma_expand(struct vma_iterator *vmi, struct vm_area_struct *vma, struct vm_area_struct *next); extern int vma_shrink(struct vma_iterator *vmi, struct vm_area_struct *vma, unsigned long start, unsigned long end, pgoff_t pgoff); -extern struct vm_area_struct *vma_merge(struct vma_iterator *vmi, - struct mm_struct *, struct vm_area_struct *prev, unsigned long addr, - unsigned long end, unsigned long vm_flags, struct anon_vma *, - struct file *, pgoff_t, struct mempolicy *, struct vm_userfaultfd_ctx, - struct anon_vma_name *); extern struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *); -extern int __split_vma(struct vma_iterator *vmi, struct vm_area_struct *, - unsigned long addr, int new_below); -extern int split_vma(struct vma_iterator *vmi, struct vm_area_struct *, - unsigned long addr, int new_below); extern int insert_vm_struct(struct mm_struct *, struct vm_area_struct *); extern void unlink_file_vma(struct vm_area_struct *); extern struct vm_area_struct *copy_vma(struct vm_area_struct **, diff --git a/mm/internal.h b/mm/internal.h index 4d33106fb3d7..caf83d9b7e43 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -1225,6 +1225,15 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmd, unsigned int flags); +/* + * mm/mmap.c + */ +struct vm_area_struct *vma_merge(struct vma_iterator *vmi, + struct mm_struct *, struct vm_area_struct *prev, unsigned long addr, + unsigned long end, unsigned long vm_flags, struct anon_vma *, + struct file *, pgoff_t, struct mempolicy *, struct vm_userfaultfd_ctx, + struct anon_vma_name *); + enum { /* mark page accessed */ FOLL_TOUCH = 1 << 16, diff --git a/mm/mmap.c b/mm/mmap.c index ddb6172c2598..5f7a88f906cc 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2381,8 +2381,8 @@ static void unmap_region(struct mm_struct *mm, struct ma_state *mas, * has already been checked or doesn't make sense to fail. * VMA Iterator will point to the end VMA. */ -int __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, - unsigned long addr, int new_below) +static int __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, + unsigned long addr, int new_below) { struct vma_prepare vp; struct vm_area_struct *new; @@ -2469,8 +2469,8 @@ int __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, * Split a vma into two pieces at address 'addr', a new vma is allocated * either for the first part or the tail. */ -int split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, - unsigned long addr, int new_below) +static int split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, + unsigned long addr, int new_below) { #ifdef CONFIG_PID_NS if (vma->vm_mm->map_count >= task_active_pid_ns(current)->max_map_count) diff --git a/mm/nommu.c b/mm/nommu.c index a0edcf833e2d..3479cae48ee1 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -1311,8 +1311,8 @@ SYSCALL_DEFINE1(old_mmap, struct mmap_arg_struct __user *, arg) * split a vma into two pieces at address 'addr', a new vma is allocated either * for the first part or the tail. */ -int split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, - unsigned long addr, int new_below) +static int split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, + unsigned long addr, int new_below) { struct vm_area_struct *new; struct vm_region *region; -- Gitee From d6140310ef6f0208ab741974e18ddac29e780bb1 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Wed, 11 Oct 2023 18:04:30 +0100 Subject: [PATCH 014/484] mm: abstract merge for new VMAs into vma_merge_new_vma() commit 4b5f2d20169827add8875e78a352cf956ccdd55a upstream Conflicts: trivial, bdc136e2b05fa added some random space Backport-reason: MM updates and clean up to prepare for SWAP updates Only in mmap_region() and copy_vma() do we attempt to merge VMAs which occupy entirely new regions of virtual memory. We can abstract this logic and make the intent of this invocations of it completely explicit, rather than invoking vma_merge() with an inscrutable wall of parameters. This also paves the way for a simplification of the core vma_merge() implementation, as we seek to make it entirely an implementation detail. The VMA merge call in mmap_region() occurs only for file-backed mappings, where each of the parameters previously specified as NULL are defaulted to NULL in vma_init() (called by vm_area_alloc()). This matches the previous behaviour of specifying NULL for a number of fields, however note that prior to this call we pass the VMA to the file system driver via call_mmap(), which may in theory adjust fields that we pass in to vma_merge_new_vma(). Therefore we actually resolve an oversight here by allowing for the fact that the driver may have done this. Link: https://lkml.kernel.org/r/3dc71d17e307756a54781d4a4ce7315cf8b18bea.1697043508.git.lstoakes@gmail.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Vlastimil Babka Cc: Alexander Viro Cc: Christian Brauner Cc: Liam R. Howlett Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/mmap.c | 26 ++++++++++++++++++-------- 1 file changed, 18 insertions(+), 8 deletions(-) diff --git a/mm/mmap.c b/mm/mmap.c index 5f7a88f906cc..82337d141987 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2530,6 +2530,20 @@ struct vm_area_struct *vma_modify(struct vma_iterator *vmi, return vma; } +/* + * Attempt to merge a newly mapped VMA with those adjacent to it. The caller + * must ensure that [start, end) does not overlap any existing VMA. + */ +static struct vm_area_struct +*vma_merge_new_vma(struct vma_iterator *vmi, struct vm_area_struct *prev, + struct vm_area_struct *vma, unsigned long start, + unsigned long end, pgoff_t pgoff) +{ + return vma_merge(vmi, vma->vm_mm, prev, start, end, vma->vm_flags, + vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma), + vma->vm_userfaultfd_ctx, anon_vma_name(vma)); +} + /* * do_vmi_align_munmap() - munmap the aligned region from @start to @end. * @vmi: The vma iterator @@ -2890,11 +2904,9 @@ static unsigned long __mmap_region(struct file *file, unsigned long addr, * vma again as we may succeed this time. */ if (unlikely(vm_flags != vma->vm_flags && prev)) { - merge = vma_merge(&vmi, mm, prev, vma->vm_start, - vma->vm_end, vma->vm_flags, NULL, - vma->vm_file, vma->vm_pgoff, NULL, - NULL_VM_UFFD_CTX, NULL); - + merge = vma_merge_new_vma(&vmi, prev, vma, + vma->vm_start, vma->vm_end, + vma->vm_pgoff); if (merge) { /* * ->mmap() can change vma->vm_file and fput @@ -3500,9 +3512,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, if (new_vma && new_vma->vm_start < addr + len) return NULL; /* should never get here */ - new_vma = vma_merge(&vmi, mm, prev, addr, addr + len, vma->vm_flags, - vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma), - vma->vm_userfaultfd_ctx, anon_vma_name(vma)); + new_vma = vma_merge_new_vma(&vmi, prev, vma, addr, addr + len, pgoff); if (new_vma) { /* * Source vma may have been merged into new_vma -- Gitee From 6258e4197dadf1a159dc7382692f456de7f04d6e Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Wed, 11 Oct 2023 18:04:31 +0100 Subject: [PATCH 015/484] mm: abstract VMA merge and extend into vma_merge_extend() helper commit 93bf5d4aa27d4ba528f71483ae51fbd70edb3ce8 upstream Conflicts: minor, due to missing e346b3813067 and its fixes Backport-reason: MM updates and clean up to prepare for SWAP updates mremap uses vma_merge() in the case where a VMA needs to be extended. This can be significantly simplified and abstracted. This makes it far easier to understand what the actual function is doing, avoids future mistakes in use of the confusing vma_merge() function and importantly allows us to make future changes to how vma_merge() is implemented by knowing explicitly which merge cases each invocation uses. Note that in the mremap() extend case, we perform this merge only when old_len == vma->vm_end - addr. The extension_start, i.e. the start of the extended portion of the VMA is equal to addr + old_len, i.e. vma->vm_end. With this refactoring, vma_merge() is no longer required anywhere except mm/mmap.c, so mark it static. Link: https://lkml.kernel.org/r/f16cbdc2e72d37a1a097c39dc7d1fee8919a1c93.1697043508.git.lstoakes@gmail.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Vlastimil Babka Cc: Alexander Viro Cc: Christian Brauner Cc: Liam R. Howlett Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/internal.h | 8 +++----- mm/mmap.c | 31 ++++++++++++++++++++++++------- mm/mremap.c | 30 +++++++++++++----------------- 3 files changed, 40 insertions(+), 29 deletions(-) diff --git a/mm/internal.h b/mm/internal.h index caf83d9b7e43..1aed3aa88a21 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -1228,11 +1228,9 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, /* * mm/mmap.c */ -struct vm_area_struct *vma_merge(struct vma_iterator *vmi, - struct mm_struct *, struct vm_area_struct *prev, unsigned long addr, - unsigned long end, unsigned long vm_flags, struct anon_vma *, - struct file *, pgoff_t, struct mempolicy *, struct vm_userfaultfd_ctx, - struct anon_vma_name *); +struct vm_area_struct *vma_merge_extend(struct vma_iterator *vmi, + struct vm_area_struct *vma, + unsigned long delta); enum { /* mark page accessed */ diff --git a/mm/mmap.c b/mm/mmap.c index 82337d141987..17a57cd6dec8 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -859,13 +859,13 @@ can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags, * **** is not represented - it will be merged and the vma containing the * area is returned, or the function will return NULL */ -struct vm_area_struct *vma_merge(struct vma_iterator *vmi, struct mm_struct *mm, - struct vm_area_struct *prev, unsigned long addr, - unsigned long end, unsigned long vm_flags, - struct anon_vma *anon_vma, struct file *file, - pgoff_t pgoff, struct mempolicy *policy, - struct vm_userfaultfd_ctx vm_userfaultfd_ctx, - struct anon_vma_name *anon_name) +static struct vm_area_struct +*vma_merge(struct vma_iterator *vmi, struct mm_struct *mm, + struct vm_area_struct *prev, unsigned long addr, unsigned long end, + unsigned long vm_flags, struct anon_vma *anon_vma, struct file *file, + pgoff_t pgoff, struct mempolicy *policy, + struct vm_userfaultfd_ctx vm_userfaultfd_ctx, + struct anon_vma_name *anon_name) { struct vm_area_struct *curr, *next, *res; struct vm_area_struct *vma, *adjust, *remove, *remove2; @@ -2544,6 +2544,23 @@ static struct vm_area_struct vma->vm_userfaultfd_ctx, anon_vma_name(vma)); } +/* + * Expand vma by delta bytes, potentially merging with an immediately adjacent + * VMA with identical properties. + */ +struct vm_area_struct *vma_merge_extend(struct vma_iterator *vmi, + struct vm_area_struct *vma, + unsigned long delta) +{ + pgoff_t pgoff = vma->vm_pgoff + vma_pages(vma); + + /* vma is specified as prev, so case 1 or 2 will apply. */ + return vma_merge(vmi, vma->vm_mm, vma, vma->vm_end, vma->vm_end + delta, + vma->vm_flags, vma->anon_vma, vma->vm_file, pgoff, + vma_policy(vma), vma->vm_userfaultfd_ctx, + anon_vma_name(vma)); +} + /* * do_vmi_align_munmap() - munmap the aligned region from @start to @end. * @vmi: The vma iterator diff --git a/mm/mremap.c b/mm/mremap.c index 71c78c600c6b..23bf8fbfb86c 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -1044,14 +1044,12 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, /* old_len exactly to the end of the area.. */ if (old_len == vma->vm_end - addr) { + unsigned long delta = new_len - old_len; + /* can we just expand the current mapping? */ - if (vma_expandable(vma, new_len - old_len)) { - long pages = (new_len - old_len) >> PAGE_SHIFT; - unsigned long extension_start = addr + old_len; - unsigned long extension_end = addr + new_len; - pgoff_t extension_pgoff = vma->vm_pgoff + - ((extension_start - vma->vm_start) >> PAGE_SHIFT); - VMA_ITERATOR(vmi, mm, extension_start); + if (vma_expandable(vma, delta)) { + long pages = delta >> PAGE_SHIFT; + VMA_ITERATOR(vmi, mm, vma->vm_end); if (vma->vm_flags & VM_ACCOUNT) { if (security_vm_enough_memory_mm(mm, pages)) { @@ -1061,17 +1059,15 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, } /* - * Function vma_merge() is called on the extension we - * are adding to the already existing vma, vma_merge() - * will merge this extension with the already existing - * vma (expand operation itself) and possibly also with - * the next vma if it becomes adjacent to the expanded - * vma and otherwise compatible. + * Function vma_merge_extend() is called on the + * extension we are adding to the already existing vma, + * vma_merge_extend() will merge this extension with the + * already existing vma (expand operation itself) and + * possibly also with the next vma if it becomes + * adjacent to the expanded vma and otherwise + * compatible. */ - vma = vma_merge(&vmi, mm, vma, extension_start, - extension_end, vma->vm_flags, vma->anon_vma, - vma->vm_file, extension_pgoff, vma_policy(vma), - vma->vm_userfaultfd_ctx, anon_vma_name(vma)); + vma = vma_merge_extend(&vmi, vma, delta); if (!vma) { vm_unacct_memory(pages); ret = -ENOMEM; -- Gitee From 81a95d5f1ab18e906cfc391dd7270b9ab3d11c24 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Thu, 23 Nov 2023 19:23:17 +0200 Subject: [PATCH 016/484] mm: list_lru: Update kernel documentation to follow the requirements commit 7679e14098c9c3c8118a7130d6e1e9cfe2565c04 upstream Conflicts: none Backport-reason: ZSWAP updates to be consistent with SWAP kernel-doc is not happy about documentation in list_lru.h: list_lru.h:90: warning: Function parameter or member 'lru' not described in 'list_lru_add' list_lru.h:90: warning: Excess function parameter 'list_lru' description in 'list_lru_add' list_lru.h:90: warning: No description found for return value of 'list_lru_add' list_lru.h:103: warning: Function parameter or member 'lru' not described in 'list_lru_del' list_lru.h:103: warning: Excess function parameter 'list_lru' description in 'list_lru_del' list_lru.h:103: warning: No description found for return value of 'list_lru_del' list_lru.h:116: warning: No description found for return value of 'list_lru_count_one' list_lru.h:168: warning: No description found for return value of 'list_lru_walk_one' list_lru.h:185: warning: No description found for return value of 'list_lru_walk_one_irq' Fix the documentation accordingly. While at it, fix the references to the parameters in functions inside the long descriptions, on which the above script is not complaining (yet?). Link: https://lkml.kernel.org/r/20231123172320.2434780-1-andriy.shevchenko@linux.intel.com Signed-off-by: Andy Shevchenko Cc: Rasmus Villemoes Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- include/linux/list_lru.h | 36 +++++++++++++++++++----------------- 1 file changed, 19 insertions(+), 17 deletions(-) diff --git a/include/linux/list_lru.h b/include/linux/list_lru.h index b35968ee9fb5..db86ad78d428 100644 --- a/include/linux/list_lru.h +++ b/include/linux/list_lru.h @@ -73,7 +73,7 @@ void memcg_reparent_list_lrus(struct mem_cgroup *memcg, struct mem_cgroup *paren /** * list_lru_add: add an element to the lru list's tail - * @list_lru: the lru pointer + * @lru: the lru pointer * @item: the item to be added. * * If the element is already part of a list, this function returns doing @@ -83,22 +83,22 @@ void memcg_reparent_list_lrus(struct mem_cgroup *memcg, struct mem_cgroup *paren * the caller organize itself in a way that elements can be in more than * one type of list, it is up to the caller to fully remove the item from * the previous list (with list_lru_del() for instance) before moving it - * to @list_lru + * to @lru. * - * Return value: true if the list was updated, false otherwise + * Return: true if the list was updated, false otherwise */ bool list_lru_add(struct list_lru *lru, struct list_head *item); /** * list_lru_del: delete an element to the lru list - * @list_lru: the lru pointer + * @lru: the lru pointer * @item: the item to be deleted. * - * This function works analogously as list_lru_add in terms of list + * This function works analogously as list_lru_add() in terms of list * manipulation. The comments about an element already pertaining to - * a list are also valid for list_lru_del. + * a list are also valid for list_lru_del(). * - * Return value: true if the list was updated, false otherwise + * Return: true if the list was updated, false otherwise */ bool list_lru_del(struct list_lru *lru, struct list_head *item); @@ -108,9 +108,11 @@ bool list_lru_del(struct list_lru *lru, struct list_head *item); * @nid: the node id to count from. * @memcg: the cgroup to count from. * - * Always return a non-negative number, 0 for empty lists. There is no - * guarantee that the list is not updated while the count is being computed. - * Callers that want such a guarantee need to provide an outer lock. + * There is no guarantee that the list is not updated while the count is being + * computed. Callers that want such a guarantee need to provide an outer lock. + * + * Return: 0 for empty lists, otherwise the number of objects + * currently held by @lru. */ unsigned long list_lru_count_one(struct list_lru *lru, int nid, struct mem_cgroup *memcg); @@ -141,7 +143,7 @@ typedef enum lru_status (*list_lru_walk_cb)(struct list_head *item, struct list_lru_one *list, spinlock_t *lock, void *cb_arg); /** - * list_lru_walk_one: walk a list_lru, isolating and disposing freeable items. + * list_lru_walk_one: walk a @lru, isolating and disposing freeable items. * @lru: the lru pointer. * @nid: the node id to scan from. * @memcg: the cgroup to scan from. @@ -150,24 +152,24 @@ typedef enum lru_status (*list_lru_walk_cb)(struct list_head *item, * @cb_arg: opaque type that will be passed to @isolate * @nr_to_walk: how many items to scan. * - * This function will scan all elements in a particular list_lru, calling the + * This function will scan all elements in a particular @lru, calling the * @isolate callback for each of those items, along with the current list * spinlock and a caller-provided opaque. The @isolate callback can choose to * drop the lock internally, but *must* return with the lock held. The callback - * will return an enum lru_status telling the list_lru infrastructure what to + * will return an enum lru_status telling the @lru infrastructure what to * do with the object being scanned. * - * Please note that nr_to_walk does not mean how many objects will be freed, + * Please note that @nr_to_walk does not mean how many objects will be freed, * just how many objects will be scanned. * - * Return value: the number of objects effectively removed from the LRU. + * Return: the number of objects effectively removed from the LRU. */ unsigned long list_lru_walk_one(struct list_lru *lru, int nid, struct mem_cgroup *memcg, list_lru_walk_cb isolate, void *cb_arg, unsigned long *nr_to_walk); /** - * list_lru_walk_one_irq: walk a list_lru, isolating and disposing freeable items. + * list_lru_walk_one_irq: walk a @lru, isolating and disposing freeable items. * @lru: the lru pointer. * @nid: the node id to scan from. * @memcg: the cgroup to scan from. @@ -176,7 +178,7 @@ unsigned long list_lru_walk_one(struct list_lru *lru, * @cb_arg: opaque type that will be passed to @isolate * @nr_to_walk: how many items to scan. * - * Same as @list_lru_walk_one except that the spinlock is acquired with + * Same as list_lru_walk_one() except that the spinlock is acquired with * spin_lock_irq(). */ unsigned long list_lru_walk_one_irq(struct list_lru *lru, -- Gitee From b68b1b421d3932d43c45f0abf4fcf8d34b2543a0 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 3 Oct 2023 02:15:09 -0700 Subject: [PATCH 017/484] hugetlbfs: drop shared NUMA mempolicy pretence commit 10969b5571387047cd461a9c701b8b9cd007f6c7 upstream Conflicts: none Backport-reason: Mempolicy updates for cleaning up shmem SWAP Patch series "mempolicy: cleanups leading to NUMA mpol without vma", v2. Mostly cleanups in mm/mempolicy.c, but finally removing the pseudo-vma from shmem folio allocation, and removing the mmap_lock around folio migration for mbind and migrate_pages syscalls. This patch (of 12): hugetlbfs_fallocate() goes through the motions of pasting a shared NUMA mempolicy onto its pseudo-vma, but how could there ever be a shared NUMA mempolicy for this file? hugetlb_vm_ops has never offered a set_policy method, and hugetlbfs_parse_param() has never supported any mpol options for a mount-wide default policy. It's just an illusion: clean it away so as not to confuse others, giving us more freedom to adjust shmem's set_policy/get_policy implementation. But hugetlbfs_inode_info is still required, just to accommodate seals. Yes, shared NUMA mempolicy support could be added to hugetlbfs, with a set_policy method and/or mpol mount option (Andi's first posting did include an admitted-unsatisfactory hugetlb_set_policy()); but it seems that nobody has bothered to add that in the nineteen years since v2.6.7 made it possible, and there is at least one company that has invested enough into hugetlbfs, that I guess they have learnt well enough how to manage its NUMA, without needing shared mempolicy. Remove linux/mempolicy.h from linux/hugetlb.h: include linux/pagemap.h in its place, because hugetlb.h's recently added use of filemap_lock_folio() requires that (although most .configs and .c's get it in some other way). Link: https://lkml.kernel.org/r/ebc0987e-beff-8bfb-9283-234c2cbd17c5@google.com Link: https://lkml.kernel.org/r/cae82d4b-904a-faaf-282a-34fcc188c81f@google.com Signed-off-by: Hugh Dickins Reviewed-by: Matthew Wilcox (Oracle) Cc: Andi Kleen Cc: Christoph Lameter Cc: David Hildenbrand Cc: Greg Kroah-Hartman Cc: "Huang, Ying" Cc: Kefeng Wang Cc: Mel Gorman Cc: Michal Hocko Cc: Mike Kravetz Cc: Sidhartha Kumar Cc: Suren Baghdasaryan Cc: Tejun heo Cc: Vishal Moola (Oracle) Cc: Yang Shi Cc: Nhat Pham Cc: Yosry Ahmed Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- fs/hugetlbfs/inode.c | 41 +---------------------------------------- include/linux/hugetlb.h | 3 +-- 2 files changed, 2 insertions(+), 42 deletions(-) diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 36ac4e536bcb..13338589ff67 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -83,29 +83,6 @@ static const struct fs_parameter_spec hugetlb_fs_parameters[] = { {} }; -#ifdef CONFIG_NUMA -static inline void hugetlb_set_vma_policy(struct vm_area_struct *vma, - struct inode *inode, pgoff_t index) -{ - vma->vm_policy = mpol_shared_policy_lookup(&HUGETLBFS_I(inode)->policy, - index); -} - -static inline void hugetlb_drop_vma_policy(struct vm_area_struct *vma) -{ - mpol_cond_put(vma->vm_policy); -} -#else -static inline void hugetlb_set_vma_policy(struct vm_area_struct *vma, - struct inode *inode, pgoff_t index) -{ -} - -static inline void hugetlb_drop_vma_policy(struct vm_area_struct *vma) -{ -} -#endif - /* * Mask used when checking the page offset value passed in via system * calls. This value will be converted to a loff_t which is signed. @@ -864,8 +841,7 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset, /* * Initialize a pseudo vma as this is required by the huge page - * allocation routines. If NUMA is configured, use page index - * as input to create an allocation policy. + * allocation routines. */ vma_init(&pseudo_vma, mm); vm_flags_init(&pseudo_vma, VM_HUGETLB | VM_MAYSHARE | VM_SHARED); @@ -913,9 +889,7 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset, * folios in these areas, we need to consume the reserves * to keep reservation accounting consistent. */ - hugetlb_set_vma_policy(&pseudo_vma, inode, index); folio = alloc_hugetlb_folio(&pseudo_vma, addr, 0); - hugetlb_drop_vma_policy(&pseudo_vma); if (IS_ERR(folio)) { mutex_unlock(&hugetlb_fault_mutex_table[hash]); error = PTR_ERR(folio); @@ -1294,18 +1268,6 @@ static struct inode *hugetlbfs_alloc_inode(struct super_block *sb) hugetlbfs_inc_free_inodes(sbinfo); return NULL; } - - /* - * Any time after allocation, hugetlbfs_destroy_inode can be called - * for the inode. mpol_free_shared_policy is unconditionally called - * as part of hugetlbfs_destroy_inode. So, initialize policy here - * in case of a quick call to destroy. - * - * Note that the policy is initialized even if we are creating a - * private inode. This simplifies hugetlbfs_destroy_inode. - */ - mpol_shared_policy_init(&p->policy, NULL); - return &p->vfs_inode; } @@ -1317,7 +1279,6 @@ static void hugetlbfs_free_inode(struct inode *inode) static void hugetlbfs_destroy_inode(struct inode *inode) { hugetlbfs_inc_free_inodes(HUGETLBFS_SB(inode->i_sb)); - mpol_free_shared_policy(&HUGETLBFS_I(inode)->policy); } static const struct address_space_operations hugetlbfs_aops = { diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index cd7c15ddbc12..f63730c3e2a1 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -30,7 +30,7 @@ void free_huge_folio(struct folio *folio); #ifdef CONFIG_HUGETLB_PAGE -#include +#include #include #include @@ -547,7 +547,6 @@ static inline struct hugetlbfs_sb_info *HUGETLBFS_SB(struct super_block *sb) } struct hugetlbfs_inode_info { - struct shared_policy policy; struct inode vfs_inode; unsigned int seals; }; -- Gitee From a7553f25166d853e0671502f47e0151edef9e05d Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 3 Oct 2023 02:16:29 -0700 Subject: [PATCH 018/484] kernfs: drop shared NUMA mempolicy hooks commit 4b981bc1aa73c204c2aa7f99b5f4f74d03b0e381 upstream Conflicts: none Backport-reason: Mempolicy updates for cleaning up shmem SWAP It seems strange that kernfs should be an outlier with a set_policy and get_policy in its kernfs_vm_ops. Ah, it dates back to v2.6.30's commit 095160aee954 ("sysfs: fix some bin_vm_ops errors"), when I had crashed on powerpc's pci_mmap_legacy_page_range() fallback to shmem_zero_setup(). Well, that was commendably thorough, to give sysfs-bin a set_policy and get_policy, just to avoid the way it was coded resulting in EINVAL from mmap when CONFIG_NUMA; but somehow feels a bit over-the-top to me now. It's easier to say that nobody should expect to manage a shmem object's shared NUMA mempolicy via some kernfs backdoor to that object: delete that code (and there's no longer an EINVAL from mmap in the NUMA case). This then leaves set_policy/get_policy as implemented only by shmem - though importantly also by SysV SHM, which has to interface with shmem which implements them, and with SHM_HUGETLB which does not. Link: https://lkml.kernel.org/r/302164-a760-4a9e-879b-6870c9b4013@google.com Signed-off-by: Hugh Dickins Reviewed-by: Matthew Wilcox (Oracle) Cc: Andi Kleen Cc: Christoph Lameter Cc: David Hildenbrand Cc: Greg Kroah-Hartman Cc: "Huang, Ying" Cc: Kefeng Wang Cc: Mel Gorman Cc: Michal Hocko Cc: Mike Kravetz Cc: Nhat Pham Cc: Sidhartha Kumar Cc: Suren Baghdasaryan Cc: Tejun heo Cc: Vishal Moola (Oracle) Cc: Yang Shi Cc: Yosry Ahmed Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- fs/kernfs/file.c | 49 ------------------------------------------------ 1 file changed, 49 deletions(-) diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c index 6b90fea6cca2..aa4b9da67437 100644 --- a/fs/kernfs/file.c +++ b/fs/kernfs/file.c @@ -429,60 +429,11 @@ static int kernfs_vma_access(struct vm_area_struct *vma, unsigned long addr, return ret; } -#ifdef CONFIG_NUMA -static int kernfs_vma_set_policy(struct vm_area_struct *vma, - struct mempolicy *new) -{ - struct file *file = vma->vm_file; - struct kernfs_open_file *of = kernfs_of(file); - int ret; - - if (!of->vm_ops) - return 0; - - if (!kernfs_get_active(of->kn)) - return -EINVAL; - - ret = 0; - if (of->vm_ops->set_policy) - ret = of->vm_ops->set_policy(vma, new); - - kernfs_put_active(of->kn); - return ret; -} - -static struct mempolicy *kernfs_vma_get_policy(struct vm_area_struct *vma, - unsigned long addr) -{ - struct file *file = vma->vm_file; - struct kernfs_open_file *of = kernfs_of(file); - struct mempolicy *pol; - - if (!of->vm_ops) - return vma->vm_policy; - - if (!kernfs_get_active(of->kn)) - return vma->vm_policy; - - pol = vma->vm_policy; - if (of->vm_ops->get_policy) - pol = of->vm_ops->get_policy(vma, addr); - - kernfs_put_active(of->kn); - return pol; -} - -#endif - static const struct vm_operations_struct kernfs_vm_ops = { .open = kernfs_vma_open, .fault = kernfs_vma_fault, .page_mkwrite = kernfs_vma_page_mkwrite, .access = kernfs_vma_access, -#ifdef CONFIG_NUMA - .set_policy = kernfs_vma_set_policy, - .get_policy = kernfs_vma_get_policy, -#endif }; static int kernfs_fop_mmap(struct file *file, struct vm_area_struct *vma) -- Gitee From a74d9ab72318d518f8b20f48fc35d5bb6b5b1177 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 3 Oct 2023 02:19:00 -0700 Subject: [PATCH 019/484] mempolicy trivia: delete those ancient pr_debug()s commit 7f1ee4e2070883d18a431c761db8bb30a958b654 upstream Conflicts: none Backport-reason: Mempolicy updates for cleaning up shmem SWAP Delete those ancient pr_debug()s - PDprintk()s in Andi Kleen's original submission of core NUMA API, and useful when debugging shared mempolicy lifetime back then, but not used recently. Link: https://lkml.kernel.org/r/f25135-ffb2-40d8-9577-720772b333@google.com Signed-off-by: Hugh Dickins Reviewed-by: Matthew Wilcox (Oracle) Cc: Andi Kleen Cc: Christoph Lameter Cc: David Hildenbrand Cc: Greg Kroah-Hartman Cc: "Huang, Ying" Cc: Kefeng Wang Cc: Mel Gorman Cc: Michal Hocko Cc: Mike Kravetz Cc: Nhat Pham Cc: Sidhartha Kumar Cc: Suren Baghdasaryan Cc: Tejun heo Cc: Vishal Moola (Oracle) Cc: Yang Shi Cc: Yosry Ahmed Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/mempolicy.c | 21 --------------------- 1 file changed, 21 deletions(-) diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 6369acb62462..6843e9efdb1d 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -268,9 +268,6 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags, { struct mempolicy *policy; - pr_debug("setting mode %d flags %d nodes[0] %lx\n", - mode, flags, nodes ? nodes_addr(*nodes)[0] : NUMA_NO_NODE); - if (mode == MPOL_DEFAULT) { if (nodes && !nodes_empty(*nodes)) return ERR_PTR(-EINVAL); @@ -771,11 +768,6 @@ static int vma_replace_policy(struct vm_area_struct *vma, vma_assert_write_locked(vma); - pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n", - vma->vm_start, vma->vm_end, vma->vm_pgoff, - vma->vm_ops, vma->vm_file, - vma->vm_ops ? vma->vm_ops->set_policy : NULL); - new = mpol_dup(pol); if (IS_ERR(new)) return PTR_ERR(new); @@ -1279,10 +1271,6 @@ static long do_mbind(unsigned long start, unsigned long len, if (!new) flags |= MPOL_MF_DISCONTIG_OK; - pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n", - start, start + len, mode, mode_flags, - nmask ? nodes_addr(*nmask)[0] : NUMA_NO_NODE); - if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) lru_cache_disable(); { @@ -2491,8 +2479,6 @@ static void sp_insert(struct shared_policy *sp, struct sp_node *new) } rb_link_node(&new->nd, parent, p); rb_insert_color(&new->nd, &sp->root); - pr_debug("inserting %lx-%lx: %d\n", new->start, new->end, - new->policy ? new->policy->mode : 0); } /* Find shared policy intersecting idx */ @@ -2631,7 +2617,6 @@ void mpol_put_task_policy(struct task_struct *task) static void sp_delete(struct shared_policy *sp, struct sp_node *n) { - pr_debug("deleting %lx-l%lx\n", n->start, n->end); rb_erase(&n->nd, &sp->root); sp_free(n); } @@ -2788,12 +2773,6 @@ int mpol_set_shared_policy(struct shared_policy *info, struct sp_node *new = NULL; unsigned long sz = vma_pages(vma); - pr_debug("set_shared_policy %lx sz %lu %d %d %lx\n", - vma->vm_pgoff, - sz, npol ? npol->mode : -1, - npol ? npol->flags : -1, - npol ? nodes_addr(npol->nodes)[0] : NUMA_NO_NODE); - if (npol) { new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol); if (!new) -- Gitee From 0cb6c14525b1e8939dc058ce6bb971da2f25aa50 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 3 Oct 2023 02:20:14 -0700 Subject: [PATCH 020/484] mempolicy trivia: slightly more consistent naming commit c36f6e6dff4d32ec8b6da8f553933727a57a7a4a upstream Conflicts: none Backport-reason: Mempolicy updates for cleaning up shmem SWAP Before getting down to work, do a little cleanup, mainly of inconsistent variable naming. I gave up trying to rationalize mpol versus pol versus policy, and node versus nid, but let's avoid p and nd. Remove a few superfluous blank lines, but add one; and here prefer vma->vm_policy to vma_policy(vma) - the latter being appropriate in other sources, which have to allow for !CONFIG_NUMA. That intriguing line about KERNEL_DS? should have gone in v2.6.15, when numa_policy_init() stopped using set_mempolicy(2)'s system call handler. Link: https://lkml.kernel.org/r/68287974-b6ae-7df-4ba-d19ddd69cbf@google.com Signed-off-by: Hugh Dickins Reviewed-by: Matthew Wilcox (Oracle) Cc: Andi Kleen Cc: Christoph Lameter Cc: David Hildenbrand Cc: Greg Kroah-Hartman Cc: "Huang, Ying" Cc: Kefeng Wang Cc: Mel Gorman Cc: Michal Hocko Cc: Mike Kravetz Cc: Nhat Pham Cc: Sidhartha Kumar Cc: Suren Baghdasaryan Cc: Tejun heo Cc: Vishal Moola (Oracle) Cc: Yang Shi Cc: Yosry Ahmed Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- include/linux/mempolicy.h | 11 +++--- mm/mempolicy.c | 73 ++++++++++++++++++--------------------- 2 files changed, 38 insertions(+), 46 deletions(-) diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h index 9d60b739967f..f4188b54f7a6 100644 --- a/include/linux/mempolicy.h +++ b/include/linux/mempolicy.h @@ -128,10 +128,9 @@ struct shared_policy { int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst); void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol); -int mpol_set_shared_policy(struct shared_policy *info, - struct vm_area_struct *vma, - struct mempolicy *new); -void mpol_free_shared_policy(struct shared_policy *p); +int mpol_set_shared_policy(struct shared_policy *sp, + struct vm_area_struct *vma, struct mempolicy *mpol); +void mpol_free_shared_policy(struct shared_policy *sp); struct mempolicy *mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx); @@ -195,7 +194,7 @@ static inline bool mpol_equal(struct mempolicy *a, struct mempolicy *b) return true; } -static inline void mpol_put(struct mempolicy *p) +static inline void mpol_put(struct mempolicy *pol) { } @@ -214,7 +213,7 @@ static inline void mpol_shared_policy_init(struct shared_policy *sp, { } -static inline void mpol_free_shared_policy(struct shared_policy *p) +static inline void mpol_free_shared_policy(struct shared_policy *sp) { } diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 6843e9efdb1d..9172424de87a 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -25,7 +25,7 @@ * to the last. It would be better if bind would truly restrict * the allocation to memory nodes instead * - * preferred Try a specific node first before normal fallback. + * preferred Try a specific node first before normal fallback. * As a special case NUMA_NO_NODE here means do the allocation * on the local CPU. This is normally identical to default, * but useful to set in a VMA when you have a non default @@ -52,7 +52,7 @@ * on systems with highmem kernel lowmem allocation don't get policied. * Same with GFP_DMA allocations. * - * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between + * For shmem/tmpfs shared memory the policy is shared between * all users and remembered even when nobody has memory mapped. */ @@ -295,6 +295,7 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags, return ERR_PTR(-EINVAL); } else if (nodes_empty(*nodes)) return ERR_PTR(-EINVAL); + policy = kmem_cache_alloc(policy_cache, GFP_KERNEL); if (!policy) return ERR_PTR(-ENOMEM); @@ -307,11 +308,11 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags, } /* Slow path of a mpol destructor. */ -void __mpol_put(struct mempolicy *p) +void __mpol_put(struct mempolicy *pol) { - if (!atomic_dec_and_test(&p->refcnt)) + if (!atomic_dec_and_test(&pol->refcnt)) return; - kmem_cache_free(policy_cache, p); + kmem_cache_free(policy_cache, pol); } static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes) @@ -368,7 +369,6 @@ static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask) * * Called with task's alloc_lock held. */ - void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new) { mpol_rebind_policy(tsk->mempolicy, new); @@ -379,7 +379,6 @@ void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new) * * Call holding a reference to mm. Takes mm->mmap_lock during call. */ - void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new) { struct vm_area_struct *vma; @@ -760,7 +759,7 @@ queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end, * This must be called with the mmap_lock held for writing. */ static int vma_replace_policy(struct vm_area_struct *vma, - struct mempolicy *pol) + struct mempolicy *pol) { int err; struct mempolicy *old; @@ -803,7 +802,7 @@ static int mbind_range(struct vma_iterator *vmi, struct vm_area_struct *vma, vmstart = vma->vm_start; } - if (mpol_equal(vma_policy(vma), new_pol)) { + if (mpol_equal(vma->vm_policy, new_pol)) { *prev = vma; return 0; } @@ -858,18 +857,18 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags, * * Called with task's alloc_lock held */ -static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes) +static void get_policy_nodemask(struct mempolicy *pol, nodemask_t *nodes) { nodes_clear(*nodes); - if (p == &default_policy) + if (pol == &default_policy) return; - switch (p->mode) { + switch (pol->mode) { case MPOL_BIND: case MPOL_INTERLEAVE: case MPOL_PREFERRED: case MPOL_PREFERRED_MANY: - *nodes = p->nodes; + *nodes = pol->nodes; break; case MPOL_LOCAL: /* return empty node mask for local allocation */ @@ -1640,7 +1639,6 @@ static int kernel_migrate_pages(pid_t pid, unsigned long maxnode, out_put: put_task_struct(task); goto out; - } SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode, @@ -1650,7 +1648,6 @@ SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode, return kernel_migrate_pages(pid, maxnode, old_nodes, new_nodes); } - /* Retrieve NUMA policy */ static int kernel_get_mempolicy(int __user *policy, unsigned long __user *nmask, @@ -1833,10 +1830,10 @@ nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy) * policy_node() is always coupled with policy_nodemask(), which * secures the nodemask limit for 'bind' and 'prefer-many' policy. */ -static int policy_node(gfp_t gfp, struct mempolicy *policy, int nd) +static int policy_node(gfp_t gfp, struct mempolicy *policy, int nid) { if (policy->mode == MPOL_PREFERRED) { - nd = first_node(policy->nodes); + nid = first_node(policy->nodes); } else { /* * __GFP_THISNODE shouldn't even be used with the bind policy @@ -1851,19 +1848,18 @@ static int policy_node(gfp_t gfp, struct mempolicy *policy, int nd) policy->home_node != NUMA_NO_NODE) return policy->home_node; - return nd; + return nid; } /* Do dynamic interleaving for a process */ -static unsigned interleave_nodes(struct mempolicy *policy) +static unsigned int interleave_nodes(struct mempolicy *policy) { - unsigned next; - struct task_struct *me = current; + unsigned int nid; - next = next_node_in(me->il_prev, policy->nodes); - if (next < MAX_NUMNODES) - me->il_prev = next; - return next; + nid = next_node_in(current->il_prev, policy->nodes); + if (nid < MAX_NUMNODES) + current->il_prev = nid; + return nid; } /* @@ -2342,7 +2338,7 @@ unsigned long alloc_pages_bulk_array_mempolicy(gfp_t gfp, int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst) { - struct mempolicy *pol = mpol_dup(vma_policy(src)); + struct mempolicy *pol = mpol_dup(src->vm_policy); if (IS_ERR(pol)) return PTR_ERR(pol); @@ -2766,40 +2762,40 @@ void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol) } } -int mpol_set_shared_policy(struct shared_policy *info, - struct vm_area_struct *vma, struct mempolicy *npol) +int mpol_set_shared_policy(struct shared_policy *sp, + struct vm_area_struct *vma, struct mempolicy *pol) { int err; struct sp_node *new = NULL; unsigned long sz = vma_pages(vma); - if (npol) { - new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol); + if (pol) { + new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, pol); if (!new) return -ENOMEM; } - err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new); + err = shared_policy_replace(sp, vma->vm_pgoff, vma->vm_pgoff + sz, new); if (err && new) sp_free(new); return err; } /* Free a backing policy store on inode delete. */ -void mpol_free_shared_policy(struct shared_policy *p) +void mpol_free_shared_policy(struct shared_policy *sp) { struct sp_node *n; struct rb_node *next; - if (!p->root.rb_node) + if (!sp->root.rb_node) return; - write_lock(&p->lock); - next = rb_first(&p->root); + write_lock(&sp->lock); + next = rb_first(&sp->root); while (next) { n = rb_entry(next, struct sp_node, nd); next = rb_next(&n->nd); - sp_delete(p, n); + sp_delete(sp, n); } - write_unlock(&p->lock); + write_unlock(&sp->lock); } #ifdef CONFIG_NUMA_BALANCING @@ -2849,7 +2845,6 @@ static inline void __init check_numabalancing_enable(void) } #endif /* CONFIG_NUMA_BALANCING */ -/* assumes fs == KERNEL_DS */ void __init numa_policy_init(void) { nodemask_t interleave_nodes; @@ -2912,7 +2907,6 @@ void numa_default_policy(void) /* * Parse and format mempolicy from/to strings */ - static const char * const policy_modes[] = { [MPOL_DEFAULT] = "default", @@ -2923,7 +2917,6 @@ static const char * const policy_modes[] = [MPOL_PREFERRED_MANY] = "prefer (many)", }; - #ifdef CONFIG_TMPFS /** * mpol_parse_str - parse string to mempolicy, for tmpfs mpol mount option. -- Gitee From 1f6761fe2e20854251ec9435948a8f6430da68a8 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 3 Oct 2023 02:21:34 -0700 Subject: [PATCH 021/484] mempolicy trivia: use pgoff_t in shared mempolicy tree commit 93397c3b7684555b7cec726cd13eef6742d191fe upstream Conflicts: none Backport-reason: Mempolicy updates for cleaning up shmem SWAP Prefer the more explicit "pgoff_t" to "unsigned long" when dealing with a shared mempolicy tree. Delete confusing comment about pseudo mm vmas. Link: https://lkml.kernel.org/r/5451157-3818-4af5-fd2c-5d26a5d1dc53@google.com Signed-off-by: Hugh Dickins Cc: Andi Kleen Cc: Christoph Lameter Cc: David Hildenbrand Cc: Greg Kroah-Hartman Cc: "Huang, Ying" Cc: Kefeng Wang Cc: Matthew Wilcox (Oracle) Cc: Mel Gorman Cc: Michal Hocko Cc: Mike Kravetz Cc: Nhat Pham Cc: Sidhartha Kumar Cc: Suren Baghdasaryan Cc: Tejun heo Cc: Vishal Moola (Oracle) Cc: Yang Shi Cc: Yosry Ahmed Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- include/linux/mempolicy.h | 20 +++++++------------- mm/mempolicy.c | 12 ++++++------ 2 files changed, 13 insertions(+), 19 deletions(-) diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h index f4188b54f7a6..eca72a4bbacf 100644 --- a/include/linux/mempolicy.h +++ b/include/linux/mempolicy.h @@ -109,22 +109,16 @@ static inline bool mpol_equal(struct mempolicy *a, struct mempolicy *b) /* * Tree of shared policies for a shared memory region. - * Maintain the policies in a pseudo mm that contains vmas. The vmas - * carry the policy. As a special twist the pseudo mm is indexed in pages, not - * bytes, so that we can work with shared memory segments bigger than - * unsigned long. */ - -struct sp_node { - struct rb_node nd; - unsigned long start, end; - struct mempolicy *policy; -}; - struct shared_policy { struct rb_root root; rwlock_t lock; }; +struct sp_node { + struct rb_node nd; + pgoff_t start, end; + struct mempolicy *policy; +}; int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst); void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol); @@ -132,7 +126,7 @@ int mpol_set_shared_policy(struct shared_policy *sp, struct vm_area_struct *vma, struct mempolicy *mpol); void mpol_free_shared_policy(struct shared_policy *sp); struct mempolicy *mpol_shared_policy_lookup(struct shared_policy *sp, - unsigned long idx); + pgoff_t idx); struct mempolicy *get_task_policy(struct task_struct *p); struct mempolicy *__get_vma_policy(struct vm_area_struct *vma, @@ -218,7 +212,7 @@ static inline void mpol_free_shared_policy(struct shared_policy *sp) } static inline struct mempolicy * -mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx) +mpol_shared_policy_lookup(struct shared_policy *sp, pgoff_t idx) { return NULL; } diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 9172424de87a..18b34f2e2e64 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -2423,8 +2423,8 @@ bool __mpol_equal(struct mempolicy *a, struct mempolicy *b) * lookup first element intersecting start-end. Caller holds sp->lock for * reading or for writing */ -static struct sp_node * -sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end) +static struct sp_node *sp_lookup(struct shared_policy *sp, + pgoff_t start, pgoff_t end) { struct rb_node *n = sp->root.rb_node; @@ -2478,8 +2478,8 @@ static void sp_insert(struct shared_policy *sp, struct sp_node *new) } /* Find shared policy intersecting idx */ -struct mempolicy * -mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx) +struct mempolicy *mpol_shared_policy_lookup(struct shared_policy *sp, + pgoff_t idx) { struct mempolicy *pol = NULL; struct sp_node *sn; @@ -2647,8 +2647,8 @@ static struct sp_node *sp_alloc(unsigned long start, unsigned long end, } /* Replace a policy range. */ -static int shared_policy_replace(struct shared_policy *sp, unsigned long start, - unsigned long end, struct sp_node *new) +static int shared_policy_replace(struct shared_policy *sp, pgoff_t start, + pgoff_t end, struct sp_node *new) { struct sp_node *n; struct sp_node *n_new = NULL; -- Gitee From ce397e63a5c96750d061c98f6cb70ea9acaa0de7 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 3 Oct 2023 02:22:59 -0700 Subject: [PATCH 022/484] mempolicy: mpol_shared_policy_init() without pseudo-vma commit 35ec8fa0207b3c7f7c3c22337c9a507d7b291626 upstream Conflicts: none Backport-reason: Mempolicy updates for cleaning up shmem SWAP mpol_shared_policy_init() does not need to use a pseudo-vma: it can use sp_alloc() and sp_insert() directly, since the object's shared policy tree is empty and inaccessible (needing no lock) at get_inode() time. Link: https://lkml.kernel.org/r/3bef62d8-ae78-4c2-533-56a44ae425c@google.com Signed-off-by: Hugh Dickins Cc: Andi Kleen Cc: Christoph Lameter Cc: David Hildenbrand Cc: Greg Kroah-Hartman Cc: "Huang, Ying" Cc: Kefeng Wang Cc: Matthew Wilcox (Oracle) Cc: Mel Gorman Cc: Michal Hocko Cc: Mike Kravetz Cc: Nhat Pham Cc: Sidhartha Kumar Cc: Suren Baghdasaryan Cc: Tejun heo Cc: Vishal Moola (Oracle) Cc: Yang Shi Cc: Yosry Ahmed Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/mempolicy.c | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 18b34f2e2e64..9e8e1b716193 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -2731,30 +2731,30 @@ void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol) rwlock_init(&sp->lock); if (mpol) { - struct vm_area_struct pvma; - struct mempolicy *new; + struct sp_node *sn; + struct mempolicy *npol; NODEMASK_SCRATCH(scratch); if (!scratch) goto put_mpol; - /* contextualize the tmpfs mount point mempolicy */ - new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask); - if (IS_ERR(new)) + + /* contextualize the tmpfs mount point mempolicy to this file */ + npol = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask); + if (IS_ERR(npol)) goto free_scratch; /* no valid nodemask intersection */ task_lock(current); - ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch); + ret = mpol_set_nodemask(npol, &mpol->w.user_nodemask, scratch); task_unlock(current); if (ret) - goto put_new; - - /* Create pseudo-vma that contains just the policy */ - vma_init(&pvma, NULL); - pvma.vm_end = TASK_SIZE; /* policy covers entire file */ - mpol_set_shared_policy(sp, &pvma, new); /* adds ref */ - -put_new: - mpol_put(new); /* drop initial ref */ + goto put_npol; + + /* alloc node covering entire file; adds ref to file's npol */ + sn = sp_alloc(0, MAX_LFS_FILESIZE >> PAGE_SHIFT, npol); + if (sn) + sp_insert(sp, sn); +put_npol: + mpol_put(npol); /* drop initial ref on file's npol */ free_scratch: NODEMASK_SCRATCH_FREE(scratch); put_mpol: -- Gitee From 1dc898bc3118f55065332fd3db4c428f0b9289b9 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 3 Oct 2023 02:24:18 -0700 Subject: [PATCH 023/484] mempolicy: remove confusing MPOL_MF_LAZY dead code commit 2cafb582173f3870240af90de3f31d18b0728882 upstream Conflicts: none Backport-reason: Mempolicy updates for cleaning up shmem SWAP v3.8 commit b24f53a0bea3 ("mm: mempolicy: Add MPOL_MF_LAZY") introduced MPOL_MF_LAZY, and included it in the MPOL_MF_VALID flags; but a720094ded8 ("mm: mempolicy: Hide MPOL_NOOP and MPOL_MF_LAZY from userspace for now") immediately removed it from MPOL_MF_VALID flags, pending further review. "This will need to be revisited", but it has not been reinstated. The present state is confusing: there is dead code in mm/mempolicy.c to handle MPOL_MF_LAZY cases which can never occur. Remove that: it can be resurrected later if necessary. But keep the definition of MPOL_MF_LAZY, which must remain in the UAPI, even though it always fails with EINVAL. https://lore.kernel.org/linux-mm/1553041659-46787-1-git-send-email-yang.shi@linux.alibaba.com/ links to a previous request to remove MPOL_MF_LAZY. Link: https://lkml.kernel.org/r/80c9665c-1c3f-17ba-21a3-f6115cebf7d@google.com Signed-off-by: Hugh Dickins Reviewed-by: Matthew Wilcox (Oracle) Reviewed-by: Yang Shi Cc: Andi Kleen Cc: Christoph Lameter Cc: David Hildenbrand Cc: Greg Kroah-Hartman Cc: "Huang, Ying" Cc: Kefeng Wang Cc: Mel Gorman Cc: Michal Hocko Cc: Mike Kravetz Cc: Nhat Pham Cc: Sidhartha Kumar Cc: Suren Baghdasaryan Cc: Tejun heo Cc: Vishal Moola (Oracle) Cc: Yosry Ahmed Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- include/uapi/linux/mempolicy.h | 2 +- mm/mempolicy.c | 18 ------------------ 2 files changed, 1 insertion(+), 19 deletions(-) diff --git a/include/uapi/linux/mempolicy.h b/include/uapi/linux/mempolicy.h index 046d0ccba4cd..a8963f7ef4c2 100644 --- a/include/uapi/linux/mempolicy.h +++ b/include/uapi/linux/mempolicy.h @@ -48,7 +48,7 @@ enum { #define MPOL_MF_MOVE (1<<1) /* Move pages owned by this process to conform to policy */ #define MPOL_MF_MOVE_ALL (1<<2) /* Move every page to conform to policy */ -#define MPOL_MF_LAZY (1<<3) /* Modifies '_MOVE: lazy migrate on fault */ +#define MPOL_MF_LAZY (1<<3) /* UNSUPPORTED FLAG: Lazy migrate on fault */ #define MPOL_MF_INTERNAL (1<<4) /* Internal flags start here */ #define MPOL_MF_VALID (MPOL_MF_STRICT | \ diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 9e8e1b716193..21ad2bdd8ae1 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -639,12 +639,6 @@ unsigned long change_prot_numa(struct vm_area_struct *vma, return nr_updated; } -#else -static unsigned long change_prot_numa(struct vm_area_struct *vma, - unsigned long addr, unsigned long end) -{ - return 0; -} #endif /* CONFIG_NUMA_BALANCING */ static int queue_pages_test_walk(unsigned long start, unsigned long end, @@ -683,14 +677,6 @@ static int queue_pages_test_walk(unsigned long start, unsigned long end, if (endvma > end) endvma = end; - if (flags & MPOL_MF_LAZY) { - /* Similar to task_numa_work, skip inaccessible VMAs */ - if (!is_vm_hugetlb_page(vma) && vma_is_accessible(vma) && - !(vma->vm_flags & VM_MIXEDMAP)) - change_prot_numa(vma, start, endvma); - return 1; - } - /* * Check page nodes, and queue pages to move, in the current vma. * But if no moving, and no strict checking, the scan can be skipped. @@ -1260,9 +1246,6 @@ static long do_mbind(unsigned long start, unsigned long len, if (IS_ERR(new)) return PTR_ERR(new); - if (flags & MPOL_MF_LAZY) - new->flags |= MPOL_F_MOF; - /* * If we are using the default policy then operation * on discontinuous address spaces is okay after all @@ -1307,7 +1290,6 @@ static long do_mbind(unsigned long start, unsigned long len, if (!err) { if (!list_empty(&pagelist)) { - WARN_ON_ONCE(flags & MPOL_MF_LAZY); nr_failed |= migrate_pages(&pagelist, new_folio, NULL, start, MIGRATE_SYNC, MR_MEMPOLICY_MBIND, NULL); } -- Gitee From 48cc3cfa02bbd871a1f2222d2318a2802b571970 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Fri, 29 Sep 2023 20:25:38 -0700 Subject: [PATCH 024/484] shmem: shrink shmem_inode_info: dir_offsets in a union commit ee615d4585cfc305bf6c218a62123c3051f8b4a3 upstream Conflicts: none Backport-reason: Shmem updates to prepare for SWAP Patch series "shmem,tmpfs: general maintenance". Mostly just cosmetic mods in mm/shmem.c, but the last two enforcing the "size=" limit better. 8/8 goes into percpu counter territory, and could stand alone. This patch (of 8): Shave 32 bytes off (the 64-bit) shmem_inode_info. There was a 4-byte pahole after stop_eviction, better filled by fsflags. And the 24-byte dir_offsets can only be used by directories, whereas shrinklist and swaplist only by shmem_mapping() inodes (regular files or long symlinks): so put those into a union. No change in mm/shmem.c is required for this. Link: https://lkml.kernel.org/r/c7441dc6-f3bb-dd60-c670-9f5cbd9f266@google.com Link: https://lkml.kernel.org/r/86ebb4b-c571-b9e8-27f5-cb82ec50357e@google.com Signed-off-by: Hugh Dickins Reviewed-by: Chuck Lever Reviewed-by: Jan Kara Cc: Axel Rasmussen Cc: Carlos Maiolino Cc: Christian Brauner Cc: Johannes Weiner Cc: Matthew Wilcox (Oracle) Cc: Darrick J. Wong Cc: Dave Chinner Cc: Tim Chen Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- include/linux/shmem_fs.h | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index 134c686c8676..f0c6bf982832 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h @@ -23,18 +23,22 @@ struct shmem_inode_info { unsigned long flags; unsigned long alloced; /* data pages alloced to file */ unsigned long swapped; /* subtotal assigned to swap */ - pgoff_t fallocend; /* highest fallocate endindex */ - struct list_head shrinklist; /* shrinkable hpage inodes */ - struct list_head swaplist; /* chain of maybes on swap */ + union { + struct offset_ctx dir_offsets; /* stable directory offsets */ + struct { + struct list_head shrinklist; /* shrinkable hpage inodes */ + struct list_head swaplist; /* chain of maybes on swap */ + }; + }; + struct timespec64 i_crtime; /* file creation time */ struct shared_policy policy; /* NUMA memory alloc policy */ struct simple_xattrs xattrs; /* list of xattrs */ + pgoff_t fallocend; /* highest fallocate endindex */ + unsigned int fsflags; /* for FS_IOC_[SG]ETFLAGS */ atomic_t stop_eviction; /* hold when working on inode */ - struct timespec64 i_crtime; /* file creation time */ - unsigned int fsflags; /* flags for FS_IOC_[SG]ETFLAGS */ #ifdef CONFIG_TMPFS_QUOTA struct dquot __rcu *i_dquot[MAXQUOTAS]; #endif - struct offset_ctx dir_offsets; /* stable entry offsets */ struct inode vfs_inode; }; -- Gitee From 0ebd44d7f37c222565120dc2e78b4cfd899b76de Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Fri, 29 Sep 2023 20:26:53 -0700 Subject: [PATCH 025/484] shmem: remove vma arg from shmem_get_folio_gfp() commit e3e1a5067fd2f1b3f4f7c651f5b33082962d1aa1 upstream Conflicts: none Backport-reason: Shmem updates to prepare for SWAP The vma is already there in vmf->vma, so no need for a separate arg. Link: https://lkml.kernel.org/r/d9ce6f65-a2ed-48f4-4299-fdb0544875c5@google.com Signed-off-by: Hugh Dickins Reviewed-by: Jan Kara Cc: Axel Rasmussen Cc: Carlos Maiolino Cc: Christian Brauner Cc: Chuck Lever Cc: Darrick J. Wong Cc: Dave Chinner Cc: Johannes Weiner Cc: Matthew Wilcox (Oracle) Cc: Tim Chen Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/shmem.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index 10cea325bef0..46290587c30b 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1962,14 +1962,13 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, * vm. If we swap it in we mark it dirty since we also free the swap * entry since a page cannot live in both the swap and page cache. * - * vma, vmf, and fault_type are only supplied by shmem_fault: - * otherwise they are NULL. + * vmf and fault_type are only supplied by shmem_fault: otherwise they are NULL. */ static int shmem_get_folio_gfp(struct inode *inode, pgoff_t index, struct folio **foliop, enum sgp_type sgp, gfp_t gfp, - struct vm_area_struct *vma, struct vm_fault *vmf, - vm_fault_t *fault_type) + struct vm_fault *vmf, vm_fault_t *fault_type) { + struct vm_area_struct *vma = vmf ? vmf->vma : NULL; struct address_space *mapping = inode->i_mapping; struct shmem_inode_info *info = SHMEM_I(inode); struct shmem_sb_info *sbinfo; @@ -2182,7 +2181,7 @@ int shmem_get_folio(struct inode *inode, pgoff_t index, struct folio **foliop, enum sgp_type sgp) { return shmem_get_folio_gfp(inode, index, foliop, sgp, - mapping_gfp_mask(inode->i_mapping), NULL, NULL, NULL); + mapping_gfp_mask(inode->i_mapping), NULL, NULL); } /* @@ -2266,7 +2265,7 @@ static vm_fault_t shmem_fault(struct vm_fault *vmf) } err = shmem_get_folio_gfp(inode, vmf->pgoff, &folio, SGP_CACHE, - gfp, vma, vmf, &ret); + gfp, vmf, &ret); if (err) return vmf_error(err); if (folio) @@ -4934,7 +4933,7 @@ struct folio *shmem_read_folio_gfp(struct address_space *mapping, BUG_ON(!shmem_mapping(mapping)); error = shmem_get_folio_gfp(inode, index, &folio, SGP_CACHE, - gfp, NULL, NULL, NULL); + gfp, NULL, NULL); if (error) return ERR_PTR(error); -- Gitee From e34f70d6dbda99295adcddf1b5af361a0969874c Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Fri, 29 Sep 2023 20:27:53 -0700 Subject: [PATCH 026/484] shmem: factor shmem_falloc_wait() out of shmem_fault() commit f0a9ad1d4d9ba3c694bca91d8d67be9a4a33b902 upstream Conflicts: none Backport-reason: Shmem updates to prepare for SWAP That Trinity livelock shmem_falloc avoidance block is unlikely, and a distraction from the proper business of shmem_fault(): separate it out. (This used to help compilers save stack on the fault path too, but both gcc and clang nowadays seem to make better choices anyway.) Link: https://lkml.kernel.org/r/6fe379a4-6176-9225-9263-fe60d2633c0@google.com Signed-off-by: Hugh Dickins Reviewed-by: Jan Kara Cc: Axel Rasmussen Cc: Carlos Maiolino Cc: Christian Brauner Cc: Chuck Lever Cc: Darrick J. Wong Cc: Dave Chinner Cc: Johannes Weiner Cc: Matthew Wilcox (Oracle) Cc: Tim Chen Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/shmem.c | 126 +++++++++++++++++++++++++++++------------------------ 1 file changed, 69 insertions(+), 57 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index 46290587c30b..b966b2b748c2 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2189,87 +2189,99 @@ int shmem_get_folio(struct inode *inode, pgoff_t index, struct folio **foliop, * entry unconditionally - even if something else had already woken the * target. */ -static int synchronous_wake_function(wait_queue_entry_t *wait, unsigned mode, int sync, void *key) +static int synchronous_wake_function(wait_queue_entry_t *wait, + unsigned int mode, int sync, void *key) { int ret = default_wake_function(wait, mode, sync, key); list_del_init(&wait->entry); return ret; } +/* + * Trinity finds that probing a hole which tmpfs is punching can + * prevent the hole-punch from ever completing: which in turn + * locks writers out with its hold on i_rwsem. So refrain from + * faulting pages into the hole while it's being punched. Although + * shmem_undo_range() does remove the additions, it may be unable to + * keep up, as each new page needs its own unmap_mapping_range() call, + * and the i_mmap tree grows ever slower to scan if new vmas are added. + * + * It does not matter if we sometimes reach this check just before the + * hole-punch begins, so that one fault then races with the punch: + * we just need to make racing faults a rare case. + * + * The implementation below would be much simpler if we just used a + * standard mutex or completion: but we cannot take i_rwsem in fault, + * and bloating every shmem inode for this unlikely case would be sad. + */ +static vm_fault_t shmem_falloc_wait(struct vm_fault *vmf, struct inode *inode) +{ + struct shmem_falloc *shmem_falloc; + struct file *fpin = NULL; + vm_fault_t ret = 0; + + spin_lock(&inode->i_lock); + shmem_falloc = inode->i_private; + if (shmem_falloc && + shmem_falloc->waitq && + vmf->pgoff >= shmem_falloc->start && + vmf->pgoff < shmem_falloc->next) { + wait_queue_head_t *shmem_falloc_waitq; + DEFINE_WAIT_FUNC(shmem_fault_wait, synchronous_wake_function); + + ret = VM_FAULT_NOPAGE; + fpin = maybe_unlock_mmap_for_io(vmf, NULL); + shmem_falloc_waitq = shmem_falloc->waitq; + prepare_to_wait(shmem_falloc_waitq, &shmem_fault_wait, + TASK_UNINTERRUPTIBLE); + spin_unlock(&inode->i_lock); + schedule(); + + /* + * shmem_falloc_waitq points into the shmem_fallocate() + * stack of the hole-punching task: shmem_falloc_waitq + * is usually invalid by the time we reach here, but + * finish_wait() does not dereference it in that case; + * though i_lock needed lest racing with wake_up_all(). + */ + spin_lock(&inode->i_lock); + finish_wait(shmem_falloc_waitq, &shmem_fault_wait); + } + spin_unlock(&inode->i_lock); + if (fpin) { + fput(fpin); + ret = VM_FAULT_RETRY; + } + return ret; +} + static vm_fault_t shmem_fault(struct vm_fault *vmf) { - struct vm_area_struct *vma = vmf->vma; - struct inode *inode = file_inode(vma->vm_file); + struct inode *inode = file_inode(vmf->vma->vm_file); gfp_t gfp = mapping_gfp_mask(inode->i_mapping); struct folio *folio = NULL; + vm_fault_t ret = 0; int err; - vm_fault_t ret = VM_FAULT_LOCKED; /* * Trinity finds that probing a hole which tmpfs is punching can - * prevent the hole-punch from ever completing: which in turn - * locks writers out with its hold on i_rwsem. So refrain from - * faulting pages into the hole while it's being punched. Although - * shmem_undo_range() does remove the additions, it may be unable to - * keep up, as each new page needs its own unmap_mapping_range() call, - * and the i_mmap tree grows ever slower to scan if new vmas are added. - * - * It does not matter if we sometimes reach this check just before the - * hole-punch begins, so that one fault then races with the punch: - * we just need to make racing faults a rare case. - * - * The implementation below would be much simpler if we just used a - * standard mutex or completion: but we cannot take i_rwsem in fault, - * and bloating every shmem inode for this unlikely case would be sad. + * prevent the hole-punch from ever completing: noted in i_private. */ if (unlikely(inode->i_private)) { - struct shmem_falloc *shmem_falloc; - - spin_lock(&inode->i_lock); - shmem_falloc = inode->i_private; - if (shmem_falloc && - shmem_falloc->waitq && - vmf->pgoff >= shmem_falloc->start && - vmf->pgoff < shmem_falloc->next) { - struct file *fpin; - wait_queue_head_t *shmem_falloc_waitq; - DEFINE_WAIT_FUNC(shmem_fault_wait, synchronous_wake_function); - - ret = VM_FAULT_NOPAGE; - fpin = maybe_unlock_mmap_for_io(vmf, NULL); - if (fpin) - ret = VM_FAULT_RETRY; - - shmem_falloc_waitq = shmem_falloc->waitq; - prepare_to_wait(shmem_falloc_waitq, &shmem_fault_wait, - TASK_UNINTERRUPTIBLE); - spin_unlock(&inode->i_lock); - schedule(); - - /* - * shmem_falloc_waitq points into the shmem_fallocate() - * stack of the hole-punching task: shmem_falloc_waitq - * is usually invalid by the time we reach here, but - * finish_wait() does not dereference it in that case; - * though i_lock needed lest racing with wake_up_all(). - */ - spin_lock(&inode->i_lock); - finish_wait(shmem_falloc_waitq, &shmem_fault_wait); - spin_unlock(&inode->i_lock); - - if (fpin) - fput(fpin); + ret = shmem_falloc_wait(vmf, inode); + if (ret) return ret; - } - spin_unlock(&inode->i_lock); } + WARN_ON_ONCE(vmf->page != NULL); err = shmem_get_folio_gfp(inode, vmf->pgoff, &folio, SGP_CACHE, gfp, vmf, &ret); if (err) return vmf_error(err); - if (folio) + if (folio) { vmf->page = folio_file_page(folio, vmf->pgoff); + ret |= VM_FAULT_LOCKED; + } return ret; } -- Gitee From 4f84e7df1b869a53a82bf081f9db8fad6aeb6596 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Fri, 29 Sep 2023 20:28:50 -0700 Subject: [PATCH 027/484] shmem: trivial tidyups, removing extra blank lines, etc commit 9be7d5b06648b808989e99c5d0bea1be47c5a384 upstream Conflicts: none Backport-reason: Shmem updates to prepare for SWAP Mostly removing a few superfluous blank lines, joining short arglines, imposing some 80-column observance, correcting a couple of comments. None of it more interesting than deleting a repeated INIT_LIST_HEAD(). Link: https://lkml.kernel.org/r/b3983d28-5d3f-8649-36af-b819285d7a9e@google.com Signed-off-by: Hugh Dickins Reviewed-by: Jan Kara Cc: Axel Rasmussen Cc: Carlos Maiolino Cc: Christian Brauner Cc: Chuck Lever Cc: Darrick J. Wong Cc: Dave Chinner Cc: Johannes Weiner Cc: Matthew Wilcox (Oracle) Cc: Tim Chen Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/shmem.c | 56 ++++++++++++++++++++---------------------------------- 1 file changed, 21 insertions(+), 35 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index b966b2b748c2..8121392af871 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -764,7 +764,7 @@ static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo, #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ /* - * Like filemap_add_folio, but error if expected item has gone. + * Somewhat like filemap_add_folio, but error if expected item has gone. */ static int shmem_add_to_page_cache(struct folio *folio, struct address_space *mapping, @@ -833,7 +833,7 @@ static int shmem_add_to_page_cache(struct folio *folio, } /* - * Like delete_from_page_cache, but substitutes swap for @folio. + * Somewhat like filemap_remove_folio, but substitutes swap for @folio. */ static void shmem_delete_from_page_cache(struct folio *folio, void *radswap) { @@ -895,7 +895,6 @@ unsigned long shmem_partial_swap_usage(struct address_space *mapping, cond_resched_rcu(); } } - rcu_read_unlock(); return swapped << PAGE_SHIFT; @@ -1238,7 +1237,6 @@ static int shmem_setattr(struct mnt_idmap *idmap, if (i_uid_needs_update(idmap, attr, inode) || i_gid_needs_update(idmap, attr, inode)) { error = dquot_transfer(idmap, inode, attr); - if (error) return error; } @@ -2494,7 +2492,6 @@ static struct inode *__shmem_get_inode(struct mnt_idmap *idmap, if (err) return ERR_PTR(err); - inode = new_inode(sb); if (!inode) { shmem_free_inode(sb, 0); @@ -2519,11 +2516,10 @@ static struct inode *__shmem_get_inode(struct mnt_idmap *idmap, shmem_set_inode_flags(inode, info->fsflags); INIT_LIST_HEAD(&info->shrinklist); INIT_LIST_HEAD(&info->swaplist); - INIT_LIST_HEAD(&info->swaplist); - if (sbinfo->noswap) - mapping_set_unevictable(inode->i_mapping); simple_xattrs_init(&info->xattrs); cache_no_acl(inode); + if (sbinfo->noswap) + mapping_set_unevictable(inode->i_mapping); mapping_set_large_folios(inode->i_mapping); switch (mode & S_IFMT) { @@ -2735,7 +2731,6 @@ shmem_write_begin(struct file *file, struct address_space *mapping, } ret = shmem_get_folio(inode, index, &folio, SGP_WRITE); - if (ret) return ret; @@ -3267,8 +3262,7 @@ shmem_mknod(struct mnt_idmap *idmap, struct inode *dir, error = simple_acl_create(dir, inode); if (error) goto out_iput; - error = security_inode_init_security(inode, dir, - &dentry->d_name, + error = security_inode_init_security(inode, dir, &dentry->d_name, shmem_initxattrs, NULL); if (error && error != -EOPNOTSUPP) goto out_iput; @@ -3297,14 +3291,11 @@ shmem_tmpfile(struct mnt_idmap *idmap, struct inode *dir, int error; inode = shmem_get_inode(idmap, dir->i_sb, dir, mode, 0, VM_NORESERVE); - if (IS_ERR(inode)) { error = PTR_ERR(inode); goto err_out; } - - error = security_inode_init_security(inode, dir, - NULL, + error = security_inode_init_security(inode, dir, NULL, shmem_initxattrs, NULL); if (error && error != -EOPNOTSUPP) goto out_iput; @@ -3341,7 +3332,8 @@ static int shmem_create(struct mnt_idmap *idmap, struct inode *dir, /* * Link a file.. */ -static int shmem_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) +static int shmem_link(struct dentry *old_dentry, struct inode *dir, + struct dentry *dentry) { struct inode *inode = d_inode(old_dentry); int ret = 0; @@ -3372,7 +3364,7 @@ static int shmem_link(struct dentry *old_dentry, struct inode *dir, struct dentr inode_inc_iversion(dir); inc_nlink(inode); ihold(inode); /* New dentry reference */ - dget(dentry); /* Extra pinning count for the created dentry */ + dget(dentry); /* Extra pinning count for the created dentry */ d_instantiate(dentry, inode); out: return ret; @@ -3392,7 +3384,7 @@ static int shmem_unlink(struct inode *dir, struct dentry *dentry) inode_set_ctime_current(inode)); inode_inc_iversion(dir); drop_nlink(inode); - dput(dentry); /* Undo the count from "create" - this does all the work */ + dput(dentry); /* Undo the count from "create" - does all the work */ return 0; } @@ -3501,7 +3493,6 @@ static int shmem_symlink(struct mnt_idmap *idmap, struct inode *dir, inode = shmem_get_inode(idmap, dir->i_sb, dir, S_IFLNK | 0777, 0, VM_NORESERVE); - if (IS_ERR(inode)) return PTR_ERR(inode); @@ -3555,8 +3546,7 @@ static void shmem_put_link(void *arg) folio_put(arg); } -static const char *shmem_get_link(struct dentry *dentry, - struct inode *inode, +static const char *shmem_get_link(struct dentry *dentry, struct inode *inode, struct delayed_call *done) { struct folio *folio = NULL; @@ -3630,8 +3620,7 @@ static int shmem_fileattr_set(struct mnt_idmap *idmap, * Callback for security_inode_init_security() for acquiring xattrs. */ static int shmem_initxattrs(struct inode *inode, - const struct xattr *xattr_array, - void *fs_info) + const struct xattr *xattr_array, void *fs_info) { struct shmem_inode_info *info = SHMEM_I(inode); struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); @@ -3815,7 +3804,6 @@ static struct dentry *shmem_find_alias(struct inode *inode) return alias ?: d_find_any_alias(inode); } - static struct dentry *shmem_fh_to_dentry(struct super_block *sb, struct fid *fid, int fh_len, int fh_type) { @@ -4399,8 +4387,8 @@ static int shmem_fill_super(struct super_block *sb, struct fs_context *fc) } #endif /* CONFIG_TMPFS_QUOTA */ - inode = shmem_get_inode(&nop_mnt_idmap, sb, NULL, S_IFDIR | sbinfo->mode, 0, - VM_NORESERVE); + inode = shmem_get_inode(&nop_mnt_idmap, sb, NULL, + S_IFDIR | sbinfo->mode, 0, VM_NORESERVE); if (IS_ERR(inode)) { error = PTR_ERR(inode); goto failed; @@ -4703,11 +4691,9 @@ static ssize_t shmem_enabled_show(struct kobject *kobj, for (i = 0; i < ARRAY_SIZE(values); i++) { len += sysfs_emit_at(buf, len, - shmem_huge == values[i] ? "%s[%s]" : "%s%s", - i ? " " : "", - shmem_format_huge(values[i])); + shmem_huge == values[i] ? "%s[%s]" : "%s%s", + i ? " " : "", shmem_format_huge(values[i])); } - len += sysfs_emit_at(buf, len, "\n"); return len; @@ -4804,8 +4790,9 @@ EXPORT_SYMBOL_GPL(shmem_truncate_range); #define shmem_acct_size(flags, size) 0 #define shmem_unacct_size(flags, size) do {} while (0) -static inline struct inode *shmem_get_inode(struct mnt_idmap *idmap, struct super_block *sb, struct inode *dir, - umode_t mode, dev_t dev, unsigned long flags) +static inline struct inode *shmem_get_inode(struct mnt_idmap *idmap, + struct super_block *sb, struct inode *dir, + umode_t mode, dev_t dev, unsigned long flags) { struct inode *inode = ramfs_get_inode(sb, dir, mode, dev); return inode ? inode : ERR_PTR(-ENOSPC); @@ -4815,8 +4802,8 @@ static inline struct inode *shmem_get_inode(struct mnt_idmap *idmap, struct supe /* common code */ -static struct file *__shmem_file_setup(struct vfsmount *mnt, const char *name, loff_t size, - unsigned long flags, unsigned int i_flags) +static struct file *__shmem_file_setup(struct vfsmount *mnt, const char *name, + loff_t size, unsigned long flags, unsigned int i_flags) { struct inode *inode; struct file *res; @@ -4835,7 +4822,6 @@ static struct file *__shmem_file_setup(struct vfsmount *mnt, const char *name, l inode = shmem_get_inode(&nop_mnt_idmap, mnt->mnt_sb, NULL, S_IFREG | S_IRWXUGO, 0, flags); - if (IS_ERR(inode)) { shmem_unacct_size(flags, size); return ERR_CAST(inode); -- Gitee From bf9a75f85af0c570a05a279aec74255b11a2675f Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Fri, 29 Sep 2023 20:30:03 -0700 Subject: [PATCH 028/484] shmem: shmem_acct_blocks() and shmem_inode_acct_blocks() commit 4199f51a7eb2054d68964efbd8d39c68053a8714 upstream Conflicts: none Backport-reason: Shmem updates to prepare for SWAP By historical accident, shmem_acct_block() and shmem_inode_acct_block() were never pluralized when the pages argument was added, despite their complements being shmem_unacct_blocks() and shmem_inode_unacct_blocks() all along. It has been an irritation: fix their naming at last. Link: https://lkml.kernel.org/r/9124094-e4ab-8be7-ef80-9a87bdc2e4fc@google.com Signed-off-by: Hugh Dickins Reviewed-by: Jan Kara Cc: Axel Rasmussen Cc: Carlos Maiolino Cc: Christian Brauner Cc: Chuck Lever Cc: Darrick J. Wong Cc: Dave Chinner Cc: Johannes Weiner Cc: Matthew Wilcox (Oracle) Cc: Tim Chen Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/shmem.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index 8121392af871..daa70f5f08e8 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -192,10 +192,10 @@ static inline int shmem_reacct_size(unsigned long flags, /* * ... whereas tmpfs objects are accounted incrementally as * pages are allocated, in order to allow large sparse files. - * shmem_get_folio reports shmem_acct_block failure as -ENOSPC not -ENOMEM, + * shmem_get_folio reports shmem_acct_blocks failure as -ENOSPC not -ENOMEM, * so that a failure on a sparse tmpfs mapping will give SIGBUS not OOM. */ -static inline int shmem_acct_block(unsigned long flags, long pages) +static inline int shmem_acct_blocks(unsigned long flags, long pages) { if (!(flags & VM_NORESERVE)) return 0; @@ -210,13 +210,13 @@ static inline void shmem_unacct_blocks(unsigned long flags, long pages) vm_unacct_memory(pages * VM_ACCT(PAGE_SIZE)); } -static int shmem_inode_acct_block(struct inode *inode, long pages) +static int shmem_inode_acct_blocks(struct inode *inode, long pages) { struct shmem_inode_info *info = SHMEM_I(inode); struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); int err = -ENOSPC; - if (shmem_acct_block(info->flags, pages)) + if (shmem_acct_blocks(info->flags, pages)) return err; might_sleep(); /* when quotas */ @@ -450,7 +450,7 @@ bool shmem_charge(struct inode *inode, long pages) { struct address_space *mapping = inode->i_mapping; - if (shmem_inode_acct_block(inode, pages)) + if (shmem_inode_acct_blocks(inode, pages)) return false; /* nrpages adjustment first, then shmem_recalc_inode() when balanced */ @@ -1703,7 +1703,7 @@ static struct folio *shmem_alloc_and_acct_folio(gfp_t gfp, struct inode *inode, huge = false; nr = huge ? HPAGE_PMD_NR : 1; - err = shmem_inode_acct_block(inode, nr); + err = shmem_inode_acct_blocks(inode, nr); if (err) goto failed; @@ -2610,7 +2610,7 @@ int shmem_mfill_atomic_pte(pmd_t *dst_pmd, int ret; pgoff_t max_off; - if (shmem_inode_acct_block(inode, 1)) { + if (shmem_inode_acct_blocks(inode, 1)) { /* * We may have got a page, returned -ENOENT triggering a retry, * and now we find ourselves with -ENOMEM. Release the page, to -- Gitee From a84b8b03e34e8260c4f4f78137dcb0550ed7ab32 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Fri, 29 Sep 2023 20:31:27 -0700 Subject: [PATCH 029/484] shmem: move memcg charge out of shmem_add_to_page_cache() commit 054a9f7ccd0a60607fb9bbe1e06ca671494971bf upstream Conflicts: none Backport-reason: Shmem updates to prepare for SWAP Extract shmem's memcg charging out of shmem_add_to_page_cache(): it's misleading done there, because many calls are dealing with a swapcache page, whose memcg is nowadays always remembered while swapped out, then the charge re-levied when it's brought back into swapcache. Temporarily move it back up to the shmem_get_folio_gfp() level, where the memcg was charged before v5.8; but the next commit goes on to move it back down to a new home. In making this change, it becomes clear that shmem_swapin_folio() does not need to know the vma, just the fault mm (if any): call it fault_mm rather than charge_mm - let mem_cgroup_charge() decide whom to charge. Link: https://lkml.kernel.org/r/4b2143c5-bf32-64f0-841-81a81158dac@google.com Signed-off-by: Hugh Dickins Reviewed-by: Jan Kara Cc: Axel Rasmussen Cc: Carlos Maiolino Cc: Christian Brauner Cc: Chuck Lever Cc: Darrick J. Wong Cc: Dave Chinner Cc: Johannes Weiner Cc: Matthew Wilcox (Oracle) Cc: Tim Chen Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/shmem.c | 68 +++++++++++++++++++++++------------------------------- 1 file changed, 29 insertions(+), 39 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index daa70f5f08e8..c2b2a3254dbe 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -149,9 +149,8 @@ static unsigned long shmem_default_max_inodes(void) #endif static int shmem_swapin_folio(struct inode *inode, pgoff_t index, - struct folio **foliop, enum sgp_type sgp, - gfp_t gfp, struct vm_area_struct *vma, - vm_fault_t *fault_type); + struct folio **foliop, enum sgp_type sgp, gfp_t gfp, + struct mm_struct *fault_mm, vm_fault_t *fault_type); static inline struct shmem_sb_info *SHMEM_SB(struct super_block *sb) { @@ -768,12 +767,10 @@ static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo, */ static int shmem_add_to_page_cache(struct folio *folio, struct address_space *mapping, - pgoff_t index, void *expected, gfp_t gfp, - struct mm_struct *charge_mm) + pgoff_t index, void *expected, gfp_t gfp) { XA_STATE_ORDER(xas, &mapping->i_pages, index, folio_order(folio)); long nr = folio_nr_pages(folio); - int error; VM_BUG_ON_FOLIO(index != round_down(index, nr), folio); VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); @@ -784,16 +781,7 @@ static int shmem_add_to_page_cache(struct folio *folio, folio->mapping = mapping; folio->index = index; - if (!folio_test_swapcache(folio)) { - error = mem_cgroup_charge(folio, charge_mm, gfp); - if (error) { - if (folio_test_pmd_mappable(folio)) { - count_vm_event(THP_FILE_FALLBACK); - count_vm_event(THP_FILE_FALLBACK_CHARGE); - } - goto error; - } - } + gfp &= GFP_RECLAIM_MASK; folio_throttle_swaprate(folio, gfp); do { @@ -821,15 +809,12 @@ static int shmem_add_to_page_cache(struct folio *folio, } while (xas_nomem(&xas, gfp)); if (xas_error(&xas)) { - error = xas_error(&xas); - goto error; + folio->mapping = NULL; + folio_ref_sub(folio, nr); + return xas_error(&xas); } return 0; -error: - folio->mapping = NULL; - folio_ref_sub(folio, nr); - return error; } /* @@ -1349,10 +1334,8 @@ static int shmem_unuse_swap_entries(struct inode *inode, if (!xa_is_value(folio)) continue; - error = shmem_swapin_folio(inode, indices[i], - &folio, SGP_CACHE, - mapping_gfp_mask(mapping), - NULL, NULL); + error = shmem_swapin_folio(inode, indices[i], &folio, SGP_CACHE, + mapping_gfp_mask(mapping), NULL, NULL); if (error == 0) { folio_unlock(folio); folio_put(folio); @@ -1842,12 +1825,11 @@ static void shmem_set_folio_swapin_error(struct inode *inode, pgoff_t index, */ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, struct folio **foliop, enum sgp_type sgp, - gfp_t gfp, struct vm_area_struct *vma, + gfp_t gfp, struct mm_struct *fault_mm, vm_fault_t *fault_type) { struct address_space *mapping = inode->i_mapping; struct shmem_inode_info *info = SHMEM_I(inode); - struct mm_struct *charge_mm = vma ? vma->vm_mm : NULL; struct swap_info_struct *si; struct folio *folio = NULL; swp_entry_t swap; @@ -1878,7 +1860,7 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, if (fault_type) { *fault_type |= VM_FAULT_MAJOR; count_vm_event(PGMAJFAULT); - count_memcg_event_mm(charge_mm, PGMAJFAULT); + count_memcg_event_mm(fault_mm, PGMAJFAULT); } /* Here we actually start the io */ #ifdef CONFIG_CGROUP_SLI @@ -1921,8 +1903,7 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, } error = shmem_add_to_page_cache(folio, mapping, index, - swp_to_radix_entry(swap), gfp, - charge_mm); + swp_to_radix_entry(swap), gfp); if (error) goto failed; @@ -1970,7 +1951,7 @@ static int shmem_get_folio_gfp(struct inode *inode, pgoff_t index, struct address_space *mapping = inode->i_mapping; struct shmem_inode_info *info = SHMEM_I(inode); struct shmem_sb_info *sbinfo; - struct mm_struct *charge_mm; + struct mm_struct *fault_mm; struct folio *folio; pgoff_t hindex; gfp_t huge_gfp; @@ -1987,7 +1968,7 @@ static int shmem_get_folio_gfp(struct inode *inode, pgoff_t index, } sbinfo = SHMEM_SB(inode->i_sb); - charge_mm = vma ? vma->vm_mm : NULL; + fault_mm = vma ? vma->vm_mm : NULL; folio = filemap_get_entry(mapping, index); if (folio && vma && userfaultfd_minor(vma)) { @@ -1999,7 +1980,7 @@ static int shmem_get_folio_gfp(struct inode *inode, pgoff_t index, if (xa_is_value(folio)) { error = shmem_swapin_folio(inode, index, &folio, - sgp, gfp, vma, fault_type); + sgp, gfp, fault_mm, fault_type); if (error == -EEXIST) goto repeat; @@ -2085,9 +2066,16 @@ static int shmem_get_folio_gfp(struct inode *inode, pgoff_t index, if (sgp == SGP_WRITE) __folio_set_referenced(folio); - error = shmem_add_to_page_cache(folio, mapping, hindex, - NULL, gfp & GFP_RECLAIM_MASK, - charge_mm); + error = mem_cgroup_charge(folio, fault_mm, gfp); + if (error) { + if (folio_test_pmd_mappable(folio)) { + count_vm_event(THP_FILE_FALLBACK); + count_vm_event(THP_FILE_FALLBACK_CHARGE); + } + goto unacct; + } + + error = shmem_add_to_page_cache(folio, mapping, hindex, NULL, gfp); if (error) goto unacct; @@ -2682,8 +2670,10 @@ int shmem_mfill_atomic_pte(pmd_t *dst_pmd, if (unlikely(pgoff >= max_off)) goto out_release; - ret = shmem_add_to_page_cache(folio, mapping, pgoff, NULL, - gfp & GFP_RECLAIM_MASK, dst_vma->vm_mm); + ret = mem_cgroup_charge(folio, dst_vma->vm_mm, gfp); + if (ret) + goto out_release; + ret = shmem_add_to_page_cache(folio, mapping, pgoff, NULL, gfp); if (ret) goto out_release; -- Gitee From bbd95fb23c91399ab349130fb201493c8626e780 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Fri, 29 Sep 2023 20:32:40 -0700 Subject: [PATCH 030/484] shmem: _add_to_page_cache() before shmem_inode_acct_blocks() commit 3022fd7af9604d44ec43da8a4398872989599b18 upstream Conflicts: none Backport-reason: Shmem updates to prepare for SWAP There has been a recurring problem, that when a tmpfs volume is being filled by racing threads, some fail with ENOSPC (or consequent SIGBUS or EFAULT) even though all allocations were within the permitted size. This was a problem since early days, but magnified and complicated by the addition of huge pages. We have often worked around it by adding some slop to the tmpfs size, but it's hard to say how much is needed, and some users prefer not to do that e.g. keeping sparse files in a tightly tailored tmpfs helps to prevent accidental writing to holes. This comes from the allocation sequence: 1. check page cache for existing folio 2. check and reserve from vm_enough_memory 3. check and account from size of tmpfs 4. if huge, check page cache for overlapping folio 5. allocate physical folio, huge or small 6. check and charge from mem cgroup limit 7. add to page cache (but maybe another folio already got in). Concurrent tasks allocating at the same position could deplete the size allowance and fail. Doing vm_enough_memory and size checks before the folio allocation was intentional (to limit the load on the page allocator from this source) and still has some virtue; but memory cgroup never did that, so I think it's better reordered to favour predictable behaviour. 1. check page cache for existing folio 2. if huge, check page cache for overlapping folio 3. allocate physical folio, huge or small 4. check and charge from mem cgroup limit 5. add to page cache (but maybe another folio already got in) 6. check and reserve from vm_enough_memory 7. check and account from size of tmpfs. The folio lock held from allocation onwards ensures that the !uptodate folio cannot be used by others, and can safely be deleted from the cache if checks 6 or 7 subsequently fail (and those waiting on folio lock already check that the folio was not truncated once they get the lock); and the early addition to page cache ensures that racers find it before they try to duplicate the accounting. Seize the opportunity to tidy up shmem_get_folio_gfp()'s ENOSPC retrying, which can be combined inside the new shmem_alloc_and_add_folio(): doing 2 splits twice (once huge, once nonhuge) is not exactly equivalent to trying 5 splits (and giving up early on huge), but let's keep it simple unless more complication proves necessary. Userfaultfd is a foreign country: they do things differently there, and for good reason - to avoid mmap_lock deadlock. Leave ordering in shmem_mfill_atomic_pte() untouched for now, but I would rather like to mesh it better with shmem_get_folio_gfp() in the future. Link: https://lkml.kernel.org/r/22ddd06-d919-33b-1219-56335c1bf28e@google.com Signed-off-by: Hugh Dickins Cc: Axel Rasmussen Cc: Carlos Maiolino Cc: Christian Brauner Cc: Chuck Lever Cc: Darrick J. Wong Cc: Dave Chinner Cc: Jan Kara Cc: Johannes Weiner Cc: Matthew Wilcox (Oracle) Cc: Tim Chen Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/shmem.c | 229 +++++++++++++++++++++++++++-------------------------- 1 file changed, 118 insertions(+), 111 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index c2b2a3254dbe..84d7fe650fc6 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -797,13 +797,11 @@ static int shmem_add_to_page_cache(struct folio *folio, xas_store(&xas, folio); if (xas_error(&xas)) goto unlock; - if (folio_test_pmd_mappable(folio)) { - count_vm_event(THP_FILE_ALLOC); + if (folio_test_pmd_mappable(folio)) __lruvec_stat_mod_folio(folio, NR_SHMEM_THPS, nr); - } - mapping->nrpages += nr; __lruvec_stat_mod_folio(folio, NR_FILE_PAGES, nr); __lruvec_stat_mod_folio(folio, NR_SHMEM, nr); + mapping->nrpages += nr; unlock: xas_unlock_irq(&xas); } while (xas_nomem(&xas, gfp)); @@ -1644,25 +1642,17 @@ static struct folio *shmem_alloc_hugefolio(gfp_t gfp, struct shmem_inode_info *info, pgoff_t index) { struct vm_area_struct pvma; - struct address_space *mapping = info->vfs_inode.i_mapping; - pgoff_t hindex; struct folio *folio; - hindex = round_down(index, HPAGE_PMD_NR); - if (xa_find(&mapping->i_pages, &hindex, hindex + HPAGE_PMD_NR - 1, - XA_PRESENT)) - return NULL; - - shmem_pseudo_vma_init(&pvma, info, hindex); + shmem_pseudo_vma_init(&pvma, info, index); folio = vma_alloc_folio(gfp, HPAGE_PMD_ORDER, &pvma, 0, true); shmem_pseudo_vma_destroy(&pvma); - if (!folio) - count_vm_event(THP_FILE_FALLBACK); + return folio; } static struct folio *shmem_alloc_folio(gfp_t gfp, - struct shmem_inode_info *info, pgoff_t index) + struct shmem_inode_info *info, pgoff_t index) { struct vm_area_struct pvma; struct folio *folio; @@ -1674,36 +1664,101 @@ static struct folio *shmem_alloc_folio(gfp_t gfp, return folio; } -static struct folio *shmem_alloc_and_acct_folio(gfp_t gfp, struct inode *inode, - pgoff_t index, bool huge) +static struct folio *shmem_alloc_and_add_folio(gfp_t gfp, + struct inode *inode, pgoff_t index, + struct mm_struct *fault_mm, bool huge) { + struct address_space *mapping = inode->i_mapping; struct shmem_inode_info *info = SHMEM_I(inode); struct folio *folio; - int nr; - int err; + long pages; + int error; if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) huge = false; - nr = huge ? HPAGE_PMD_NR : 1; - err = shmem_inode_acct_blocks(inode, nr); - if (err) - goto failed; + if (huge) { + pages = HPAGE_PMD_NR; + index = round_down(index, HPAGE_PMD_NR); + + /* + * Check for conflict before waiting on a huge allocation. + * Conflict might be that a huge page has just been allocated + * and added to page cache by a racing thread, or that there + * is already at least one small page in the huge extent. + * Be careful to retry when appropriate, but not forever! + * Elsewhere -EEXIST would be the right code, but not here. + */ + if (xa_find(&mapping->i_pages, &index, + index + HPAGE_PMD_NR - 1, XA_PRESENT)) + return ERR_PTR(-E2BIG); - if (huge) folio = shmem_alloc_hugefolio(gfp, info, index); - else + if (!folio) + count_vm_event(THP_FILE_FALLBACK); + } else { + pages = 1; folio = shmem_alloc_folio(gfp, info, index); - if (folio) { - __folio_set_locked(folio); - __folio_set_swapbacked(folio); - return folio; + } + if (!folio) + return ERR_PTR(-ENOMEM); + + __folio_set_locked(folio); + __folio_set_swapbacked(folio); + + gfp &= GFP_RECLAIM_MASK; + error = mem_cgroup_charge(folio, fault_mm, gfp); + if (error) { + if (xa_find(&mapping->i_pages, &index, + index + pages - 1, XA_PRESENT)) { + error = -EEXIST; + } else if (huge) { + count_vm_event(THP_FILE_FALLBACK); + count_vm_event(THP_FILE_FALLBACK_CHARGE); + } + goto unlock; } - err = -ENOMEM; - shmem_inode_unacct_blocks(inode, nr); -failed: - return ERR_PTR(err); + error = shmem_add_to_page_cache(folio, mapping, index, NULL, gfp); + if (error) + goto unlock; + + error = shmem_inode_acct_blocks(inode, pages); + if (error) { + struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); + long freed; + /* + * Try to reclaim some space by splitting a few + * large folios beyond i_size on the filesystem. + */ + shmem_unused_huge_shrink(sbinfo, NULL, 2); + /* + * And do a shmem_recalc_inode() to account for freed pages: + * except our folio is there in cache, so not quite balanced. + */ + spin_lock(&info->lock); + freed = pages + info->alloced - info->swapped - + READ_ONCE(mapping->nrpages); + if (freed > 0) + info->alloced -= freed; + spin_unlock(&info->lock); + if (freed > 0) + shmem_inode_unacct_blocks(inode, freed); + error = shmem_inode_acct_blocks(inode, pages); + if (error) { + filemap_remove_folio(folio); + goto unlock; + } + } + + shmem_recalc_inode(inode, pages, 0); + folio_add_lru(folio); + return folio; + +unlock: + folio_unlock(folio); + folio_put(folio); + return ERR_PTR(error); } /* @@ -1948,29 +2003,22 @@ static int shmem_get_folio_gfp(struct inode *inode, pgoff_t index, struct vm_fault *vmf, vm_fault_t *fault_type) { struct vm_area_struct *vma = vmf ? vmf->vma : NULL; - struct address_space *mapping = inode->i_mapping; - struct shmem_inode_info *info = SHMEM_I(inode); - struct shmem_sb_info *sbinfo; struct mm_struct *fault_mm; struct folio *folio; - pgoff_t hindex; - gfp_t huge_gfp; int error; - int once = 0; - int alloced = 0; + bool alloced; if (index > (MAX_LFS_FILESIZE >> PAGE_SHIFT)) return -EFBIG; repeat: if (sgp <= SGP_CACHE && - ((loff_t)index << PAGE_SHIFT) >= i_size_read(inode)) { + ((loff_t)index << PAGE_SHIFT) >= i_size_read(inode)) return -EINVAL; - } - sbinfo = SHMEM_SB(inode->i_sb); + alloced = false; fault_mm = vma ? vma->vm_mm : NULL; - folio = filemap_get_entry(mapping, index); + folio = filemap_get_entry(inode->i_mapping, index); if (folio && vma && userfaultfd_minor(vma)) { if (!xa_is_value(folio)) folio_put(folio); @@ -1992,7 +2040,7 @@ static int shmem_get_folio_gfp(struct inode *inode, pgoff_t index, folio_lock(folio); /* Has the folio been truncated or swapped out? */ - if (unlikely(folio->mapping != mapping)) { + if (unlikely(folio->mapping != inode->i_mapping)) { folio_unlock(folio); folio_put(folio); goto repeat; @@ -2027,65 +2075,38 @@ static int shmem_get_folio_gfp(struct inode *inode, pgoff_t index, return 0; } - if (!shmem_is_huge(inode, index, false, - vma ? vma->vm_mm : NULL, vma ? vma->vm_flags : 0)) - goto alloc_nohuge; + if (shmem_is_huge(inode, index, false, fault_mm, + vma ? vma->vm_flags : 0)) { + gfp_t huge_gfp; - huge_gfp = vma_thp_gfp_mask(vma); - huge_gfp = limit_gfp_mask(huge_gfp, gfp); - folio = shmem_alloc_and_acct_folio(huge_gfp, inode, index, true); - if (IS_ERR(folio)) { -alloc_nohuge: - folio = shmem_alloc_and_acct_folio(gfp, inode, index, false); + huge_gfp = vma_thp_gfp_mask(vma); + huge_gfp = limit_gfp_mask(huge_gfp, gfp); + folio = shmem_alloc_and_add_folio(huge_gfp, + inode, index, fault_mm, true); + if (!IS_ERR(folio)) { + count_vm_event(THP_FILE_ALLOC); + goto alloced; + } + if (PTR_ERR(folio) == -EEXIST) + goto repeat; } - if (IS_ERR(folio)) { - int retry = 5; + folio = shmem_alloc_and_add_folio(gfp, inode, index, fault_mm, false); + if (IS_ERR(folio)) { error = PTR_ERR(folio); + if (error == -EEXIST) + goto repeat; folio = NULL; - if (error != -ENOSPC) - goto unlock; - /* - * Try to reclaim some space by splitting a large folio - * beyond i_size on the filesystem. - */ - while (retry--) { - int ret; - - ret = shmem_unused_huge_shrink(sbinfo, NULL, 1); - if (ret == SHRINK_STOP) - break; - if (ret) - goto alloc_nohuge; - } goto unlock; } - hindex = round_down(index, folio_nr_pages(folio)); - - if (sgp == SGP_WRITE) - __folio_set_referenced(folio); - - error = mem_cgroup_charge(folio, fault_mm, gfp); - if (error) { - if (folio_test_pmd_mappable(folio)) { - count_vm_event(THP_FILE_FALLBACK); - count_vm_event(THP_FILE_FALLBACK_CHARGE); - } - goto unacct; - } - - error = shmem_add_to_page_cache(folio, mapping, hindex, NULL, gfp); - if (error) - goto unacct; - - folio_add_lru(folio); - shmem_recalc_inode(inode, folio_nr_pages(folio), 0); +alloced: alloced = true; - if (folio_test_pmd_mappable(folio) && DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE) < folio_next_index(folio) - 1) { + struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); + struct shmem_inode_info *info = SHMEM_I(inode); /* * Part of the large folio is beyond i_size: subject * to shrink under memory pressure. @@ -2103,6 +2124,8 @@ static int shmem_get_folio_gfp(struct inode *inode, pgoff_t index, spin_unlock(&sbinfo->shrinklist_lock); } + if (sgp == SGP_WRITE) + folio_set_referenced(folio); /* * Let SGP_FALLOC use the SGP_WRITE optimization on a new folio. */ @@ -2126,11 +2149,6 @@ static int shmem_get_folio_gfp(struct inode *inode, pgoff_t index, /* Perhaps the file has been truncated since we checked */ if (sgp <= SGP_CACHE && ((loff_t)index << PAGE_SHIFT) >= i_size_read(inode)) { - if (alloced) { - folio_clear_dirty(folio); - filemap_remove_folio(folio); - shmem_recalc_inode(inode, 0, 0); - } error = -EINVAL; goto unlock; } @@ -2141,25 +2159,14 @@ static int shmem_get_folio_gfp(struct inode *inode, pgoff_t index, /* * Error recovery. */ -unacct: - shmem_inode_unacct_blocks(inode, folio_nr_pages(folio)); - - if (folio_test_large(folio)) { - folio_unlock(folio); - folio_put(folio); - goto alloc_nohuge; - } unlock: + if (alloced) + filemap_remove_folio(folio); + shmem_recalc_inode(inode, 0, 0); if (folio) { folio_unlock(folio); folio_put(folio); } - if (error == -ENOSPC && !once++) { - shmem_recalc_inode(inode, 0, 0); - goto repeat; - } - if (error == -EEXIST) - goto repeat; return error; } -- Gitee From 66bc63fdda230b54a9c0db959bea756af6a44684 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Fri, 29 Sep 2023 20:42:45 -0700 Subject: [PATCH 031/484] shmem,percpu_counter: add _limited_add(fbc, limit, amount) commit beb9868628445306958fd7b2da1cd369a4a381cc upstream Conflicts: none Backport-reason: Shmem updates to prepare for SWAP Percpu counter's compare and add are separate functions: without locking around them (which would defeat their purpose), it has been possible to overflow the intended limit. Imagine all the other CPUs fallocating tmpfs huge pages to the limit, in between this CPU's compare and its add. I have not seen reports of that happening; but tmpfs's recent addition of dquot_alloc_block_nodirty() in between the compare and the add makes it even more likely, and I'd be uncomfortable to leave it unfixed. Introduce percpu_counter_limited_add(fbc, limit, amount) to prevent it. I believe this implementation is correct, and slightly more efficient than the combination of compare and add (taking the lock once rather than twice when nearing full - the last 128MiB of a tmpfs volume on a machine with 128 CPUs and 4KiB pages); but it does beg for a better design - when nearing full, there is no new batching, but the costly percpu counter sum across CPUs still has to be done, while locked. Follow __percpu_counter_sum()'s example, including cpu_dying_mask as well as cpu_online_mask: but shouldn't __percpu_counter_compare() and __percpu_counter_limited_add() then be adding a num_dying_cpus() to num_online_cpus(), when they calculate the maximum which could be held across CPUs? But the times when it matters would be vanishingly rare. Link: https://lkml.kernel.org/r/bb817848-2d19-bcc8-39ca-ea179af0f0b4@google.com Signed-off-by: Hugh Dickins Reviewed-by: Jan Kara Cc: Tim Chen Cc: Dave Chinner Cc: Darrick J. Wong Cc: Axel Rasmussen Cc: Carlos Maiolino Cc: Christian Brauner Cc: Chuck Lever Cc: Johannes Weiner Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- include/linux/percpu_counter.h | 23 +++++++++++++++ lib/percpu_counter.c | 53 ++++++++++++++++++++++++++++++++++ mm/shmem.c | 10 +++---- 3 files changed, 81 insertions(+), 5 deletions(-) diff --git a/include/linux/percpu_counter.h b/include/linux/percpu_counter.h index d01351b1526f..8cb7c071bd5c 100644 --- a/include/linux/percpu_counter.h +++ b/include/linux/percpu_counter.h @@ -57,6 +57,8 @@ void percpu_counter_add_batch(struct percpu_counter *fbc, s64 amount, s32 batch); s64 __percpu_counter_sum(struct percpu_counter *fbc); int __percpu_counter_compare(struct percpu_counter *fbc, s64 rhs, s32 batch); +bool __percpu_counter_limited_add(struct percpu_counter *fbc, s64 limit, + s64 amount, s32 batch); void percpu_counter_sync(struct percpu_counter *fbc); static inline int percpu_counter_compare(struct percpu_counter *fbc, s64 rhs) @@ -69,6 +71,13 @@ static inline void percpu_counter_add(struct percpu_counter *fbc, s64 amount) percpu_counter_add_batch(fbc, amount, percpu_counter_batch); } +static inline bool +percpu_counter_limited_add(struct percpu_counter *fbc, s64 limit, s64 amount) +{ + return __percpu_counter_limited_add(fbc, limit, amount, + percpu_counter_batch); +} + /* * With percpu_counter_add_local() and percpu_counter_sub_local(), counts * are accumulated in local per cpu counter and not in fbc->count until @@ -185,6 +194,20 @@ percpu_counter_add(struct percpu_counter *fbc, s64 amount) local_irq_restore(flags); } +static inline bool +percpu_counter_limited_add(struct percpu_counter *fbc, s64 limit, s64 amount) +{ + unsigned long flags; + s64 count; + + local_irq_save(flags); + count = fbc->count + amount; + if (count <= limit) + fbc->count = count; + local_irq_restore(flags); + return count <= limit; +} + /* non-SMP percpu_counter_add_local is the same with percpu_counter_add */ static inline void percpu_counter_add_local(struct percpu_counter *fbc, s64 amount) diff --git a/lib/percpu_counter.c b/lib/percpu_counter.c index 9073430dc865..58a3392f471b 100644 --- a/lib/percpu_counter.c +++ b/lib/percpu_counter.c @@ -278,6 +278,59 @@ int __percpu_counter_compare(struct percpu_counter *fbc, s64 rhs, s32 batch) } EXPORT_SYMBOL(__percpu_counter_compare); +/* + * Compare counter, and add amount if the total is within limit. + * Return true if amount was added, false if it would exceed limit. + */ +bool __percpu_counter_limited_add(struct percpu_counter *fbc, + s64 limit, s64 amount, s32 batch) +{ + s64 count; + s64 unknown; + unsigned long flags; + bool good; + + if (amount > limit) + return false; + + local_irq_save(flags); + unknown = batch * num_online_cpus(); + count = __this_cpu_read(*fbc->counters); + + /* Skip taking the lock when safe */ + if (abs(count + amount) <= batch && + fbc->count + unknown <= limit) { + this_cpu_add(*fbc->counters, amount); + local_irq_restore(flags); + return true; + } + + raw_spin_lock(&fbc->lock); + count = fbc->count + amount; + + /* Skip percpu_counter_sum() when safe */ + if (count + unknown > limit) { + s32 *pcount; + int cpu; + + for_each_cpu_or(cpu, cpu_online_mask, cpu_dying_mask) { + pcount = per_cpu_ptr(fbc->counters, cpu); + count += *pcount; + } + } + + good = count <= limit; + if (good) { + count = __this_cpu_read(*fbc->counters); + fbc->count += count + amount; + __this_cpu_sub(*fbc->counters, count); + } + + raw_spin_unlock(&fbc->lock); + local_irq_restore(flags); + return good; +} + static int __init percpu_counter_startup(void) { int ret; diff --git a/mm/shmem.c b/mm/shmem.c index 84d7fe650fc6..d38f24f251ae 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -220,15 +220,15 @@ static int shmem_inode_acct_blocks(struct inode *inode, long pages) might_sleep(); /* when quotas */ if (sbinfo->max_blocks) { - if (percpu_counter_compare(&sbinfo->used_blocks, - sbinfo->max_blocks - pages) > 0) + if (!percpu_counter_limited_add(&sbinfo->used_blocks, + sbinfo->max_blocks, pages)) goto unacct; err = dquot_alloc_block_nodirty(inode, pages); - if (err) + if (err) { + percpu_counter_sub(&sbinfo->used_blocks, pages); goto unacct; - - percpu_counter_add(&sbinfo->used_blocks, pages); + } } else { err = dquot_alloc_block_nodirty(inode, pages); if (err) -- Gitee From f20ae298800a5d26d7f211d62b055544f463e7a4 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Wed, 11 Oct 2023 21:40:09 -0700 Subject: [PATCH 032/484] percpu_counter: extend _limited_add() to negative amounts commit 1431996bf9088ee59f8017637ab9a7f89909ae63 upstream Conflicts: none Backport-reason: Shmem updates to prepare for SWAP Though tmpfs does not need it, percpu_counter_limited_add() can be twice as useful if it works sensibly with negative amounts (subs) - typically decrements towards a limit of 0 or nearby: as suggested by Dave Chinner. And in the course of that reworking, skip the percpu counter sum if it is already obvious that the limit would be passed: as suggested by Tim Chen. Extend the comment above __percpu_counter_limited_add(), defining the behaviour with positive and negative amounts, allowing negative limits, but not bothering about overflow beyond S64_MAX. Link: https://lkml.kernel.org/r/8f86083b-c452-95d4-365b-f16a2e4ebcd4@google.com Signed-off-by: Hugh Dickins Cc: Axel Rasmussen Cc: Carlos Maiolino Cc: Christian Brauner Cc: Chuck Lever Cc: Darrick J. Wong Cc: Dave Chinner Cc: Jan Kara Cc: Johannes Weiner Cc: Matthew Wilcox (Oracle) Cc: Tim Chen Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- include/linux/percpu_counter.h | 11 +++++-- lib/percpu_counter.c | 54 +++++++++++++++++++++++++--------- 2 files changed, 49 insertions(+), 16 deletions(-) diff --git a/include/linux/percpu_counter.h b/include/linux/percpu_counter.h index 8cb7c071bd5c..3a44dd1e33d2 100644 --- a/include/linux/percpu_counter.h +++ b/include/linux/percpu_counter.h @@ -198,14 +198,21 @@ static inline bool percpu_counter_limited_add(struct percpu_counter *fbc, s64 limit, s64 amount) { unsigned long flags; + bool good = false; s64 count; + if (amount == 0) + return true; + local_irq_save(flags); count = fbc->count + amount; - if (count <= limit) + if ((amount > 0 && count <= limit) || + (amount < 0 && count >= limit)) { fbc->count = count; + good = true; + } local_irq_restore(flags); - return count <= limit; + return good; } /* non-SMP percpu_counter_add_local is the same with percpu_counter_add */ diff --git a/lib/percpu_counter.c b/lib/percpu_counter.c index 58a3392f471b..44dd133594d4 100644 --- a/lib/percpu_counter.c +++ b/lib/percpu_counter.c @@ -279,8 +279,16 @@ int __percpu_counter_compare(struct percpu_counter *fbc, s64 rhs, s32 batch) EXPORT_SYMBOL(__percpu_counter_compare); /* - * Compare counter, and add amount if the total is within limit. - * Return true if amount was added, false if it would exceed limit. + * Compare counter, and add amount if total is: less than or equal to limit if + * amount is positive, or greater than or equal to limit if amount is negative. + * Return true if amount is added, or false if total would be beyond the limit. + * + * Negative limit is allowed, but unusual. + * When negative amounts (subs) are given to percpu_counter_limited_add(), + * the limit would most naturally be 0 - but other limits are also allowed. + * + * Overflow beyond S64_MAX is not allowed for: counter, limit and amount + * are all assumed to be sane (far from S64_MIN and S64_MAX). */ bool __percpu_counter_limited_add(struct percpu_counter *fbc, s64 limit, s64 amount, s32 batch) @@ -288,10 +296,10 @@ bool __percpu_counter_limited_add(struct percpu_counter *fbc, s64 count; s64 unknown; unsigned long flags; - bool good; + bool good = false; - if (amount > limit) - return false; + if (amount == 0) + return true; local_irq_save(flags); unknown = batch * num_online_cpus(); @@ -299,7 +307,8 @@ bool __percpu_counter_limited_add(struct percpu_counter *fbc, /* Skip taking the lock when safe */ if (abs(count + amount) <= batch && - fbc->count + unknown <= limit) { + ((amount > 0 && fbc->count + unknown <= limit) || + (amount < 0 && fbc->count - unknown >= limit))) { this_cpu_add(*fbc->counters, amount); local_irq_restore(flags); return true; @@ -309,7 +318,19 @@ bool __percpu_counter_limited_add(struct percpu_counter *fbc, count = fbc->count + amount; /* Skip percpu_counter_sum() when safe */ - if (count + unknown > limit) { + if (amount > 0) { + if (count - unknown > limit) + goto out; + if (count + unknown <= limit) + good = true; + } else { + if (count + unknown < limit) + goto out; + if (count - unknown >= limit) + good = true; + } + + if (!good) { s32 *pcount; int cpu; @@ -317,15 +338,20 @@ bool __percpu_counter_limited_add(struct percpu_counter *fbc, pcount = per_cpu_ptr(fbc->counters, cpu); count += *pcount; } + if (amount > 0) { + if (count > limit) + goto out; + } else { + if (count < limit) + goto out; + } + good = true; } - good = count <= limit; - if (good) { - count = __this_cpu_read(*fbc->counters); - fbc->count += count + amount; - __this_cpu_sub(*fbc->counters, count); - } - + count = __this_cpu_read(*fbc->counters); + fbc->count += count + amount; + __this_cpu_sub(*fbc->counters, count); +out: raw_spin_unlock(&fbc->lock); local_irq_restore(flags); return good; -- Gitee From 1f96d62502d89ff1c48f98c48aa30ec02931c394 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Thu, 19 Oct 2023 13:39:08 -0700 Subject: [PATCH 033/484] mempolicy: alloc_pages_mpol() for NUMA policy without vma commit ddc1a5cbc05dc62743a2f409b96faa5cf95ba064 upstream Conflicts: minor, due to SLI and backported 78524b05f1a3 Backport-reason: Mempolicy updates for cleaning up shmem SWAP Shrink shmem's stack usage by eliminating the pseudo-vma from its folio allocation. alloc_pages_mpol(gfp, order, pol, ilx, nid) becomes the principal actor for passing mempolicy choice down to __alloc_pages(), rather than vma_alloc_folio(gfp, order, vma, addr, hugepage). vma_alloc_folio() and alloc_pages() remain, but as wrappers around alloc_pages_mpol(). alloc_pages_bulk_*() untouched, except to provide the additional args to policy_nodemask(), which subsumes policy_node(). Cleanup throughout, cutting out some unhelpful "helpers". It would all be much simpler without MPOL_INTERLEAVE, but that adds a dynamic to the constant mpol: complicated by v3.6 commit 09c231cb8bfd ("tmpfs: distribute interleave better across nodes"), which added ino bias to the interleave, hidden from mm/mempolicy.c until this commit. Hence "ilx" throughout, the "interleave index". Originally I thought it could be done just with nid, but that's wrong: the nodemask may come from the shared policy layer below a shmem vma, or it may come from the task layer above a shmem vma; and without the final nodemask then nodeid cannot be decided. And how ilx is applied depends also on page order. The interleave index is almost always irrelevant unless MPOL_INTERLEAVE: with one exception in alloc_pages_mpol(), where the NO_INTERLEAVE_INDEX passed down from vma-less alloc_pages() is also used as hint not to use THP-style hugepage allocation - to avoid the overhead of a hugepage arg (though I don't understand why we never just added a GFP bit for THP - if it actually needs a different allocation strategy from other pages of the same order). vma_alloc_folio() still carries its hugepage arg here, but it is not used, and should be removed when agreed. get_vma_policy() no longer allows a NULL vma: over time I believe we've eradicated all the places which used to need it e.g. swapoff and madvise used to pass NULL vma to read_swap_cache_async(), but now know the vma. [hughd@google.com: handle NULL mpol being passed to __read_swap_cache_async()] Link: https://lkml.kernel.org/r/ea419956-4751-0102-21f7-9c93cb957892@google.com Link: https://lkml.kernel.org/r/74e34633-6060-f5e3-aee-7040d43f2e93@google.com Link: https://lkml.kernel.org/r/1738368e-bac0-fd11-ed7f-b87142a939fe@google.com Signed-off-by: Hugh Dickins Cc: Andi Kleen Cc: Christoph Lameter Cc: David Hildenbrand Cc: Greg Kroah-Hartman Cc: Huang Ying Cc: Kefeng Wang Cc: Matthew Wilcox (Oracle) Cc: Mel Gorman Cc: Michal Hocko Cc: Mike Kravetz Cc: Nhat Pham Cc: Sidhartha Kumar Cc: Suren Baghdasaryan Cc: Tejun heo Cc: Vishal Moola (Oracle) Cc: Yang Shi Cc: Yosry Ahmed Cc: Domenico Cerasuolo Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- fs/proc/task_mmu.c | 5 +- include/linux/gfp.h | 10 +- include/linux/mempolicy.h | 20 +- include/linux/mm.h | 2 +- ipc/shm.c | 21 +-- mm/mempolicy.c | 379 ++++++++++++++++---------------------- mm/shmem.c | 92 ++++----- mm/swap.h | 9 +- mm/swap_state.c | 86 +++++---- mm/zswap.c | 7 +- 10 files changed, 308 insertions(+), 323 deletions(-) diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index b3d2ee1bc4e5..bf032a97e2de 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -1966,8 +1966,9 @@ static int show_numa_map(struct seq_file *m, void *v) struct numa_maps *md = &numa_priv->md; struct file *file = vma->vm_file; struct mm_struct *mm = vma->vm_mm; - struct mempolicy *pol; char buffer[64]; + struct mempolicy *pol; + pgoff_t ilx; int nid; if (!mm) @@ -1976,7 +1977,7 @@ static int show_numa_map(struct seq_file *m, void *v) /* Ensure we start with an empty set of numa_maps statistics. */ memset(md, 0, sizeof(*md)); - pol = __get_vma_policy(vma, vma->vm_start); + pol = __get_vma_policy(vma, vma->vm_start, &ilx); if (pol) { mpol_to_str(buffer, sizeof(buffer), pol); mpol_cond_put(pol); diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 6ceb4bd9ae07..3e4c0c536a3d 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -8,6 +8,7 @@ #include struct vm_area_struct; +struct mempolicy; /* Convert GFP flags to their corresponding migrate type */ #define GFP_MOVABLE_MASK (__GFP_RECLAIMABLE|__GFP_MOVABLE) @@ -262,7 +263,9 @@ static inline struct page *alloc_pages_node(int nid, gfp_t gfp_mask, #ifdef CONFIG_NUMA struct page *alloc_pages(gfp_t gfp, unsigned int order); -struct folio *folio_alloc(gfp_t gfp, unsigned order); +struct page *alloc_pages_mpol(gfp_t gfp, unsigned int order, + struct mempolicy *mpol, pgoff_t ilx, int nid); +struct folio *folio_alloc(gfp_t gfp, unsigned int order); struct folio *vma_alloc_folio(gfp_t gfp, int order, struct vm_area_struct *vma, unsigned long addr, bool hugepage); #else @@ -270,6 +273,11 @@ static inline struct page *alloc_pages(gfp_t gfp_mask, unsigned int order) { return alloc_pages_node(numa_node_id(), gfp_mask, order); } +static inline struct page *alloc_pages_mpol(gfp_t gfp, unsigned int order, + struct mempolicy *mpol, pgoff_t ilx, int nid) +{ + return alloc_pages(gfp, order); +} static inline struct folio *folio_alloc(gfp_t gfp, unsigned int order) { return __folio_alloc_node(gfp, order, numa_node_id()); diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h index eca72a4bbacf..59b58cb0d651 100644 --- a/include/linux/mempolicy.h +++ b/include/linux/mempolicy.h @@ -17,6 +17,8 @@ struct mm_struct; +#define NO_INTERLEAVE_INDEX (-1UL) /* use task il_prev for interleaving */ + #ifdef CONFIG_NUMA /* @@ -130,7 +132,9 @@ struct mempolicy *mpol_shared_policy_lookup(struct shared_policy *sp, struct mempolicy *get_task_policy(struct task_struct *p); struct mempolicy *__get_vma_policy(struct vm_area_struct *vma, - unsigned long addr); + unsigned long addr, pgoff_t *ilx); +struct mempolicy *get_vma_policy(struct vm_area_struct *vma, + unsigned long addr, int order, pgoff_t *ilx); bool vma_policy_mof(struct vm_area_struct *vma); extern void numa_default_policy(void); @@ -144,8 +148,6 @@ extern int huge_node(struct vm_area_struct *vma, extern bool init_nodemask_of_mempolicy(nodemask_t *mask); extern bool mempolicy_in_oom_domain(struct task_struct *tsk, const nodemask_t *mask); -extern nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy); - extern unsigned int mempolicy_slab_node(void); extern enum zone_type policy_zone; @@ -183,6 +185,11 @@ extern bool apply_policy_zone(struct mempolicy *policy, enum zone_type zone); struct mempolicy {}; +static inline struct mempolicy *get_task_policy(struct task_struct *p) +{ + return NULL; +} + static inline bool mpol_equal(struct mempolicy *a, struct mempolicy *b) { return true; @@ -217,6 +224,13 @@ mpol_shared_policy_lookup(struct shared_policy *sp, pgoff_t idx) return NULL; } +static inline struct mempolicy *get_vma_policy(struct vm_area_struct *vma, + unsigned long addr, int order, pgoff_t *ilx) +{ + *ilx = 0; + return NULL; +} + static inline int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst) { diff --git a/include/linux/mm.h b/include/linux/mm.h index 89e3e9ec68cc..4d20c51baa3f 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -629,7 +629,7 @@ struct vm_operations_struct { * policy. */ struct mempolicy *(*get_policy)(struct vm_area_struct *vma, - unsigned long addr); + unsigned long addr, pgoff_t *ilx); #endif /* * Called by vm_normal_page() for special PTEs to find the diff --git a/ipc/shm.c b/ipc/shm.c index c4845689b7f6..bb017dd760f4 100644 --- a/ipc/shm.c +++ b/ipc/shm.c @@ -565,30 +565,25 @@ static unsigned long shm_pagesize(struct vm_area_struct *vma) } #ifdef CONFIG_NUMA -static int shm_set_policy(struct vm_area_struct *vma, struct mempolicy *new) +static int shm_set_policy(struct vm_area_struct *vma, struct mempolicy *mpol) { - struct file *file = vma->vm_file; - struct shm_file_data *sfd = shm_file_data(file); + struct shm_file_data *sfd = shm_file_data(vma->vm_file); int err = 0; if (sfd->vm_ops->set_policy) - err = sfd->vm_ops->set_policy(vma, new); + err = sfd->vm_ops->set_policy(vma, mpol); return err; } static struct mempolicy *shm_get_policy(struct vm_area_struct *vma, - unsigned long addr) + unsigned long addr, pgoff_t *ilx) { - struct file *file = vma->vm_file; - struct shm_file_data *sfd = shm_file_data(file); - struct mempolicy *pol = NULL; + struct shm_file_data *sfd = shm_file_data(vma->vm_file); + struct mempolicy *mpol = vma->vm_policy; if (sfd->vm_ops->get_policy) - pol = sfd->vm_ops->get_policy(vma, addr); - else if (vma->vm_policy) - pol = vma->vm_policy; - - return pol; + mpol = sfd->vm_ops->get_policy(vma, addr, ilx); + return mpol; } #endif diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 21ad2bdd8ae1..4e8f2fddd16b 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -901,6 +901,7 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask, } if (flags & MPOL_F_ADDR) { + pgoff_t ilx; /* ignored here */ /* * Do NOT fall back to task policy if the * vma/shared policy at addr is NULL. We @@ -912,10 +913,7 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask, mmap_read_unlock(mm); return -EFAULT; } - if (vma->vm_ops && vma->vm_ops->get_policy) - pol = vma->vm_ops->get_policy(vma, addr); - else - pol = vma->vm_policy; + pol = __get_vma_policy(vma, addr, &ilx); } else if (addr) return -EINVAL; @@ -1176,6 +1174,15 @@ static struct folio *new_folio(struct folio *src, unsigned long start) break; } + /* + * __get_vma_policy() now expects a genuine non-NULL vma. Return NULL + * when the page can no longer be located in a vma: that is not ideal + * (migrate_pages() will give up early, presuming ENOMEM), but good + * enough to avoid a crash by syzkaller or concurrent holepunch. + */ + if (!vma) + return NULL; + if (folio_test_hugetlb(src)) { return alloc_hugetlb_folio_vma(folio_hstate(src), vma, address); @@ -1184,9 +1191,6 @@ static struct folio *new_folio(struct folio *src, unsigned long start) if (folio_test_large(src)) gfp = GFP_TRANSHUGE; - /* - * if !vma, vma_alloc_folio() will use task or system default policy - */ return vma_alloc_folio(gfp, folio_order(src), vma, address, folio_test_large(src)); } @@ -1696,34 +1700,19 @@ bool vma_migratable(struct vm_area_struct *vma) } struct mempolicy *__get_vma_policy(struct vm_area_struct *vma, - unsigned long addr) + unsigned long addr, pgoff_t *ilx) { - struct mempolicy *pol = NULL; - - if (vma) { - if (vma->vm_ops && vma->vm_ops->get_policy) { - pol = vma->vm_ops->get_policy(vma, addr); - } else if (vma->vm_policy) { - pol = vma->vm_policy; - - /* - * shmem_alloc_page() passes MPOL_F_SHARED policy with - * a pseudo vma whose vma->vm_ops=NULL. Take a reference - * count on these policies which will be dropped by - * mpol_cond_put() later - */ - if (mpol_needs_cond_ref(pol)) - mpol_get(pol); - } - } - - return pol; + *ilx = 0; + return (vma->vm_ops && vma->vm_ops->get_policy) ? + vma->vm_ops->get_policy(vma, addr, ilx) : vma->vm_policy; } /* - * get_vma_policy(@vma, @addr) + * get_vma_policy(@vma, @addr, @order, @ilx) * @vma: virtual memory area whose policy is sought * @addr: address in @vma for shared policy lookup + * @order: 0, or appropriate huge_page_order for interleaving + * @ilx: interleave index (output), for use only when MPOL_INTERLEAVE * * Returns effective policy for a VMA at specified address. * Falls back to current->mempolicy or system default policy, as necessary. @@ -1732,14 +1721,18 @@ struct mempolicy *__get_vma_policy(struct vm_area_struct *vma, * freeing by another task. It is the caller's responsibility to free the * extra reference for shared policies. */ -static struct mempolicy *get_vma_policy(struct vm_area_struct *vma, - unsigned long addr) +struct mempolicy *get_vma_policy(struct vm_area_struct *vma, + unsigned long addr, int order, pgoff_t *ilx) { - struct mempolicy *pol = __get_vma_policy(vma, addr); + struct mempolicy *pol; + pol = __get_vma_policy(vma, addr, ilx); if (!pol) pol = get_task_policy(current); - + if (pol->mode == MPOL_INTERLEAVE) { + *ilx += vma->vm_pgoff >> order; + *ilx += (addr - vma->vm_start) >> (PAGE_SHIFT + order); + } return pol; } @@ -1749,8 +1742,9 @@ bool vma_policy_mof(struct vm_area_struct *vma) if (vma->vm_ops && vma->vm_ops->get_policy) { bool ret = false; + pgoff_t ilx; /* ignored here */ - pol = vma->vm_ops->get_policy(vma, vma->vm_start); + pol = vma->vm_ops->get_policy(vma, vma->vm_start, &ilx); if (pol && (pol->flags & MPOL_F_MOF)) ret = true; mpol_cond_put(pol); @@ -1785,54 +1779,6 @@ bool apply_policy_zone(struct mempolicy *policy, enum zone_type zone) return zone >= dynamic_policy_zone; } -/* - * Return a nodemask representing a mempolicy for filtering nodes for - * page allocation - */ -nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy) -{ - int mode = policy->mode; - - /* Lower zones don't get a nodemask applied for MPOL_BIND */ - if (unlikely(mode == MPOL_BIND) && - apply_policy_zone(policy, gfp_zone(gfp)) && - cpuset_nodemask_valid_mems_allowed(&policy->nodes)) - return &policy->nodes; - - if (mode == MPOL_PREFERRED_MANY) - return &policy->nodes; - - return NULL; -} - -/* - * Return the preferred node id for 'prefer' mempolicy, and return - * the given id for all other policies. - * - * policy_node() is always coupled with policy_nodemask(), which - * secures the nodemask limit for 'bind' and 'prefer-many' policy. - */ -static int policy_node(gfp_t gfp, struct mempolicy *policy, int nid) -{ - if (policy->mode == MPOL_PREFERRED) { - nid = first_node(policy->nodes); - } else { - /* - * __GFP_THISNODE shouldn't even be used with the bind policy - * because we might easily break the expectation to stay on the - * requested node and not break the policy. - */ - WARN_ON_ONCE(policy->mode == MPOL_BIND && (gfp & __GFP_THISNODE)); - } - - if ((policy->mode == MPOL_BIND || - policy->mode == MPOL_PREFERRED_MANY) && - policy->home_node != NUMA_NO_NODE) - return policy->home_node; - - return nid; -} - /* Do dynamic interleaving for a process */ static unsigned int interleave_nodes(struct mempolicy *policy) { @@ -1892,11 +1838,11 @@ unsigned int mempolicy_slab_node(void) } /* - * Do static interleaving for a VMA with known offset @n. Returns the n'th - * node in pol->nodes (starting from n=0), wrapping around if n exceeds the - * number of present nodes. + * Do static interleaving for interleave index @ilx. Returns the ilx'th + * node in pol->nodes (starting from ilx=0), wrapping around if ilx + * exceeds the number of present nodes. */ -static unsigned offset_il_node(struct mempolicy *pol, unsigned long n) +static unsigned int interleave_nid(struct mempolicy *pol, pgoff_t ilx) { nodemask_t nodemask = pol->nodes; unsigned int target, nnodes; @@ -1914,33 +1860,54 @@ static unsigned offset_il_node(struct mempolicy *pol, unsigned long n) nnodes = nodes_weight(nodemask); if (!nnodes) return numa_node_id(); - target = (unsigned int)n % nnodes; + target = ilx % nnodes; nid = first_node(nodemask); for (i = 0; i < target; i++) nid = next_node(nid, nodemask); return nid; } -/* Determine a node number for interleave */ -static inline unsigned interleave_nid(struct mempolicy *pol, - struct vm_area_struct *vma, unsigned long addr, int shift) +/* + * Return a nodemask representing a mempolicy for filtering nodes for + * page allocation, together with preferred node id (or the input node id). + */ +static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *pol, + pgoff_t ilx, int *nid) { - if (vma) { - unsigned long off; + nodemask_t *nodemask = NULL; + switch (pol->mode) { + case MPOL_PREFERRED: + /* Override input node id */ + *nid = first_node(pol->nodes); + break; + case MPOL_PREFERRED_MANY: + nodemask = &pol->nodes; + if (pol->home_node != NUMA_NO_NODE) + *nid = pol->home_node; + break; + case MPOL_BIND: + /* Restrict to nodemask (but not on lower zones) */ + if (apply_policy_zone(pol, gfp_zone(gfp)) && + cpuset_nodemask_valid_mems_allowed(&pol->nodes)) + nodemask = &pol->nodes; + if (pol->home_node != NUMA_NO_NODE) + *nid = pol->home_node; /* - * for small pages, there is no difference between - * shift and PAGE_SHIFT, so the bit-shift is safe. - * for huge pages, since vm_pgoff is in units of small - * pages, we need to shift off the always 0 bits to get - * a useful offset. + * __GFP_THISNODE shouldn't even be used with the bind policy + * because we might easily break the expectation to stay on the + * requested node and not break the policy. */ - BUG_ON(shift < PAGE_SHIFT); - off = vma->vm_pgoff >> (shift - PAGE_SHIFT); - off += (addr - vma->vm_start) >> shift; - return offset_il_node(pol, off); - } else - return interleave_nodes(pol); + WARN_ON_ONCE(gfp & __GFP_THISNODE); + break; + case MPOL_INTERLEAVE: + /* Override input node id */ + *nid = (ilx == NO_INTERLEAVE_INDEX) ? + interleave_nodes(pol) : interleave_nid(pol, ilx); + break; + } + + return nodemask; } #ifdef CONFIG_HUGETLBFS @@ -1956,27 +1923,16 @@ static inline unsigned interleave_nid(struct mempolicy *pol, * to the struct mempolicy for conditional unref after allocation. * If the effective policy is 'bind' or 'prefer-many', returns a pointer * to the mempolicy's @nodemask for filtering the zonelist. - * - * Must be protected by read_mems_allowed_begin() */ int huge_node(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags, - struct mempolicy **mpol, nodemask_t **nodemask) + struct mempolicy **mpol, nodemask_t **nodemask) { + pgoff_t ilx; int nid; - int mode; - *mpol = get_vma_policy(vma, addr); - *nodemask = NULL; - mode = (*mpol)->mode; - - if (unlikely(mode == MPOL_INTERLEAVE)) { - nid = interleave_nid(*mpol, vma, addr, - huge_page_shift(hstate_vma(vma))); - } else { - nid = policy_node(gfp_flags, *mpol, numa_node_id()); - if (mode == MPOL_BIND || mode == MPOL_PREFERRED_MANY) - *nodemask = &(*mpol)->nodes; - } + nid = numa_node_id(); + *mpol = get_vma_policy(vma, addr, hstate_vma(vma)->order, &ilx); + *nodemask = policy_nodemask(gfp_flags, *mpol, ilx, &nid); return nid; } @@ -2054,27 +2010,8 @@ bool mempolicy_in_oom_domain(struct task_struct *tsk, return ret; } -/* Allocate a page in interleaved policy. - Own path because it needs to do special accounting. */ -static struct page *alloc_page_interleave(gfp_t gfp, unsigned order, - unsigned nid) -{ - struct page *page; - - page = __alloc_pages(gfp, order, nid, NULL); - /* skip NUMA_INTERLEAVE_HIT counter update if numa stats is disabled */ - if (!static_branch_likely(&vm_numa_stat_key)) - return page; - if (page && page_to_nid(page) == nid) { - preempt_disable(); - __count_numa_event(page_zone(page), NUMA_INTERLEAVE_HIT); - preempt_enable(); - } - return page; -} - static struct page *alloc_pages_preferred_many(gfp_t gfp, unsigned int order, - int nid, struct mempolicy *pol) + int nid, nodemask_t *nodemask) { struct page *page; gfp_t preferred_gfp; @@ -2087,7 +2024,7 @@ static struct page *alloc_pages_preferred_many(gfp_t gfp, unsigned int order, */ preferred_gfp = gfp | __GFP_NOWARN; preferred_gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL); - page = __alloc_pages(preferred_gfp, order, nid, &pol->nodes); + page = __alloc_pages(preferred_gfp, order, nid, nodemask); if (!page) page = __alloc_pages(gfp, order, nid, NULL); @@ -2095,55 +2032,29 @@ static struct page *alloc_pages_preferred_many(gfp_t gfp, unsigned int order, } /** - * vma_alloc_folio - Allocate a folio for a VMA. + * alloc_pages_mpol - Allocate pages according to NUMA mempolicy. * @gfp: GFP flags. - * @order: Order of the folio. - * @vma: Pointer to VMA or NULL if not available. - * @addr: Virtual address of the allocation. Must be inside @vma. - * @hugepage: For hugepages try only the preferred node if possible. - * - * Allocate a folio for a specific address in @vma, using the appropriate - * NUMA policy. When @vma is not NULL the caller must hold the mmap_lock - * of the mm_struct of the VMA to prevent it from going away. Should be - * used for all allocations for folios that will be mapped into user space. + * @order: Order of the page allocation. + * @pol: Pointer to the NUMA mempolicy. + * @ilx: Index for interleave mempolicy (also distinguishes alloc_pages()). + * @nid: Preferred node (usually numa_node_id() but @mpol may override it). * - * Return: The folio on success or NULL if allocation fails. + * Return: The page on success or NULL if allocation fails. */ -struct folio *vma_alloc_folio(gfp_t gfp, int order, struct vm_area_struct *vma, - unsigned long addr, bool hugepage) +struct page *alloc_pages_mpol(gfp_t gfp, unsigned int order, + struct mempolicy *pol, pgoff_t ilx, int nid) { - struct mempolicy *pol; - int node = numa_node_id(); - struct folio *folio; - int preferred_nid; - nodemask_t *nmask; - - pol = get_vma_policy(vma, addr); - - if (pol->mode == MPOL_INTERLEAVE) { - struct page *page; - unsigned nid; - - nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order); - mpol_cond_put(pol); - gfp |= __GFP_COMP; - page = alloc_page_interleave(gfp, order, nid); - return page_rmappable_folio(page); - } - - if (pol->mode == MPOL_PREFERRED_MANY) { - struct page *page; + nodemask_t *nodemask; + struct page *page; - node = policy_node(gfp, pol, node); - gfp |= __GFP_COMP; - page = alloc_pages_preferred_many(gfp, order, node, pol); - mpol_cond_put(pol); - return page_rmappable_folio(page); - } + nodemask = policy_nodemask(gfp, pol, ilx, &nid); - if (unlikely(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && hugepage)) { - int hpage_node = node; + if (pol->mode == MPOL_PREFERRED_MANY) + return alloc_pages_preferred_many(gfp, order, nid, nodemask); + if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && + /* filter "hugepage" allocation, unless from alloc_pages() */ + order == HPAGE_PMD_ORDER && ilx != NO_INTERLEAVE_INDEX) { /* * For hugepage allocation and non-interleave policy which * allows the current node (or other explicitly preferred @@ -2154,39 +2065,68 @@ struct folio *vma_alloc_folio(gfp_t gfp, int order, struct vm_area_struct *vma, * If the policy is interleave or does not allow the current * node in its nodemask, we allocate the standard way. */ - if (pol->mode == MPOL_PREFERRED) - hpage_node = first_node(pol->nodes); - - nmask = policy_nodemask(gfp, pol); - if (!nmask || node_isset(hpage_node, *nmask)) { - mpol_cond_put(pol); + if (pol->mode != MPOL_INTERLEAVE && + (!nodemask || node_isset(nid, *nodemask))) { /* * First, try to allocate THP only on local node, but * don't reclaim unnecessarily, just compact. */ - folio = __folio_alloc_node(gfp | __GFP_THISNODE | - __GFP_NORETRY, order, hpage_node); - + page = __alloc_pages_node(nid, + gfp | __GFP_THISNODE | __GFP_NORETRY, order); + if (page || !(gfp & __GFP_DIRECT_RECLAIM)) + return page; /* * If hugepage allocations are configured to always * synchronous compact or the vma has been madvised * to prefer hugepage backing, retry allowing remote * memory with both reclaim and compact as well. */ - if (!folio && (gfp & __GFP_DIRECT_RECLAIM)) - folio = __folio_alloc(gfp, order, hpage_node, - nmask); + } + } - goto out; + page = __alloc_pages(gfp, order, nid, nodemask); + + if (unlikely(pol->mode == MPOL_INTERLEAVE) && page) { + /* skip NUMA_INTERLEAVE_HIT update if numa stats is disabled */ + if (static_branch_likely(&vm_numa_stat_key) && + page_to_nid(page) == nid) { + preempt_disable(); + __count_numa_event(page_zone(page), NUMA_INTERLEAVE_HIT); + preempt_enable(); } } - nmask = policy_nodemask(gfp, pol); - preferred_nid = policy_node(gfp, pol, node); - folio = __folio_alloc(gfp, order, preferred_nid, nmask); + return page; +} + +/** + * vma_alloc_folio - Allocate a folio for a VMA. + * @gfp: GFP flags. + * @order: Order of the folio. + * @vma: Pointer to VMA. + * @addr: Virtual address of the allocation. Must be inside @vma. + * @hugepage: Unused (was: For hugepages try only preferred node if possible). + * + * Allocate a folio for a specific address in @vma, using the appropriate + * NUMA policy. The caller must hold the mmap_lock of the mm_struct of the + * VMA to prevent it from going away. Should be used for all allocations + * for folios that will be mapped into user space, excepting hugetlbfs, and + * excepting where direct use of alloc_pages_mpol() is more appropriate. + * + * Return: The folio on success or NULL if allocation fails. + */ +struct folio *vma_alloc_folio(gfp_t gfp, int order, struct vm_area_struct *vma, + unsigned long addr, bool hugepage) +{ + struct mempolicy *pol; + pgoff_t ilx; + struct page *page; + + pol = get_vma_policy(vma, addr, order, &ilx); + page = alloc_pages_mpol(gfp | __GFP_COMP, order, + pol, ilx, numa_node_id()); mpol_cond_put(pol); -out: - return folio; + return page_rmappable_folio(page); } EXPORT_SYMBOL(vma_alloc_folio); @@ -2204,33 +2144,23 @@ EXPORT_SYMBOL(vma_alloc_folio); * flags are used. * Return: The page on success or NULL if allocation fails. */ -struct page *alloc_pages(gfp_t gfp, unsigned order) +struct page *alloc_pages(gfp_t gfp, unsigned int order) { struct mempolicy *pol = &default_policy; - struct page *page; - - if (!in_interrupt() && !(gfp & __GFP_THISNODE)) - pol = get_task_policy(current); /* * No reference counting needed for current->mempolicy * nor system default_policy */ - if (pol->mode == MPOL_INTERLEAVE) - page = alloc_page_interleave(gfp, order, interleave_nodes(pol)); - else if (pol->mode == MPOL_PREFERRED_MANY) - page = alloc_pages_preferred_many(gfp, order, - policy_node(gfp, pol, numa_node_id()), pol); - else - page = __alloc_pages(gfp, order, - policy_node(gfp, pol, numa_node_id()), - policy_nodemask(gfp, pol)); + if (!in_interrupt() && !(gfp & __GFP_THISNODE)) + pol = get_task_policy(current); - return page; + return alloc_pages_mpol(gfp, order, + pol, NO_INTERLEAVE_INDEX, numa_node_id()); } EXPORT_SYMBOL(alloc_pages); -struct folio *folio_alloc(gfp_t gfp, unsigned order) +struct folio *folio_alloc(gfp_t gfp, unsigned int order) { return page_rmappable_folio(alloc_pages(gfp | __GFP_COMP, order)); } @@ -2301,6 +2231,8 @@ unsigned long alloc_pages_bulk_array_mempolicy(gfp_t gfp, unsigned long nr_pages, struct page **page_array) { struct mempolicy *pol = &default_policy; + nodemask_t *nodemask; + int nid; if (!in_interrupt() && !(gfp & __GFP_THISNODE)) pol = get_task_policy(current); @@ -2313,9 +2245,10 @@ unsigned long alloc_pages_bulk_array_mempolicy(gfp_t gfp, return alloc_pages_bulk_array_preferred_many(gfp, numa_node_id(), pol, nr_pages, page_array); - return __alloc_pages_bulk(gfp, policy_node(gfp, pol, numa_node_id()), - policy_nodemask(gfp, pol), nr_pages, NULL, - page_array); + nid = numa_node_id(); + nodemask = policy_nodemask(gfp, pol, NO_INTERLEAVE_INDEX, &nid); + return __alloc_pages_bulk(gfp, nid, nodemask, + nr_pages, NULL, page_array); } int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst) @@ -2502,23 +2435,21 @@ int mpol_misplaced(struct folio *folio, struct vm_area_struct *vma, unsigned long addr) { struct mempolicy *pol; + pgoff_t ilx; struct zoneref *z; int curnid = folio_nid(folio); - unsigned long pgoff; int thiscpu = raw_smp_processor_id(); int thisnid = cpu_to_node(thiscpu); int polnid = NUMA_NO_NODE; int ret = NUMA_NO_NODE; - pol = get_vma_policy(vma, addr); + pol = get_vma_policy(vma, addr, folio_order(folio), &ilx); if (!(pol->flags & MPOL_F_MOF)) goto out; switch (pol->mode) { case MPOL_INTERLEAVE: - pgoff = vma->vm_pgoff; - pgoff += (addr - vma->vm_start) >> PAGE_SHIFT; - polnid = offset_il_node(pol, pgoff); + polnid = interleave_nid(pol, ilx); break; case MPOL_PREFERRED: diff --git a/mm/shmem.c b/mm/shmem.c index d38f24f251ae..9d80ad42a5a6 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1576,38 +1576,20 @@ static inline struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo) return NULL; } #endif /* CONFIG_NUMA && CONFIG_TMPFS */ -#ifndef CONFIG_NUMA -#define vm_policy vm_private_data -#endif - -static void shmem_pseudo_vma_init(struct vm_area_struct *vma, - struct shmem_inode_info *info, pgoff_t index) -{ - /* Create a pseudo vma that just contains the policy */ - vma_init(vma, NULL); - /* Bias interleave by inode number to distribute better across nodes */ - vma->vm_pgoff = index + info->vfs_inode.i_ino; - vma->vm_policy = mpol_shared_policy_lookup(&info->policy, index); -} -static void shmem_pseudo_vma_destroy(struct vm_area_struct *vma) -{ - /* Drop reference taken by mpol_shared_policy_lookup() */ - mpol_cond_put(vma->vm_policy); -} +static struct mempolicy *shmem_get_pgoff_policy(struct shmem_inode_info *info, + pgoff_t index, unsigned int order, pgoff_t *ilx); -static struct folio *shmem_swapin(swp_entry_t swap, gfp_t gfp, +static struct folio *shmem_swapin_cluster(swp_entry_t swap, gfp_t gfp, struct shmem_inode_info *info, pgoff_t index) { - struct vm_area_struct pvma; + struct mempolicy *mpol; + pgoff_t ilx; struct page *page; - struct vm_fault vmf = { - .vma = &pvma, - }; - shmem_pseudo_vma_init(&pvma, info, index); - page = swap_cluster_readahead(swap, gfp, &vmf); - shmem_pseudo_vma_destroy(&pvma); + mpol = shmem_get_pgoff_policy(info, index, 0, &ilx); + page = swap_cluster_readahead(swap, gfp, mpol, ilx); + mpol_cond_put(mpol); if (!page) return NULL; @@ -1641,27 +1623,29 @@ static gfp_t limit_gfp_mask(gfp_t huge_gfp, gfp_t limit_gfp) static struct folio *shmem_alloc_hugefolio(gfp_t gfp, struct shmem_inode_info *info, pgoff_t index) { - struct vm_area_struct pvma; - struct folio *folio; + struct mempolicy *mpol; + pgoff_t ilx; + struct page *page; - shmem_pseudo_vma_init(&pvma, info, index); - folio = vma_alloc_folio(gfp, HPAGE_PMD_ORDER, &pvma, 0, true); - shmem_pseudo_vma_destroy(&pvma); + mpol = shmem_get_pgoff_policy(info, index, HPAGE_PMD_ORDER, &ilx); + page = alloc_pages_mpol(gfp, HPAGE_PMD_ORDER, mpol, ilx, numa_node_id()); + mpol_cond_put(mpol); - return folio; + return page_rmappable_folio(page); } static struct folio *shmem_alloc_folio(gfp_t gfp, struct shmem_inode_info *info, pgoff_t index) { - struct vm_area_struct pvma; - struct folio *folio; + struct mempolicy *mpol; + pgoff_t ilx; + struct page *page; - shmem_pseudo_vma_init(&pvma, info, index); - folio = vma_alloc_folio(gfp, 0, &pvma, 0, false); - shmem_pseudo_vma_destroy(&pvma); + mpol = shmem_get_pgoff_policy(info, index, 0, &ilx); + page = alloc_pages_mpol(gfp, 0, mpol, ilx, numa_node_id()); + mpol_cond_put(mpol); - return folio; + return (struct folio *)page; } static struct folio *shmem_alloc_and_add_folio(gfp_t gfp, @@ -1921,7 +1905,7 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, #ifdef CONFIG_CGROUP_SLI sli_memlat_stat_start(&start); #endif - folio = shmem_swapin(swap, gfp, info, index); + folio = shmem_swapin_cluster(swap, gfp, info, index); #ifdef CONFIG_CGROUP_SLI sli_memlat_stat_end(MEM_LAT_DIRECT_SWAPIN, start); #endif @@ -2375,15 +2359,41 @@ static int shmem_set_policy(struct vm_area_struct *vma, struct mempolicy *mpol) } static struct mempolicy *shmem_get_policy(struct vm_area_struct *vma, - unsigned long addr) + unsigned long addr, pgoff_t *ilx) { struct inode *inode = file_inode(vma->vm_file); pgoff_t index; + /* + * Bias interleave by inode number to distribute better across nodes; + * but this interface is independent of which page order is used, so + * supplies only that bias, letting caller apply the offset (adjusted + * by page order, as in shmem_get_pgoff_policy() and get_vma_policy()). + */ + *ilx = inode->i_ino; index = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; return mpol_shared_policy_lookup(&SHMEM_I(inode)->policy, index); } -#endif + +static struct mempolicy *shmem_get_pgoff_policy(struct shmem_inode_info *info, + pgoff_t index, unsigned int order, pgoff_t *ilx) +{ + struct mempolicy *mpol; + + /* Bias interleave by inode number to distribute better across nodes */ + *ilx = info->vfs_inode.i_ino + (index >> order); + + mpol = mpol_shared_policy_lookup(&info->policy, index); + return mpol ? mpol : get_task_policy(current); +} +#else +static struct mempolicy *shmem_get_pgoff_policy(struct shmem_inode_info *info, + pgoff_t index, unsigned int order, pgoff_t *ilx) +{ + *ilx = 0; + return NULL; +} +#endif /* CONFIG_NUMA */ int shmem_lock(struct file *file, int lock, struct ucounts *ucounts) { diff --git a/mm/swap.h b/mm/swap.h index d83b12395691..2d2662815abb 100644 --- a/mm/swap.h +++ b/mm/swap.h @@ -2,6 +2,8 @@ #ifndef _MM_SWAP_H #define _MM_SWAP_H +struct mempolicy; + #ifdef CONFIG_SWAP #include /* for bio_end_io_t */ @@ -48,11 +50,10 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, unsigned long addr, struct swap_iocb **plug); struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, - struct vm_area_struct *vma, - unsigned long addr, + struct mempolicy *mpol, pgoff_t ilx, bool *new_page_allocated); struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t flag, - struct vm_fault *vmf); + struct mempolicy *mpol, pgoff_t ilx); struct page *swapin_readahead(swp_entry_t entry, gfp_t flag, struct vm_fault *vmf); @@ -80,7 +81,7 @@ static inline void show_swap_cache_info(void) } static inline struct page *swap_cluster_readahead(swp_entry_t entry, - gfp_t gfp_mask, struct vm_fault *vmf) + gfp_t gfp_mask, struct mempolicy *mpol, pgoff_t ilx) { return NULL; } diff --git a/mm/swap_state.c b/mm/swap_state.c index 9ee92eb359d4..0e74567d60aa 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include @@ -361,8 +362,8 @@ struct folio *filemap_get_incore_folio(struct address_space *mapping, } struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, - struct vm_area_struct *vma, unsigned long addr, - bool *new_page_allocated) + struct mempolicy *mpol, pgoff_t ilx, + bool *new_page_allocated) { struct swap_info_struct *si = swp_swap_info(entry); struct folio *folio; @@ -396,7 +397,8 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, * before marking swap_map SWAP_HAS_CACHE, when -EEXIST will * cause any racers to loop around until we add it to cache. */ - folio = vma_alloc_folio(gfp_mask, 0, vma, addr, false); + folio = (struct folio *)alloc_pages_mpol(gfp_mask, 0, + mpol, ilx, numa_node_id()); if (!folio) goto fail_put_swap; @@ -470,20 +472,25 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, unsigned long addr, struct swap_iocb **plug) { struct swap_info_struct *si; - bool page_was_allocated; + bool page_allocated; + struct mempolicy *mpol; + pgoff_t ilx; + struct page *page; si = get_swap_device(entry); if (!si) return NULL; - struct page *retpage = __read_swap_cache_async(entry, gfp_mask, - vma, addr, &page_was_allocated); + mpol = get_vma_policy(vma, addr, 0, &ilx); + page = __read_swap_cache_async(entry, gfp_mask, mpol, ilx, + &page_allocated); + mpol_cond_put(mpol); - if (page_was_allocated) - swap_readpage(retpage, false, plug); + if (page_allocated) + swap_readpage(page, false, plug); put_swap_device(si); - return retpage; + return page; } EXPORT_SYMBOL(read_swap_cache_async); @@ -552,7 +559,8 @@ static unsigned long swapin_nr_pages(unsigned long offset) * swap_cluster_readahead - swap in pages in hope we need them soon * @entry: swap entry of this memory * @gfp_mask: memory allocation flags - * @vmf: fault information + * @mpol: NUMA memory allocation policy to be applied + * @ilx: NUMA interleave index, for use only when MPOL_INTERLEAVE * * Returns the struct page for entry and addr, after queueing swapin. * @@ -561,13 +569,12 @@ static unsigned long swapin_nr_pages(unsigned long offset) * because it doesn't cost us any seek time. We also make sure to queue * the 'original' request together with the readahead ones... * - * This has been extended to use the NUMA policies from the mm triggering - * the readahead. - * - * Caller must hold read mmap_lock if vmf->vma is not NULL. + * Note: it is intentional that the same NUMA policy and interleave index + * are used for every page of the readahead: neighbouring pages on swap + * are fairly likely to have been swapped out from the same node. */ struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask, - struct vm_fault *vmf) + struct mempolicy *mpol, pgoff_t ilx) { struct page *page; unsigned long entry_offset = swp_offset(entry); @@ -578,8 +585,6 @@ struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask, struct blk_plug plug; struct swap_iocb *splug = NULL; bool page_allocated; - struct vm_area_struct *vma = vmf->vma; - unsigned long addr = vmf->address; mask = swapin_nr_pages(offset) - 1; if (!mask) @@ -597,8 +602,8 @@ struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask, for (offset = start_offset; offset <= end_offset ; offset++) { /* Ok, do the async read-ahead now */ page = __read_swap_cache_async( - swp_entry(swp_type(entry), offset), - gfp_mask, vma, addr, &page_allocated); + swp_entry(swp_type(entry), offset), + gfp_mask, mpol, ilx, &page_allocated); if (!page) continue; if (page_allocated) { @@ -612,11 +617,14 @@ struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask, } blk_finish_plug(&plug); swap_read_unplug(splug); - lru_add_drain(); /* Push any new pages onto the LRU now */ skip: /* The page was likely read above, so no need for plugging here */ - return read_swap_cache_async(entry, gfp_mask, vma, addr, NULL); + page = __read_swap_cache_async(entry, gfp_mask, mpol, ilx, + &page_allocated); + if (unlikely(page_allocated)) + swap_readpage(page, false, NULL); + return page; } int init_swap_address_space(struct swap_info_struct *si, unsigned long nr_pages) @@ -719,8 +727,10 @@ static void swap_ra_info(struct vm_fault *vmf, /** * swap_vma_readahead - swap in pages in hope we need them soon - * @fentry: swap entry of this memory + * @targ_entry: swap entry of the targeted memory * @gfp_mask: memory allocation flags + * @mpol: NUMA memory allocation policy to be applied + * @targ_ilx: NUMA interleave index, for use only when MPOL_INTERLEAVE * @vmf: fault information * * Returns the struct page for entry and addr, after queueing swapin. @@ -731,16 +741,17 @@ static void swap_ra_info(struct vm_fault *vmf, * Caller must hold read mmap_lock if vmf->vma is not NULL. * */ -static struct page *swap_vma_readahead(swp_entry_t fentry, gfp_t gfp_mask, +static struct page *swap_vma_readahead(swp_entry_t targ_entry, gfp_t gfp_mask, + struct mempolicy *mpol, pgoff_t targ_ilx, struct vm_fault *vmf) { struct blk_plug plug; struct swap_iocb *splug = NULL; - struct vm_area_struct *vma = vmf->vma; struct page *page; pte_t *pte = NULL, pentry; unsigned long addr; swp_entry_t entry; + pgoff_t ilx; unsigned int i; bool page_allocated; struct vma_swap_readahead ra_info = { @@ -752,9 +763,10 @@ static struct page *swap_vma_readahead(swp_entry_t fentry, gfp_t gfp_mask, goto skip; addr = vmf->address - (ra_info.offset * PAGE_SIZE); + ilx = targ_ilx - ra_info.offset; blk_start_plug(&plug); - for (i = 0; i < ra_info.nr_pte; i++, addr += PAGE_SIZE) { + for (i = 0; i < ra_info.nr_pte; i++, ilx++, addr += PAGE_SIZE) { if (!pte++) { pte = pte_offset_map(vmf->pmd, addr); if (!pte) @@ -768,8 +780,8 @@ static struct page *swap_vma_readahead(swp_entry_t fentry, gfp_t gfp_mask, continue; pte_unmap(pte); pte = NULL; - page = __read_swap_cache_async(entry, gfp_mask, vma, - addr, &page_allocated); + page = __read_swap_cache_async(entry, gfp_mask, mpol, ilx, + &page_allocated); if (!page) continue; if (page_allocated) { @@ -788,8 +800,11 @@ static struct page *swap_vma_readahead(swp_entry_t fentry, gfp_t gfp_mask, lru_add_drain(); skip: /* The page was likely read above, so no need for plugging here */ - return read_swap_cache_async(fentry, gfp_mask, vma, vmf->address, - NULL); + page = __read_swap_cache_async(targ_entry, gfp_mask, mpol, targ_ilx, + &page_allocated); + if (unlikely(page_allocated)) + swap_readpage(page, false, NULL); + return page; } /** @@ -807,9 +822,16 @@ static struct page *swap_vma_readahead(swp_entry_t fentry, gfp_t gfp_mask, struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask, struct vm_fault *vmf) { - return swap_use_vma_readahead() ? - swap_vma_readahead(entry, gfp_mask, vmf) : - swap_cluster_readahead(entry, gfp_mask, vmf); + struct mempolicy *mpol; + pgoff_t ilx; + struct page *page; + + mpol = get_vma_policy(vmf->vma, vmf->address, 0, &ilx); + page = swap_use_vma_readahead() ? + swap_vma_readahead(entry, gfp_mask, mpol, ilx, vmf) : + swap_cluster_readahead(entry, gfp_mask, mpol, ilx); + mpol_cond_put(mpol); + return page; } #ifdef CONFIG_SYSFS diff --git a/mm/zswap.c b/mm/zswap.c index 78c1bf0e5fe4..5cecf3689c70 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -24,6 +24,7 @@ #include #include #include +#include #include #include #include @@ -1057,6 +1058,7 @@ static int zswap_writeback_entry(struct zswap_entry *entry, { swp_entry_t swpentry = entry->swpentry; struct page *page; + struct mempolicy *mpol; struct scatterlist input, output; struct crypto_acomp_ctx *acomp_ctx; struct zpool *pool = zswap_find_zpool(entry); @@ -1080,8 +1082,9 @@ static int zswap_writeback_entry(struct zswap_entry *entry, if (!si) return -EEXIST; - page = __read_swap_cache_async(swpentry, GFP_KERNEL, NULL, 0, - &page_was_allocated); + mpol = get_task_policy(current); + page = __read_swap_cache_async(swpentry, GFP_KERNEL, mpol, + NO_INTERLEAVE_INDEX, &page_was_allocated); put_swap_device(si); if (!page) { ret = -ENOMEM; -- Gitee From 108e6bddefa80b34e0cfbc5244c7c1803325bbd4 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 3 Oct 2023 02:27:47 -0700 Subject: [PATCH 034/484] mempolicy: mmap_lock is not needed while migrating folios commit 72e315f7a750281b4410ac30d8930f735459e72d upstream Conflicts: minor, seems some merge order issue Backport-reason: Mempolicy updates for cleaning up shmem SWAP mbind(2) holds down_write of current task's mmap_lock throughout (exclusive because it needs to set the new mempolicy on the vmas); migrate_pages(2) holds down_read of pid's mmap_lock throughout. They both hold mmap_lock across the internal migrate_pages(), under which all new page allocations (huge or small) are made. I'm nervous about it; and migrate_pages() certainly does not need mmap_lock itself. It's done this way for mbind(2), because its page allocator is vma_alloc_folio() or alloc_hugetlb_folio_vma(), both of which depend on vma and address. Now that we have alloc_pages_mpol(), depending on (refcounted) memory policy and interleave index, mbind(2) can be modified to use that or alloc_hugetlb_folio_nodemask(), and then not need mmap_lock across the internal migrate_pages() at all: add alloc_migration_target_by_mpol() to replace mbind's new_page(). (After that change, alloc_hugetlb_folio_vma() is used by nothing but a userfaultfd function: move it out of hugetlb.h and into the #ifdef.) migrate_pages(2) has chosen its target node before migrating, so can continue to use the standard alloc_migration_target(); but let it take and drop mmap_lock just around migrate_to_node()'s queue_pages_range(): neither the node-to-node calculations nor the page migrations need it. It seems unlikely, but it is conceivable that some userspace depends on the kernel's mmap_lock exclusion here, instead of doing its own locking: more likely in a testsuite than in real life. It is also possible, of course, that some pages on the list will be munmapped by another thread before they are migrated, or a newer memory policy applied to the range by that time: but such races could happen before, as soon as mmap_lock was dropped, so it does not appear to be a concern. Link: https://lkml.kernel.org/r/21e564e8-269f-6a89-7ee2-fd612831c289@google.com Signed-off-by: Hugh Dickins Cc: Andi Kleen Cc: Christoph Lameter Cc: David Hildenbrand Cc: Greg Kroah-Hartman Cc: "Huang, Ying" Cc: Kefeng Wang Cc: Matthew Wilcox (Oracle) Cc: Mel Gorman Cc: Michal Hocko Cc: Mike Kravetz Cc: Nhat Pham Cc: Sidhartha Kumar Cc: Suren Baghdasaryan Cc: Tejun heo Cc: Vishal Moola (Oracle) Cc: Yang Shi Cc: Yosry Ahmed Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- include/linux/hugetlb.h | 9 ----- mm/hugetlb.c | 38 ++++++++++--------- mm/mempolicy.c | 83 +++++++++++++++++++++-------------------- 3 files changed, 63 insertions(+), 67 deletions(-) diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index f63730c3e2a1..74d96dba8656 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -753,8 +753,6 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, unsigned long addr, int avoid_reserve); struct folio *alloc_hugetlb_folio_nodemask(struct hstate *h, int preferred_nid, nodemask_t *nmask, gfp_t gfp_mask); -struct folio *alloc_hugetlb_folio_vma(struct hstate *h, struct vm_area_struct *vma, - unsigned long address); int hugetlb_add_to_page_cache(struct folio *folio, struct address_space *mapping, pgoff_t idx); void restore_reserve_on_error(struct hstate *h, struct vm_area_struct *vma, @@ -1089,13 +1087,6 @@ alloc_hugetlb_folio_nodemask(struct hstate *h, int preferred_nid, return NULL; } -static inline struct folio *alloc_hugetlb_folio_vma(struct hstate *h, - struct vm_area_struct *vma, - unsigned long address) -{ - return NULL; -} - static inline int __alloc_bootmem_huge_page(struct hstate *h) { return 0; diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 2c63b09ae0d3..87d0f5c29c0c 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2484,24 +2484,6 @@ struct folio *alloc_hugetlb_folio_nodemask(struct hstate *h, int preferred_nid, return alloc_migrate_hugetlb_folio(h, gfp_mask, preferred_nid, nmask); } -/* mempolicy aware migration callback */ -struct folio *alloc_hugetlb_folio_vma(struct hstate *h, struct vm_area_struct *vma, - unsigned long address) -{ - struct mempolicy *mpol; - nodemask_t *nodemask; - struct folio *folio; - gfp_t gfp_mask; - int node; - - gfp_mask = htlb_alloc_mask(h); - node = huge_node(vma, address, gfp_mask, &mpol, &nodemask); - folio = alloc_hugetlb_folio_nodemask(h, node, nodemask, gfp_mask); - mpol_cond_put(mpol); - - return folio; -} - static nodemask_t *policy_mbind_nodemask(gfp_t gfp) { #ifdef CONFIG_NUMA @@ -6325,6 +6307,26 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, } #ifdef CONFIG_USERFAULTFD +/* + * Can probably be eliminated, but still used by hugetlb_mfill_atomic_pte(). + */ +static struct folio *alloc_hugetlb_folio_vma(struct hstate *h, + struct vm_area_struct *vma, unsigned long address) +{ + struct mempolicy *mpol; + nodemask_t *nodemask; + struct folio *folio; + gfp_t gfp_mask; + int node; + + gfp_mask = htlb_alloc_mask(h); + node = huge_node(vma, address, gfp_mask, &mpol, &nodemask); + folio = alloc_hugetlb_folio_nodemask(h, node, nodemask, gfp_mask); + mpol_cond_put(mpol); + + return folio; +} + /* * Used by userfaultfd UFFDIO_* ioctls. Based on userfaultfd's mfill_atomic_pte * with modifications for hugetlb pages. diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 4e8f2fddd16b..c7a4e873b36f 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -419,6 +419,8 @@ static const struct mempolicy_operations mpol_ops[MPOL_MAX] = { static bool migrate_folio_add(struct folio *folio, struct list_head *foliolist, unsigned long flags); +static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *pol, + pgoff_t ilx, int *nid); static bool strictly_unmovable(unsigned long flags) { @@ -1023,6 +1025,8 @@ static long migrate_to_node(struct mm_struct *mm, int source, int dest, node_set(source, nmask); VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))); + + mmap_read_lock(mm); vma = find_vma(mm, 0); if (unlikely(!vma)) { mmap_read_unlock(mm); @@ -1037,6 +1041,7 @@ static long migrate_to_node(struct mm_struct *mm, int source, int dest, */ nr_failed = queue_pages_range(mm, vma->vm_start, mm->task_size, &nmask, flags | MPOL_MF_DISCONTIG_OK, &pagelist); + mmap_read_unlock(mm); if (!list_empty(&pagelist)) { err = migrate_pages(&pagelist, alloc_migration_target, NULL, @@ -1065,8 +1070,6 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, lru_cache_disable(); - mmap_read_lock(mm); - /* * Find a 'source' bit set in 'tmp' whose corresponding 'dest' * bit in 'to' is not also set in 'tmp'. Clear the found 'source' @@ -1146,7 +1149,6 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, if (err < 0) break; } - mmap_read_unlock(mm); lru_cache_enable(); if (err < 0) @@ -1155,44 +1157,38 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, } /* - * Allocate a new page for page migration based on vma policy. - * Start by assuming the page is mapped by the same vma as contains @start. - * Search forward from there, if not. N.B., this assumes that the - * list of pages handed to migrate_pages()--which is how we get here-- - * is in virtual address order. + * Allocate a new folio for page migration, according to NUMA mempolicy. */ -static struct folio *new_folio(struct folio *src, unsigned long start) +static struct folio *alloc_migration_target_by_mpol(struct folio *src, + unsigned long private) { - struct vm_area_struct *vma; - unsigned long address; - VMA_ITERATOR(vmi, current->mm, start); - gfp_t gfp = GFP_HIGHUSER_MOVABLE | __GFP_RETRY_MAYFAIL; - - for_each_vma(vmi, vma) { - address = page_address_in_vma(&src->page, vma); - if (address != -EFAULT) - break; - } + struct mempolicy *pol = (struct mempolicy *)private; + pgoff_t ilx = 0; /* improve on this later */ + struct page *page; + unsigned int order; + int nid = numa_node_id(); + gfp_t gfp; - /* - * __get_vma_policy() now expects a genuine non-NULL vma. Return NULL - * when the page can no longer be located in a vma: that is not ideal - * (migrate_pages() will give up early, presuming ENOMEM), but good - * enough to avoid a crash by syzkaller or concurrent holepunch. - */ - if (!vma) - return NULL; + order = folio_order(src); + ilx += src->index >> order; if (folio_test_hugetlb(src)) { - return alloc_hugetlb_folio_vma(folio_hstate(src), - vma, address); + nodemask_t *nodemask; + struct hstate *h; + + h = folio_hstate(src); + gfp = htlb_alloc_mask(h); + nodemask = policy_nodemask(gfp, pol, ilx, &nid); + return alloc_hugetlb_folio_nodemask(h, nid, nodemask, gfp); } if (folio_test_large(src)) gfp = GFP_TRANSHUGE; + else + gfp = GFP_HIGHUSER_MOVABLE | __GFP_RETRY_MAYFAIL | __GFP_COMP; - return vma_alloc_folio(gfp, folio_order(src), vma, address, - folio_test_large(src)); + page = alloc_pages_mpol(gfp, order, pol, ilx, nid); + return page_rmappable_folio(page); } #else @@ -1208,7 +1204,8 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, return -ENOSYS; } -static struct folio *new_folio(struct folio *src, unsigned long start) +static struct folio *alloc_migration_target_by_mpol(struct folio *src, + unsigned long private) { return NULL; } @@ -1282,6 +1279,7 @@ static long do_mbind(unsigned long start, unsigned long len, if (nr_failed < 0) { err = nr_failed; + nr_failed = 0; } else { vma_iter_init(&vmi, mm, start); prev = vma_prev(&vmi); @@ -1292,19 +1290,24 @@ static long do_mbind(unsigned long start, unsigned long len, } } - if (!err) { - if (!list_empty(&pagelist)) { - nr_failed |= migrate_pages(&pagelist, new_folio, NULL, - start, MIGRATE_SYNC, MR_MEMPOLICY_MBIND, NULL); + mmap_write_unlock(mm); + + if (!err && !list_empty(&pagelist)) { + /* Convert MPOL_DEFAULT's NULL to task or default policy */ + if (!new) { + new = get_task_policy(current); + mpol_get(new); } - if (nr_failed && (flags & MPOL_MF_STRICT)) - err = -EIO; + nr_failed |= migrate_pages(&pagelist, + alloc_migration_target_by_mpol, NULL, + (unsigned long)new, MIGRATE_SYNC, + MR_MEMPOLICY_MBIND, NULL); } + if (nr_failed && (flags & MPOL_MF_STRICT)) + err = -EIO; if (!list_empty(&pagelist)) putback_movable_pages(&pagelist); - - mmap_write_unlock(mm); mpol_out: mpol_put(new); if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) -- Gitee From 5a676fa704350746dfce3d6a4ddfb54cd43c09ea Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 3 Oct 2023 02:29:00 -0700 Subject: [PATCH 035/484] mempolicy: migration attempt to match interleave nodes commit 88c91dc58582f1d0c6f5f501051b0262d06ec4ed upstream Conflicts: none Backport-reason: Mempolicy updates for cleaning up shmem SWAP Improve alloc_migration_target_by_mpol()'s treatment of MPOL_INTERLEAVE. Make an effort in do_mbind(), to identify the correct interleave index for the first page to be migrated, so that it and all subsequent pages from the same vma will be targeted to precisely their intended nodes. Pages from following vmas will still be interleaved from the requested nodemask, but perhaps starting from a different base. Whether this is worth doing at all, or worth improving further, is arguable: queue_folio_required() is right not to care about the precise placement on interleaved nodes; but this little effort seems appropriate. [hughd@google.com: do vma_iter search under mmap_write_unlock()] Link: https://lkml.kernel.org/r/3311d544-fb05-a7f1-1b74-16aa0f6cd4fe@google.com Link: https://lkml.kernel.org/r/77954a5-9c9b-1c11-7d5c-3262c01b895f@google.com Signed-off-by: Hugh Dickins Cc: Andi Kleen Cc: Christoph Lameter Cc: David Hildenbrand Cc: Greg Kroah-Hartman Cc: "Huang, Ying" Cc: Kefeng Wang Cc: Matthew Wilcox (Oracle) Cc: Mel Gorman Cc: Michal Hocko Cc: Mike Kravetz Cc: Nhat Pham Cc: Sidhartha Kumar Cc: Suren Baghdasaryan Cc: Tejun heo Cc: Vishal Moola (Oracle) Cc: Yang Shi Cc: Yosry Ahmed Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/mempolicy.c | 55 +++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 50 insertions(+), 5 deletions(-) diff --git a/mm/mempolicy.c b/mm/mempolicy.c index c7a4e873b36f..1fe8f837f9cd 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -432,6 +432,11 @@ static bool strictly_unmovable(unsigned long flags) MPOL_MF_STRICT; } +struct migration_mpol { /* for alloc_migration_target_by_mpol() */ + struct mempolicy *pol; + pgoff_t ilx; +}; + struct queue_pages { struct list_head *pagelist; unsigned long flags; @@ -1162,8 +1167,9 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, static struct folio *alloc_migration_target_by_mpol(struct folio *src, unsigned long private) { - struct mempolicy *pol = (struct mempolicy *)private; - pgoff_t ilx = 0; /* improve on this later */ + struct migration_mpol *mmpol = (struct migration_mpol *)private; + struct mempolicy *pol = mmpol->pol; + pgoff_t ilx = mmpol->ilx; struct page *page; unsigned int order; int nid = numa_node_id(); @@ -1218,6 +1224,7 @@ static long do_mbind(unsigned long start, unsigned long len, struct mm_struct *mm = current->mm; struct vm_area_struct *vma, *prev; struct vma_iterator vmi; + struct migration_mpol mmpol; struct mempolicy *new; unsigned long end; long err; @@ -1290,17 +1297,55 @@ static long do_mbind(unsigned long start, unsigned long len, } } - mmap_write_unlock(mm); - if (!err && !list_empty(&pagelist)) { /* Convert MPOL_DEFAULT's NULL to task or default policy */ if (!new) { new = get_task_policy(current); mpol_get(new); } + mmpol.pol = new; + mmpol.ilx = 0; + + /* + * In the interleaved case, attempt to allocate on exactly the + * targeted nodes, for the first VMA to be migrated; for later + * VMAs, the nodes will still be interleaved from the targeted + * nodemask, but one by one may be selected differently. + */ + if (new->mode == MPOL_INTERLEAVE) { + struct page *page; + unsigned int order; + unsigned long addr = -EFAULT; + + list_for_each_entry(page, &pagelist, lru) { + if (!PageKsm(page)) + break; + } + if (!list_entry_is_head(page, &pagelist, lru)) { + vma_iter_init(&vmi, mm, start); + for_each_vma_range(vmi, vma, end) { + addr = page_address_in_vma(page, vma); + if (addr != -EFAULT) + break; + } + } + if (addr != -EFAULT) { + order = compound_order(page); + /* We already know the pol, but not the ilx */ + mpol_cond_put(get_vma_policy(vma, addr, order, + &mmpol.ilx)); + /* Set base from which to increment by index */ + mmpol.ilx -= page->index >> order; + } + } + } + + mmap_write_unlock(mm); + + if (!err && !list_empty(&pagelist)) { nr_failed |= migrate_pages(&pagelist, alloc_migration_target_by_mpol, NULL, - (unsigned long)new, MIGRATE_SYNC, + (unsigned long)&mmpol, MIGRATE_SYNC, MR_MEMPOLICY_MBIND, NULL); } -- Gitee From 83742e079d36a38e98b43cb4285fb7805c40e8db Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Mon, 11 Sep 2023 17:25:14 +0800 Subject: [PATCH 036/484] mm: move some shrinker-related function declarations to mm/internal.h MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 3ee0aa9f06756e959633ffda37856c6741d948ed upstream Conflicts: none Backport-reason: Shrinker cleanup Patch series "cleanups for lockless slab shrink", v4. This series is some cleanups for lockless slab shrink. This patch (of 4): The following functions are only used inside the mm subsystem, so it's better to move their declarations to the mm/internal.h file. 1. shrinker_debugfs_add() 2. shrinker_debugfs_detach() 3. shrinker_debugfs_remove() Link: https://lkml.kernel.org/r/20230911092517.64141-1-zhengqi.arch@bytedance.com Link: https://lkml.kernel.org/r/20230911092517.64141-2-zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Reviewed-by: Muchun Song Cc: Christian Brauner Cc: Christian König Cc: Chuck Lever Cc: Daniel Vetter Cc: Darrick J. Wong Cc: Dave Chinner Cc: Greg Kroah-Hartman Cc: Joel Fernandes Cc: Kirill Tkhai Cc: Paul E. McKenney Cc: Roman Gushchin Cc: Sergey Senozhatsky Cc: Steven Price Cc: Theodore Ts'o Cc: Vlastimil Babka Cc: Daniel Vetter Cc: Abhinav Kumar Cc: Alasdair Kergon Cc: Alexander Viro Cc: Alyssa Rosenzweig Cc: Andreas Dilger Cc: Andreas Gruenbacher Cc: Anna Schumaker Cc: Arnd Bergmann Cc: Bob Peterson Cc: Borislav Petkov Cc: Carlos Llamas Cc: Chandan Babu R Cc: Chao Yu Cc: Chris Mason Cc: Coly Li Cc: Dai Ngo Cc: Dave Hansen Cc: David Airlie Cc: David Hildenbrand Cc: David Sterba Cc: Dmitry Baryshkov Cc: Gao Xiang Cc: Huang Rui Cc: Ingo Molnar Cc: Jaegeuk Kim Cc: Jani Nikula Cc: Jan Kara Cc: Jason Wang Cc: Jeff Layton Cc: Jeffle Xu Cc: Joonas Lahtinen Cc: Josef Bacik Cc: Juergen Gross Cc: Kent Overstreet Cc: Marijn Suijten Cc: "Michael S. Tsirkin" Cc: Mike Snitzer Cc: Minchan Kim Cc: Muchun Song Cc: Nadav Amit Cc: Neil Brown Cc: Oleksandr Tyshchenko Cc: Olga Kornievskaia Cc: Richard Weinberger Cc: Rob Clark Cc: Rob Herring Cc: Rodrigo Vivi Cc: Sean Paul Cc: Song Liu Cc: Stefano Stabellini Cc: Thomas Gleixner Cc: Tomeu Vizoso Cc: Tom Talpey Cc: Trond Myklebust Cc: Tvrtko Ursulin Cc: Xuan Zhuo Cc: Yue Hu Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- include/linux/shrinker.h | 19 ------------------- mm/internal.h | 26 ++++++++++++++++++++++++++ mm/shrinker_debug.c | 2 ++ 3 files changed, 28 insertions(+), 19 deletions(-) diff --git a/include/linux/shrinker.h b/include/linux/shrinker.h index a5402e1d6326..1b0389a4d5ac 100644 --- a/include/linux/shrinker.h +++ b/include/linux/shrinker.h @@ -110,28 +110,9 @@ extern void free_prealloced_shrinker(struct shrinker *shrinker); extern void synchronize_shrinkers(void); #ifdef CONFIG_SHRINKER_DEBUG -extern int shrinker_debugfs_add(struct shrinker *shrinker); -extern struct dentry *shrinker_debugfs_detach(struct shrinker *shrinker, - int *debugfs_id); -extern void shrinker_debugfs_remove(struct dentry *debugfs_entry, - int debugfs_id); extern int __printf(2, 3) shrinker_debugfs_rename(struct shrinker *shrinker, const char *fmt, ...); #else /* CONFIG_SHRINKER_DEBUG */ -static inline int shrinker_debugfs_add(struct shrinker *shrinker) -{ - return 0; -} -static inline struct dentry *shrinker_debugfs_detach(struct shrinker *shrinker, - int *debugfs_id) -{ - *debugfs_id = -1; - return NULL; -} -static inline void shrinker_debugfs_remove(struct dentry *debugfs_entry, - int debugfs_id) -{ -} static inline __printf(2, 3) int shrinker_debugfs_rename(struct shrinker *shrinker, const char *fmt, ...) { diff --git a/mm/internal.h b/mm/internal.h index 1aed3aa88a21..a57a587e80a3 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -1443,4 +1443,30 @@ struct vma_prepare { struct vm_area_struct *remove; struct vm_area_struct *remove2; }; + +/* shrinker related functions */ + +#ifdef CONFIG_SHRINKER_DEBUG +extern int shrinker_debugfs_add(struct shrinker *shrinker); +extern struct dentry *shrinker_debugfs_detach(struct shrinker *shrinker, + int *debugfs_id); +extern void shrinker_debugfs_remove(struct dentry *debugfs_entry, + int debugfs_id); +#else /* CONFIG_SHRINKER_DEBUG */ +static inline int shrinker_debugfs_add(struct shrinker *shrinker) +{ + return 0; +} +static inline struct dentry *shrinker_debugfs_detach(struct shrinker *shrinker, + int *debugfs_id) +{ + *debugfs_id = -1; + return NULL; +} +static inline void shrinker_debugfs_remove(struct dentry *debugfs_entry, + int debugfs_id) +{ +} +#endif /* CONFIG_SHRINKER_DEBUG */ + #endif /* __MM_INTERNAL_H */ diff --git a/mm/shrinker_debug.c b/mm/shrinker_debug.c index 3ab53fad8876..ee0cddb4530f 100644 --- a/mm/shrinker_debug.c +++ b/mm/shrinker_debug.c @@ -6,6 +6,8 @@ #include #include +#include "internal.h" + /* defined in vmscan.c */ extern struct rw_semaphore shrinker_rwsem; extern struct list_head shrinker_list; -- Gitee From 26d874833e19c48af1366a82797e3c35d14051b1 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Mon, 11 Sep 2023 17:25:15 +0800 Subject: [PATCH 037/484] mm: vmscan: move shrinker-related code into a separate file MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 96f7b2b9bbb1d680fab1fa96a3958b2d759b881f upstream Conflicts: trivial, shrink_slab is static in upstream Backport-reason: Shrinker cleanup The mm/vmscan.c file is too large, so separate the shrinker-related code from it into a separate file. No functional changes. Link: https://lkml.kernel.org/r/20230911092517.64141-3-zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Reviewed-by: Muchun Song Cc: Christian Brauner Cc: Christian König Cc: Chuck Lever Cc: Daniel Vetter Cc: Daniel Vetter Cc: Darrick J. Wong Cc: Dave Chinner Cc: Greg Kroah-Hartman Cc: Joel Fernandes Cc: Kirill Tkhai Cc: Paul E. McKenney Cc: Roman Gushchin Cc: Sergey Senozhatsky Cc: Steven Price Cc: Theodore Ts'o Cc: Vlastimil Babka Cc: Abhinav Kumar Cc: Alasdair Kergon Cc: Alexander Viro Cc: Alyssa Rosenzweig Cc: Andreas Dilger Cc: Andreas Gruenbacher Cc: Anna Schumaker Cc: Arnd Bergmann Cc: Bob Peterson Cc: Borislav Petkov Cc: Carlos Llamas Cc: Chandan Babu R Cc: Chao Yu Cc: Chris Mason Cc: Coly Li Cc: Dai Ngo Cc: Dave Hansen Cc: David Airlie Cc: David Hildenbrand Cc: David Sterba Cc: Dmitry Baryshkov Cc: Gao Xiang Cc: Huang Rui Cc: Ingo Molnar Cc: Jaegeuk Kim Cc: Jani Nikula Cc: Jan Kara Cc: Jason Wang Cc: Jeff Layton Cc: Jeffle Xu Cc: Joonas Lahtinen Cc: Josef Bacik Cc: Juergen Gross Cc: Kent Overstreet Cc: Marijn Suijten Cc: "Michael S. Tsirkin" Cc: Mike Snitzer Cc: Minchan Kim Cc: Muchun Song Cc: Nadav Amit Cc: Neil Brown Cc: Oleksandr Tyshchenko Cc: Olga Kornievskaia Cc: Richard Weinberger Cc: Rob Clark Cc: Rob Herring Cc: Rodrigo Vivi Cc: Sean Paul Cc: Song Liu Cc: Stefano Stabellini Cc: Thomas Gleixner Cc: Tomeu Vizoso Cc: Tom Talpey Cc: Trond Myklebust Cc: Tvrtko Ursulin Cc: Xuan Zhuo Cc: Yue Hu Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/Makefile | 4 +- mm/internal.h | 2 + mm/shrinker.c | 709 ++++++++++++++++++++++++++++++++++++++++++++++++++ mm/vmscan.c | 701 ------------------------------------------------- 4 files changed, 713 insertions(+), 703 deletions(-) create mode 100644 mm/shrinker.c diff --git a/mm/Makefile b/mm/Makefile index f1258837b9ab..3ddb4127a204 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -48,8 +48,8 @@ endif obj-y := filemap.o mempool.o oom_kill.o fadvise.o \ maccess.o page-writeback.o folio-compat.o \ - readahead.o swap.o truncate.o vmscan.o shmem.o \ - util.o mmzone.o vmstat.o backing-dev.o \ + readahead.o swap.o truncate.o vmscan.o shrinker.o \ + shmem.o util.o mmzone.o vmstat.o backing-dev.o \ mm_init.o percpu.o slab_common.o \ compaction.o show_mem.o shmem_quota.o\ interval_tree.o list_lru.o workingset.o \ diff --git a/mm/internal.h b/mm/internal.h index a57a587e80a3..98cd14a88361 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -1445,6 +1445,8 @@ struct vma_prepare { }; /* shrinker related functions */ +unsigned long shrink_slab(gfp_t gfp_mask, int nid, struct mem_cgroup *memcg, + int priority); #ifdef CONFIG_SHRINKER_DEBUG extern int shrinker_debugfs_add(struct shrinker *shrinker); diff --git a/mm/shrinker.c b/mm/shrinker.c new file mode 100644 index 000000000000..043c87ccfab4 --- /dev/null +++ b/mm/shrinker.c @@ -0,0 +1,709 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include + +#include "internal.h" + +LIST_HEAD(shrinker_list); +DECLARE_RWSEM(shrinker_rwsem); + +#ifdef CONFIG_MEMCG +static int shrinker_nr_max; + +/* The shrinker_info is expanded in a batch of BITS_PER_LONG */ +static inline int shrinker_map_size(int nr_items) +{ + return (DIV_ROUND_UP(nr_items, BITS_PER_LONG) * sizeof(unsigned long)); +} + +static inline int shrinker_defer_size(int nr_items) +{ + return (round_up(nr_items, BITS_PER_LONG) * sizeof(atomic_long_t)); +} + +void free_shrinker_info(struct mem_cgroup *memcg) +{ + struct mem_cgroup_per_node *pn; + struct shrinker_info *info; + int nid; + + for_each_node(nid) { + pn = memcg->nodeinfo[nid]; + info = rcu_dereference_protected(pn->shrinker_info, true); + kvfree(info); + rcu_assign_pointer(pn->shrinker_info, NULL); + } +} + +int alloc_shrinker_info(struct mem_cgroup *memcg) +{ + struct shrinker_info *info; + int nid, size, ret = 0; + int map_size, defer_size = 0; + + down_write(&shrinker_rwsem); + map_size = shrinker_map_size(shrinker_nr_max); + defer_size = shrinker_defer_size(shrinker_nr_max); + size = map_size + defer_size; + for_each_node(nid) { + info = kvzalloc_node(sizeof(*info) + size, GFP_KERNEL, nid); + if (!info) { + free_shrinker_info(memcg); + ret = -ENOMEM; + break; + } + info->nr_deferred = (atomic_long_t *)(info + 1); + info->map = (void *)info->nr_deferred + defer_size; + info->map_nr_max = shrinker_nr_max; + rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_info, info); + } + up_write(&shrinker_rwsem); + + return ret; +} + +static struct shrinker_info *shrinker_info_protected(struct mem_cgroup *memcg, + int nid) +{ + return rcu_dereference_protected(memcg->nodeinfo[nid]->shrinker_info, + lockdep_is_held(&shrinker_rwsem)); +} + +static int expand_one_shrinker_info(struct mem_cgroup *memcg, + int map_size, int defer_size, + int old_map_size, int old_defer_size, + int new_nr_max) +{ + struct shrinker_info *new, *old; + struct mem_cgroup_per_node *pn; + int nid; + int size = map_size + defer_size; + + for_each_node(nid) { + pn = memcg->nodeinfo[nid]; + old = shrinker_info_protected(memcg, nid); + /* Not yet online memcg */ + if (!old) + return 0; + + /* Already expanded this shrinker_info */ + if (new_nr_max <= old->map_nr_max) + continue; + + new = kvmalloc_node(sizeof(*new) + size, GFP_KERNEL, nid); + if (!new) + return -ENOMEM; + + new->nr_deferred = (atomic_long_t *)(new + 1); + new->map = (void *)new->nr_deferred + defer_size; + new->map_nr_max = new_nr_max; + + /* map: set all old bits, clear all new bits */ + memset(new->map, (int)0xff, old_map_size); + memset((void *)new->map + old_map_size, 0, map_size - old_map_size); + /* nr_deferred: copy old values, clear all new values */ + memcpy(new->nr_deferred, old->nr_deferred, old_defer_size); + memset((void *)new->nr_deferred + old_defer_size, 0, + defer_size - old_defer_size); + + rcu_assign_pointer(pn->shrinker_info, new); + kvfree_rcu(old, rcu); + } + + return 0; +} + +static int expand_shrinker_info(int new_id) +{ + int ret = 0; + int new_nr_max = round_up(new_id + 1, BITS_PER_LONG); + int map_size, defer_size = 0; + int old_map_size, old_defer_size = 0; + struct mem_cgroup *memcg; + + if (!root_mem_cgroup) + goto out; + + lockdep_assert_held(&shrinker_rwsem); + + map_size = shrinker_map_size(new_nr_max); + defer_size = shrinker_defer_size(new_nr_max); + old_map_size = shrinker_map_size(shrinker_nr_max); + old_defer_size = shrinker_defer_size(shrinker_nr_max); + + memcg = mem_cgroup_iter(NULL, NULL, NULL); + do { + ret = expand_one_shrinker_info(memcg, map_size, defer_size, + old_map_size, old_defer_size, + new_nr_max); + if (ret) { + mem_cgroup_iter_break(NULL, memcg); + goto out; + } + } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL); +out: + if (!ret) + shrinker_nr_max = new_nr_max; + + return ret; +} + +void set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id) +{ + if (shrinker_id >= 0 && memcg && !mem_cgroup_is_root(memcg)) { + struct shrinker_info *info; + + rcu_read_lock(); + info = rcu_dereference(memcg->nodeinfo[nid]->shrinker_info); + if (!WARN_ON_ONCE(shrinker_id >= info->map_nr_max)) { + /* Pairs with smp mb in shrink_slab() */ + smp_mb__before_atomic(); + set_bit(shrinker_id, info->map); + } + rcu_read_unlock(); + } +} + +static DEFINE_IDR(shrinker_idr); + +static int prealloc_memcg_shrinker(struct shrinker *shrinker) +{ + int id, ret = -ENOMEM; + + if (mem_cgroup_disabled()) + return -ENOSYS; + + down_write(&shrinker_rwsem); + /* This may call shrinker, so it must use down_read_trylock() */ + id = idr_alloc(&shrinker_idr, shrinker, 0, 0, GFP_KERNEL); + if (id < 0) + goto unlock; + + if (id >= shrinker_nr_max) { + if (expand_shrinker_info(id)) { + idr_remove(&shrinker_idr, id); + goto unlock; + } + } + shrinker->id = id; + ret = 0; +unlock: + up_write(&shrinker_rwsem); + return ret; +} + +static void unregister_memcg_shrinker(struct shrinker *shrinker) +{ + int id = shrinker->id; + + BUG_ON(id < 0); + + lockdep_assert_held(&shrinker_rwsem); + + idr_remove(&shrinker_idr, id); +} + +static long xchg_nr_deferred_memcg(int nid, struct shrinker *shrinker, + struct mem_cgroup *memcg) +{ + struct shrinker_info *info; + + info = shrinker_info_protected(memcg, nid); + return atomic_long_xchg(&info->nr_deferred[shrinker->id], 0); +} + +static long add_nr_deferred_memcg(long nr, int nid, struct shrinker *shrinker, + struct mem_cgroup *memcg) +{ + struct shrinker_info *info; + + info = shrinker_info_protected(memcg, nid); + return atomic_long_add_return(nr, &info->nr_deferred[shrinker->id]); +} + +void reparent_shrinker_deferred(struct mem_cgroup *memcg) +{ + int i, nid; + long nr; + struct mem_cgroup *parent; + struct shrinker_info *child_info, *parent_info; + + parent = parent_mem_cgroup(memcg); + if (!parent) + parent = root_mem_cgroup; + + /* Prevent from concurrent shrinker_info expand */ + down_read(&shrinker_rwsem); + for_each_node(nid) { + child_info = shrinker_info_protected(memcg, nid); + parent_info = shrinker_info_protected(parent, nid); + for (i = 0; i < child_info->map_nr_max; i++) { + nr = atomic_long_read(&child_info->nr_deferred[i]); + atomic_long_add(nr, &parent_info->nr_deferred[i]); + } + } + up_read(&shrinker_rwsem); +} +#else +static int prealloc_memcg_shrinker(struct shrinker *shrinker) +{ + return -ENOSYS; +} + +static void unregister_memcg_shrinker(struct shrinker *shrinker) +{ +} + +static long xchg_nr_deferred_memcg(int nid, struct shrinker *shrinker, + struct mem_cgroup *memcg) +{ + return 0; +} + +static long add_nr_deferred_memcg(long nr, int nid, struct shrinker *shrinker, + struct mem_cgroup *memcg) +{ + return 0; +} +#endif /* CONFIG_MEMCG */ + +static long xchg_nr_deferred(struct shrinker *shrinker, + struct shrink_control *sc) +{ + int nid = sc->nid; + + if (!(shrinker->flags & SHRINKER_NUMA_AWARE)) + nid = 0; + + if (sc->memcg && + (shrinker->flags & SHRINKER_MEMCG_AWARE)) + return xchg_nr_deferred_memcg(nid, shrinker, + sc->memcg); + + return atomic_long_xchg(&shrinker->nr_deferred[nid], 0); +} + + +static long add_nr_deferred(long nr, struct shrinker *shrinker, + struct shrink_control *sc) +{ + int nid = sc->nid; + + if (!(shrinker->flags & SHRINKER_NUMA_AWARE)) + nid = 0; + + if (sc->memcg && + (shrinker->flags & SHRINKER_MEMCG_AWARE)) + return add_nr_deferred_memcg(nr, nid, shrinker, + sc->memcg); + + return atomic_long_add_return(nr, &shrinker->nr_deferred[nid]); +} + +#define SHRINK_BATCH 128 + +static unsigned long do_shrink_slab(struct shrink_control *shrinkctl, + struct shrinker *shrinker, int priority) +{ + unsigned long freed = 0; + unsigned long long delta; + long total_scan; + long freeable; + long nr; + long new_nr; + long batch_size = shrinker->batch ? shrinker->batch + : SHRINK_BATCH; + long scanned = 0, next_deferred; + + freeable = shrinker->count_objects(shrinker, shrinkctl); + if (freeable == 0 || freeable == SHRINK_EMPTY) + return freeable; + + /* + * copy the current shrinker scan count into a local variable + * and zero it so that other concurrent shrinker invocations + * don't also do this scanning work. + */ + nr = xchg_nr_deferred(shrinker, shrinkctl); + + if (shrinker->seeks) { + delta = freeable >> priority; + delta *= 4; + do_div(delta, shrinker->seeks); + } else { + /* + * These objects don't require any IO to create. Trim + * them aggressively under memory pressure to keep + * them from causing refetches in the IO caches. + */ + delta = freeable / 2; + } + + total_scan = nr >> priority; + total_scan += delta; + total_scan = min(total_scan, (2 * freeable)); + + trace_mm_shrink_slab_start(shrinker, shrinkctl, nr, + freeable, delta, total_scan, priority); + + /* + * Normally, we should not scan less than batch_size objects in one + * pass to avoid too frequent shrinker calls, but if the slab has less + * than batch_size objects in total and we are really tight on memory, + * we will try to reclaim all available objects, otherwise we can end + * up failing allocations although there are plenty of reclaimable + * objects spread over several slabs with usage less than the + * batch_size. + * + * We detect the "tight on memory" situations by looking at the total + * number of objects we want to scan (total_scan). If it is greater + * than the total number of objects on slab (freeable), we must be + * scanning at high prio and therefore should try to reclaim as much as + * possible. + */ + while (total_scan >= batch_size || + total_scan >= freeable) { + unsigned long ret; + unsigned long nr_to_scan = min(batch_size, total_scan); + + shrinkctl->nr_to_scan = nr_to_scan; + shrinkctl->nr_scanned = nr_to_scan; + ret = shrinker->scan_objects(shrinker, shrinkctl); + if (ret == SHRINK_STOP) + break; + freed += ret; + + count_vm_events(SLABS_SCANNED, shrinkctl->nr_scanned); + total_scan -= shrinkctl->nr_scanned; + scanned += shrinkctl->nr_scanned; + + cond_resched(); + } + + /* + * The deferred work is increased by any new work (delta) that wasn't + * done, decreased by old deferred work that was done now. + * + * And it is capped to two times of the freeable items. + */ + next_deferred = max_t(long, (nr + delta - scanned), 0); + next_deferred = min(next_deferred, (2 * freeable)); + + /* + * move the unused scan count back into the shrinker in a + * manner that handles concurrent updates. + */ + new_nr = add_nr_deferred(next_deferred, shrinker, shrinkctl); + + trace_mm_shrink_slab_end(shrinker, shrinkctl->nid, freed, nr, new_nr, total_scan); + return freed; +} + +#ifdef CONFIG_MEMCG +static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, + struct mem_cgroup *memcg, int priority) +{ + struct shrinker_info *info; + unsigned long ret, freed = 0; + int i; + + if (!mem_cgroup_online(memcg)) + return 0; + + if (!down_read_trylock(&shrinker_rwsem)) + return 0; + + info = shrinker_info_protected(memcg, nid); + if (unlikely(!info)) + goto unlock; + + for_each_set_bit(i, info->map, info->map_nr_max) { + struct shrink_control sc = { + .gfp_mask = gfp_mask, + .nid = nid, + .memcg = memcg, + }; + struct shrinker *shrinker; + + shrinker = idr_find(&shrinker_idr, i); + if (unlikely(!shrinker || !(shrinker->flags & SHRINKER_REGISTERED))) { + if (!shrinker) + clear_bit(i, info->map); + continue; + } + + /* Call non-slab shrinkers even though kmem is disabled */ + if (!memcg_kmem_online() && + !(shrinker->flags & SHRINKER_NONSLAB)) + continue; + + ret = do_shrink_slab(&sc, shrinker, priority); + if (ret == SHRINK_EMPTY) { + clear_bit(i, info->map); + /* + * After the shrinker reported that it had no objects to + * free, but before we cleared the corresponding bit in + * the memcg shrinker map, a new object might have been + * added. To make sure, we have the bit set in this + * case, we invoke the shrinker one more time and reset + * the bit if it reports that it is not empty anymore. + * The memory barrier here pairs with the barrier in + * set_shrinker_bit(): + * + * list_lru_add() shrink_slab_memcg() + * list_add_tail() clear_bit() + * + * set_bit() do_shrink_slab() + */ + smp_mb__after_atomic(); + ret = do_shrink_slab(&sc, shrinker, priority); + if (ret == SHRINK_EMPTY) + ret = 0; + else + set_shrinker_bit(memcg, nid, i); + } + freed += ret; + + if (rwsem_is_contended(&shrinker_rwsem)) { + freed = freed ? : 1; + break; + } + } +unlock: + up_read(&shrinker_rwsem); + return freed; +} +#else /* !CONFIG_MEMCG */ +static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, + struct mem_cgroup *memcg, int priority) +{ + return 0; +} +#endif /* CONFIG_MEMCG */ + +/** + * shrink_slab - shrink slab caches + * @gfp_mask: allocation context + * @nid: node whose slab caches to target + * @memcg: memory cgroup whose slab caches to target + * @priority: the reclaim priority + * + * Call the shrink functions to age shrinkable caches. + * + * @nid is passed along to shrinkers with SHRINKER_NUMA_AWARE set, + * unaware shrinkers will receive a node id of 0 instead. + * + * @memcg specifies the memory cgroup to target. Unaware shrinkers + * are called only if it is the root cgroup. + * + * @priority is sc->priority, we take the number of objects and >> by priority + * in order to get the scan target. + * + * Returns the number of reclaimed slab objects. + */ +unsigned long shrink_slab(gfp_t gfp_mask, int nid, struct mem_cgroup *memcg, + int priority) +{ + unsigned long ret, freed = 0; + struct shrinker *shrinker; + + /* + * The root memcg might be allocated even though memcg is disabled + * via "cgroup_disable=memory" boot parameter. This could make + * mem_cgroup_is_root() return false, then just run memcg slab + * shrink, but skip global shrink. This may result in premature + * oom. + */ + if (!mem_cgroup_disabled() && !mem_cgroup_is_root(memcg)) + return shrink_slab_memcg(gfp_mask, nid, memcg, priority); + + if (!down_read_trylock(&shrinker_rwsem)) + goto out; + + list_for_each_entry(shrinker, &shrinker_list, list) { + struct shrink_control sc = { + .gfp_mask = gfp_mask, + .nid = nid, + .memcg = memcg, + }; + + ret = do_shrink_slab(&sc, shrinker, priority); + if (ret == SHRINK_EMPTY) + ret = 0; + freed += ret; + /* + * Bail out if someone want to register a new shrinker to + * prevent the registration from being stalled for long periods + * by parallel ongoing shrinking. + */ + if (rwsem_is_contended(&shrinker_rwsem)) { + freed = freed ? : 1; + break; + } + } + + up_read(&shrinker_rwsem); +out: + cond_resched(); + return freed; +} + +/* + * Add a shrinker callback to be called from the vm. + */ +static int __prealloc_shrinker(struct shrinker *shrinker) +{ + unsigned int size; + int err; + + if (shrinker->flags & SHRINKER_MEMCG_AWARE) { + err = prealloc_memcg_shrinker(shrinker); + if (err != -ENOSYS) + return err; + + shrinker->flags &= ~SHRINKER_MEMCG_AWARE; + } + + size = sizeof(*shrinker->nr_deferred); + if (shrinker->flags & SHRINKER_NUMA_AWARE) + size *= nr_node_ids; + + shrinker->nr_deferred = kzalloc(size, GFP_KERNEL); + if (!shrinker->nr_deferred) + return -ENOMEM; + + return 0; +} + +#ifdef CONFIG_SHRINKER_DEBUG +int prealloc_shrinker(struct shrinker *shrinker, const char *fmt, ...) +{ + va_list ap; + int err; + + va_start(ap, fmt); + shrinker->name = kvasprintf_const(GFP_KERNEL, fmt, ap); + va_end(ap); + if (!shrinker->name) + return -ENOMEM; + + err = __prealloc_shrinker(shrinker); + if (err) { + kfree_const(shrinker->name); + shrinker->name = NULL; + } + + return err; +} +#else +int prealloc_shrinker(struct shrinker *shrinker, const char *fmt, ...) +{ + return __prealloc_shrinker(shrinker); +} +#endif + +void free_prealloced_shrinker(struct shrinker *shrinker) +{ +#ifdef CONFIG_SHRINKER_DEBUG + kfree_const(shrinker->name); + shrinker->name = NULL; +#endif + if (shrinker->flags & SHRINKER_MEMCG_AWARE) { + down_write(&shrinker_rwsem); + unregister_memcg_shrinker(shrinker); + up_write(&shrinker_rwsem); + return; + } + + kfree(shrinker->nr_deferred); + shrinker->nr_deferred = NULL; +} + +void register_shrinker_prepared(struct shrinker *shrinker) +{ + down_write(&shrinker_rwsem); + list_add_tail(&shrinker->list, &shrinker_list); + shrinker->flags |= SHRINKER_REGISTERED; + shrinker_debugfs_add(shrinker); + up_write(&shrinker_rwsem); +} + +static int __register_shrinker(struct shrinker *shrinker) +{ + int err = __prealloc_shrinker(shrinker); + + if (err) + return err; + register_shrinker_prepared(shrinker); + return 0; +} + +#ifdef CONFIG_SHRINKER_DEBUG +int register_shrinker(struct shrinker *shrinker, const char *fmt, ...) +{ + va_list ap; + int err; + + va_start(ap, fmt); + shrinker->name = kvasprintf_const(GFP_KERNEL, fmt, ap); + va_end(ap); + if (!shrinker->name) + return -ENOMEM; + + err = __register_shrinker(shrinker); + if (err) { + kfree_const(shrinker->name); + shrinker->name = NULL; + } + return err; +} +#else +int register_shrinker(struct shrinker *shrinker, const char *fmt, ...) +{ + return __register_shrinker(shrinker); +} +#endif +EXPORT_SYMBOL(register_shrinker); + +/* + * Remove one + */ +void unregister_shrinker(struct shrinker *shrinker) +{ + struct dentry *debugfs_entry; + int debugfs_id; + + if (!(shrinker->flags & SHRINKER_REGISTERED)) + return; + + down_write(&shrinker_rwsem); + list_del(&shrinker->list); + shrinker->flags &= ~SHRINKER_REGISTERED; + if (shrinker->flags & SHRINKER_MEMCG_AWARE) + unregister_memcg_shrinker(shrinker); + debugfs_entry = shrinker_debugfs_detach(shrinker, &debugfs_id); + up_write(&shrinker_rwsem); + + shrinker_debugfs_remove(debugfs_entry, debugfs_id); + + kfree(shrinker->nr_deferred); + shrinker->nr_deferred = NULL; +} +EXPORT_SYMBOL(unregister_shrinker); + +/** + * synchronize_shrinkers - Wait for all running shrinkers to complete. + * + * This is equivalent to calling unregister_shrink() and register_shrinker(), + * but atomically and with less overhead. This is useful to guarantee that all + * shrinker invocations have seen an update, before freeing memory, similar to + * rcu. + */ +void synchronize_shrinkers(void) +{ + down_write(&shrinker_rwsem); + up_write(&shrinker_rwsem); +} +EXPORT_SYMBOL(synchronize_shrinkers); diff --git a/mm/vmscan.c b/mm/vmscan.c index ad0ab4489605..4659cb111b78 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -35,7 +35,6 @@ #include #include #include -#include #include #include #include @@ -229,246 +228,7 @@ unsigned int sysctl_vm_swapcache_fastfree __read_mostly; #define sysctl_vm_swapcache_fastfree 0 #endif -LIST_HEAD(shrinker_list); -DECLARE_RWSEM(shrinker_rwsem); - #ifdef CONFIG_MEMCG -static int shrinker_nr_max; - -/* The shrinker_info is expanded in a batch of BITS_PER_LONG */ -static inline int shrinker_map_size(int nr_items) -{ - return (DIV_ROUND_UP(nr_items, BITS_PER_LONG) * sizeof(unsigned long)); -} - -static inline int shrinker_defer_size(int nr_items) -{ - return (round_up(nr_items, BITS_PER_LONG) * sizeof(atomic_long_t)); -} - -static struct shrinker_info *shrinker_info_protected(struct mem_cgroup *memcg, - int nid) -{ - return rcu_dereference_protected(memcg->nodeinfo[nid]->shrinker_info, - lockdep_is_held(&shrinker_rwsem)); -} - -static int expand_one_shrinker_info(struct mem_cgroup *memcg, - int map_size, int defer_size, - int old_map_size, int old_defer_size, - int new_nr_max) -{ - struct shrinker_info *new, *old; - struct mem_cgroup_per_node *pn; - int nid; - int size = map_size + defer_size; - - for_each_node(nid) { - pn = memcg->nodeinfo[nid]; - old = shrinker_info_protected(memcg, nid); - /* Not yet online memcg */ - if (!old) - return 0; - - /* Already expanded this shrinker_info */ - if (new_nr_max <= old->map_nr_max) - continue; - - new = kvmalloc_node(sizeof(*new) + size, GFP_KERNEL, nid); - if (!new) - return -ENOMEM; - - new->nr_deferred = (atomic_long_t *)(new + 1); - new->map = (void *)new->nr_deferred + defer_size; - new->map_nr_max = new_nr_max; - - /* map: set all old bits, clear all new bits */ - memset(new->map, (int)0xff, old_map_size); - memset((void *)new->map + old_map_size, 0, map_size - old_map_size); - /* nr_deferred: copy old values, clear all new values */ - memcpy(new->nr_deferred, old->nr_deferred, old_defer_size); - memset((void *)new->nr_deferred + old_defer_size, 0, - defer_size - old_defer_size); - - rcu_assign_pointer(pn->shrinker_info, new); - kvfree_rcu(old, rcu); - } - - return 0; -} - -void free_shrinker_info(struct mem_cgroup *memcg) -{ - struct mem_cgroup_per_node *pn; - struct shrinker_info *info; - int nid; - - for_each_node(nid) { - pn = memcg->nodeinfo[nid]; - info = rcu_dereference_protected(pn->shrinker_info, true); - kvfree(info); - rcu_assign_pointer(pn->shrinker_info, NULL); - } -} - -int alloc_shrinker_info(struct mem_cgroup *memcg) -{ - struct shrinker_info *info; - int nid, size, ret = 0; - int map_size, defer_size = 0; - - down_write(&shrinker_rwsem); - map_size = shrinker_map_size(shrinker_nr_max); - defer_size = shrinker_defer_size(shrinker_nr_max); - size = map_size + defer_size; - for_each_node(nid) { - info = kvzalloc_node(sizeof(*info) + size, GFP_KERNEL, nid); - if (!info) { - free_shrinker_info(memcg); - ret = -ENOMEM; - break; - } - info->nr_deferred = (atomic_long_t *)(info + 1); - info->map = (void *)info->nr_deferred + defer_size; - info->map_nr_max = shrinker_nr_max; - rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_info, info); - } - up_write(&shrinker_rwsem); - - return ret; -} - -static int expand_shrinker_info(int new_id) -{ - int ret = 0; - int new_nr_max = round_up(new_id + 1, BITS_PER_LONG); - int map_size, defer_size = 0; - int old_map_size, old_defer_size = 0; - struct mem_cgroup *memcg; - - if (!root_mem_cgroup) - goto out; - - lockdep_assert_held(&shrinker_rwsem); - - map_size = shrinker_map_size(new_nr_max); - defer_size = shrinker_defer_size(new_nr_max); - old_map_size = shrinker_map_size(shrinker_nr_max); - old_defer_size = shrinker_defer_size(shrinker_nr_max); - - memcg = mem_cgroup_iter(NULL, NULL, NULL); - do { - ret = expand_one_shrinker_info(memcg, map_size, defer_size, - old_map_size, old_defer_size, - new_nr_max); - if (ret) { - mem_cgroup_iter_break(NULL, memcg); - goto out; - } - } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL); -out: - if (!ret) - shrinker_nr_max = new_nr_max; - - return ret; -} - -void set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id) -{ - if (shrinker_id >= 0 && memcg && !mem_cgroup_is_root(memcg)) { - struct shrinker_info *info; - - rcu_read_lock(); - info = rcu_dereference(memcg->nodeinfo[nid]->shrinker_info); - if (!WARN_ON_ONCE(shrinker_id >= info->map_nr_max)) { - /* Pairs with smp mb in shrink_slab() */ - smp_mb__before_atomic(); - set_bit(shrinker_id, info->map); - } - rcu_read_unlock(); - } -} - -static DEFINE_IDR(shrinker_idr); - -static int prealloc_memcg_shrinker(struct shrinker *shrinker) -{ - int id, ret = -ENOMEM; - - if (mem_cgroup_disabled()) - return -ENOSYS; - - down_write(&shrinker_rwsem); - /* This may call shrinker, so it must use down_read_trylock() */ - id = idr_alloc(&shrinker_idr, shrinker, 0, 0, GFP_KERNEL); - if (id < 0) - goto unlock; - - if (id >= shrinker_nr_max) { - if (expand_shrinker_info(id)) { - idr_remove(&shrinker_idr, id); - goto unlock; - } - } - shrinker->id = id; - ret = 0; -unlock: - up_write(&shrinker_rwsem); - return ret; -} - -static void unregister_memcg_shrinker(struct shrinker *shrinker) -{ - int id = shrinker->id; - - BUG_ON(id < 0); - - lockdep_assert_held(&shrinker_rwsem); - - idr_remove(&shrinker_idr, id); -} - -static long xchg_nr_deferred_memcg(int nid, struct shrinker *shrinker, - struct mem_cgroup *memcg) -{ - struct shrinker_info *info; - - info = shrinker_info_protected(memcg, nid); - return atomic_long_xchg(&info->nr_deferred[shrinker->id], 0); -} - -static long add_nr_deferred_memcg(long nr, int nid, struct shrinker *shrinker, - struct mem_cgroup *memcg) -{ - struct shrinker_info *info; - - info = shrinker_info_protected(memcg, nid); - return atomic_long_add_return(nr, &info->nr_deferred[shrinker->id]); -} - -void reparent_shrinker_deferred(struct mem_cgroup *memcg) -{ - int i, nid; - long nr; - struct mem_cgroup *parent; - struct shrinker_info *child_info, *parent_info; - - parent = parent_mem_cgroup(memcg); - if (!parent) - parent = root_mem_cgroup; - - /* Prevent from concurrent shrinker_info expand */ - down_read(&shrinker_rwsem); - for_each_node(nid) { - child_info = shrinker_info_protected(memcg, nid); - parent_info = shrinker_info_protected(parent, nid); - for (i = 0; i < child_info->map_nr_max; i++) { - nr = atomic_long_read(&child_info->nr_deferred[i]); - atomic_long_add(nr, &parent_info->nr_deferred[i]); - } - } - up_read(&shrinker_rwsem); -} /* Returns true for reclaim through cgroup limits or cgroup interfaces. */ static bool cgroup_reclaim(struct scan_control *sc) @@ -527,27 +287,6 @@ static bool writeback_throttling_sane(struct scan_control *sc) iter != NULL; \ iter = mem_cgroup_iter(NULL, iter, NULL)) -static int prealloc_memcg_shrinker(struct shrinker *shrinker) -{ - return -ENOSYS; -} - -static void unregister_memcg_shrinker(struct shrinker *shrinker) -{ -} - -static long xchg_nr_deferred_memcg(int nid, struct shrinker *shrinker, - struct mem_cgroup *memcg) -{ - return 0; -} - -static long add_nr_deferred_memcg(long nr, int nid, struct shrinker *shrinker, - struct mem_cgroup *memcg) -{ - return 0; -} - static bool cgroup_reclaim(struct scan_control *sc) { return false; @@ -616,39 +355,6 @@ static void flush_reclaim_state(struct scan_control *sc) } } -static long xchg_nr_deferred(struct shrinker *shrinker, - struct shrink_control *sc) -{ - int nid = sc->nid; - - if (!(shrinker->flags & SHRINKER_NUMA_AWARE)) - nid = 0; - - if (sc->memcg && - (shrinker->flags & SHRINKER_MEMCG_AWARE)) - return xchg_nr_deferred_memcg(nid, shrinker, - sc->memcg); - - return atomic_long_xchg(&shrinker->nr_deferred[nid], 0); -} - - -static long add_nr_deferred(long nr, struct shrinker *shrinker, - struct shrink_control *sc) -{ - int nid = sc->nid; - - if (!(shrinker->flags & SHRINKER_NUMA_AWARE)) - nid = 0; - - if (sc->memcg && - (shrinker->flags & SHRINKER_MEMCG_AWARE)) - return add_nr_deferred_memcg(nr, nid, shrinker, - sc->memcg); - - return atomic_long_add_return(nr, &shrinker->nr_deferred[nid]); -} - static bool can_demote(int nid, struct scan_control *sc) { if (!numa_demotion_enabled) @@ -737,413 +443,6 @@ static unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru, return size; } -/* - * Add a shrinker callback to be called from the vm. - */ -static int __prealloc_shrinker(struct shrinker *shrinker) -{ - unsigned int size; - int err; - - if (shrinker->flags & SHRINKER_MEMCG_AWARE) { - err = prealloc_memcg_shrinker(shrinker); - if (err != -ENOSYS) - return err; - - shrinker->flags &= ~SHRINKER_MEMCG_AWARE; - } - - size = sizeof(*shrinker->nr_deferred); - if (shrinker->flags & SHRINKER_NUMA_AWARE) - size *= nr_node_ids; - - shrinker->nr_deferred = kzalloc(size, GFP_KERNEL); - if (!shrinker->nr_deferred) - return -ENOMEM; - - return 0; -} - -#ifdef CONFIG_SHRINKER_DEBUG -int prealloc_shrinker(struct shrinker *shrinker, const char *fmt, ...) -{ - va_list ap; - int err; - - va_start(ap, fmt); - shrinker->name = kvasprintf_const(GFP_KERNEL, fmt, ap); - va_end(ap); - if (!shrinker->name) - return -ENOMEM; - - err = __prealloc_shrinker(shrinker); - if (err) { - kfree_const(shrinker->name); - shrinker->name = NULL; - } - - return err; -} -#else -int prealloc_shrinker(struct shrinker *shrinker, const char *fmt, ...) -{ - return __prealloc_shrinker(shrinker); -} -#endif - -void free_prealloced_shrinker(struct shrinker *shrinker) -{ -#ifdef CONFIG_SHRINKER_DEBUG - kfree_const(shrinker->name); - shrinker->name = NULL; -#endif - if (shrinker->flags & SHRINKER_MEMCG_AWARE) { - down_write(&shrinker_rwsem); - unregister_memcg_shrinker(shrinker); - up_write(&shrinker_rwsem); - return; - } - - kfree(shrinker->nr_deferred); - shrinker->nr_deferred = NULL; -} - -void register_shrinker_prepared(struct shrinker *shrinker) -{ - down_write(&shrinker_rwsem); - list_add_tail(&shrinker->list, &shrinker_list); - shrinker->flags |= SHRINKER_REGISTERED; - shrinker_debugfs_add(shrinker); - up_write(&shrinker_rwsem); -} - -static int __register_shrinker(struct shrinker *shrinker) -{ - int err = __prealloc_shrinker(shrinker); - - if (err) - return err; - register_shrinker_prepared(shrinker); - return 0; -} - -#ifdef CONFIG_SHRINKER_DEBUG -int register_shrinker(struct shrinker *shrinker, const char *fmt, ...) -{ - va_list ap; - int err; - - va_start(ap, fmt); - shrinker->name = kvasprintf_const(GFP_KERNEL, fmt, ap); - va_end(ap); - if (!shrinker->name) - return -ENOMEM; - - err = __register_shrinker(shrinker); - if (err) { - kfree_const(shrinker->name); - shrinker->name = NULL; - } - return err; -} -#else -int register_shrinker(struct shrinker *shrinker, const char *fmt, ...) -{ - return __register_shrinker(shrinker); -} -#endif -EXPORT_SYMBOL(register_shrinker); - -/* - * Remove one - */ -void unregister_shrinker(struct shrinker *shrinker) -{ - struct dentry *debugfs_entry; - int debugfs_id; - - if (!(shrinker->flags & SHRINKER_REGISTERED)) - return; - - down_write(&shrinker_rwsem); - list_del(&shrinker->list); - shrinker->flags &= ~SHRINKER_REGISTERED; - if (shrinker->flags & SHRINKER_MEMCG_AWARE) - unregister_memcg_shrinker(shrinker); - debugfs_entry = shrinker_debugfs_detach(shrinker, &debugfs_id); - up_write(&shrinker_rwsem); - - shrinker_debugfs_remove(debugfs_entry, debugfs_id); - - kfree(shrinker->nr_deferred); - shrinker->nr_deferred = NULL; -} -EXPORT_SYMBOL(unregister_shrinker); - -/** - * synchronize_shrinkers - Wait for all running shrinkers to complete. - * - * This is equivalent to calling unregister_shrink() and register_shrinker(), - * but atomically and with less overhead. This is useful to guarantee that all - * shrinker invocations have seen an update, before freeing memory, similar to - * rcu. - */ -void synchronize_shrinkers(void) -{ - down_write(&shrinker_rwsem); - up_write(&shrinker_rwsem); -} -EXPORT_SYMBOL(synchronize_shrinkers); - -#define SHRINK_BATCH 128 - -static unsigned long do_shrink_slab(struct shrink_control *shrinkctl, - struct shrinker *shrinker, int priority) -{ - unsigned long freed = 0; - unsigned long long delta; - long total_scan; - long freeable; - long nr; - long new_nr; - long batch_size = shrinker->batch ? shrinker->batch - : SHRINK_BATCH; - long scanned = 0, next_deferred; - - freeable = shrinker->count_objects(shrinker, shrinkctl); - if (freeable == 0 || freeable == SHRINK_EMPTY) - return freeable; - - /* - * copy the current shrinker scan count into a local variable - * and zero it so that other concurrent shrinker invocations - * don't also do this scanning work. - */ - nr = xchg_nr_deferred(shrinker, shrinkctl); - - if (shrinker->seeks) { - delta = freeable >> priority; - delta *= 4; - do_div(delta, shrinker->seeks); - } else { - /* - * These objects don't require any IO to create. Trim - * them aggressively under memory pressure to keep - * them from causing refetches in the IO caches. - */ - delta = freeable / 2; - } - - total_scan = nr >> priority; - total_scan += delta; - total_scan = min(total_scan, (2 * freeable)); - - trace_mm_shrink_slab_start(shrinker, shrinkctl, nr, - freeable, delta, total_scan, priority); - - /* - * Normally, we should not scan less than batch_size objects in one - * pass to avoid too frequent shrinker calls, but if the slab has less - * than batch_size objects in total and we are really tight on memory, - * we will try to reclaim all available objects, otherwise we can end - * up failing allocations although there are plenty of reclaimable - * objects spread over several slabs with usage less than the - * batch_size. - * - * We detect the "tight on memory" situations by looking at the total - * number of objects we want to scan (total_scan). If it is greater - * than the total number of objects on slab (freeable), we must be - * scanning at high prio and therefore should try to reclaim as much as - * possible. - */ - while (total_scan >= batch_size || - total_scan >= freeable) { - unsigned long ret; - unsigned long nr_to_scan = min(batch_size, total_scan); - - shrinkctl->nr_to_scan = nr_to_scan; - shrinkctl->nr_scanned = nr_to_scan; - ret = shrinker->scan_objects(shrinker, shrinkctl); - if (ret == SHRINK_STOP) - break; - freed += ret; - - count_vm_events(SLABS_SCANNED, shrinkctl->nr_scanned); - total_scan -= shrinkctl->nr_scanned; - scanned += shrinkctl->nr_scanned; - - cond_resched(); - } - - /* - * The deferred work is increased by any new work (delta) that wasn't - * done, decreased by old deferred work that was done now. - * - * And it is capped to two times of the freeable items. - */ - next_deferred = max_t(long, (nr + delta - scanned), 0); - next_deferred = min(next_deferred, (2 * freeable)); - - /* - * move the unused scan count back into the shrinker in a - * manner that handles concurrent updates. - */ - new_nr = add_nr_deferred(next_deferred, shrinker, shrinkctl); - - trace_mm_shrink_slab_end(shrinker, shrinkctl->nid, freed, nr, new_nr, total_scan); - return freed; -} - -#ifdef CONFIG_MEMCG -static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, - struct mem_cgroup *memcg, int priority) -{ - struct shrinker_info *info; - unsigned long ret, freed = 0; - int i; - - if (!mem_cgroup_online(memcg)) - return 0; - - if (!down_read_trylock(&shrinker_rwsem)) - return 0; - - info = shrinker_info_protected(memcg, nid); - if (unlikely(!info)) - goto unlock; - - for_each_set_bit(i, info->map, info->map_nr_max) { - struct shrink_control sc = { - .gfp_mask = gfp_mask, - .nid = nid, - .memcg = memcg, - }; - struct shrinker *shrinker; - - shrinker = idr_find(&shrinker_idr, i); - if (unlikely(!shrinker || !(shrinker->flags & SHRINKER_REGISTERED))) { - if (!shrinker) - clear_bit(i, info->map); - continue; - } - - /* Call non-slab shrinkers even though kmem is disabled */ - if (!memcg_kmem_online() && - !(shrinker->flags & SHRINKER_NONSLAB)) - continue; - - ret = do_shrink_slab(&sc, shrinker, priority); - if (ret == SHRINK_EMPTY) { - clear_bit(i, info->map); - /* - * After the shrinker reported that it had no objects to - * free, but before we cleared the corresponding bit in - * the memcg shrinker map, a new object might have been - * added. To make sure, we have the bit set in this - * case, we invoke the shrinker one more time and reset - * the bit if it reports that it is not empty anymore. - * The memory barrier here pairs with the barrier in - * set_shrinker_bit(): - * - * list_lru_add() shrink_slab_memcg() - * list_add_tail() clear_bit() - * - * set_bit() do_shrink_slab() - */ - smp_mb__after_atomic(); - ret = do_shrink_slab(&sc, shrinker, priority); - if (ret == SHRINK_EMPTY) - ret = 0; - else - set_shrinker_bit(memcg, nid, i); - } - freed += ret; - - if (rwsem_is_contended(&shrinker_rwsem)) { - freed = freed ? : 1; - break; - } - } -unlock: - up_read(&shrinker_rwsem); - return freed; -} -#else /* CONFIG_MEMCG */ -static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, - struct mem_cgroup *memcg, int priority) -{ - return 0; -} -#endif /* CONFIG_MEMCG */ - -/** - * shrink_slab - shrink slab caches - * @gfp_mask: allocation context - * @nid: node whose slab caches to target - * @memcg: memory cgroup whose slab caches to target - * @priority: the reclaim priority - * - * Call the shrink functions to age shrinkable caches. - * - * @nid is passed along to shrinkers with SHRINKER_NUMA_AWARE set, - * unaware shrinkers will receive a node id of 0 instead. - * - * @memcg specifies the memory cgroup to target. Unaware shrinkers - * are called only if it is the root cgroup. - * - * @priority is sc->priority, we take the number of objects and >> by priority - * in order to get the scan target. - * - * Returns the number of reclaimed slab objects. - */ -unsigned long shrink_slab(gfp_t gfp_mask, int nid, - struct mem_cgroup *memcg, - int priority) -{ - unsigned long ret, freed = 0; - struct shrinker *shrinker; - - /* - * The root memcg might be allocated even though memcg is disabled - * via "cgroup_disable=memory" boot parameter. This could make - * mem_cgroup_is_root() return false, then just run memcg slab - * shrink, but skip global shrink. This may result in premature - * oom. - */ - if (!mem_cgroup_disabled() && !mem_cgroup_is_root(memcg)) - return shrink_slab_memcg(gfp_mask, nid, memcg, priority); - - if (!down_read_trylock(&shrinker_rwsem)) - goto out; - - list_for_each_entry(shrinker, &shrinker_list, list) { - struct shrink_control sc = { - .gfp_mask = gfp_mask, - .nid = nid, - .memcg = memcg, - }; - - ret = do_shrink_slab(&sc, shrinker, priority); - if (ret == SHRINK_EMPTY) - ret = 0; - freed += ret; - /* - * Bail out if someone want to register a new shrinker to - * prevent the registration from being stalled for long periods - * by parallel ongoing shrinking. - */ - if (rwsem_is_contended(&shrinker_rwsem)) { - freed = freed ? : 1; - break; - } - } - - up_read(&shrinker_rwsem); -out: - cond_resched(); - return freed; -} - static unsigned long drop_slab_node(int nid) { unsigned long freed = 0; -- Gitee From 216cfad5a615bb37a612fa7259ad3414e4a60e9f Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Mon, 11 Sep 2023 17:25:16 +0800 Subject: [PATCH 038/484] mm: shrinker: remove redundant shrinker_rwsem in debugfs operations MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 1dd49e58f966b1eecd935dc28458a8369ae94ad1 upstream Conflicts: none Backport-reason: Shrinker cleanup debugfs_remove_recursive() will wait for debugfs_file_put() to return, so the shrinker will not be freed when doing debugfs operations (such as shrinker_debugfs_count_show() and shrinker_debugfs_scan_write()), so there is no need to hold shrinker_rwsem during debugfs operations. Link: https://lkml.kernel.org/r/20230911092517.64141-4-zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Reviewed-by: Muchun Song Cc: Christian Brauner Cc: Christian König Cc: Chuck Lever Cc: Daniel Vetter Cc: Daniel Vetter Cc: Darrick J. Wong Cc: Dave Chinner Cc: Greg Kroah-Hartman Cc: Joel Fernandes Cc: Kirill Tkhai Cc: Paul E. McKenney Cc: Roman Gushchin Cc: Sergey Senozhatsky Cc: Steven Price Cc: Theodore Ts'o Cc: Vlastimil Babka Cc: Abhinav Kumar Cc: Alasdair Kergon Cc: Alexander Viro Cc: Alyssa Rosenzweig Cc: Andreas Dilger Cc: Andreas Gruenbacher Cc: Anna Schumaker Cc: Arnd Bergmann Cc: Bob Peterson Cc: Borislav Petkov Cc: Carlos Llamas Cc: Chandan Babu R Cc: Chao Yu Cc: Chris Mason Cc: Coly Li Cc: Dai Ngo Cc: Dave Hansen Cc: David Airlie Cc: David Hildenbrand Cc: David Sterba Cc: Dmitry Baryshkov Cc: Gao Xiang Cc: Huang Rui Cc: Ingo Molnar Cc: Jaegeuk Kim Cc: Jani Nikula Cc: Jan Kara Cc: Jason Wang Cc: Jeff Layton Cc: Jeffle Xu Cc: Joonas Lahtinen Cc: Josef Bacik Cc: Juergen Gross Cc: Kent Overstreet Cc: Marijn Suijten Cc: "Michael S. Tsirkin" Cc: Mike Snitzer Cc: Minchan Kim Cc: Muchun Song Cc: Nadav Amit Cc: Neil Brown Cc: Oleksandr Tyshchenko Cc: Olga Kornievskaia Cc: Richard Weinberger Cc: Rob Clark Cc: Rob Herring Cc: Rodrigo Vivi Cc: Sean Paul Cc: Song Liu Cc: Stefano Stabellini Cc: Thomas Gleixner Cc: Tomeu Vizoso Cc: Tom Talpey Cc: Trond Myklebust Cc: Tvrtko Ursulin Cc: Xuan Zhuo Cc: Yue Hu Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/shrinker_debug.c | 16 +--------------- 1 file changed, 1 insertion(+), 15 deletions(-) diff --git a/mm/shrinker_debug.c b/mm/shrinker_debug.c index ee0cddb4530f..e4ce509f619e 100644 --- a/mm/shrinker_debug.c +++ b/mm/shrinker_debug.c @@ -51,17 +51,12 @@ static int shrinker_debugfs_count_show(struct seq_file *m, void *v) struct mem_cgroup *memcg; unsigned long total; bool memcg_aware; - int ret, nid; + int ret = 0, nid; count_per_node = kcalloc(nr_node_ids, sizeof(unsigned long), GFP_KERNEL); if (!count_per_node) return -ENOMEM; - ret = down_read_killable(&shrinker_rwsem); - if (ret) { - kfree(count_per_node); - return ret; - } rcu_read_lock(); memcg_aware = shrinker->flags & SHRINKER_MEMCG_AWARE; @@ -94,7 +89,6 @@ static int shrinker_debugfs_count_show(struct seq_file *m, void *v) } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL); rcu_read_unlock(); - up_read(&shrinker_rwsem); kfree(count_per_node); return ret; @@ -119,7 +113,6 @@ static ssize_t shrinker_debugfs_scan_write(struct file *file, struct mem_cgroup *memcg = NULL; int nid; char kbuf[72]; - ssize_t ret; read_len = size < (sizeof(kbuf) - 1) ? size : (sizeof(kbuf) - 1); if (copy_from_user(kbuf, buf, read_len)) @@ -148,12 +141,6 @@ static ssize_t shrinker_debugfs_scan_write(struct file *file, return -EINVAL; } - ret = down_read_killable(&shrinker_rwsem); - if (ret) { - mem_cgroup_put(memcg); - return ret; - } - sc.nid = nid; sc.memcg = memcg; sc.nr_to_scan = nr_to_scan; @@ -161,7 +148,6 @@ static ssize_t shrinker_debugfs_scan_write(struct file *file, shrinker->scan_objects(shrinker, &sc); - up_read(&shrinker_rwsem); mem_cgroup_put(memcg); return size; -- Gitee From 4f956b6c2f30cd261d34c822ecd40aee0d6a1496 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Mon, 11 Sep 2023 17:25:17 +0800 Subject: [PATCH 039/484] drm/ttm: introduce pool_shrink_rwsem MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 0b2f5ea1aa39c0ed34bdadb53faf519e3d84ac4a upstream Conflicts: none Backport-reason: Shrinker cleanup Currently, synchronize_shrinkers() is only used by TTM pool. It only requires that no shrinkers run in parallel. After we use RCU+refcount method to implement the lockless slab shrink, we can not use shrinker_rwsem or synchronize_rcu() to guarantee that all shrinker invocations have seen an update before freeing memory. So we introduce a new pool_shrink_rwsem to implement a private ttm_pool_synchronize_shrinkers(), so as to achieve the same purpose. Link: https://lkml.kernel.org/r/20230911092517.64141-5-zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Reviewed-by: Muchun Song Reviewed-by: Christian König Acked-by: Daniel Vetter Cc: Christian Brauner Cc: Chuck Lever Cc: Daniel Vetter Cc: Darrick J. Wong Cc: Dave Chinner Cc: Greg Kroah-Hartman Cc: Joel Fernandes Cc: Kirill Tkhai Cc: Paul E. McKenney Cc: Roman Gushchin Cc: Sergey Senozhatsky Cc: Steven Price Cc: Theodore Ts'o Cc: Vlastimil Babka Cc: Abhinav Kumar Cc: Alasdair Kergon Cc: Alexander Viro Cc: Alyssa Rosenzweig Cc: Andreas Dilger Cc: Andreas Gruenbacher Cc: Anna Schumaker Cc: Arnd Bergmann Cc: Bob Peterson Cc: Borislav Petkov Cc: Carlos Llamas Cc: Chandan Babu R Cc: Chao Yu Cc: Chris Mason Cc: Coly Li Cc: Dai Ngo Cc: Dave Hansen Cc: David Airlie Cc: David Hildenbrand Cc: David Sterba Cc: Dmitry Baryshkov Cc: Gao Xiang Cc: Huang Rui Cc: Ingo Molnar Cc: Jaegeuk Kim Cc: Jani Nikula Cc: Jan Kara Cc: Jason Wang Cc: Jeff Layton Cc: Jeffle Xu Cc: Joonas Lahtinen Cc: Josef Bacik Cc: Juergen Gross Cc: Kent Overstreet Cc: Marijn Suijten Cc: "Michael S. Tsirkin" Cc: Mike Snitzer Cc: Minchan Kim Cc: Muchun Song Cc: Nadav Amit Cc: Neil Brown Cc: Oleksandr Tyshchenko Cc: Olga Kornievskaia Cc: Richard Weinberger Cc: Rob Clark Cc: Rob Herring Cc: Rodrigo Vivi Cc: Sean Paul Cc: Song Liu Cc: Stefano Stabellini Cc: Thomas Gleixner Cc: Tomeu Vizoso Cc: Tom Talpey Cc: Trond Myklebust Cc: Tvrtko Ursulin Cc: Xuan Zhuo Cc: Yue Hu Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- drivers/gpu/drm/ttm/ttm_pool.c | 17 ++++++++++++++++- include/linux/shrinker.h | 1 - mm/shrinker.c | 15 --------------- 3 files changed, 16 insertions(+), 17 deletions(-) diff --git a/drivers/gpu/drm/ttm/ttm_pool.c b/drivers/gpu/drm/ttm/ttm_pool.c index ae7900256759..08073d3bd5c2 100644 --- a/drivers/gpu/drm/ttm/ttm_pool.c +++ b/drivers/gpu/drm/ttm/ttm_pool.c @@ -74,6 +74,7 @@ static struct ttm_pool_type global_dma32_uncached[NR_PAGE_ORDERS]; static spinlock_t shrinker_lock; static struct list_head shrinker_list; static struct shrinker mm_shrinker; +static DECLARE_RWSEM(pool_shrink_rwsem); /* Allocate pages of size 1 << order with the given gfp_flags */ static struct page *ttm_pool_alloc_page(struct ttm_pool *pool, gfp_t gfp_flags, @@ -323,6 +324,7 @@ static unsigned int ttm_pool_shrink(void) unsigned int num_pages; struct page *p; + down_read(&pool_shrink_rwsem); spin_lock(&shrinker_lock); pt = list_first_entry(&shrinker_list, typeof(*pt), shrinker_list); list_move_tail(&pt->shrinker_list, &shrinker_list); @@ -335,6 +337,7 @@ static unsigned int ttm_pool_shrink(void) } else { num_pages = 0; } + up_read(&pool_shrink_rwsem); return num_pages; } @@ -584,6 +587,18 @@ void ttm_pool_init(struct ttm_pool *pool, struct device *dev, } EXPORT_SYMBOL(ttm_pool_init); +/** + * ttm_pool_synchronize_shrinkers - Wait for all running shrinkers to complete. + * + * This is useful to guarantee that all shrinker invocations have seen an + * update, before freeing memory, similar to rcu. + */ +static void ttm_pool_synchronize_shrinkers(void) +{ + down_write(&pool_shrink_rwsem); + up_write(&pool_shrink_rwsem); +} + /** * ttm_pool_fini - Cleanup a pool * @@ -611,7 +626,7 @@ void ttm_pool_fini(struct ttm_pool *pool) /* We removed the pool types from the LRU, but we need to also make sure * that no shrinker is concurrently freeing pages from the pool. */ - synchronize_shrinkers(); + ttm_pool_synchronize_shrinkers(); } EXPORT_SYMBOL(ttm_pool_fini); diff --git a/include/linux/shrinker.h b/include/linux/shrinker.h index 1b0389a4d5ac..63476df0d64d 100644 --- a/include/linux/shrinker.h +++ b/include/linux/shrinker.h @@ -107,7 +107,6 @@ extern int __printf(2, 3) register_shrinker(struct shrinker *shrinker, const char *fmt, ...); extern void unregister_shrinker(struct shrinker *shrinker); extern void free_prealloced_shrinker(struct shrinker *shrinker); -extern void synchronize_shrinkers(void); #ifdef CONFIG_SHRINKER_DEBUG extern int __printf(2, 3) shrinker_debugfs_rename(struct shrinker *shrinker, diff --git a/mm/shrinker.c b/mm/shrinker.c index 043c87ccfab4..a16cd448b924 100644 --- a/mm/shrinker.c +++ b/mm/shrinker.c @@ -692,18 +692,3 @@ void unregister_shrinker(struct shrinker *shrinker) shrinker->nr_deferred = NULL; } EXPORT_SYMBOL(unregister_shrinker); - -/** - * synchronize_shrinkers - Wait for all running shrinkers to complete. - * - * This is equivalent to calling unregister_shrink() and register_shrinker(), - * but atomically and with less overhead. This is useful to guarantee that all - * shrinker invocations have seen an update, before freeing memory, similar to - * rcu. - */ -void synchronize_shrinkers(void) -{ - down_write(&shrinker_rwsem); - up_write(&shrinker_rwsem); -} -EXPORT_SYMBOL(synchronize_shrinkers); -- Gitee From dfc57291ce31121bdcdd180b938e4370bfe7b3b8 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Mon, 11 Sep 2023 17:44:00 +0800 Subject: [PATCH 040/484] mm: shrinker: add infrastructure for dynamically allocating shrinker commit c42d50aefd17a6bad3ed617769edbbb579137545 upstream Conflicts: none Backport-reason: Shrinker lockless Patch series "use refcount+RCU method to implement lockless slab shrink", v6. 1. Background ============= We used to implement the lockless slab shrink with SRCU [1], but then kernel test robot reported -88.8% regression in stress-ng.ramfs.ops_per_sec test case [2], so we reverted it [3]. This patch series aims to re-implement the lockless slab shrink using the refcount+RCU method proposed by Dave Chinner [4]. [1]. https://lore.kernel.org/lkml/20230313112819.38938-1-zhengqi.arch@bytedance.com/ [2]. https://lore.kernel.org/lkml/202305230837.db2c233f-yujie.liu@intel.com/ [3]. https://lore.kernel.org/all/20230609081518.3039120-1-qi.zheng@linux.dev/ [4]. https://lore.kernel.org/lkml/ZIJhou1d55d4H1s0@dread.disaster.area/ 2. Implementation ================= Currently, the shrinker instances can be divided into the following three types: a) global shrinker instance statically defined in the kernel, such as workingset_shadow_shrinker. b) global shrinker instance statically defined in the kernel modules, such as mmu_shrinker in x86. c) shrinker instance embedded in other structures. For case a, the memory of shrinker instance is never freed. For case b, the memory of shrinker instance will be freed after synchronize_rcu() when the module is unloaded. For case c, the memory of shrinker instance will be freed along with the structure it is embedded in. In preparation for implementing lockless slab shrink, we need to dynamically allocate those shrinker instances in case c, then the memory can be dynamically freed alone by calling kfree_rcu(). This patchset adds the following new APIs for dynamically allocating shrinker, and add a private_data field to struct shrinker to record and get the original embedded structure. 1. shrinker_alloc() 2. shrinker_register() 3. shrinker_free() In order to simplify shrinker-related APIs and make shrinker more independent of other kernel mechanisms, this patchset uses the above APIs to convert all shrinkers (including case a and b) to dynamically allocated, and then remove all existing APIs. This will also have another advantage mentioned by Dave Chinner: ``` The other advantage of this is that it will break all the existing out of tree code and third party modules using the old API and will no longer work with a kernel using lockless slab shrinkers. They need to break (both at the source and binary levels) to stop bad things from happening due to using uncoverted shrinkers in the new setup. ``` Then we free the shrinker by calling call_rcu(), and use rcu_read_{lock,unlock}() to ensure that the shrinker instance is valid. And the shrinker::refcount mechanism ensures that the shrinker instance will not be run again after unregistration. So the structure that records the pointer of shrinker instance can be safely freed without waiting for the RCU read-side critical section. In this way, while we implement the lockless slab shrink, we don't need to be blocked in unregister_shrinker() to wait RCU read-side critical section. PATCH 1: introduce new APIs PATCH 2~38: convert all shrinnkers to use new APIs PATCH 39: remove old APIs PATCH 40~41: some cleanups and preparations PATCH 42-43: implement the lockless slab shrink PATCH 44~45: convert shrinker_rwsem to mutex 3. Testing ========== 3.1 slab shrink stress test --------------------------- We can reproduce the down_read_trylock() hotspot through the following script: ``` DIR="/root/shrinker/memcg/mnt" do_create() { mkdir -p /sys/fs/cgroup/memory/test echo 4G > /sys/fs/cgroup/memory/test/memory.limit_in_bytes for i in `seq 0 $1`; do mkdir -p /sys/fs/cgroup/memory/test/$i; echo $$ > /sys/fs/cgroup/memory/test/$i/cgroup.procs; mkdir -p $DIR/$i; done } do_mount() { for i in `seq $1 $2`; do mount -t tmpfs $i $DIR/$i; done } do_touch() { for i in `seq $1 $2`; do echo $$ > /sys/fs/cgroup/memory/test/$i/cgroup.procs; dd if=/dev/zero of=$DIR/$i/file$i bs=1M count=1 & done } case "$1" in touch) do_touch $2 $3 ;; test) do_create 4000 do_mount 0 4000 do_touch 0 3000 ;; *) exit 1 ;; esac ``` Save the above script, then run test and touch commands. Then we can use the following perf command to view hotspots: perf top -U -F 999 1) Before applying this patchset: 33.15% [kernel] [k] down_read_trylock 25.38% [kernel] [k] shrink_slab 21.75% [kernel] [k] up_read 4.45% [kernel] [k] _find_next_bit 2.27% [kernel] [k] do_shrink_slab 1.80% [kernel] [k] intel_idle_irq 1.79% [kernel] [k] shrink_lruvec 0.67% [kernel] [k] xas_descend 0.41% [kernel] [k] mem_cgroup_iter 0.40% [kernel] [k] shrink_node 0.38% [kernel] [k] list_lru_count_one 2) After applying this patchset: 64.56% [kernel] [k] shrink_slab 12.18% [kernel] [k] do_shrink_slab 3.30% [kernel] [k] __rcu_read_unlock 2.61% [kernel] [k] shrink_lruvec 2.49% [kernel] [k] __rcu_read_lock 1.93% [kernel] [k] intel_idle_irq 0.89% [kernel] [k] shrink_node 0.81% [kernel] [k] mem_cgroup_iter 0.77% [kernel] [k] mem_cgroup_calculate_protection 0.66% [kernel] [k] list_lru_count_one We can see that the first perf hotspot becomes shrink_slab, which is what we expect. 3.2 registration and unregistration stress test ----------------------------------------------- Run the command below to test: stress-ng --timeout 60 --times --verify --metrics-brief --ramfs 9 & 1) Before applying this patchset: setting to a 60 second run per stressor dispatching hogs: 9 ramfs stressor bogo ops real time usr time sys time bogo ops/s bogo ops/s (secs) (secs) (secs) (real time) (usr+sys time) ramfs 473062 60.00 8.00 279.13 7884.12 1647.59 for a 60.01s run time: 1440.34s available CPU time 7.99s user time ( 0.55%) 279.13s system time ( 19.38%) 287.12s total time ( 19.93%) load average: 7.12 2.99 1.15 successful run completed in 60.01s (1 min, 0.01 secs) 2) After applying this patchset: setting to a 60 second run per stressor dispatching hogs: 9 ramfs stressor bogo ops real time usr time sys time bogo ops/s bogo ops/s (secs) (secs) (secs) (real time) (usr+sys time) ramfs 477165 60.00 8.13 281.34 7952.55 1648.40 for a 60.01s run time: 1440.33s available CPU time 8.12s user time ( 0.56%) 281.34s system time ( 19.53%) 289.46s total time ( 20.10%) load average: 6.98 3.03 1.19 successful run completed in 60.01s (1 min, 0.01 secs) We can see that the ops/s has hardly changed. This patch (of 45): Currently, the shrinker instances can be divided into the following three types: a) global shrinker instance statically defined in the kernel, such as workingset_shadow_shrinker. b) global shrinker instance statically defined in the kernel modules, such as mmu_shrinker in x86. c) shrinker instance embedded in other structures. For case a, the memory of shrinker instance is never freed. For case b, the memory of shrinker instance will be freed after synchronize_rcu() when the module is unloaded. For case c, the memory of shrinker instance will be freed along with the structure it is embedded in. In preparation for implementing lockless slab shrink, we need to dynamically allocate those shrinker instances in case c, then the memory can be dynamically freed alone by calling kfree_rcu(). So this commit adds the following new APIs for dynamically allocating shrinker, and add a private_data field to struct shrinker to record and get the original embedded structure. 1. shrinker_alloc() Used to allocate shrinker instance itself and related memory, it will return a pointer to the shrinker instance on success and NULL on failure. 2. shrinker_register() Used to register the shrinker instance, which is same as the current register_shrinker_prepared(). 3. shrinker_free() Used to unregister (if needed) and free the shrinker instance. In order to simplify shrinker-related APIs and make shrinker more independent of other kernel mechanisms, subsequent submissions will use the above API to convert all shrinkers (including case a and b) to dynamically allocated, and then remove all existing APIs. This will also have another advantage mentioned by Dave Chinner: ``` The other advantage of this is that it will break all the existing out of tree code and third party modules using the old API and will no longer work with a kernel using lockless slab shrinkers. They need to break (both at the source and binary levels) to stop bad things from happening due to using unconverted shrinkers in the new setup. ``` [zhengqi.arch@bytedance.com: mm: shrinker: some cleanup] Link: https://lkml.kernel.org/r/20230919024607.65463-1-zhengqi.arch@bytedance.com Link: https://lkml.kernel.org/r/20230911094444.68966-1-zhengqi.arch@bytedance.com Link: https://lkml.kernel.org/r/20230911094444.68966-2-zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Reviewed-by: Muchun Song Cc: Christian Brauner Cc: Chuck Lever Cc: Darrick J. Wong Cc: Dave Chinner Cc: Greg Kroah-Hartman Cc: Kirill Tkhai Cc: Paul E. McKenney Cc: Roman Gushchin Cc: Sergey Senozhatsky Cc: Steven Price Cc: Theodore Ts'o Cc: Vlastimil Babka Cc: Abhinav Kumar Cc: Alasdair Kergon Cc: Alexander Viro Cc: Alyssa Rosenzweig Cc: Andreas Dilger Cc: Andreas Gruenbacher Cc: Anna Schumaker Cc: Arnd Bergmann Cc: Bob Peterson Cc: Borislav Petkov Cc: Carlos Llamas Cc: Chandan Babu R Cc: Chao Yu Cc: Chris Mason Cc: Christian Koenig Cc: Coly Li Cc: Dai Ngo Cc: Daniel Vetter Cc: Daniel Vetter Cc: Dave Hansen Cc: David Airlie Cc: David Hildenbrand Cc: David Sterba Cc: Dmitry Baryshkov Cc: Gao Xiang Cc: Huang Rui Cc: Ingo Molnar Cc: Jaegeuk Kim Cc: Jani Nikula Cc: Jan Kara Cc: Jason Wang Cc: Jeff Layton Cc: Jeffle Xu Cc: Joel Fernandes (Google) Cc: Joonas Lahtinen Cc: Josef Bacik Cc: Juergen Gross Cc: Kent Overstreet Cc: Marijn Suijten Cc: "Michael S. Tsirkin" Cc: Mike Snitzer Cc: Minchan Kim Cc: Nadav Amit Cc: Neil Brown Cc: Oleksandr Tyshchenko Cc: Olga Kornievskaia Cc: Richard Weinberger Cc: Rob Clark Cc: Rob Herring Cc: Rodrigo Vivi Cc: Sean Paul Cc: Song Liu Cc: Stefano Stabellini Cc: Thomas Gleixner Cc: Tomeu Vizoso Cc: Tom Talpey Cc: Trond Myklebust Cc: Tvrtko Ursulin Cc: Xuan Zhuo Cc: Yue Hu Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- include/linux/shrinker.h | 19 +++++-- mm/internal.h | 22 ++++++++ mm/shrinker.c | 106 +++++++++++++++++++++++++++++++++++++++ mm/shrinker_debug.c | 3 -- 4 files changed, 142 insertions(+), 8 deletions(-) diff --git a/include/linux/shrinker.h b/include/linux/shrinker.h index 63476df0d64d..484724cabaae 100644 --- a/include/linux/shrinker.h +++ b/include/linux/shrinker.h @@ -71,6 +71,8 @@ struct shrinker { int seeks; /* seeks to recreate an obj */ unsigned flags; + void *private_data; + /* These are for internal use */ struct list_head list; #ifdef CONFIG_MEMCG @@ -90,15 +92,22 @@ struct shrinker { }; #define DEFAULT_SEEKS 2 /* A good number if you don't know better. */ -/* Flags */ -#define SHRINKER_REGISTERED (1 << 0) -#define SHRINKER_NUMA_AWARE (1 << 1) -#define SHRINKER_MEMCG_AWARE (1 << 2) +/* Internal flags */ +#define SHRINKER_REGISTERED BIT(0) +#define SHRINKER_ALLOCATED BIT(1) + +/* Flags for users to use */ +#define SHRINKER_NUMA_AWARE BIT(2) +#define SHRINKER_MEMCG_AWARE BIT(3) /* * It just makes sense when the shrinker is also MEMCG_AWARE for now, * non-MEMCG_AWARE shrinker should not have this flag set. */ -#define SHRINKER_NONSLAB (1 << 3) +#define SHRINKER_NONSLAB BIT(4) + +struct shrinker *shrinker_alloc(unsigned int flags, const char *fmt, ...); +void shrinker_register(struct shrinker *shrinker); +void shrinker_free(struct shrinker *shrinker); extern int __printf(2, 3) prealloc_shrinker(struct shrinker *shrinker, const char *fmt, ...); diff --git a/mm/internal.h b/mm/internal.h index 98cd14a88361..54d02bc1c44b 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -1449,6 +1449,20 @@ unsigned long shrink_slab(gfp_t gfp_mask, int nid, struct mem_cgroup *memcg, int priority); #ifdef CONFIG_SHRINKER_DEBUG +static inline int shrinker_debugfs_name_alloc(struct shrinker *shrinker, + const char *fmt, va_list ap) +{ + shrinker->name = kvasprintf_const(GFP_KERNEL, fmt, ap); + + return shrinker->name ? 0 : -ENOMEM; +} + +static inline void shrinker_debugfs_name_free(struct shrinker *shrinker) +{ + kfree_const(shrinker->name); + shrinker->name = NULL; +} + extern int shrinker_debugfs_add(struct shrinker *shrinker); extern struct dentry *shrinker_debugfs_detach(struct shrinker *shrinker, int *debugfs_id); @@ -1459,6 +1473,14 @@ static inline int shrinker_debugfs_add(struct shrinker *shrinker) { return 0; } +static inline int shrinker_debugfs_name_alloc(struct shrinker *shrinker, + const char *fmt, va_list ap) +{ + return 0; +} +static inline void shrinker_debugfs_name_free(struct shrinker *shrinker) +{ +} static inline struct dentry *shrinker_debugfs_detach(struct shrinker *shrinker, int *debugfs_id) { diff --git a/mm/shrinker.c b/mm/shrinker.c index a16cd448b924..d1032a4d5684 100644 --- a/mm/shrinker.c +++ b/mm/shrinker.c @@ -550,6 +550,112 @@ unsigned long shrink_slab(gfp_t gfp_mask, int nid, struct mem_cgroup *memcg, return freed; } +struct shrinker *shrinker_alloc(unsigned int flags, const char *fmt, ...) +{ + struct shrinker *shrinker; + unsigned int size; + va_list ap; + int err; + + shrinker = kzalloc(sizeof(struct shrinker), GFP_KERNEL); + if (!shrinker) + return NULL; + + va_start(ap, fmt); + err = shrinker_debugfs_name_alloc(shrinker, fmt, ap); + va_end(ap); + if (err) + goto err_name; + + shrinker->flags = flags | SHRINKER_ALLOCATED; + shrinker->seeks = DEFAULT_SEEKS; + + if (flags & SHRINKER_MEMCG_AWARE) { + err = prealloc_memcg_shrinker(shrinker); + if (err == -ENOSYS) { + /* Memcg is not supported, fallback to non-memcg-aware shrinker. */ + shrinker->flags &= ~SHRINKER_MEMCG_AWARE; + goto non_memcg; + } + + if (err) + goto err_flags; + + return shrinker; + } + +non_memcg: + /* + * The nr_deferred is available on per memcg level for memcg aware + * shrinkers, so only allocate nr_deferred in the following cases: + * - non-memcg-aware shrinkers + * - !CONFIG_MEMCG + * - memcg is disabled by kernel command line + */ + size = sizeof(*shrinker->nr_deferred); + if (flags & SHRINKER_NUMA_AWARE) + size *= nr_node_ids; + + shrinker->nr_deferred = kzalloc(size, GFP_KERNEL); + if (!shrinker->nr_deferred) + goto err_flags; + + return shrinker; + +err_flags: + shrinker_debugfs_name_free(shrinker); +err_name: + kfree(shrinker); + return NULL; +} +EXPORT_SYMBOL_GPL(shrinker_alloc); + +void shrinker_register(struct shrinker *shrinker) +{ + if (unlikely(!(shrinker->flags & SHRINKER_ALLOCATED))) { + pr_warn("Must use shrinker_alloc() to dynamically allocate the shrinker"); + return; + } + + down_write(&shrinker_rwsem); + list_add_tail(&shrinker->list, &shrinker_list); + shrinker->flags |= SHRINKER_REGISTERED; + shrinker_debugfs_add(shrinker); + up_write(&shrinker_rwsem); +} +EXPORT_SYMBOL_GPL(shrinker_register); + +void shrinker_free(struct shrinker *shrinker) +{ + struct dentry *debugfs_entry = NULL; + int debugfs_id; + + if (!shrinker) + return; + + down_write(&shrinker_rwsem); + if (shrinker->flags & SHRINKER_REGISTERED) { + list_del(&shrinker->list); + debugfs_entry = shrinker_debugfs_detach(shrinker, &debugfs_id); + shrinker->flags &= ~SHRINKER_REGISTERED; + } + + shrinker_debugfs_name_free(shrinker); + + if (shrinker->flags & SHRINKER_MEMCG_AWARE) + unregister_memcg_shrinker(shrinker); + up_write(&shrinker_rwsem); + + if (debugfs_entry) + shrinker_debugfs_remove(debugfs_entry, debugfs_id); + + kfree(shrinker->nr_deferred); + shrinker->nr_deferred = NULL; + + kfree(shrinker); +} +EXPORT_SYMBOL_GPL(shrinker_free); + /* * Add a shrinker callback to be called from the vm. */ diff --git a/mm/shrinker_debug.c b/mm/shrinker_debug.c index e4ce509f619e..24aebe7c24cc 100644 --- a/mm/shrinker_debug.c +++ b/mm/shrinker_debug.c @@ -241,9 +241,6 @@ struct dentry *shrinker_debugfs_detach(struct shrinker *shrinker, lockdep_assert_held(&shrinker_rwsem); - kfree_const(shrinker->name); - shrinker->name = NULL; - *debugfs_id = entry ? shrinker->debugfs_id : -1; shrinker->debugfs_entry = NULL; -- Gitee From 19d479287dcb4261e6673fad737bbad28b9363d9 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Mon, 11 Sep 2023 17:44:01 +0800 Subject: [PATCH 041/484] kvm: mmu: dynamically allocate the x86-mmu shrinker commit e5985c40987640717e3985e2ae02bf707c12f74a upstream Conflicts: none Backport-reason: Shrinker lockless Use new APIs to dynamically allocate the x86-mmu shrinker. Link: https://lkml.kernel.org/r/20230911094444.68966-3-zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Reviewed-by: Muchun Song Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Borislav Petkov Cc: Dave Hansen Cc: Abhinav Kumar Cc: Alasdair Kergon Cc: Alexander Viro Cc: Alyssa Rosenzweig Cc: Andreas Dilger Cc: Andreas Gruenbacher Cc: Anna Schumaker Cc: Arnd Bergmann Cc: Bob Peterson Cc: Carlos Llamas Cc: Chandan Babu R Cc: Chao Yu Cc: Chris Mason Cc: Christian Brauner Cc: Christian Koenig Cc: Chuck Lever Cc: Coly Li Cc: Dai Ngo Cc: Daniel Vetter Cc: Daniel Vetter Cc: "Darrick J. Wong" Cc: Dave Chinner Cc: David Airlie Cc: David Hildenbrand Cc: David Sterba Cc: Dmitry Baryshkov Cc: Gao Xiang Cc: Greg Kroah-Hartman Cc: Huang Rui Cc: Jaegeuk Kim Cc: Jani Nikula Cc: Jan Kara Cc: Jason Wang Cc: Jeff Layton Cc: Jeffle Xu Cc: Joel Fernandes (Google) Cc: Joonas Lahtinen Cc: Josef Bacik Cc: Juergen Gross Cc: Kent Overstreet Cc: Kirill Tkhai Cc: Marijn Suijten Cc: "Michael S. Tsirkin" Cc: Mike Snitzer Cc: Minchan Kim Cc: Muchun Song Cc: Nadav Amit Cc: Neil Brown Cc: Oleksandr Tyshchenko Cc: Olga Kornievskaia Cc: Paul E. McKenney Cc: Richard Weinberger Cc: Rob Clark Cc: Rob Herring Cc: Rodrigo Vivi Cc: Roman Gushchin Cc: Sean Paul Cc: Sergey Senozhatsky Cc: Song Liu Cc: Stefano Stabellini Cc: Steven Price Cc: "Theodore Ts'o" Cc: Tomeu Vizoso Cc: Tom Talpey Cc: Trond Myklebust Cc: Tvrtko Ursulin Cc: Vlastimil Babka Cc: Xuan Zhuo Cc: Yue Hu Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- arch/x86/kvm/mmu/mmu.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 1abee3f522d2..ab89ad346b4b 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -6796,11 +6796,7 @@ static unsigned long mmu_shrink_count(struct shrinker *shrink, return percpu_counter_read_positive(&kvm_total_used_mmu_pages); } -static struct shrinker mmu_shrinker = { - .count_objects = mmu_shrink_count, - .scan_objects = mmu_shrink_scan, - .seeks = DEFAULT_SEEKS * 10, -}; +static struct shrinker *mmu_shrinker; static void mmu_destroy_caches(void) { @@ -6933,10 +6929,16 @@ int kvm_mmu_vendor_module_init(void) if (percpu_counter_init(&kvm_total_used_mmu_pages, 0, GFP_KERNEL)) goto out; - ret = register_shrinker(&mmu_shrinker, "x86-mmu"); - if (ret) + mmu_shrinker = shrinker_alloc(0, "x86-mmu"); + if (!mmu_shrinker) goto out_shrinker; + mmu_shrinker->count_objects = mmu_shrink_count; + mmu_shrinker->scan_objects = mmu_shrink_scan; + mmu_shrinker->seeks = DEFAULT_SEEKS * 10; + + shrinker_register(mmu_shrinker); + return 0; out_shrinker: @@ -6958,7 +6960,7 @@ void kvm_mmu_vendor_module_exit(void) { mmu_destroy_caches(); percpu_counter_destroy(&kvm_total_used_mmu_pages); - unregister_shrinker(&mmu_shrinker); + shrinker_free(mmu_shrinker); } /* -- Gitee From 8b2ddd771f5eb39915d12d4c2c7ba12b08b20363 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Mon, 11 Sep 2023 17:44:02 +0800 Subject: [PATCH 042/484] binder: dynamically allocate the android-binder shrinker commit 95a542da5322ef2db193a546d2d888cf4dbaa219 upstream Conflicts: none Backport-reason: Shrinker lockless Use new APIs to dynamically allocate the android-binder shrinker. Link: https://lkml.kernel.org/r/20230911094444.68966-4-zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Reviewed-by: Muchun Song Acked-by: Carlos Llamas Cc: Greg Kroah-Hartman Cc: Abhinav Kumar Cc: Alasdair Kergon Cc: Alexander Viro Cc: Alyssa Rosenzweig Cc: Andreas Dilger Cc: Andreas Gruenbacher Cc: Anna Schumaker Cc: Arnd Bergmann Cc: Bob Peterson Cc: Borislav Petkov Cc: Chandan Babu R Cc: Chao Yu Cc: Chris Mason Cc: Christian Brauner Cc: Christian Koenig Cc: Chuck Lever Cc: Coly Li Cc: Dai Ngo Cc: Daniel Vetter Cc: Daniel Vetter Cc: "Darrick J. Wong" Cc: Dave Chinner Cc: Dave Hansen Cc: David Airlie Cc: David Hildenbrand Cc: David Sterba Cc: Dmitry Baryshkov Cc: Gao Xiang Cc: Huang Rui Cc: Ingo Molnar Cc: Jaegeuk Kim Cc: Jani Nikula Cc: Jan Kara Cc: Jason Wang Cc: Jeff Layton Cc: Jeffle Xu Cc: Joel Fernandes (Google) Cc: Joonas Lahtinen Cc: Josef Bacik Cc: Juergen Gross Cc: Kent Overstreet Cc: Kirill Tkhai Cc: Marijn Suijten Cc: "Michael S. Tsirkin" Cc: Mike Snitzer Cc: Minchan Kim Cc: Muchun Song Cc: Nadav Amit Cc: Neil Brown Cc: Oleksandr Tyshchenko Cc: Olga Kornievskaia Cc: Paul E. McKenney Cc: Richard Weinberger Cc: Rob Clark Cc: Rob Herring Cc: Rodrigo Vivi Cc: Roman Gushchin Cc: Sean Paul Cc: Sergey Senozhatsky Cc: Song Liu Cc: Stefano Stabellini Cc: Steven Price Cc: "Theodore Ts'o" Cc: Thomas Gleixner Cc: Tomeu Vizoso Cc: Tom Talpey Cc: Trond Myklebust Cc: Tvrtko Ursulin Cc: Vlastimil Babka Cc: Xuan Zhuo Cc: Yue Hu Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- drivers/android/binder_alloc.c | 30 ++++++++++++++++++------------ 1 file changed, 18 insertions(+), 12 deletions(-) diff --git a/drivers/android/binder_alloc.c b/drivers/android/binder_alloc.c index 34c27223cb7d..a56cbfd9ba44 100644 --- a/drivers/android/binder_alloc.c +++ b/drivers/android/binder_alloc.c @@ -1055,11 +1055,7 @@ binder_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) NULL, sc->nr_to_scan); } -static struct shrinker binder_shrinker = { - .count_objects = binder_shrink_count, - .scan_objects = binder_shrink_scan, - .seeks = DEFAULT_SEEKS, -}; +static struct shrinker *binder_shrinker; /** * binder_alloc_init() - called by binder_open() for per-proc initialization @@ -1079,19 +1075,29 @@ void binder_alloc_init(struct binder_alloc *alloc) int binder_alloc_shrinker_init(void) { - int ret = list_lru_init(&binder_alloc_lru); + int ret; - if (ret == 0) { - ret = register_shrinker(&binder_shrinker, "android-binder"); - if (ret) - list_lru_destroy(&binder_alloc_lru); + ret = list_lru_init(&binder_alloc_lru); + if (ret) + return ret; + + binder_shrinker = shrinker_alloc(0, "android-binder"); + if (!binder_shrinker) { + list_lru_destroy(&binder_alloc_lru); + return -ENOMEM; } - return ret; + + binder_shrinker->count_objects = binder_shrink_count; + binder_shrinker->scan_objects = binder_shrink_scan; + + shrinker_register(binder_shrinker); + + return 0; } void binder_alloc_shrinker_exit(void) { - unregister_shrinker(&binder_shrinker); + shrinker_free(binder_shrinker); list_lru_destroy(&binder_alloc_lru); } -- Gitee From e477cd3ee7fd68847d3c3f56a2d6568d1157954b Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Mon, 11 Sep 2023 17:44:03 +0800 Subject: [PATCH 043/484] drm/ttm: dynamically allocate the drm-ttm_pool shrinker commit d35b5c98c1f1ea44d4652a78163ce6d0e6ec2b78 upstream Conflicts: none Backport-reason: Shrinker lockless Use new APIs to dynamically allocate the drm-ttm_pool shrinker. Link: https://lkml.kernel.org/r/20230911094444.68966-5-zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Reviewed-by: Muchun Song Acked-by: Daniel Vetter Cc: Christian Koenig Cc: Huang Rui Cc: David Airlie Cc: Abhinav Kumar Cc: Alasdair Kergon Cc: Alexander Viro Cc: Alyssa Rosenzweig Cc: Andreas Dilger Cc: Andreas Gruenbacher Cc: Anna Schumaker Cc: Arnd Bergmann Cc: Bob Peterson Cc: Borislav Petkov Cc: Carlos Llamas Cc: Chandan Babu R Cc: Chao Yu Cc: Chris Mason Cc: Christian Brauner Cc: Chuck Lever Cc: Coly Li Cc: Dai Ngo Cc: Daniel Vetter Cc: "Darrick J. Wong" Cc: Dave Chinner Cc: Dave Hansen Cc: David Hildenbrand Cc: David Sterba Cc: Dmitry Baryshkov Cc: Gao Xiang Cc: Greg Kroah-Hartman Cc: Ingo Molnar Cc: Jaegeuk Kim Cc: Jani Nikula Cc: Jan Kara Cc: Jason Wang Cc: Jeff Layton Cc: Jeffle Xu Cc: Joel Fernandes (Google) Cc: Joonas Lahtinen Cc: Josef Bacik Cc: Juergen Gross Cc: Kent Overstreet Cc: Kirill Tkhai Cc: Marijn Suijten Cc: "Michael S. Tsirkin" Cc: Mike Snitzer Cc: Minchan Kim Cc: Muchun Song Cc: Nadav Amit Cc: Neil Brown Cc: Oleksandr Tyshchenko Cc: Olga Kornievskaia Cc: Paul E. McKenney Cc: Richard Weinberger Cc: Rob Clark Cc: Rob Herring Cc: Rodrigo Vivi Cc: Roman Gushchin Cc: Sean Paul Cc: Sergey Senozhatsky Cc: Song Liu Cc: Stefano Stabellini Cc: Steven Price Cc: "Theodore Ts'o" Cc: Thomas Gleixner Cc: Tomeu Vizoso Cc: Tom Talpey Cc: Trond Myklebust Cc: Tvrtko Ursulin Cc: Vlastimil Babka Cc: Xuan Zhuo Cc: Yue Hu Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- drivers/gpu/drm/ttm/ttm_pool.c | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/drivers/gpu/drm/ttm/ttm_pool.c b/drivers/gpu/drm/ttm/ttm_pool.c index 08073d3bd5c2..6e1fd6985ffc 100644 --- a/drivers/gpu/drm/ttm/ttm_pool.c +++ b/drivers/gpu/drm/ttm/ttm_pool.c @@ -73,7 +73,7 @@ static struct ttm_pool_type global_dma32_uncached[NR_PAGE_ORDERS]; static spinlock_t shrinker_lock; static struct list_head shrinker_list; -static struct shrinker mm_shrinker; +static struct shrinker *mm_shrinker; static DECLARE_RWSEM(pool_shrink_rwsem); /* Allocate pages of size 1 << order with the given gfp_flags */ @@ -767,8 +767,8 @@ static int ttm_pool_debugfs_shrink_show(struct seq_file *m, void *data) struct shrink_control sc = { .gfp_mask = GFP_NOFS }; fs_reclaim_acquire(GFP_KERNEL); - seq_printf(m, "%lu/%lu\n", ttm_pool_shrinker_count(&mm_shrinker, &sc), - ttm_pool_shrinker_scan(&mm_shrinker, &sc)); + seq_printf(m, "%lu/%lu\n", ttm_pool_shrinker_count(mm_shrinker, &sc), + ttm_pool_shrinker_scan(mm_shrinker, &sc)); fs_reclaim_release(GFP_KERNEL); return 0; @@ -812,10 +812,17 @@ int ttm_pool_mgr_init(unsigned long num_pages) &ttm_pool_debugfs_shrink_fops); #endif - mm_shrinker.count_objects = ttm_pool_shrinker_count; - mm_shrinker.scan_objects = ttm_pool_shrinker_scan; - mm_shrinker.seeks = 1; - return register_shrinker(&mm_shrinker, "drm-ttm_pool"); + mm_shrinker = shrinker_alloc(0, "drm-ttm_pool"); + if (!mm_shrinker) + return -ENOMEM; + + mm_shrinker->count_objects = ttm_pool_shrinker_count; + mm_shrinker->scan_objects = ttm_pool_shrinker_scan; + mm_shrinker->seeks = 1; + + shrinker_register(mm_shrinker); + + return 0; } /** @@ -835,6 +842,6 @@ void ttm_pool_mgr_fini(void) ttm_pool_type_fini(&global_dma32_uncached[i]); } - unregister_shrinker(&mm_shrinker); + shrinker_free(mm_shrinker); WARN_ON(!list_empty(&shrinker_list)); } -- Gitee From 7d5e4622fa71ecb4581483367e31c260f88c58d2 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Mon, 11 Sep 2023 17:44:04 +0800 Subject: [PATCH 044/484] xenbus/backend: dynamically allocate the xen-backend shrinker commit 1ec016bfa10f2873c125018c4efcf09c141645ff upstream Conflicts: none Backport-reason: Shrinker lockless Use new APIs to dynamically allocate the xen-backend shrinker. Link: https://lkml.kernel.org/r/20230911094444.68966-6-zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Reviewed-by: Muchun Song Acked-by: Juergen Gross Cc: Stefano Stabellini Cc: Oleksandr Tyshchenko Cc: Abhinav Kumar Cc: Alasdair Kergon Cc: Alexander Viro Cc: Alyssa Rosenzweig Cc: Andreas Dilger Cc: Andreas Gruenbacher Cc: Anna Schumaker Cc: Arnd Bergmann Cc: Bob Peterson Cc: Borislav Petkov Cc: Carlos Llamas Cc: Chandan Babu R Cc: Chao Yu Cc: Chris Mason Cc: Christian Brauner Cc: Christian Koenig Cc: Chuck Lever Cc: Coly Li Cc: Dai Ngo Cc: Daniel Vetter Cc: Daniel Vetter Cc: "Darrick J. Wong" Cc: Dave Chinner Cc: Dave Hansen Cc: David Airlie Cc: David Hildenbrand Cc: David Sterba Cc: Dmitry Baryshkov Cc: Gao Xiang Cc: Greg Kroah-Hartman Cc: Huang Rui Cc: Ingo Molnar Cc: Jaegeuk Kim Cc: Jani Nikula Cc: Jan Kara Cc: Jason Wang Cc: Jeff Layton Cc: Jeffle Xu Cc: Joel Fernandes (Google) Cc: Joonas Lahtinen Cc: Josef Bacik Cc: Kent Overstreet Cc: Kirill Tkhai Cc: Marijn Suijten Cc: "Michael S. Tsirkin" Cc: Mike Snitzer Cc: Minchan Kim Cc: Muchun Song Cc: Nadav Amit Cc: Neil Brown Cc: Olga Kornievskaia Cc: Paul E. McKenney Cc: Richard Weinberger Cc: Rob Clark Cc: Rob Herring Cc: Rodrigo Vivi Cc: Roman Gushchin Cc: Sean Paul Cc: Sergey Senozhatsky Cc: Song Liu Cc: Steven Price Cc: "Theodore Ts'o" Cc: Thomas Gleixner Cc: Tomeu Vizoso Cc: Tom Talpey Cc: Trond Myklebust Cc: Tvrtko Ursulin Cc: Vlastimil Babka Cc: Xuan Zhuo Cc: Yue Hu Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- drivers/xen/xenbus/xenbus_probe_backend.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/drivers/xen/xenbus/xenbus_probe_backend.c b/drivers/xen/xenbus/xenbus_probe_backend.c index da96c260e26b..5ebb7233076f 100644 --- a/drivers/xen/xenbus/xenbus_probe_backend.c +++ b/drivers/xen/xenbus/xenbus_probe_backend.c @@ -284,13 +284,9 @@ static unsigned long backend_shrink_memory_count(struct shrinker *shrinker, return 0; } -static struct shrinker backend_memory_shrinker = { - .count_objects = backend_shrink_memory_count, - .seeks = DEFAULT_SEEKS, -}; - static int __init xenbus_probe_backend_init(void) { + struct shrinker *backend_memory_shrinker; static struct notifier_block xenstore_notifier = { .notifier_call = backend_probe_and_watch }; @@ -305,8 +301,15 @@ static int __init xenbus_probe_backend_init(void) register_xenstore_notifier(&xenstore_notifier); - if (register_shrinker(&backend_memory_shrinker, "xen-backend")) - pr_warn("shrinker registration failed\n"); + backend_memory_shrinker = shrinker_alloc(0, "xen-backend"); + if (!backend_memory_shrinker) { + pr_warn("shrinker allocation failed\n"); + return 0; + } + + backend_memory_shrinker->count_objects = backend_shrink_memory_count; + + shrinker_register(backend_memory_shrinker); return 0; } -- Gitee From 9024ba4273359db08d48d0ac5bfd208ab9c7da6a Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Mon, 11 Sep 2023 17:44:05 +0800 Subject: [PATCH 045/484] erofs: dynamically allocate the erofs-shrinker commit 557936ee8dc6bd0d5e078f415441d9f2a3fcca23 upstream Conflicts: none Backport-reason: Shrinker lockless Use new APIs to dynamically allocate the erofs-shrinker. Link: https://lkml.kernel.org/r/20230911094444.68966-7-zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Reviewed-by: Muchun Song Reviewed-by: Chao Yu Reviewed-by: Gao Xiang Cc: Yue Hu Cc: Jeffle Xu Cc: Abhinav Kumar Cc: Alasdair Kergon Cc: Alexander Viro Cc: Alyssa Rosenzweig Cc: Andreas Dilger Cc: Andreas Gruenbacher Cc: Anna Schumaker Cc: Arnd Bergmann Cc: Bob Peterson Cc: Borislav Petkov Cc: Carlos Llamas Cc: Chandan Babu R Cc: Chris Mason Cc: Christian Brauner Cc: Christian Koenig Cc: Chuck Lever Cc: Coly Li Cc: Dai Ngo Cc: Daniel Vetter Cc: Daniel Vetter Cc: "Darrick J. Wong" Cc: Dave Chinner Cc: Dave Hansen Cc: David Airlie Cc: David Hildenbrand Cc: David Sterba Cc: Dmitry Baryshkov Cc: Greg Kroah-Hartman Cc: Huang Rui Cc: Ingo Molnar Cc: Jaegeuk Kim Cc: Jani Nikula Cc: Jan Kara Cc: Jason Wang Cc: Jeff Layton Cc: Joel Fernandes (Google) Cc: Joonas Lahtinen Cc: Josef Bacik Cc: Juergen Gross Cc: Kent Overstreet Cc: Kirill Tkhai Cc: Marijn Suijten Cc: "Michael S. Tsirkin" Cc: Mike Snitzer Cc: Minchan Kim Cc: Muchun Song Cc: Nadav Amit Cc: Neil Brown Cc: Oleksandr Tyshchenko Cc: Olga Kornievskaia Cc: Paul E. McKenney Cc: Richard Weinberger Cc: Rob Clark Cc: Rob Herring Cc: Rodrigo Vivi Cc: Roman Gushchin Cc: Sean Paul Cc: Sergey Senozhatsky Cc: Song Liu Cc: Stefano Stabellini Cc: Steven Price Cc: "Theodore Ts'o" Cc: Thomas Gleixner Cc: Tomeu Vizoso Cc: Tom Talpey Cc: Trond Myklebust Cc: Tvrtko Ursulin Cc: Vlastimil Babka Cc: Xuan Zhuo Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- fs/erofs/utils.c | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/fs/erofs/utils.c b/fs/erofs/utils.c index 4256a85719a1..5dea308764b4 100644 --- a/fs/erofs/utils.c +++ b/fs/erofs/utils.c @@ -264,19 +264,24 @@ static unsigned long erofs_shrink_scan(struct shrinker *shrink, return freed; } -static struct shrinker erofs_shrinker_info = { - .scan_objects = erofs_shrink_scan, - .count_objects = erofs_shrink_count, - .seeks = DEFAULT_SEEKS, -}; +static struct shrinker *erofs_shrinker_info; int __init erofs_init_shrinker(void) { - return register_shrinker(&erofs_shrinker_info, "erofs-shrinker"); + erofs_shrinker_info = shrinker_alloc(0, "erofs-shrinker"); + if (!erofs_shrinker_info) + return -ENOMEM; + + erofs_shrinker_info->count_objects = erofs_shrink_count; + erofs_shrinker_info->scan_objects = erofs_shrink_scan; + + shrinker_register(erofs_shrinker_info); + + return 0; } void erofs_exit_shrinker(void) { - unregister_shrinker(&erofs_shrinker_info); + shrinker_free(erofs_shrinker_info); } #endif /* !CONFIG_EROFS_FS_ZIP */ -- Gitee From ae82162ca1c316912dddd68147500602694244da Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Mon, 11 Sep 2023 17:44:06 +0800 Subject: [PATCH 046/484] f2fs: dynamically allocate the f2fs-shrinker commit bfcba5ba39cbce8957b4908c9dd570dd6097fff3 upstream Conflicts: resolved Backport-reason: Shrinker lockless Use new APIs to dynamically allocate the f2fs-shrinker. Link: https://lkml.kernel.org/r/20230911094444.68966-8-zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Reviewed-by: Muchun Song Reviewed-by: Chao Yu Cc: Jaegeuk Kim Cc: Abhinav Kumar Cc: Alasdair Kergon Cc: Alexander Viro Cc: Alyssa Rosenzweig Cc: Andreas Dilger Cc: Andreas Gruenbacher Cc: Anna Schumaker Cc: Arnd Bergmann Cc: Bob Peterson Cc: Borislav Petkov Cc: Carlos Llamas Cc: Chandan Babu R Cc: Chris Mason Cc: Christian Brauner Cc: Christian Koenig Cc: Chuck Lever Cc: Coly Li Cc: Dai Ngo Cc: Daniel Vetter Cc: Daniel Vetter Cc: "Darrick J. Wong" Cc: Dave Chinner Cc: Dave Hansen Cc: David Airlie Cc: David Hildenbrand Cc: David Sterba Cc: Dmitry Baryshkov Cc: Gao Xiang Cc: Greg Kroah-Hartman Cc: Huang Rui Cc: Ingo Molnar Cc: Jani Nikula Cc: Jan Kara Cc: Jason Wang Cc: Jeff Layton Cc: Jeffle Xu Cc: Joel Fernandes (Google) Cc: Joonas Lahtinen Cc: Josef Bacik Cc: Juergen Gross Cc: Kent Overstreet Cc: Kirill Tkhai Cc: Marijn Suijten Cc: "Michael S. Tsirkin" Cc: Mike Snitzer Cc: Minchan Kim Cc: Muchun Song Cc: Nadav Amit Cc: Neil Brown Cc: Oleksandr Tyshchenko Cc: Olga Kornievskaia Cc: Paul E. McKenney Cc: Richard Weinberger Cc: Rob Clark Cc: Rob Herring Cc: Rodrigo Vivi Cc: Roman Gushchin Cc: Sean Paul Cc: Sergey Senozhatsky Cc: Song Liu Cc: Stefano Stabellini Cc: Steven Price Cc: "Theodore Ts'o" Cc: Thomas Gleixner Cc: Tomeu Vizoso Cc: Tom Talpey Cc: Trond Myklebust Cc: Tvrtko Ursulin Cc: Vlastimil Babka Cc: Xuan Zhuo Cc: Yue Hu Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- fs/f2fs/super.c | 31 +++++++++++++++++++++++-------- 1 file changed, 23 insertions(+), 8 deletions(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 986178be8ca9..723e0e6431ab 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -93,11 +93,26 @@ int f2fs_build_fault_attr(struct f2fs_sb_info *sbi, unsigned long rate, #endif /* f2fs-wide shrinker description */ -static struct shrinker f2fs_shrinker_info = { - .scan_objects = f2fs_shrink_scan, - .count_objects = f2fs_shrink_count, - .seeks = DEFAULT_SEEKS, -}; +static struct shrinker *f2fs_shrinker_info; + +static int __init f2fs_init_shrinker(void) +{ + f2fs_shrinker_info = shrinker_alloc(0, "f2fs-shrinker"); + if (!f2fs_shrinker_info) + return -ENOMEM; + + f2fs_shrinker_info->count_objects = f2fs_shrink_count; + f2fs_shrinker_info->scan_objects = f2fs_shrink_scan; + + shrinker_register(f2fs_shrinker_info); + + return 0; +} + +static void f2fs_exit_shrinker(void) +{ + shrinker_free(f2fs_shrinker_info); +} enum { Opt_gc_background, @@ -4987,7 +5002,7 @@ static int __init init_f2fs_fs(void) err = f2fs_init_sysfs(); if (err) goto free_garbage_collection_cache; - err = register_shrinker(&f2fs_shrinker_info, "f2fs-shrinker"); + err = f2fs_init_shrinker(); if (err) goto free_sysfs; f2fs_create_root_stats(); @@ -5032,7 +5047,7 @@ static int __init init_f2fs_fs(void) f2fs_destroy_post_read_processing(); free_root_stats: f2fs_destroy_root_stats(); - unregister_shrinker(&f2fs_shrinker_info); + f2fs_exit_shrinker(); free_sysfs: f2fs_exit_sysfs(); free_garbage_collection_cache: @@ -5064,7 +5079,7 @@ static void __exit exit_f2fs_fs(void) f2fs_destroy_iostat_processing(); f2fs_destroy_post_read_processing(); f2fs_destroy_root_stats(); - unregister_shrinker(&f2fs_shrinker_info); + f2fs_exit_shrinker(); f2fs_exit_sysfs(); f2fs_destroy_garbage_collection_cache(); f2fs_destroy_extent_cache(); -- Gitee From 0cfa833ee97a8b6cca336a4ff0470438d8f26c56 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Mon, 11 Sep 2023 17:44:07 +0800 Subject: [PATCH 047/484] gfs2: dynamically allocate the gfs2-glock shrinker commit a304c23cd6c96ecdf5f96df66e809993d5e96bf0 upstream Conflicts: none Backport-reason: Shrinker lockless Use new APIs to dynamically allocate the gfs2-glock shrinker. Link: https://lkml.kernel.org/r/20230911094444.68966-9-zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Reviewed-by: Muchun Song Cc: Bob Peterson Cc: Andreas Gruenbacher Cc: Abhinav Kumar Cc: Alasdair Kergon Cc: Alexander Viro Cc: Alyssa Rosenzweig Cc: Andreas Dilger Cc: Anna Schumaker Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Carlos Llamas Cc: Chandan Babu R Cc: Chao Yu Cc: Chris Mason Cc: Christian Brauner Cc: Christian Koenig Cc: Chuck Lever Cc: Coly Li Cc: Dai Ngo Cc: Daniel Vetter Cc: Daniel Vetter Cc: "Darrick J. Wong" Cc: Dave Chinner Cc: Dave Hansen Cc: David Airlie Cc: David Hildenbrand Cc: David Sterba Cc: Dmitry Baryshkov Cc: Gao Xiang Cc: Greg Kroah-Hartman Cc: Huang Rui Cc: Ingo Molnar Cc: Jaegeuk Kim Cc: Jani Nikula Cc: Jan Kara Cc: Jason Wang Cc: Jeff Layton Cc: Jeffle Xu Cc: Joel Fernandes (Google) Cc: Joonas Lahtinen Cc: Josef Bacik Cc: Juergen Gross Cc: Kent Overstreet Cc: Kirill Tkhai Cc: Marijn Suijten Cc: "Michael S. Tsirkin" Cc: Mike Snitzer Cc: Minchan Kim Cc: Muchun Song Cc: Nadav Amit Cc: Neil Brown Cc: Oleksandr Tyshchenko Cc: Olga Kornievskaia Cc: Paul E. McKenney Cc: Richard Weinberger Cc: Rob Clark Cc: Rob Herring Cc: Rodrigo Vivi Cc: Roman Gushchin Cc: Sean Paul Cc: Sergey Senozhatsky Cc: Song Liu Cc: Stefano Stabellini Cc: Steven Price Cc: "Theodore Ts'o" Cc: Thomas Gleixner Cc: Tomeu Vizoso Cc: Tom Talpey Cc: Trond Myklebust Cc: Tvrtko Ursulin Cc: Vlastimil Babka Cc: Xuan Zhuo Cc: Yue Hu Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- fs/gfs2/glock.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index 687670075d22..60c27913ce3c 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -2085,11 +2085,7 @@ static unsigned long gfs2_glock_shrink_count(struct shrinker *shrink, return vfs_pressure_ratio(atomic_read(&lru_count)); } -static struct shrinker glock_shrinker = { - .seeks = DEFAULT_SEEKS, - .count_objects = gfs2_glock_shrink_count, - .scan_objects = gfs2_glock_shrink_scan, -}; +static struct shrinker *glock_shrinker; /** * glock_hash_walk - Call a function for glock in a hash bucket @@ -2514,13 +2510,18 @@ int __init gfs2_glock_init(void) return -ENOMEM; } - ret = register_shrinker(&glock_shrinker, "gfs2-glock"); - if (ret) { + glock_shrinker = shrinker_alloc(0, "gfs2-glock"); + if (!glock_shrinker) { destroy_workqueue(glock_workqueue); rhashtable_destroy(&gl_hash_table); - return ret; + return -ENOMEM; } + glock_shrinker->count_objects = gfs2_glock_shrink_count; + glock_shrinker->scan_objects = gfs2_glock_shrink_scan; + + shrinker_register(glock_shrinker); + for (i = 0; i < GLOCK_WAIT_TABLE_SIZE; i++) init_waitqueue_head(glock_wait_table + i); @@ -2529,7 +2530,7 @@ int __init gfs2_glock_init(void) void gfs2_glock_exit(void) { - unregister_shrinker(&glock_shrinker); + shrinker_free(glock_shrinker); rhashtable_destroy(&gl_hash_table); destroy_workqueue(glock_workqueue); } -- Gitee From d6dd7af6afa04fc2cd202170d1be9c6c2dcb4950 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Mon, 11 Sep 2023 17:44:08 +0800 Subject: [PATCH 048/484] gfs2: dynamically allocate the gfs2-qd shrinker commit 8ee0fd9c1085d952ea62c1879620eaa01e4c6332 upstream Conflicts: none Backport-reason: Shrinker lockless Use new APIs to dynamically allocate the gfs2-qd shrinker. Link: https://lkml.kernel.org/r/20230911094444.68966-10-zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Reviewed-by: Muchun Song Cc: Bob Peterson Cc: Andreas Gruenbacher Cc: Abhinav Kumar Cc: Alasdair Kergon Cc: Alexander Viro Cc: Alyssa Rosenzweig Cc: Andreas Dilger Cc: Anna Schumaker Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Carlos Llamas Cc: Chandan Babu R Cc: Chao Yu Cc: Chris Mason Cc: Christian Brauner Cc: Christian Koenig Cc: Chuck Lever Cc: Coly Li Cc: Dai Ngo Cc: Daniel Vetter Cc: Daniel Vetter Cc: "Darrick J. Wong" Cc: Dave Chinner Cc: Dave Hansen Cc: David Airlie Cc: David Hildenbrand Cc: David Sterba Cc: Dmitry Baryshkov Cc: Gao Xiang Cc: Greg Kroah-Hartman Cc: Huang Rui Cc: Ingo Molnar Cc: Jaegeuk Kim Cc: Jani Nikula Cc: Jan Kara Cc: Jason Wang Cc: Jeff Layton Cc: Jeffle Xu Cc: Joel Fernandes (Google) Cc: Joonas Lahtinen Cc: Josef Bacik Cc: Juergen Gross Cc: Kent Overstreet Cc: Kirill Tkhai Cc: Marijn Suijten Cc: "Michael S. Tsirkin" Cc: Mike Snitzer Cc: Minchan Kim Cc: Muchun Song Cc: Nadav Amit Cc: Neil Brown Cc: Oleksandr Tyshchenko Cc: Olga Kornievskaia Cc: Paul E. McKenney Cc: Richard Weinberger Cc: Rob Clark Cc: Rob Herring Cc: Rodrigo Vivi Cc: Roman Gushchin Cc: Sean Paul Cc: Sergey Senozhatsky Cc: Song Liu Cc: Stefano Stabellini Cc: Steven Price Cc: "Theodore Ts'o" Cc: Thomas Gleixner Cc: Tomeu Vizoso Cc: Tom Talpey Cc: Trond Myklebust Cc: Tvrtko Ursulin Cc: Vlastimil Babka Cc: Xuan Zhuo Cc: Yue Hu Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- fs/gfs2/main.c | 6 +++--- fs/gfs2/quota.c | 25 +++++++++++++++++++------ fs/gfs2/quota.h | 3 ++- 3 files changed, 24 insertions(+), 10 deletions(-) diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c index 66eb98b690a2..79be0cdc730c 100644 --- a/fs/gfs2/main.c +++ b/fs/gfs2/main.c @@ -147,7 +147,7 @@ static int __init init_gfs2_fs(void) if (!gfs2_trans_cachep) goto fail_cachep8; - error = register_shrinker(&gfs2_qd_shrinker, "gfs2-qd"); + error = gfs2_qd_shrinker_init(); if (error) goto fail_shrinker; @@ -196,7 +196,7 @@ static int __init init_gfs2_fs(void) fail_wq2: destroy_workqueue(gfs2_recovery_wq); fail_wq1: - unregister_shrinker(&gfs2_qd_shrinker); + gfs2_qd_shrinker_exit(); fail_shrinker: kmem_cache_destroy(gfs2_trans_cachep); fail_cachep8: @@ -229,7 +229,7 @@ static int __init init_gfs2_fs(void) static void __exit exit_gfs2_fs(void) { - unregister_shrinker(&gfs2_qd_shrinker); + gfs2_qd_shrinker_exit(); gfs2_glock_exit(); gfs2_unregister_debugfs(); unregister_filesystem(&gfs2_fs_type); diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c index c537e1d02cf3..ae317d1951e7 100644 --- a/fs/gfs2/quota.c +++ b/fs/gfs2/quota.c @@ -193,13 +193,26 @@ static unsigned long gfs2_qd_shrink_count(struct shrinker *shrink, return vfs_pressure_ratio(list_lru_shrink_count(&gfs2_qd_lru, sc)); } -struct shrinker gfs2_qd_shrinker = { - .count_objects = gfs2_qd_shrink_count, - .scan_objects = gfs2_qd_shrink_scan, - .seeks = DEFAULT_SEEKS, - .flags = SHRINKER_NUMA_AWARE, -}; +static struct shrinker *gfs2_qd_shrinker; + +int __init gfs2_qd_shrinker_init(void) +{ + gfs2_qd_shrinker = shrinker_alloc(SHRINKER_NUMA_AWARE, "gfs2-qd"); + if (!gfs2_qd_shrinker) + return -ENOMEM; + + gfs2_qd_shrinker->count_objects = gfs2_qd_shrink_count; + gfs2_qd_shrinker->scan_objects = gfs2_qd_shrink_scan; + + shrinker_register(gfs2_qd_shrinker); + return 0; +} + +void gfs2_qd_shrinker_exit(void) +{ + shrinker_free(gfs2_qd_shrinker); +} static u64 qd2index(struct gfs2_quota_data *qd) { diff --git a/fs/gfs2/quota.h b/fs/gfs2/quota.h index e4a2fdb552cd..b98164c94a34 100644 --- a/fs/gfs2/quota.h +++ b/fs/gfs2/quota.h @@ -60,7 +60,8 @@ static inline int gfs2_quota_lock_check(struct gfs2_inode *ip, } extern const struct quotactl_ops gfs2_quotactl_ops; -extern struct shrinker gfs2_qd_shrinker; +int __init gfs2_qd_shrinker_init(void); +void gfs2_qd_shrinker_exit(void); extern struct list_lru gfs2_qd_lru; void __init gfs2_quota_hash_init(void); -- Gitee From 174ca9a2eeb92c1e7c0b5f909ce64d1cd3939a9f Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Mon, 11 Sep 2023 17:44:09 +0800 Subject: [PATCH 049/484] NFSv4.2: dynamically allocate the nfs-xattr shrinkers commit d5dad4929fb42e7f610cda5e1ea51cd76be998d4 upstream Conflicts: none Backport-reason: Shrinker lockless Use new APIs to dynamically allocate the nfs-xattr shrinkers. Link: https://lkml.kernel.org/r/20230911094444.68966-11-zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Reviewed-by: Muchun Song Cc: Trond Myklebust Cc: Anna Schumaker Cc: Abhinav Kumar Cc: Alasdair Kergon Cc: Alexander Viro Cc: Alyssa Rosenzweig Cc: Andreas Dilger Cc: Andreas Gruenbacher Cc: Arnd Bergmann Cc: Bob Peterson Cc: Borislav Petkov Cc: Carlos Llamas Cc: Chandan Babu R Cc: Chao Yu Cc: Chris Mason Cc: Christian Brauner Cc: Christian Koenig Cc: Chuck Lever Cc: Coly Li Cc: Dai Ngo Cc: Daniel Vetter Cc: Daniel Vetter Cc: "Darrick J. Wong" Cc: Dave Chinner Cc: Dave Hansen Cc: David Airlie Cc: David Hildenbrand Cc: David Sterba Cc: Dmitry Baryshkov Cc: Gao Xiang Cc: Greg Kroah-Hartman Cc: Huang Rui Cc: Ingo Molnar Cc: Jaegeuk Kim Cc: Jani Nikula Cc: Jan Kara Cc: Jason Wang Cc: Jeff Layton Cc: Jeffle Xu Cc: Joel Fernandes (Google) Cc: Joonas Lahtinen Cc: Josef Bacik Cc: Juergen Gross Cc: Kent Overstreet Cc: Kirill Tkhai Cc: Marijn Suijten Cc: "Michael S. Tsirkin" Cc: Mike Snitzer Cc: Minchan Kim Cc: Muchun Song Cc: Nadav Amit Cc: Neil Brown Cc: Oleksandr Tyshchenko Cc: Olga Kornievskaia Cc: Paul E. McKenney Cc: Richard Weinberger Cc: Rob Clark Cc: Rob Herring Cc: Rodrigo Vivi Cc: Roman Gushchin Cc: Sean Paul Cc: Sergey Senozhatsky Cc: Song Liu Cc: Stefano Stabellini Cc: Steven Price Cc: "Theodore Ts'o" Cc: Thomas Gleixner Cc: Tomeu Vizoso Cc: Tom Talpey Cc: Tvrtko Ursulin Cc: Vlastimil Babka Cc: Xuan Zhuo Cc: Yue Hu Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- fs/nfs/nfs42xattr.c | 87 +++++++++++++++++++++++---------------------- 1 file changed, 44 insertions(+), 43 deletions(-) diff --git a/fs/nfs/nfs42xattr.c b/fs/nfs/nfs42xattr.c index 911f634ba3da..2ad66a8922f4 100644 --- a/fs/nfs/nfs42xattr.c +++ b/fs/nfs/nfs42xattr.c @@ -796,28 +796,9 @@ static unsigned long nfs4_xattr_cache_scan(struct shrinker *shrink, static unsigned long nfs4_xattr_entry_scan(struct shrinker *shrink, struct shrink_control *sc); -static struct shrinker nfs4_xattr_cache_shrinker = { - .count_objects = nfs4_xattr_cache_count, - .scan_objects = nfs4_xattr_cache_scan, - .seeks = DEFAULT_SEEKS, - .flags = SHRINKER_MEMCG_AWARE, -}; - -static struct shrinker nfs4_xattr_entry_shrinker = { - .count_objects = nfs4_xattr_entry_count, - .scan_objects = nfs4_xattr_entry_scan, - .seeks = DEFAULT_SEEKS, - .batch = 512, - .flags = SHRINKER_MEMCG_AWARE, -}; - -static struct shrinker nfs4_xattr_large_entry_shrinker = { - .count_objects = nfs4_xattr_entry_count, - .scan_objects = nfs4_xattr_entry_scan, - .seeks = 1, - .batch = 512, - .flags = SHRINKER_MEMCG_AWARE, -}; +static struct shrinker *nfs4_xattr_cache_shrinker; +static struct shrinker *nfs4_xattr_entry_shrinker; +static struct shrinker *nfs4_xattr_large_entry_shrinker; static enum lru_status cache_lru_isolate(struct list_head *item, @@ -943,7 +924,7 @@ nfs4_xattr_entry_scan(struct shrinker *shrink, struct shrink_control *sc) struct nfs4_xattr_entry *entry; struct list_lru *lru; - lru = (shrink == &nfs4_xattr_large_entry_shrinker) ? + lru = (shrink == nfs4_xattr_large_entry_shrinker) ? &nfs4_xattr_large_entry_lru : &nfs4_xattr_entry_lru; freed = list_lru_shrink_walk(lru, sc, entry_lru_isolate, &dispose); @@ -971,7 +952,7 @@ nfs4_xattr_entry_count(struct shrinker *shrink, struct shrink_control *sc) unsigned long count; struct list_lru *lru; - lru = (shrink == &nfs4_xattr_large_entry_shrinker) ? + lru = (shrink == nfs4_xattr_large_entry_shrinker) ? &nfs4_xattr_large_entry_lru : &nfs4_xattr_entry_lru; count = list_lru_shrink_count(lru, sc); @@ -991,18 +972,34 @@ static void nfs4_xattr_cache_init_once(void *p) INIT_LIST_HEAD(&cache->dispose); } -static int nfs4_xattr_shrinker_init(struct shrinker *shrinker, - struct list_lru *lru, const char *name) +typedef unsigned long (*count_objects_cb)(struct shrinker *s, + struct shrink_control *sc); +typedef unsigned long (*scan_objects_cb)(struct shrinker *s, + struct shrink_control *sc); + +static int __init nfs4_xattr_shrinker_init(struct shrinker **shrinker, + struct list_lru *lru, const char *name, + count_objects_cb count, + scan_objects_cb scan, long batch, int seeks) { - int ret = 0; + int ret; - ret = register_shrinker(shrinker, name); - if (ret) + *shrinker = shrinker_alloc(SHRINKER_MEMCG_AWARE, name); + if (!*shrinker) + return -ENOMEM; + + ret = list_lru_init_memcg(lru, *shrinker); + if (ret) { + shrinker_free(*shrinker); return ret; + } - ret = list_lru_init_memcg(lru, shrinker); - if (ret) - unregister_shrinker(shrinker); + (*shrinker)->count_objects = count; + (*shrinker)->scan_objects = scan; + (*shrinker)->batch = batch; + (*shrinker)->seeks = seeks; + + shrinker_register(*shrinker); return ret; } @@ -1010,7 +1007,7 @@ static int nfs4_xattr_shrinker_init(struct shrinker *shrinker, static void nfs4_xattr_shrinker_destroy(struct shrinker *shrinker, struct list_lru *lru) { - unregister_shrinker(shrinker); + shrinker_free(shrinker); list_lru_destroy(lru); } @@ -1026,27 +1023,31 @@ int __init nfs4_xattr_cache_init(void) return -ENOMEM; ret = nfs4_xattr_shrinker_init(&nfs4_xattr_cache_shrinker, - &nfs4_xattr_cache_lru, - "nfs-xattr_cache"); + &nfs4_xattr_cache_lru, "nfs-xattr_cache", + nfs4_xattr_cache_count, + nfs4_xattr_cache_scan, 0, DEFAULT_SEEKS); if (ret) goto out1; ret = nfs4_xattr_shrinker_init(&nfs4_xattr_entry_shrinker, - &nfs4_xattr_entry_lru, - "nfs-xattr_entry"); + &nfs4_xattr_entry_lru, "nfs-xattr_entry", + nfs4_xattr_entry_count, + nfs4_xattr_entry_scan, 512, DEFAULT_SEEKS); if (ret) goto out2; ret = nfs4_xattr_shrinker_init(&nfs4_xattr_large_entry_shrinker, &nfs4_xattr_large_entry_lru, - "nfs-xattr_large_entry"); + "nfs-xattr_large_entry", + nfs4_xattr_entry_count, + nfs4_xattr_entry_scan, 512, 1); if (!ret) return 0; - nfs4_xattr_shrinker_destroy(&nfs4_xattr_entry_shrinker, + nfs4_xattr_shrinker_destroy(nfs4_xattr_entry_shrinker, &nfs4_xattr_entry_lru); out2: - nfs4_xattr_shrinker_destroy(&nfs4_xattr_cache_shrinker, + nfs4_xattr_shrinker_destroy(nfs4_xattr_cache_shrinker, &nfs4_xattr_cache_lru); out1: kmem_cache_destroy(nfs4_xattr_cache_cachep); @@ -1056,11 +1057,11 @@ int __init nfs4_xattr_cache_init(void) void nfs4_xattr_cache_exit(void) { - nfs4_xattr_shrinker_destroy(&nfs4_xattr_large_entry_shrinker, + nfs4_xattr_shrinker_destroy(nfs4_xattr_large_entry_shrinker, &nfs4_xattr_large_entry_lru); - nfs4_xattr_shrinker_destroy(&nfs4_xattr_entry_shrinker, + nfs4_xattr_shrinker_destroy(nfs4_xattr_entry_shrinker, &nfs4_xattr_entry_lru); - nfs4_xattr_shrinker_destroy(&nfs4_xattr_cache_shrinker, + nfs4_xattr_shrinker_destroy(nfs4_xattr_cache_shrinker, &nfs4_xattr_cache_lru); kmem_cache_destroy(nfs4_xattr_cache_cachep); } -- Gitee From e9f61a329398b23b7cc9706d70f62c3f70250572 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Mon, 11 Sep 2023 17:44:10 +0800 Subject: [PATCH 050/484] nfs: dynamically allocate the nfs-acl shrinker commit 777fc8f1b4b9a0c5a558f1ab49ddfa5137bfaeb3 upstream Conflicts: none Backport-reason: Shrinker lockless Use new APIs to dynamically allocate the nfs-acl shrinker. Link: https://lkml.kernel.org/r/20230911094444.68966-12-zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Reviewed-by: Muchun Song Cc: Trond Myklebust Cc: Anna Schumaker Cc: Abhinav Kumar Cc: Alasdair Kergon Cc: Alexander Viro Cc: Alyssa Rosenzweig Cc: Andreas Dilger Cc: Andreas Gruenbacher Cc: Arnd Bergmann Cc: Bob Peterson Cc: Borislav Petkov Cc: Carlos Llamas Cc: Chandan Babu R Cc: Chao Yu Cc: Chris Mason Cc: Christian Brauner Cc: Christian Koenig Cc: Chuck Lever Cc: Coly Li Cc: Dai Ngo Cc: Daniel Vetter Cc: Daniel Vetter Cc: "Darrick J. Wong" Cc: Dave Chinner Cc: Dave Hansen Cc: David Airlie Cc: David Hildenbrand Cc: David Sterba Cc: Dmitry Baryshkov Cc: Gao Xiang Cc: Greg Kroah-Hartman Cc: Huang Rui Cc: Ingo Molnar Cc: Jaegeuk Kim Cc: Jani Nikula Cc: Jan Kara Cc: Jason Wang Cc: Jeff Layton Cc: Jeffle Xu Cc: Joel Fernandes (Google) Cc: Joonas Lahtinen Cc: Josef Bacik Cc: Juergen Gross Cc: Kent Overstreet Cc: Kirill Tkhai Cc: Marijn Suijten Cc: "Michael S. Tsirkin" Cc: Mike Snitzer Cc: Minchan Kim Cc: Muchun Song Cc: Nadav Amit Cc: Neil Brown Cc: Oleksandr Tyshchenko Cc: Olga Kornievskaia Cc: Paul E. McKenney Cc: Richard Weinberger Cc: Rob Clark Cc: Rob Herring Cc: Rodrigo Vivi Cc: Roman Gushchin Cc: Sean Paul Cc: Sergey Senozhatsky Cc: Song Liu Cc: Stefano Stabellini Cc: Steven Price Cc: "Theodore Ts'o" Cc: Thomas Gleixner Cc: Tomeu Vizoso Cc: Tom Talpey Cc: Tvrtko Ursulin Cc: Vlastimil Babka Cc: Xuan Zhuo Cc: Yue Hu Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- fs/nfs/super.c | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 4e72ee57fc8f..9c02e6b14108 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -130,11 +130,7 @@ static void nfs_ssc_unregister_ops(void) } #endif /* CONFIG_NFS_V4_2 */ -static struct shrinker acl_shrinker = { - .count_objects = nfs_access_cache_count, - .scan_objects = nfs_access_cache_scan, - .seeks = DEFAULT_SEEKS, -}; +static struct shrinker *acl_shrinker; /* * Register the NFS filesystems @@ -154,9 +150,18 @@ int __init register_nfs_fs(void) ret = nfs_register_sysctl(); if (ret < 0) goto error_2; - ret = register_shrinker(&acl_shrinker, "nfs-acl"); - if (ret < 0) + + acl_shrinker = shrinker_alloc(0, "nfs-acl"); + if (!acl_shrinker) { + ret = -ENOMEM; goto error_3; + } + + acl_shrinker->count_objects = nfs_access_cache_count; + acl_shrinker->scan_objects = nfs_access_cache_scan; + + shrinker_register(acl_shrinker); + #ifdef CONFIG_NFS_V4_2 nfs_ssc_register_ops(); #endif @@ -176,7 +181,7 @@ int __init register_nfs_fs(void) */ void __exit unregister_nfs_fs(void) { - unregister_shrinker(&acl_shrinker); + shrinker_free(acl_shrinker); nfs_unregister_sysctl(); unregister_nfs4_fs(); #ifdef CONFIG_NFS_V4_2 -- Gitee From 734190085664978d4ffaeed3d9af701a88120595 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Mon, 11 Sep 2023 17:44:11 +0800 Subject: [PATCH 051/484] nfsd: dynamically allocate the nfsd-filecache shrinker commit 856e594965414a1cc63d0f3e709ecb6869bc685e upstream Conflicts: none Backport-reason: Shrinker lockless Use new APIs to dynamically allocate the nfsd-filecache shrinker. Link: https://lkml.kernel.org/r/20230911094444.68966-13-zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Reviewed-by: Muchun Song Cc: Jeff Layton Cc: Neil Brown Cc: Olga Kornievskaia Cc: Dai Ngo Cc: Tom Talpey Cc: Abhinav Kumar Cc: Alasdair Kergon Cc: Alexander Viro Cc: Alyssa Rosenzweig Cc: Andreas Dilger Cc: Andreas Gruenbacher Cc: Anna Schumaker Cc: Arnd Bergmann Cc: Bob Peterson Cc: Borislav Petkov Cc: Carlos Llamas Cc: Chandan Babu R Cc: Chao Yu Cc: Chris Mason Cc: Christian Brauner Cc: Christian Koenig Cc: Chuck Lever Cc: Coly Li Cc: Daniel Vetter Cc: Daniel Vetter Cc: "Darrick J. Wong" Cc: Dave Chinner Cc: Dave Hansen Cc: David Airlie Cc: David Hildenbrand Cc: David Sterba Cc: Dmitry Baryshkov Cc: Gao Xiang Cc: Greg Kroah-Hartman Cc: Huang Rui Cc: Ingo Molnar Cc: Jaegeuk Kim Cc: Jani Nikula Cc: Jan Kara Cc: Jason Wang Cc: Jeffle Xu Cc: Joel Fernandes (Google) Cc: Joonas Lahtinen Cc: Josef Bacik Cc: Juergen Gross Cc: Kent Overstreet Cc: Kirill Tkhai Cc: Marijn Suijten Cc: "Michael S. Tsirkin" Cc: Mike Snitzer Cc: Minchan Kim Cc: Muchun Song Cc: Nadav Amit Cc: Oleksandr Tyshchenko Cc: Paul E. McKenney Cc: Richard Weinberger Cc: Rob Clark Cc: Rob Herring Cc: Rodrigo Vivi Cc: Roman Gushchin Cc: Sean Paul Cc: Sergey Senozhatsky Cc: Song Liu Cc: Stefano Stabellini Cc: Steven Price Cc: "Theodore Ts'o" Cc: Thomas Gleixner Cc: Tomeu Vizoso Cc: Trond Myklebust Cc: Tvrtko Ursulin Cc: Vlastimil Babka Cc: Xuan Zhuo Cc: Yue Hu Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- fs/nfsd/filecache.c | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c index 0ff07d53931f..2419cf330210 100644 --- a/fs/nfsd/filecache.c +++ b/fs/nfsd/filecache.c @@ -523,11 +523,7 @@ nfsd_file_lru_scan(struct shrinker *s, struct shrink_control *sc) return ret; } -static struct shrinker nfsd_file_shrinker = { - .scan_objects = nfsd_file_lru_scan, - .count_objects = nfsd_file_lru_count, - .seeks = 1, -}; +static struct shrinker *nfsd_file_shrinker; /** * nfsd_file_cond_queue - conditionally unhash and queue a nfsd_file @@ -748,12 +744,19 @@ nfsd_file_cache_init(void) goto out_err; } - ret = register_shrinker(&nfsd_file_shrinker, "nfsd-filecache"); - if (ret) { - pr_err("nfsd: failed to register nfsd_file_shrinker: %d\n", ret); + nfsd_file_shrinker = shrinker_alloc(0, "nfsd-filecache"); + if (!nfsd_file_shrinker) { + ret = -ENOMEM; + pr_err("nfsd: failed to allocate nfsd_file_shrinker\n"); goto out_lru; } + nfsd_file_shrinker->count_objects = nfsd_file_lru_count; + nfsd_file_shrinker->scan_objects = nfsd_file_lru_scan; + nfsd_file_shrinker->seeks = 1; + + shrinker_register(nfsd_file_shrinker); + ret = lease_register_notifier(&nfsd_file_lease_notifier); if (ret) { pr_err("nfsd: unable to register lease notifier: %d\n", ret); @@ -778,7 +781,7 @@ nfsd_file_cache_init(void) out_notifier: lease_unregister_notifier(&nfsd_file_lease_notifier); out_shrinker: - unregister_shrinker(&nfsd_file_shrinker); + shrinker_free(nfsd_file_shrinker); out_lru: list_lru_destroy(&nfsd_file_lru); out_err: @@ -895,7 +898,7 @@ nfsd_file_cache_shutdown(void) return; lease_unregister_notifier(&nfsd_file_lease_notifier); - unregister_shrinker(&nfsd_file_shrinker); + shrinker_free(nfsd_file_shrinker); /* * make sure all callers of nfsd_file_lru_cb are done before * calling nfsd_file_cache_purge -- Gitee From f3df526d234bf28fae4004ce779898ea869f86be Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Mon, 11 Sep 2023 17:44:12 +0800 Subject: [PATCH 052/484] quota: dynamically allocate the dquota-cache shrinker commit eab477e883b52a82bd7b4437111bbc6264694349 upstream Conflicts: none Backport-reason: Shrinker lockless Use new APIs to dynamically allocate the dquota-cache shrinker. Link: https://lkml.kernel.org/r/20230911094444.68966-14-zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Reviewed-by: Muchun Song Acked-by: Jan Kara Cc: Abhinav Kumar Cc: Alasdair Kergon Cc: Alexander Viro Cc: Alyssa Rosenzweig Cc: Andreas Dilger Cc: Andreas Gruenbacher Cc: Anna Schumaker Cc: Arnd Bergmann Cc: Bob Peterson Cc: Borislav Petkov Cc: Carlos Llamas Cc: Chandan Babu R Cc: Chao Yu Cc: Chris Mason Cc: Christian Brauner Cc: Christian Koenig Cc: Chuck Lever Cc: Coly Li Cc: Dai Ngo Cc: Daniel Vetter Cc: Daniel Vetter Cc: "Darrick J. Wong" Cc: Dave Chinner Cc: Dave Hansen Cc: David Airlie Cc: David Hildenbrand Cc: David Sterba Cc: Dmitry Baryshkov Cc: Gao Xiang Cc: Greg Kroah-Hartman Cc: Huang Rui Cc: Ingo Molnar Cc: Jaegeuk Kim Cc: Jani Nikula Cc: Jason Wang Cc: Jeff Layton Cc: Jeffle Xu Cc: Joel Fernandes (Google) Cc: Joonas Lahtinen Cc: Josef Bacik Cc: Juergen Gross Cc: Kent Overstreet Cc: Kirill Tkhai Cc: Marijn Suijten Cc: "Michael S. Tsirkin" Cc: Mike Snitzer Cc: Minchan Kim Cc: Muchun Song Cc: Nadav Amit Cc: Neil Brown Cc: Oleksandr Tyshchenko Cc: Olga Kornievskaia Cc: Paul E. McKenney Cc: Richard Weinberger Cc: Rob Clark Cc: Rob Herring Cc: Rodrigo Vivi Cc: Roman Gushchin Cc: Sean Paul Cc: Sergey Senozhatsky Cc: Song Liu Cc: Stefano Stabellini Cc: Steven Price Cc: "Theodore Ts'o" Cc: Thomas Gleixner Cc: Tomeu Vizoso Cc: Tom Talpey Cc: Trond Myklebust Cc: Tvrtko Ursulin Cc: Vlastimil Babka Cc: Xuan Zhuo Cc: Yue Hu Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- fs/quota/dquot.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index 67562c78e57d..cd6dec2f9834 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -807,12 +807,6 @@ dqcache_shrink_count(struct shrinker *shrink, struct shrink_control *sc) percpu_counter_read_positive(&dqstats.counter[DQST_FREE_DQUOTS])); } -static struct shrinker dqcache_shrinker = { - .count_objects = dqcache_shrink_count, - .scan_objects = dqcache_shrink_scan, - .seeks = DEFAULT_SEEKS, -}; - /* * Safely release dquot and put reference to dquot. */ @@ -3010,6 +3004,7 @@ static int __init dquot_init(void) { int i, ret; unsigned long nr_hash, order; + struct shrinker *dqcache_shrinker; printk(KERN_NOTICE "VFS: Disk quotas %s\n", __DQUOT_VERSION__); @@ -3044,8 +3039,14 @@ static int __init dquot_init(void) pr_info("VFS: Dquot-cache hash table entries: %ld (order %ld," " %ld bytes)\n", nr_hash, order, (PAGE_SIZE << order)); - if (register_shrinker(&dqcache_shrinker, "dquota-cache")) - panic("Cannot register dquot shrinker"); + dqcache_shrinker = shrinker_alloc(0, "dquota-cache"); + if (!dqcache_shrinker) + panic("Cannot allocate dquot shrinker"); + + dqcache_shrinker->count_objects = dqcache_shrink_count; + dqcache_shrinker->scan_objects = dqcache_shrink_scan; + + shrinker_register(dqcache_shrinker); return 0; } -- Gitee From e391e7c746c10a78a0f86e6a421389e8bd7d19f0 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Mon, 11 Sep 2023 17:44:13 +0800 Subject: [PATCH 053/484] ubifs: dynamically allocate the ubifs-slab shrinker commit 827a34f90e5a27b83f188b914512d81e36688dec upstream Conflicts: none Backport-reason: Shrinker lockless Use new APIs to dynamically allocate the ubifs-slab shrinker. Link: https://lkml.kernel.org/r/20230911094444.68966-15-zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Reviewed-by: Muchun Song Cc: Richard Weinberger Cc: Abhinav Kumar Cc: Alasdair Kergon Cc: Alexander Viro Cc: Alyssa Rosenzweig Cc: Andreas Dilger Cc: Andreas Gruenbacher Cc: Anna Schumaker Cc: Arnd Bergmann Cc: Bob Peterson Cc: Borislav Petkov Cc: Carlos Llamas Cc: Chandan Babu R Cc: Chao Yu Cc: Chris Mason Cc: Christian Brauner Cc: Christian Koenig Cc: Chuck Lever Cc: Coly Li Cc: Dai Ngo Cc: Daniel Vetter Cc: Daniel Vetter Cc: "Darrick J. Wong" Cc: Dave Chinner Cc: Dave Hansen Cc: David Airlie Cc: David Hildenbrand Cc: David Sterba Cc: Dmitry Baryshkov Cc: Gao Xiang Cc: Greg Kroah-Hartman Cc: Huang Rui Cc: Ingo Molnar Cc: Jaegeuk Kim Cc: Jani Nikula Cc: Jan Kara Cc: Jason Wang Cc: Jeff Layton Cc: Jeffle Xu Cc: Joel Fernandes (Google) Cc: Joonas Lahtinen Cc: Josef Bacik Cc: Juergen Gross Cc: Kent Overstreet Cc: Kirill Tkhai Cc: Marijn Suijten Cc: "Michael S. Tsirkin" Cc: Mike Snitzer Cc: Minchan Kim Cc: Muchun Song Cc: Nadav Amit Cc: Neil Brown Cc: Oleksandr Tyshchenko Cc: Olga Kornievskaia Cc: Paul E. McKenney Cc: Rob Clark Cc: Rob Herring Cc: Rodrigo Vivi Cc: Roman Gushchin Cc: Sean Paul Cc: Sergey Senozhatsky Cc: Song Liu Cc: Stefano Stabellini Cc: Steven Price Cc: "Theodore Ts'o" Cc: Thomas Gleixner Cc: Tomeu Vizoso Cc: Tom Talpey Cc: Trond Myklebust Cc: Tvrtko Ursulin Cc: Vlastimil Babka Cc: Xuan Zhuo Cc: Yue Hu Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- fs/ubifs/super.c | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index 3409488d39ba..8a83421cddba 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -54,11 +54,7 @@ module_param_cb(default_version, &ubifs_default_version_ops, &ubifs_default_vers static struct kmem_cache *ubifs_inode_slab; /* UBIFS TNC shrinker description */ -static struct shrinker ubifs_shrinker_info = { - .scan_objects = ubifs_shrink_scan, - .count_objects = ubifs_shrink_count, - .seeks = DEFAULT_SEEKS, -}; +static struct shrinker *ubifs_shrinker_info; /** * validate_inode - validate inode. @@ -2373,7 +2369,7 @@ static void inode_slab_ctor(void *obj) static int __init ubifs_init(void) { - int err; + int err = -ENOMEM; BUILD_BUG_ON(sizeof(struct ubifs_ch) != 24); @@ -2439,10 +2435,15 @@ static int __init ubifs_init(void) if (!ubifs_inode_slab) return -ENOMEM; - err = register_shrinker(&ubifs_shrinker_info, "ubifs-slab"); - if (err) + ubifs_shrinker_info = shrinker_alloc(0, "ubifs-slab"); + if (!ubifs_shrinker_info) goto out_slab; + ubifs_shrinker_info->count_objects = ubifs_shrink_count; + ubifs_shrinker_info->scan_objects = ubifs_shrink_scan; + + shrinker_register(ubifs_shrinker_info); + err = ubifs_compressors_init(); if (err) goto out_shrinker; @@ -2467,7 +2468,7 @@ static int __init ubifs_init(void) dbg_debugfs_exit(); ubifs_compressors_exit(); out_shrinker: - unregister_shrinker(&ubifs_shrinker_info); + shrinker_free(ubifs_shrinker_info); out_slab: kmem_cache_destroy(ubifs_inode_slab); return err; @@ -2483,7 +2484,7 @@ static void __exit ubifs_exit(void) dbg_debugfs_exit(); ubifs_sysfs_exit(); ubifs_compressors_exit(); - unregister_shrinker(&ubifs_shrinker_info); + shrinker_free(ubifs_shrinker_info); /* * Make sure all delayed rcu free inodes are flushed before we -- Gitee From 76b11a0a7a7a7a818fe194be2029c6e3e5a7c06f Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Mon, 11 Sep 2023 17:44:14 +0800 Subject: [PATCH 054/484] rcu: dynamically allocate the rcu-lazy shrinker commit 2fbacff0cbf58530fa459c3f28ba9125df33ad86 upstream Conflicts: none Backport-reason: Shrinker lockless Use new APIs to dynamically allocate the rcu-lazy shrinker. Link: https://lkml.kernel.org/r/20230911094444.68966-16-zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Reviewed-by: Joel Fernandes (Google) Acked-by: Muchun Song Cc: Abhinav Kumar Cc: Alasdair Kergon Cc: Alexander Viro Cc: Alyssa Rosenzweig Cc: Andreas Dilger Cc: Andreas Gruenbacher Cc: Anna Schumaker Cc: Arnd Bergmann Cc: Bob Peterson Cc: Borislav Petkov Cc: Carlos Llamas Cc: Chandan Babu R Cc: Chao Yu Cc: Chris Mason Cc: Christian Brauner Cc: Christian Koenig Cc: Chuck Lever Cc: Coly Li Cc: Dai Ngo Cc: Daniel Vetter Cc: Daniel Vetter Cc: "Darrick J. Wong" Cc: Dave Chinner Cc: Dave Hansen Cc: David Airlie Cc: David Hildenbrand Cc: David Sterba Cc: Dmitry Baryshkov Cc: Gao Xiang Cc: Greg Kroah-Hartman Cc: Huang Rui Cc: Ingo Molnar Cc: Jaegeuk Kim Cc: Jani Nikula Cc: Jan Kara Cc: Jason Wang Cc: Jeff Layton Cc: Jeffle Xu Cc: Joonas Lahtinen Cc: Josef Bacik Cc: Juergen Gross Cc: Kent Overstreet Cc: Kirill Tkhai Cc: Marijn Suijten Cc: "Michael S. Tsirkin" Cc: Mike Snitzer Cc: Minchan Kim Cc: Nadav Amit Cc: Neil Brown Cc: Oleksandr Tyshchenko Cc: Olga Kornievskaia Cc: Paul E. McKenney Cc: Richard Weinberger Cc: Rob Clark Cc: Rob Herring Cc: Rodrigo Vivi Cc: Roman Gushchin Cc: Sean Paul Cc: Sergey Senozhatsky Cc: Song Liu Cc: Stefano Stabellini Cc: Steven Price Cc: "Theodore Ts'o" Cc: Thomas Gleixner Cc: Tomeu Vizoso Cc: Tom Talpey Cc: Trond Myklebust Cc: Tvrtko Ursulin Cc: Vlastimil Babka Cc: Xuan Zhuo Cc: Yue Hu Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- kernel/rcu/tree_nocb.h | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h index 8993b2322be2..85c3efb5392e 100644 --- a/kernel/rcu/tree_nocb.h +++ b/kernel/rcu/tree_nocb.h @@ -1384,13 +1384,6 @@ lazy_rcu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) return count ? count : SHRINK_STOP; } - -static struct shrinker lazy_rcu_shrinker = { - .count_objects = lazy_rcu_shrink_count, - .scan_objects = lazy_rcu_shrink_scan, - .batch = 0, - .seeks = DEFAULT_SEEKS, -}; #endif // #ifdef CONFIG_RCU_LAZY void __init rcu_init_nohz(void) @@ -1398,6 +1391,7 @@ void __init rcu_init_nohz(void) int cpu; struct rcu_data *rdp; const struct cpumask *cpumask = NULL; + struct shrinker * __maybe_unused lazy_rcu_shrinker; #if defined(CONFIG_NO_HZ_FULL) if (tick_nohz_full_running && !cpumask_empty(tick_nohz_full_mask)) @@ -1424,8 +1418,15 @@ void __init rcu_init_nohz(void) return; #ifdef CONFIG_RCU_LAZY - if (register_shrinker(&lazy_rcu_shrinker, "rcu-lazy")) - pr_err("Failed to register lazy_rcu shrinker!\n"); + lazy_rcu_shrinker = shrinker_alloc(0, "rcu-lazy"); + if (!lazy_rcu_shrinker) { + pr_err("Failed to allocate lazy_rcu shrinker!\n"); + } else { + lazy_rcu_shrinker->count_objects = lazy_rcu_shrink_count; + lazy_rcu_shrinker->scan_objects = lazy_rcu_shrink_scan; + + shrinker_register(lazy_rcu_shrinker); + } #endif // #ifdef CONFIG_RCU_LAZY if (!cpumask_subset(rcu_nocb_mask, cpu_possible_mask)) { -- Gitee From a99509df26206a3571eda453ce40b2eb2b33d405 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Mon, 11 Sep 2023 17:44:15 +0800 Subject: [PATCH 055/484] rcu: dynamically allocate the rcu-kfree shrinker commit 21e0b932fb5d72bb6cefdf56e22bfddea968cf32 upstream Conflicts: none Backport-reason: Shrinker lockless Use new APIs to dynamically allocate the rcu-kfree shrinker. Link: https://lkml.kernel.org/r/20230911094444.68966-17-zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Reviewed-by: Joel Fernandes (Google) Reviewed-by: Muchun Song Cc: Abhinav Kumar Cc: Alasdair Kergon Cc: Alexander Viro Cc: Alyssa Rosenzweig Cc: Andreas Dilger Cc: Andreas Gruenbacher Cc: Anna Schumaker Cc: Arnd Bergmann Cc: Bob Peterson Cc: Borislav Petkov Cc: Carlos Llamas Cc: Chandan Babu R Cc: Chao Yu Cc: Chris Mason Cc: Christian Brauner Cc: Christian Koenig Cc: Chuck Lever Cc: Coly Li Cc: Dai Ngo Cc: Daniel Vetter Cc: Daniel Vetter Cc: "Darrick J. Wong" Cc: Dave Chinner Cc: Dave Hansen Cc: David Airlie Cc: David Hildenbrand Cc: David Sterba Cc: Dmitry Baryshkov Cc: Gao Xiang Cc: Greg Kroah-Hartman Cc: Huang Rui Cc: Ingo Molnar Cc: Jaegeuk Kim Cc: Jani Nikula Cc: Jan Kara Cc: Jason Wang Cc: Jeff Layton Cc: Jeffle Xu Cc: Joonas Lahtinen Cc: Josef Bacik Cc: Juergen Gross Cc: Kent Overstreet Cc: Kirill Tkhai Cc: Marijn Suijten Cc: "Michael S. Tsirkin" Cc: Mike Snitzer Cc: Minchan Kim Cc: Muchun Song Cc: Nadav Amit Cc: Neil Brown Cc: Oleksandr Tyshchenko Cc: Olga Kornievskaia Cc: Paul E. McKenney Cc: Richard Weinberger Cc: Rob Clark Cc: Rob Herring Cc: Rodrigo Vivi Cc: Roman Gushchin Cc: Sean Paul Cc: Sergey Senozhatsky Cc: Song Liu Cc: Stefano Stabellini Cc: Steven Price Cc: "Theodore Ts'o" Cc: Thomas Gleixner Cc: Tomeu Vizoso Cc: Tom Talpey Cc: Trond Myklebust Cc: Tvrtko Ursulin Cc: Vlastimil Babka Cc: Xuan Zhuo Cc: Yue Hu Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- kernel/rcu/tree.c | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 536acebf22b0..cd42311e30ee 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3537,13 +3537,6 @@ kfree_rcu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) return freed == 0 ? SHRINK_STOP : freed; } -static struct shrinker kfree_rcu_shrinker = { - .count_objects = kfree_rcu_shrink_count, - .scan_objects = kfree_rcu_shrink_scan, - .batch = 0, - .seeks = DEFAULT_SEEKS, -}; - void __init kfree_rcu_scheduler_running(void) { int cpu; @@ -5034,6 +5027,7 @@ static void __init kfree_rcu_batch_init(void) { int cpu; int i, j; + struct shrinker *kfree_rcu_shrinker; /* Clamp it to [0:100] seconds interval. */ if (rcu_delay_page_cache_fill_msec < 0 || @@ -5065,8 +5059,17 @@ static void __init kfree_rcu_batch_init(void) INIT_DELAYED_WORK(&krcp->page_cache_work, fill_page_cache_func); krcp->initialized = true; } - if (register_shrinker(&kfree_rcu_shrinker, "rcu-kfree")) - pr_err("Failed to register kfree_rcu() shrinker!\n"); + + kfree_rcu_shrinker = shrinker_alloc(0, "rcu-kfree"); + if (!kfree_rcu_shrinker) { + pr_err("Failed to allocate kfree_rcu() shrinker!\n"); + return; + } + + kfree_rcu_shrinker->count_objects = kfree_rcu_shrink_count; + kfree_rcu_shrinker->scan_objects = kfree_rcu_shrink_scan; + + shrinker_register(kfree_rcu_shrinker); } void __init rcu_init(void) -- Gitee From ac3298c114226df72d3bb22a80634674b9dfcebb Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Mon, 11 Sep 2023 17:44:16 +0800 Subject: [PATCH 056/484] mm: thp: dynamically allocate the thp-related shrinkers commit 54d917295b8366545e6ee940c93dffe1452e6480 upstream Conflicts: none Backport-reason: Shrinker lockless Use new APIs to dynamically allocate the thp-zero and thp-deferred_split shrinkers. Link: https://lkml.kernel.org/r/20230911094444.68966-18-zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Cc: Abhinav Kumar Cc: Alasdair Kergon Cc: Alexander Viro Cc: Alyssa Rosenzweig Cc: Andreas Dilger Cc: Andreas Gruenbacher Cc: Anna Schumaker Cc: Arnd Bergmann Cc: Bob Peterson Cc: Borislav Petkov Cc: Carlos Llamas Cc: Chandan Babu R Cc: Chao Yu Cc: Chris Mason Cc: Christian Brauner Cc: Christian Koenig Cc: Chuck Lever Cc: Coly Li Cc: Dai Ngo Cc: Daniel Vetter Cc: Daniel Vetter Cc: "Darrick J. Wong" Cc: Dave Chinner Cc: Dave Hansen Cc: David Airlie Cc: David Hildenbrand Cc: David Sterba Cc: Dmitry Baryshkov Cc: Gao Xiang Cc: Greg Kroah-Hartman Cc: Huang Rui Cc: Ingo Molnar Cc: Jaegeuk Kim Cc: Jani Nikula Cc: Jan Kara Cc: Jason Wang Cc: Jeff Layton Cc: Jeffle Xu Cc: Joel Fernandes (Google) Cc: Joonas Lahtinen Cc: Josef Bacik Cc: Juergen Gross Cc: Kent Overstreet Cc: Kirill Tkhai Cc: Marijn Suijten Cc: "Michael S. Tsirkin" Cc: Mike Snitzer Cc: Minchan Kim Cc: Muchun Song Cc: Muchun Song Cc: Nadav Amit Cc: Neil Brown Cc: Oleksandr Tyshchenko Cc: Olga Kornievskaia Cc: Paul E. McKenney Cc: Richard Weinberger Cc: Rob Clark Cc: Rob Herring Cc: Rodrigo Vivi Cc: Roman Gushchin Cc: Sean Paul Cc: Sergey Senozhatsky Cc: Song Liu Cc: Stefano Stabellini Cc: Steven Price Cc: "Theodore Ts'o" Cc: Thomas Gleixner Cc: Tomeu Vizoso Cc: Tom Talpey Cc: Trond Myklebust Cc: Tvrtko Ursulin Cc: Vlastimil Babka Cc: Xuan Zhuo Cc: Yue Hu Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/huge_memory.c | 67 +++++++++++++++++++++++++++++++----------------- 1 file changed, 43 insertions(+), 24 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 1db33db4a2b9..0b026daa1c93 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -66,7 +66,11 @@ unsigned long transparent_hugepage_flags __read_mostly = (1<count_objects = shrink_huge_zero_page_count; + huge_zero_page_shrinker->scan_objects = shrink_huge_zero_page_scan; + shrinker_register(huge_zero_page_shrinker); + + deferred_split_shrinker->count_objects = deferred_split_count; + deferred_split_shrinker->scan_objects = deferred_split_scan; + shrinker_register(deferred_split_shrinker); + + return 0; +} + +static void __init thp_shrinker_exit(void) +{ + shrinker_free(huge_zero_page_shrinker); + shrinker_free(deferred_split_shrinker); +} + static int __init hugepage_init(void) { int err; @@ -751,12 +783,9 @@ static int __init hugepage_init(void) if (err) goto err_slab; - err = register_shrinker(&huge_zero_page_shrinker, "thp-zero"); - if (err) - goto err_hzp_shrinker; - err = register_shrinker(&deferred_split_shrinker, "thp-deferred_split"); + err = thp_shrinker_init(); if (err) - goto err_split_shrinker; + goto err_shrinker; /* * By default disable transparent hugepages on smaller systems, @@ -774,10 +803,8 @@ static int __init hugepage_init(void) return 0; err_khugepaged: - unregister_shrinker(&deferred_split_shrinker); -err_split_shrinker: - unregister_shrinker(&huge_zero_page_shrinker); -err_hzp_shrinker: + thp_shrinker_exit(); +err_shrinker: khugepaged_destroy(); err_slab: hugepage_exit_sysfs(hugepage_kobj); @@ -3165,7 +3192,7 @@ void deferred_split_folio(struct folio *folio) #ifdef CONFIG_MEMCG if (memcg) set_shrinker_bit(memcg, folio_nid(folio), - deferred_split_shrinker.id); + deferred_split_shrinker->id); #endif } spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags); @@ -3239,14 +3266,6 @@ static unsigned long deferred_split_scan(struct shrinker *shrink, return split; } -static struct shrinker deferred_split_shrinker = { - .count_objects = deferred_split_count, - .scan_objects = deferred_split_scan, - .seeks = DEFAULT_SEEKS, - .flags = SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE | - SHRINKER_NONSLAB, -}; - #ifdef CONFIG_DEBUG_FS static void split_huge_pages_all(void) { -- Gitee From 488ec72d8dfe53afba3cfafa0f8bc889c4adcb92 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Mon, 11 Sep 2023 17:44:17 +0800 Subject: [PATCH 057/484] sunrpc: dynamically allocate the sunrpc_cred shrinker commit abe0c269f9820c637223ad88d77d174e8d85c58f upstream Conflicts: none Backport-reason: Shrinker lockless Use new APIs to dynamically allocate the sunrpc_cred shrinker. Link: https://lkml.kernel.org/r/20230911094444.68966-19-zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Reviewed-by: Muchun Song Cc: Jeff Layton Cc: Neil Brown Cc: Olga Kornievskaia Cc: Dai Ngo Cc: Tom Talpey Cc: Trond Myklebust Cc: Anna Schumaker Cc: Abhinav Kumar Cc: Alasdair Kergon Cc: Alexander Viro Cc: Alyssa Rosenzweig Cc: Andreas Dilger Cc: Andreas Gruenbacher Cc: Arnd Bergmann Cc: Bob Peterson Cc: Borislav Petkov Cc: Carlos Llamas Cc: Chandan Babu R Cc: Chao Yu Cc: Chris Mason Cc: Christian Brauner Cc: Christian Koenig Cc: Chuck Lever Cc: Coly Li Cc: Daniel Vetter Cc: Daniel Vetter Cc: "Darrick J. Wong" Cc: Dave Chinner Cc: Dave Hansen Cc: David Airlie Cc: David Hildenbrand Cc: David Sterba Cc: Dmitry Baryshkov Cc: Gao Xiang Cc: Greg Kroah-Hartman Cc: Huang Rui Cc: Ingo Molnar Cc: Jaegeuk Kim Cc: Jani Nikula Cc: Jan Kara Cc: Jason Wang Cc: Jeffle Xu Cc: Joel Fernandes (Google) Cc: Joonas Lahtinen Cc: Josef Bacik Cc: Juergen Gross Cc: Kent Overstreet Cc: Kirill Tkhai Cc: Marijn Suijten Cc: "Michael S. Tsirkin" Cc: Mike Snitzer Cc: Minchan Kim Cc: Muchun Song Cc: Nadav Amit Cc: Oleksandr Tyshchenko Cc: Paul E. McKenney Cc: Richard Weinberger Cc: Rob Clark Cc: Rob Herring Cc: Rodrigo Vivi Cc: Roman Gushchin Cc: Sean Paul Cc: Sergey Senozhatsky Cc: Song Liu Cc: Stefano Stabellini Cc: Steven Price Cc: "Theodore Ts'o" Cc: Thomas Gleixner Cc: Tomeu Vizoso Cc: Tvrtko Ursulin Cc: Vlastimil Babka Cc: Xuan Zhuo Cc: Yue Hu Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- net/sunrpc/auth.c | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/net/sunrpc/auth.c b/net/sunrpc/auth.c index ec41b26af76e..04534ea537c8 100644 --- a/net/sunrpc/auth.c +++ b/net/sunrpc/auth.c @@ -863,11 +863,7 @@ rpcauth_uptodatecred(struct rpc_task *task) test_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags) != 0; } -static struct shrinker rpc_cred_shrinker = { - .count_objects = rpcauth_cache_shrink_count, - .scan_objects = rpcauth_cache_shrink_scan, - .seeks = DEFAULT_SEEKS, -}; +static struct shrinker *rpc_cred_shrinker; int __init rpcauth_init_module(void) { @@ -876,9 +872,17 @@ int __init rpcauth_init_module(void) err = rpc_init_authunix(); if (err < 0) goto out1; - err = register_shrinker(&rpc_cred_shrinker, "sunrpc_cred"); - if (err < 0) + rpc_cred_shrinker = shrinker_alloc(0, "sunrpc_cred"); + if (!rpc_cred_shrinker) { + err = -ENOMEM; goto out2; + } + + rpc_cred_shrinker->count_objects = rpcauth_cache_shrink_count; + rpc_cred_shrinker->scan_objects = rpcauth_cache_shrink_scan; + + shrinker_register(rpc_cred_shrinker); + return 0; out2: rpc_destroy_authunix(); @@ -889,5 +893,5 @@ int __init rpcauth_init_module(void) void rpcauth_remove_module(void) { rpc_destroy_authunix(); - unregister_shrinker(&rpc_cred_shrinker); + shrinker_free(rpc_cred_shrinker); } -- Gitee From b026f17b3a8984b317f7b0e0b48c42f339675fc1 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Mon, 11 Sep 2023 17:44:18 +0800 Subject: [PATCH 058/484] mm: workingset: dynamically allocate the mm-shadow shrinker commit 219c666eb2854bb9ca94d04a53ceb14aaf81cb28 upstream Conflicts: resolved Backport-reason: Shrinker lockless Use new APIs to dynamically allocate the mm-shadow shrinker. Link: https://lkml.kernel.org/r/20230911094444.68966-20-zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Acked-by: Muchun Song Cc: Abhinav Kumar Cc: Alasdair Kergon Cc: Alexander Viro Cc: Alyssa Rosenzweig Cc: Andreas Dilger Cc: Andreas Gruenbacher Cc: Anna Schumaker Cc: Arnd Bergmann Cc: Bob Peterson Cc: Borislav Petkov Cc: Carlos Llamas Cc: Chandan Babu R Cc: Chao Yu Cc: Chris Mason Cc: Christian Brauner Cc: Christian Koenig Cc: Chuck Lever Cc: Coly Li Cc: Dai Ngo Cc: Daniel Vetter Cc: Daniel Vetter Cc: "Darrick J. Wong" Cc: Dave Chinner Cc: Dave Hansen Cc: David Airlie Cc: David Hildenbrand Cc: David Sterba Cc: Dmitry Baryshkov Cc: Gao Xiang Cc: Greg Kroah-Hartman Cc: Huang Rui Cc: Ingo Molnar Cc: Jaegeuk Kim Cc: Jani Nikula Cc: Jan Kara Cc: Jason Wang Cc: Jeff Layton Cc: Jeffle Xu Cc: Joel Fernandes (Google) Cc: Joonas Lahtinen Cc: Josef Bacik Cc: Juergen Gross Cc: Kent Overstreet Cc: Kirill Tkhai Cc: Marijn Suijten Cc: "Michael S. Tsirkin" Cc: Mike Snitzer Cc: Minchan Kim Cc: Nadav Amit Cc: Neil Brown Cc: Oleksandr Tyshchenko Cc: Olga Kornievskaia Cc: Paul E. McKenney Cc: Richard Weinberger Cc: Rob Clark Cc: Rob Herring Cc: Rodrigo Vivi Cc: Roman Gushchin Cc: Sean Paul Cc: Sergey Senozhatsky Cc: Song Liu Cc: Stefano Stabellini Cc: Steven Price Cc: "Theodore Ts'o" Cc: Thomas Gleixner Cc: Tomeu Vizoso Cc: Tom Talpey Cc: Trond Myklebust Cc: Tvrtko Ursulin Cc: Vlastimil Babka Cc: Xuan Zhuo Cc: Yue Hu Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/workingset.c | 29 ++++++++++++++++------------- 1 file changed, 16 insertions(+), 13 deletions(-) diff --git a/mm/workingset.c b/mm/workingset.c index a2460d8889e1..4a5e6054750e 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -922,13 +922,6 @@ static unsigned long scan_shadow_nodes(struct shrinker *shrinker, NULL); } -static struct shrinker workingset_shadow_shrinker = { - .count_objects = count_shadow_nodes, - .scan_objects = scan_shadow_nodes, - .seeks = 0, /* ->count reports only fully expendable nodes */ - .flags = SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE, -}; - /* * Our list_lru->lock is IRQ-safe as it nests inside the IRQ-safe * i_pages lock. @@ -937,8 +930,9 @@ static struct lock_class_key shadow_nodes_key; static int __init workingset_init(void) { + struct shrinker *workingset_shadow_shrinker; unsigned int max_order; - int ret; + int ret = -ENOMEM; BUILD_BUG_ON(BITS_PER_LONG < EVICTION_SHIFT); /* @@ -960,17 +954,26 @@ static int __init workingset_init(void) LRU_GEN_EVICTION_BITS, lru_gen_bucket_order); #endif - ret = prealloc_shrinker(&workingset_shadow_shrinker, "mm-shadow"); - if (ret) + workingset_shadow_shrinker = shrinker_alloc(SHRINKER_NUMA_AWARE | + SHRINKER_MEMCG_AWARE, + "mm-shadow"); + if (!workingset_shadow_shrinker) goto err; + ret = __list_lru_init(&shadow_nodes, true, &shadow_nodes_key, - &workingset_shadow_shrinker); + workingset_shadow_shrinker); if (ret) goto err_list_lru; - register_shrinker_prepared(&workingset_shadow_shrinker); + + workingset_shadow_shrinker->count_objects = count_shadow_nodes; + workingset_shadow_shrinker->scan_objects = scan_shadow_nodes; + /* ->count reports only fully expendable nodes */ + workingset_shadow_shrinker->seeks = 0; + + shrinker_register(workingset_shadow_shrinker); return 0; err_list_lru: - free_prealloced_shrinker(&workingset_shadow_shrinker); + shrinker_free(workingset_shadow_shrinker); err: return ret; } -- Gitee From 5183b3d6a2fe9df122a385fa285a624274cc87da Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Mon, 11 Sep 2023 17:44:19 +0800 Subject: [PATCH 059/484] drm/i915: dynamically allocate the i915_gem_mm shrinker commit 583cc9e41095292e2ebf33c977d8ba1e64308892 upstream Conflicts: none Backport-reason: Shrinker lockless In preparation for implementing lockless slab shrink, use new APIs to dynamically allocate the i915_gem_mm shrinker, so that it can be freed asynchronously via RCU. Then it doesn't need to wait for RCU read-side critical section when releasing the struct drm_i915_private. Link: https://lkml.kernel.org/r/20230911094444.68966-21-zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Reviewed-by: Muchun Song Acked-by: Daniel Vetter Cc: Jani Nikula Cc: Joonas Lahtinen Cc: Rodrigo Vivi Cc: Tvrtko Ursulin Cc: David Airlie Cc: Abhinav Kumar Cc: Alasdair Kergon Cc: Alexander Viro Cc: Alyssa Rosenzweig Cc: Andreas Dilger Cc: Andreas Gruenbacher Cc: Anna Schumaker Cc: Arnd Bergmann Cc: Bob Peterson Cc: Borislav Petkov Cc: Carlos Llamas Cc: Chandan Babu R Cc: Chao Yu Cc: Chris Mason Cc: Christian Brauner Cc: Christian Koenig Cc: Chuck Lever Cc: Coly Li Cc: Dai Ngo Cc: Daniel Vetter Cc: "Darrick J. Wong" Cc: Dave Chinner Cc: Dave Hansen Cc: David Hildenbrand Cc: David Sterba Cc: Dmitry Baryshkov Cc: Gao Xiang Cc: Greg Kroah-Hartman Cc: Huang Rui Cc: Ingo Molnar Cc: Jaegeuk Kim Cc: Jan Kara Cc: Jason Wang Cc: Jeff Layton Cc: Jeffle Xu Cc: Joel Fernandes (Google) Cc: Josef Bacik Cc: Juergen Gross Cc: Kent Overstreet Cc: Kirill Tkhai Cc: Marijn Suijten Cc: "Michael S. Tsirkin" Cc: Mike Snitzer Cc: Minchan Kim Cc: Muchun Song Cc: Nadav Amit Cc: Neil Brown Cc: Oleksandr Tyshchenko Cc: Olga Kornievskaia Cc: Paul E. McKenney Cc: Richard Weinberger Cc: Rob Clark Cc: Rob Herring Cc: Roman Gushchin Cc: Sean Paul Cc: Sergey Senozhatsky Cc: Song Liu Cc: Stefano Stabellini Cc: Steven Price Cc: "Theodore Ts'o" Cc: Thomas Gleixner Cc: Tomeu Vizoso Cc: Tom Talpey Cc: Trond Myklebust Cc: Vlastimil Babka Cc: Xuan Zhuo Cc: Yue Hu Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- drivers/gpu/drm/i915/gem/i915_gem_shrinker.c | 29 +++++++++++--------- drivers/gpu/drm/i915/i915_drv.h | 2 +- 2 files changed, 17 insertions(+), 14 deletions(-) diff --git a/drivers/gpu/drm/i915/gem/i915_gem_shrinker.c b/drivers/gpu/drm/i915/gem/i915_gem_shrinker.c index 214763942aa2..e07ffbd9eab3 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_shrinker.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_shrinker.c @@ -284,8 +284,7 @@ unsigned long i915_gem_shrink_all(struct drm_i915_private *i915) static unsigned long i915_gem_shrinker_count(struct shrinker *shrinker, struct shrink_control *sc) { - struct drm_i915_private *i915 = - container_of(shrinker, struct drm_i915_private, mm.shrinker); + struct drm_i915_private *i915 = shrinker->private_data; unsigned long num_objects; unsigned long count; @@ -302,8 +301,8 @@ i915_gem_shrinker_count(struct shrinker *shrinker, struct shrink_control *sc) if (num_objects) { unsigned long avg = 2 * count / num_objects; - i915->mm.shrinker.batch = - max((i915->mm.shrinker.batch + avg) >> 1, + i915->mm.shrinker->batch = + max((i915->mm.shrinker->batch + avg) >> 1, 128ul /* default SHRINK_BATCH */); } @@ -313,8 +312,7 @@ i915_gem_shrinker_count(struct shrinker *shrinker, struct shrink_control *sc) static unsigned long i915_gem_shrinker_scan(struct shrinker *shrinker, struct shrink_control *sc) { - struct drm_i915_private *i915 = - container_of(shrinker, struct drm_i915_private, mm.shrinker); + struct drm_i915_private *i915 = shrinker->private_data; unsigned long freed; sc->nr_scanned = 0; @@ -422,12 +420,17 @@ i915_gem_shrinker_vmap(struct notifier_block *nb, unsigned long event, void *ptr void i915_gem_driver_register__shrinker(struct drm_i915_private *i915) { - i915->mm.shrinker.scan_objects = i915_gem_shrinker_scan; - i915->mm.shrinker.count_objects = i915_gem_shrinker_count; - i915->mm.shrinker.seeks = DEFAULT_SEEKS; - i915->mm.shrinker.batch = 4096; - drm_WARN_ON(&i915->drm, register_shrinker(&i915->mm.shrinker, - "drm-i915_gem")); + i915->mm.shrinker = shrinker_alloc(0, "drm-i915_gem"); + if (!i915->mm.shrinker) { + drm_WARN_ON(&i915->drm, 1); + } else { + i915->mm.shrinker->scan_objects = i915_gem_shrinker_scan; + i915->mm.shrinker->count_objects = i915_gem_shrinker_count; + i915->mm.shrinker->batch = 4096; + i915->mm.shrinker->private_data = i915; + + shrinker_register(i915->mm.shrinker); + } i915->mm.oom_notifier.notifier_call = i915_gem_shrinker_oom; drm_WARN_ON(&i915->drm, register_oom_notifier(&i915->mm.oom_notifier)); @@ -443,7 +446,7 @@ void i915_gem_driver_unregister__shrinker(struct drm_i915_private *i915) unregister_vmap_purge_notifier(&i915->mm.vmap_notifier)); drm_WARN_ON(&i915->drm, unregister_oom_notifier(&i915->mm.oom_notifier)); - unregister_shrinker(&i915->mm.shrinker); + shrinker_free(i915->mm.shrinker); } void i915_gem_shrinker_taints_mutex(struct drm_i915_private *i915, diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h index e0e0493d6c1f..6bf439717de4 100644 --- a/drivers/gpu/drm/i915/i915_drv.h +++ b/drivers/gpu/drm/i915/i915_drv.h @@ -163,7 +163,7 @@ struct i915_gem_mm { struct notifier_block oom_notifier; struct notifier_block vmap_notifier; - struct shrinker shrinker; + struct shrinker *shrinker; #ifdef CONFIG_MMU_NOTIFIER /** -- Gitee From de09e4209839250b11d87e99d08c35f2b4881863 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Mon, 11 Sep 2023 17:44:20 +0800 Subject: [PATCH 060/484] drm/msm: dynamically allocate the drm-msm_gem shrinker commit cd61a76c210afd324c345ff62d950e2ff7947f11 upstream Conflicts: none Backport-reason: Shrinker lockless In preparation for implementing lockless slab shrink, use new APIs to dynamically allocate the drm-msm_gem shrinker, so that it can be freed asynchronously via RCU. Then it doesn't need to wait for RCU read-side critical section when releasing the struct msm_drm_private. Link: https://lkml.kernel.org/r/20230911094444.68966-22-zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Reviewed-by: Muchun Song Acked-by: Daniel Vetter Cc: Rob Clark Cc: Abhinav Kumar Cc: Dmitry Baryshkov Cc: Sean Paul Cc: Marijn Suijten Cc: David Airlie Cc: Alasdair Kergon Cc: Alexander Viro Cc: Alyssa Rosenzweig Cc: Andreas Dilger Cc: Andreas Gruenbacher Cc: Anna Schumaker Cc: Arnd Bergmann Cc: Bob Peterson Cc: Borislav Petkov Cc: Carlos Llamas Cc: Chandan Babu R Cc: Chao Yu Cc: Chris Mason Cc: Christian Brauner Cc: Christian Koenig Cc: Chuck Lever Cc: Coly Li Cc: Dai Ngo Cc: Daniel Vetter Cc: "Darrick J. Wong" Cc: Dave Chinner Cc: Dave Hansen Cc: David Hildenbrand Cc: David Sterba Cc: Gao Xiang Cc: Greg Kroah-Hartman Cc: Huang Rui Cc: Ingo Molnar Cc: Jaegeuk Kim Cc: Jani Nikula Cc: Jan Kara Cc: Jason Wang Cc: Jeff Layton Cc: Jeffle Xu Cc: Joel Fernandes (Google) Cc: Joonas Lahtinen Cc: Josef Bacik Cc: Juergen Gross Cc: Kent Overstreet Cc: Kirill Tkhai Cc: "Michael S. Tsirkin" Cc: Mike Snitzer Cc: Minchan Kim Cc: Muchun Song Cc: Nadav Amit Cc: Neil Brown Cc: Oleksandr Tyshchenko Cc: Olga Kornievskaia Cc: Paul E. McKenney Cc: Richard Weinberger Cc: Rob Herring Cc: Rodrigo Vivi Cc: Roman Gushchin Cc: Sergey Senozhatsky Cc: Song Liu Cc: Stefano Stabellini Cc: Steven Price Cc: "Theodore Ts'o" Cc: Thomas Gleixner Cc: Tomeu Vizoso Cc: Tom Talpey Cc: Trond Myklebust Cc: Tvrtko Ursulin Cc: Vlastimil Babka Cc: Xuan Zhuo Cc: Yue Hu Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- drivers/gpu/drm/msm/msm_drv.c | 4 +++- drivers/gpu/drm/msm/msm_drv.h | 4 ++-- drivers/gpu/drm/msm/msm_gem_shrinker.c | 33 ++++++++++++++++---------- 3 files changed, 25 insertions(+), 16 deletions(-) diff --git a/drivers/gpu/drm/msm/msm_drv.c b/drivers/gpu/drm/msm/msm_drv.c index 4bd028fa7500..7f20249d6071 100644 --- a/drivers/gpu/drm/msm/msm_drv.c +++ b/drivers/gpu/drm/msm/msm_drv.c @@ -462,7 +462,9 @@ static int msm_drm_init(struct device *dev, const struct drm_driver *drv) if (ret) goto err_msm_uninit; - msm_gem_shrinker_init(ddev); + ret = msm_gem_shrinker_init(ddev); + if (ret) + goto err_msm_uninit; if (priv->kms_init) { ret = priv->kms_init(ddev); diff --git a/drivers/gpu/drm/msm/msm_drv.h b/drivers/gpu/drm/msm/msm_drv.h index 223bf904235a..c3c0aef6a5cb 100644 --- a/drivers/gpu/drm/msm/msm_drv.h +++ b/drivers/gpu/drm/msm/msm_drv.h @@ -209,7 +209,7 @@ struct msm_drm_private { } vram; struct notifier_block vmap_notifier; - struct shrinker shrinker; + struct shrinker *shrinker; struct drm_atomic_state *pm_state; @@ -271,7 +271,7 @@ int msm_ioctl_gem_submit(struct drm_device *dev, void *data, unsigned long msm_gem_shrinker_shrink(struct drm_device *dev, unsigned long nr_to_scan); #endif -void msm_gem_shrinker_init(struct drm_device *dev); +int msm_gem_shrinker_init(struct drm_device *dev); void msm_gem_shrinker_cleanup(struct drm_device *dev); struct sg_table *msm_gem_prime_get_sg_table(struct drm_gem_object *obj); diff --git a/drivers/gpu/drm/msm/msm_gem_shrinker.c b/drivers/gpu/drm/msm/msm_gem_shrinker.c index 0641f5bb8649..07ca4ddfe4e3 100644 --- a/drivers/gpu/drm/msm/msm_gem_shrinker.c +++ b/drivers/gpu/drm/msm/msm_gem_shrinker.c @@ -34,8 +34,7 @@ static bool can_block(struct shrink_control *sc) static unsigned long msm_gem_shrinker_count(struct shrinker *shrinker, struct shrink_control *sc) { - struct msm_drm_private *priv = - container_of(shrinker, struct msm_drm_private, shrinker); + struct msm_drm_private *priv = shrinker->private_data; unsigned count = priv->lru.dontneed.count; if (can_swap()) @@ -100,8 +99,7 @@ active_evict(struct drm_gem_object *obj) static unsigned long msm_gem_shrinker_scan(struct shrinker *shrinker, struct shrink_control *sc) { - struct msm_drm_private *priv = - container_of(shrinker, struct msm_drm_private, shrinker); + struct msm_drm_private *priv = shrinker->private_data; struct { struct drm_gem_lru *lru; bool (*shrink)(struct drm_gem_object *obj); @@ -148,10 +146,11 @@ msm_gem_shrinker_shrink(struct drm_device *dev, unsigned long nr_to_scan) struct shrink_control sc = { .nr_to_scan = nr_to_scan, }; - int ret; + unsigned long ret = SHRINK_STOP; fs_reclaim_acquire(GFP_KERNEL); - ret = msm_gem_shrinker_scan(&priv->shrinker, &sc); + if (priv->shrinker) + ret = msm_gem_shrinker_scan(priv->shrinker, &sc); fs_reclaim_release(GFP_KERNEL); return ret; @@ -210,16 +209,24 @@ msm_gem_shrinker_vmap(struct notifier_block *nb, unsigned long event, void *ptr) * * This function registers and sets up the msm shrinker. */ -void msm_gem_shrinker_init(struct drm_device *dev) +int msm_gem_shrinker_init(struct drm_device *dev) { struct msm_drm_private *priv = dev->dev_private; - priv->shrinker.count_objects = msm_gem_shrinker_count; - priv->shrinker.scan_objects = msm_gem_shrinker_scan; - priv->shrinker.seeks = DEFAULT_SEEKS; - WARN_ON(register_shrinker(&priv->shrinker, "drm-msm_gem")); + + priv->shrinker = shrinker_alloc(0, "drm-msm_gem"); + if (!priv->shrinker) + return -ENOMEM; + + priv->shrinker->count_objects = msm_gem_shrinker_count; + priv->shrinker->scan_objects = msm_gem_shrinker_scan; + priv->shrinker->private_data = priv; + + shrinker_register(priv->shrinker); priv->vmap_notifier.notifier_call = msm_gem_shrinker_vmap; WARN_ON(register_vmap_purge_notifier(&priv->vmap_notifier)); + + return 0; } /** @@ -232,8 +239,8 @@ void msm_gem_shrinker_cleanup(struct drm_device *dev) { struct msm_drm_private *priv = dev->dev_private; - if (priv->shrinker.nr_deferred) { + if (priv->shrinker) { WARN_ON(unregister_vmap_purge_notifier(&priv->vmap_notifier)); - unregister_shrinker(&priv->shrinker); + shrinker_free(priv->shrinker); } } -- Gitee From d94eee78456aca8a27ac51d8602afca26c839d3d Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Mon, 11 Sep 2023 17:44:21 +0800 Subject: [PATCH 061/484] drm/panfrost: dynamically allocate the drm-panfrost shrinker commit e11c4f3acbbb776992425867df894b8c0f131f50 upstream Conflicts: none Backport-reason: Shrinker lockless In preparation for implementing lockless slab shrink, use new APIs to dynamically allocate the drm-panfrost shrinker, so that it can be freed asynchronously via RCU. Then it doesn't need to wait for RCU read-side critical section when releasing the struct panfrost_device. Link: https://lkml.kernel.org/r/20230911094444.68966-23-zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Reviewed-by: Steven Price Acked-by: Daniel Vetter Cc: Rob Herring Cc: Tomeu Vizoso Cc: Alyssa Rosenzweig Cc: David Airlie Cc: Abhinav Kumar Cc: Alasdair Kergon Cc: Alexander Viro Cc: Andreas Dilger Cc: Andreas Gruenbacher Cc: Anna Schumaker Cc: Arnd Bergmann Cc: Bob Peterson Cc: Borislav Petkov Cc: Carlos Llamas Cc: Chandan Babu R Cc: Chao Yu Cc: Chris Mason Cc: Christian Brauner Cc: Christian Koenig Cc: Chuck Lever Cc: Coly Li Cc: Dai Ngo Cc: Daniel Vetter Cc: "Darrick J. Wong" Cc: Dave Chinner Cc: Dave Hansen Cc: David Hildenbrand Cc: David Sterba Cc: Dmitry Baryshkov Cc: Gao Xiang Cc: Greg Kroah-Hartman Cc: Huang Rui Cc: Ingo Molnar Cc: Jaegeuk Kim Cc: Jani Nikula Cc: Jan Kara Cc: Jason Wang Cc: Jeff Layton Cc: Jeffle Xu Cc: Joel Fernandes (Google) Cc: Joonas Lahtinen Cc: Josef Bacik Cc: Juergen Gross Cc: Kent Overstreet Cc: Kirill Tkhai Cc: Marijn Suijten Cc: "Michael S. Tsirkin" Cc: Mike Snitzer Cc: Minchan Kim Cc: Muchun Song Cc: Muchun Song Cc: Nadav Amit Cc: Neil Brown Cc: Oleksandr Tyshchenko Cc: Olga Kornievskaia Cc: Paul E. McKenney Cc: Richard Weinberger Cc: Rob Clark Cc: Rodrigo Vivi Cc: Roman Gushchin Cc: Sean Paul Cc: Sergey Senozhatsky Cc: Song Liu Cc: Stefano Stabellini Cc: "Theodore Ts'o" Cc: Thomas Gleixner Cc: Tom Talpey Cc: Trond Myklebust Cc: Tvrtko Ursulin Cc: Vlastimil Babka Cc: Xuan Zhuo Cc: Yue Hu Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- drivers/gpu/drm/panfrost/panfrost_device.h | 2 +- drivers/gpu/drm/panfrost/panfrost_drv.c | 6 +++- drivers/gpu/drm/panfrost/panfrost_gem.h | 2 +- .../gpu/drm/panfrost/panfrost_gem_shrinker.c | 29 +++++++++++-------- 4 files changed, 24 insertions(+), 15 deletions(-) diff --git a/drivers/gpu/drm/panfrost/panfrost_device.h b/drivers/gpu/drm/panfrost/panfrost_device.h index b0126b9fbadc..e667e5689353 100644 --- a/drivers/gpu/drm/panfrost/panfrost_device.h +++ b/drivers/gpu/drm/panfrost/panfrost_device.h @@ -118,7 +118,7 @@ struct panfrost_device { struct mutex shrinker_lock; struct list_head shrinker_list; - struct shrinker shrinker; + struct shrinker *shrinker; struct panfrost_devfreq pfdevfreq; }; diff --git a/drivers/gpu/drm/panfrost/panfrost_drv.c b/drivers/gpu/drm/panfrost/panfrost_drv.c index ddcc8259061b..c17cf7f578af 100644 --- a/drivers/gpu/drm/panfrost/panfrost_drv.c +++ b/drivers/gpu/drm/panfrost/panfrost_drv.c @@ -601,10 +601,14 @@ static int panfrost_probe(struct platform_device *pdev) if (err < 0) goto err_out1; - panfrost_gem_shrinker_init(ddev); + err = panfrost_gem_shrinker_init(ddev); + if (err) + goto err_out2; return 0; +err_out2: + drm_dev_unregister(ddev); err_out1: pm_runtime_disable(pfdev->dev); panfrost_device_fini(pfdev); diff --git a/drivers/gpu/drm/panfrost/panfrost_gem.h b/drivers/gpu/drm/panfrost/panfrost_gem.h index ad2877eeeccd..863d2ec8d4f0 100644 --- a/drivers/gpu/drm/panfrost/panfrost_gem.h +++ b/drivers/gpu/drm/panfrost/panfrost_gem.h @@ -81,7 +81,7 @@ panfrost_gem_mapping_get(struct panfrost_gem_object *bo, void panfrost_gem_mapping_put(struct panfrost_gem_mapping *mapping); void panfrost_gem_teardown_mappings_locked(struct panfrost_gem_object *bo); -void panfrost_gem_shrinker_init(struct drm_device *dev); +int panfrost_gem_shrinker_init(struct drm_device *dev); void panfrost_gem_shrinker_cleanup(struct drm_device *dev); #endif /* __PANFROST_GEM_H__ */ diff --git a/drivers/gpu/drm/panfrost/panfrost_gem_shrinker.c b/drivers/gpu/drm/panfrost/panfrost_gem_shrinker.c index 6a71a2555f85..3d9f51bd48b6 100644 --- a/drivers/gpu/drm/panfrost/panfrost_gem_shrinker.c +++ b/drivers/gpu/drm/panfrost/panfrost_gem_shrinker.c @@ -18,8 +18,7 @@ static unsigned long panfrost_gem_shrinker_count(struct shrinker *shrinker, struct shrink_control *sc) { - struct panfrost_device *pfdev = - container_of(shrinker, struct panfrost_device, shrinker); + struct panfrost_device *pfdev = shrinker->private_data; struct drm_gem_shmem_object *shmem; unsigned long count = 0; @@ -65,8 +64,7 @@ static bool panfrost_gem_purge(struct drm_gem_object *obj) static unsigned long panfrost_gem_shrinker_scan(struct shrinker *shrinker, struct shrink_control *sc) { - struct panfrost_device *pfdev = - container_of(shrinker, struct panfrost_device, shrinker); + struct panfrost_device *pfdev = shrinker->private_data; struct drm_gem_shmem_object *shmem, *tmp; unsigned long freed = 0; @@ -97,13 +95,21 @@ panfrost_gem_shrinker_scan(struct shrinker *shrinker, struct shrink_control *sc) * * This function registers and sets up the panfrost shrinker. */ -void panfrost_gem_shrinker_init(struct drm_device *dev) +int panfrost_gem_shrinker_init(struct drm_device *dev) { struct panfrost_device *pfdev = dev->dev_private; - pfdev->shrinker.count_objects = panfrost_gem_shrinker_count; - pfdev->shrinker.scan_objects = panfrost_gem_shrinker_scan; - pfdev->shrinker.seeks = DEFAULT_SEEKS; - WARN_ON(register_shrinker(&pfdev->shrinker, "drm-panfrost")); + + pfdev->shrinker = shrinker_alloc(0, "drm-panfrost"); + if (!pfdev->shrinker) + return -ENOMEM; + + pfdev->shrinker->count_objects = panfrost_gem_shrinker_count; + pfdev->shrinker->scan_objects = panfrost_gem_shrinker_scan; + pfdev->shrinker->private_data = pfdev; + + shrinker_register(pfdev->shrinker); + + return 0; } /** @@ -116,7 +122,6 @@ void panfrost_gem_shrinker_cleanup(struct drm_device *dev) { struct panfrost_device *pfdev = dev->dev_private; - if (pfdev->shrinker.nr_deferred) { - unregister_shrinker(&pfdev->shrinker); - } + if (pfdev->shrinker) + shrinker_free(pfdev->shrinker); } -- Gitee From cce82b106043d9618b9b4ea2e0d11f0e8c4e5297 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Mon, 11 Sep 2023 17:44:22 +0800 Subject: [PATCH 062/484] dm: dynamically allocate the dm-bufio shrinker commit 1f1d459c9a2f0c8618b137fe845c09e405ae59fc upstream Conflicts: none Backport-reason: Shrinker lockless In preparation for implementing lockless slab shrink, use new APIs to dynamically allocate the dm-bufio shrinker, so that it can be freed asynchronously via RCU. Then it doesn't need to wait for RCU read-side critical section when releasing the struct dm_bufio_client. Link: https://lkml.kernel.org/r/20230911094444.68966-24-zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Reviewed-by: Muchun Song Cc: Alasdair Kergon Cc: Mike Snitzer Cc: Abhinav Kumar Cc: Alexander Viro Cc: Alyssa Rosenzweig Cc: Andreas Dilger Cc: Andreas Gruenbacher Cc: Anna Schumaker Cc: Arnd Bergmann Cc: Bob Peterson Cc: Borislav Petkov Cc: Carlos Llamas Cc: Chandan Babu R Cc: Chao Yu Cc: Chris Mason Cc: Christian Brauner Cc: Christian Koenig Cc: Chuck Lever Cc: Coly Li Cc: Dai Ngo Cc: Daniel Vetter Cc: Daniel Vetter Cc: "Darrick J. Wong" Cc: Dave Chinner Cc: Dave Hansen Cc: David Airlie Cc: David Hildenbrand Cc: David Sterba Cc: Dmitry Baryshkov Cc: Gao Xiang Cc: Greg Kroah-Hartman Cc: Huang Rui Cc: Ingo Molnar Cc: Jaegeuk Kim Cc: Jani Nikula Cc: Jan Kara Cc: Jason Wang Cc: Jeff Layton Cc: Jeffle Xu Cc: Joel Fernandes (Google) Cc: Joonas Lahtinen Cc: Josef Bacik Cc: Juergen Gross Cc: Kent Overstreet Cc: Kirill Tkhai Cc: Marijn Suijten Cc: "Michael S. Tsirkin" Cc: Minchan Kim Cc: Muchun Song Cc: Nadav Amit Cc: Neil Brown Cc: Oleksandr Tyshchenko Cc: Olga Kornievskaia Cc: Paul E. McKenney Cc: Richard Weinberger Cc: Rob Clark Cc: Rob Herring Cc: Rodrigo Vivi Cc: Roman Gushchin Cc: Sean Paul Cc: Sergey Senozhatsky Cc: Song Liu Cc: Stefano Stabellini Cc: Steven Price Cc: "Theodore Ts'o" Cc: Thomas Gleixner Cc: Tomeu Vizoso Cc: Tom Talpey Cc: Trond Myklebust Cc: Tvrtko Ursulin Cc: Vlastimil Babka Cc: Xuan Zhuo Cc: Yue Hu Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- drivers/md/dm-bufio.c | 28 +++++++++++++++++----------- 1 file changed, 17 insertions(+), 11 deletions(-) diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c index 4a7a395c3808..5b80f886f289 100644 --- a/drivers/md/dm-bufio.c +++ b/drivers/md/dm-bufio.c @@ -1003,7 +1003,7 @@ struct dm_bufio_client { sector_t start; - struct shrinker shrinker; + struct shrinker *shrinker; struct work_struct shrink_work; atomic_long_t need_shrink; @@ -2412,7 +2412,7 @@ static unsigned long dm_bufio_shrink_scan(struct shrinker *shrink, struct shrink { struct dm_bufio_client *c; - c = container_of(shrink, struct dm_bufio_client, shrinker); + c = shrink->private_data; atomic_long_add(sc->nr_to_scan, &c->need_shrink); queue_work(dm_bufio_wq, &c->shrink_work); @@ -2421,7 +2421,7 @@ static unsigned long dm_bufio_shrink_scan(struct shrinker *shrink, struct shrink static unsigned long dm_bufio_shrink_count(struct shrinker *shrink, struct shrink_control *sc) { - struct dm_bufio_client *c = container_of(shrink, struct dm_bufio_client, shrinker); + struct dm_bufio_client *c = shrink->private_data; unsigned long count = cache_total(&c->cache); unsigned long retain_target = get_retain_buffers(c); unsigned long queued_for_cleanup = atomic_long_read(&c->need_shrink); @@ -2538,14 +2538,20 @@ struct dm_bufio_client *dm_bufio_client_create(struct block_device *bdev, unsign INIT_WORK(&c->shrink_work, shrink_work); atomic_long_set(&c->need_shrink, 0); - c->shrinker.count_objects = dm_bufio_shrink_count; - c->shrinker.scan_objects = dm_bufio_shrink_scan; - c->shrinker.seeks = 1; - c->shrinker.batch = 0; - r = register_shrinker(&c->shrinker, "dm-bufio:(%u:%u)", - MAJOR(bdev->bd_dev), MINOR(bdev->bd_dev)); - if (r) + c->shrinker = shrinker_alloc(0, "dm-bufio:(%u:%u)", + MAJOR(bdev->bd_dev), MINOR(bdev->bd_dev)); + if (!c->shrinker) { + r = -ENOMEM; goto bad; + } + + c->shrinker->count_objects = dm_bufio_shrink_count; + c->shrinker->scan_objects = dm_bufio_shrink_scan; + c->shrinker->seeks = 1; + c->shrinker->batch = 0; + c->shrinker->private_data = c; + + shrinker_register(c->shrinker); mutex_lock(&dm_bufio_clients_lock); dm_bufio_client_count++; @@ -2585,7 +2591,7 @@ void dm_bufio_client_destroy(struct dm_bufio_client *c) drop_buffers(c); - unregister_shrinker(&c->shrinker); + shrinker_free(c->shrinker); flush_work(&c->shrink_work); mutex_lock(&dm_bufio_clients_lock); -- Gitee From f23ab52dfc2f6ea15ec5a0e2764cfec31302bc7d Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Mon, 11 Sep 2023 17:44:23 +0800 Subject: [PATCH 063/484] dm zoned: dynamically allocate the dm-zoned-meta shrinker commit ba3d6acafd26416c3001adb4cc3cedcd6a419c73 upstream Conflicts: none Backport-reason: Shrinker lockless In preparation for implementing lockless slab shrink, use new APIs to dynamically allocate the dm-zoned-meta shrinker, so that it can be freed asynchronously via RCU. Then it doesn't need to wait for RCU read-side critical section when releasing the struct dmz_metadata. Link: https://lkml.kernel.org/r/20230911094444.68966-25-zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Reviewed-by: Muchun Song Cc: Alasdair Kergon Cc: Mike Snitzer Cc: Abhinav Kumar Cc: Alexander Viro Cc: Alyssa Rosenzweig Cc: Andreas Dilger Cc: Andreas Gruenbacher Cc: Anna Schumaker Cc: Arnd Bergmann Cc: Bob Peterson Cc: Borislav Petkov Cc: Carlos Llamas Cc: Chandan Babu R Cc: Chao Yu Cc: Chris Mason Cc: Christian Brauner Cc: Christian Koenig Cc: Chuck Lever Cc: Coly Li Cc: Dai Ngo Cc: Daniel Vetter Cc: Daniel Vetter Cc: "Darrick J. Wong" Cc: Dave Chinner Cc: Dave Hansen Cc: David Airlie Cc: David Hildenbrand Cc: David Sterba Cc: Dmitry Baryshkov Cc: Gao Xiang Cc: Greg Kroah-Hartman Cc: Huang Rui Cc: Ingo Molnar Cc: Jaegeuk Kim Cc: Jani Nikula Cc: Jan Kara Cc: Jason Wang Cc: Jeff Layton Cc: Jeffle Xu Cc: Joel Fernandes (Google) Cc: Joonas Lahtinen Cc: Josef Bacik Cc: Juergen Gross Cc: Kent Overstreet Cc: Kirill Tkhai Cc: Marijn Suijten Cc: "Michael S. Tsirkin" Cc: Minchan Kim Cc: Muchun Song Cc: Nadav Amit Cc: Neil Brown Cc: Oleksandr Tyshchenko Cc: Olga Kornievskaia Cc: Paul E. McKenney Cc: Richard Weinberger Cc: Rob Clark Cc: Rob Herring Cc: Rodrigo Vivi Cc: Roman Gushchin Cc: Sean Paul Cc: Sergey Senozhatsky Cc: Song Liu Cc: Stefano Stabellini Cc: Steven Price Cc: "Theodore Ts'o" Cc: Thomas Gleixner Cc: Tomeu Vizoso Cc: Tom Talpey Cc: Trond Myklebust Cc: Tvrtko Ursulin Cc: Vlastimil Babka Cc: Xuan Zhuo Cc: Yue Hu Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- drivers/md/dm-zoned-metadata.c | 28 ++++++++++++++++------------ 1 file changed, 16 insertions(+), 12 deletions(-) diff --git a/drivers/md/dm-zoned-metadata.c b/drivers/md/dm-zoned-metadata.c index 9d3cca8e3dc9..60a4dc01ea18 100644 --- a/drivers/md/dm-zoned-metadata.c +++ b/drivers/md/dm-zoned-metadata.c @@ -187,7 +187,7 @@ struct dmz_metadata { struct rb_root mblk_rbtree; struct list_head mblk_lru_list; struct list_head mblk_dirty_list; - struct shrinker mblk_shrinker; + struct shrinker *mblk_shrinker; /* Zone allocation management */ struct mutex map_lock; @@ -615,7 +615,7 @@ static unsigned long dmz_shrink_mblock_cache(struct dmz_metadata *zmd, static unsigned long dmz_mblock_shrinker_count(struct shrinker *shrink, struct shrink_control *sc) { - struct dmz_metadata *zmd = container_of(shrink, struct dmz_metadata, mblk_shrinker); + struct dmz_metadata *zmd = shrink->private_data; return atomic_read(&zmd->nr_mblks); } @@ -626,7 +626,7 @@ static unsigned long dmz_mblock_shrinker_count(struct shrinker *shrink, static unsigned long dmz_mblock_shrinker_scan(struct shrinker *shrink, struct shrink_control *sc) { - struct dmz_metadata *zmd = container_of(shrink, struct dmz_metadata, mblk_shrinker); + struct dmz_metadata *zmd = shrink->private_data; unsigned long count; spin_lock(&zmd->mblk_lock); @@ -2936,19 +2936,23 @@ int dmz_ctr_metadata(struct dmz_dev *dev, int num_dev, */ zmd->min_nr_mblks = 2 + zmd->nr_map_blocks + zmd->zone_nr_bitmap_blocks * 16; zmd->max_nr_mblks = zmd->min_nr_mblks + 512; - zmd->mblk_shrinker.count_objects = dmz_mblock_shrinker_count; - zmd->mblk_shrinker.scan_objects = dmz_mblock_shrinker_scan; - zmd->mblk_shrinker.seeks = DEFAULT_SEEKS; /* Metadata cache shrinker */ - ret = register_shrinker(&zmd->mblk_shrinker, "dm-zoned-meta:(%u:%u)", - MAJOR(dev->bdev->bd_dev), - MINOR(dev->bdev->bd_dev)); - if (ret) { - dmz_zmd_err(zmd, "Register metadata cache shrinker failed"); + zmd->mblk_shrinker = shrinker_alloc(0, "dm-zoned-meta:(%u:%u)", + MAJOR(dev->bdev->bd_dev), + MINOR(dev->bdev->bd_dev)); + if (!zmd->mblk_shrinker) { + ret = -ENOMEM; + dmz_zmd_err(zmd, "Allocate metadata cache shrinker failed"); goto err; } + zmd->mblk_shrinker->count_objects = dmz_mblock_shrinker_count; + zmd->mblk_shrinker->scan_objects = dmz_mblock_shrinker_scan; + zmd->mblk_shrinker->private_data = zmd; + + shrinker_register(zmd->mblk_shrinker); + dmz_zmd_info(zmd, "DM-Zoned metadata version %d", zmd->sb_version); for (i = 0; i < zmd->nr_devs; i++) dmz_print_dev(zmd, i); @@ -2995,7 +2999,7 @@ int dmz_ctr_metadata(struct dmz_dev *dev, int num_dev, */ void dmz_dtr_metadata(struct dmz_metadata *zmd) { - unregister_shrinker(&zmd->mblk_shrinker); + shrinker_free(zmd->mblk_shrinker); dmz_cleanup_metadata(zmd); kfree(zmd); } -- Gitee From 0d7d4038bf424d73332123743df4bd8b289db21a Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Mon, 11 Sep 2023 17:44:24 +0800 Subject: [PATCH 064/484] md/raid5: dynamically allocate the md-raid5 shrinker commit 86298d8b8ceacc17d0192cd6412d2773ff51b27f upstream Conflicts: resolved Backport-reason: Shrinker lockless In preparation for implementing lockless slab shrink, use new APIs to dynamically allocate the md-raid5 shrinker, so that it can be freed asynchronously via RCU. Then it doesn't need to wait for RCU read-side critical section when releasing the struct r5conf. Link: https://lkml.kernel.org/r/20230911094444.68966-26-zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Reviewed-by: Muchun Song Reviewed-by: Song Liu Cc: Abhinav Kumar Cc: Alasdair Kergon Cc: Alexander Viro Cc: Alyssa Rosenzweig Cc: Andreas Dilger Cc: Andreas Gruenbacher Cc: Anna Schumaker Cc: Arnd Bergmann Cc: Bob Peterson Cc: Borislav Petkov Cc: Carlos Llamas Cc: Chandan Babu R Cc: Chao Yu Cc: Chris Mason Cc: Christian Brauner Cc: Christian Koenig Cc: Chuck Lever Cc: Coly Li Cc: Dai Ngo Cc: Daniel Vetter Cc: Daniel Vetter Cc: "Darrick J. Wong" Cc: Dave Chinner Cc: Dave Hansen Cc: David Airlie Cc: David Hildenbrand Cc: David Sterba Cc: Dmitry Baryshkov Cc: Gao Xiang Cc: Greg Kroah-Hartman Cc: Huang Rui Cc: Ingo Molnar Cc: Jaegeuk Kim Cc: Jani Nikula Cc: Jan Kara Cc: Jason Wang Cc: Jeff Layton Cc: Jeffle Xu Cc: Joel Fernandes (Google) Cc: Joonas Lahtinen Cc: Josef Bacik Cc: Juergen Gross Cc: Kent Overstreet Cc: Kirill Tkhai Cc: Marijn Suijten Cc: "Michael S. Tsirkin" Cc: Mike Snitzer Cc: Minchan Kim Cc: Muchun Song Cc: Nadav Amit Cc: Neil Brown Cc: Oleksandr Tyshchenko Cc: Olga Kornievskaia Cc: Paul E. McKenney Cc: Richard Weinberger Cc: Rob Clark Cc: Rob Herring Cc: Rodrigo Vivi Cc: Roman Gushchin Cc: Sean Paul Cc: Sergey Senozhatsky Cc: Stefano Stabellini Cc: Steven Price Cc: "Theodore Ts'o" Cc: Thomas Gleixner Cc: Tomeu Vizoso Cc: Tom Talpey Cc: Trond Myklebust Cc: Tvrtko Ursulin Cc: Vlastimil Babka Cc: Xuan Zhuo Cc: Yue Hu Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- drivers/md/raid5.c | 26 +++++++++++++++----------- drivers/md/raid5.h | 2 +- 2 files changed, 16 insertions(+), 12 deletions(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index f69e4a6a8a59..8a0edb429b13 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -7422,7 +7422,7 @@ static void free_conf(struct r5conf *conf) log_exit(conf); - unregister_shrinker(&conf->shrinker); + shrinker_free(conf->shrinker); free_thread_groups(conf); shrink_stripes(conf); raid5_free_percpu(conf); @@ -7470,7 +7470,7 @@ static int raid5_alloc_percpu(struct r5conf *conf) static unsigned long raid5_cache_scan(struct shrinker *shrink, struct shrink_control *sc) { - struct r5conf *conf = container_of(shrink, struct r5conf, shrinker); + struct r5conf *conf = shrink->private_data; unsigned long ret = SHRINK_STOP; if (mutex_trylock(&conf->cache_size_mutex)) { @@ -7491,7 +7491,7 @@ static unsigned long raid5_cache_scan(struct shrinker *shrink, static unsigned long raid5_cache_count(struct shrinker *shrink, struct shrink_control *sc) { - struct r5conf *conf = container_of(shrink, struct r5conf, shrinker); + struct r5conf *conf = shrink->private_data; int max_stripes = READ_ONCE(conf->max_nr_stripes); int min_stripes = READ_ONCE(conf->min_nr_stripes); @@ -7728,18 +7728,22 @@ static struct r5conf *setup_conf(struct mddev *mddev) * it reduces the queue depth and so can hurt throughput. * So set it rather large, scaled by number of devices. */ - conf->shrinker.seeks = DEFAULT_SEEKS * conf->raid_disks * 4; - conf->shrinker.scan_objects = raid5_cache_scan; - conf->shrinker.count_objects = raid5_cache_count; - conf->shrinker.batch = 128; - conf->shrinker.flags = 0; - ret = register_shrinker(&conf->shrinker, "md-raid5:%s", mdname(mddev)); - if (ret) { - pr_warn("md/raid:%s: couldn't register shrinker.\n", + conf->shrinker = shrinker_alloc(0, "md-raid5:%s", mdname(mddev)); + if (!conf->shrinker) { + ret = -ENOMEM; + pr_warn("md/raid:%s: couldn't allocate shrinker.\n", mdname(mddev)); goto abort; } + conf->shrinker->seeks = DEFAULT_SEEKS * conf->raid_disks * 4; + conf->shrinker->scan_objects = raid5_cache_scan; + conf->shrinker->count_objects = raid5_cache_count; + conf->shrinker->batch = 128; + conf->shrinker->private_data = conf; + + shrinker_register(conf->shrinker); + sprintf(pers_name, "raid%d", mddev->new_level); rcu_assign_pointer(conf->thread, md_register_thread(raid5d, mddev, pers_name)); diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index fd6171553880..f796b23c2b1b 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h @@ -666,7 +666,7 @@ struct r5conf { wait_queue_head_t wait_for_stripe; wait_queue_head_t wait_for_overlap; unsigned long cache_state; - struct shrinker shrinker; + struct shrinker *shrinker; int pool_size; /* number of disks in stripeheads in pool */ spinlock_t device_lock; struct disk_info *disks; -- Gitee From cbe0af94d00357100a8d6c055c84fe0305dea7e4 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Mon, 11 Sep 2023 17:44:25 +0800 Subject: [PATCH 065/484] bcache: dynamically allocate the md-bcache shrinker commit a6a1eb6214cf7adfd1e2da33d819df17fe8a64d7 upstream Conflicts: none Backport-reason: Shrinker lockless In preparation for implementing lockless slab shrink, use new APIs to dynamically allocate the md-bcache shrinker, so that it can be freed asynchronously via RCU. Then it doesn't need to wait for RCU read-side critical section when releasing the struct cache_set. Link: https://lkml.kernel.org/r/20230911094444.68966-27-zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Acked-by: Muchun Song Cc: Coly Li Cc: Kent Overstreet Cc: Abhinav Kumar Cc: Alasdair Kergon Cc: Alexander Viro Cc: Alyssa Rosenzweig Cc: Andreas Dilger Cc: Andreas Gruenbacher Cc: Anna Schumaker Cc: Arnd Bergmann Cc: Bob Peterson Cc: Borislav Petkov Cc: Carlos Llamas Cc: Chandan Babu R Cc: Chao Yu Cc: Chris Mason Cc: Christian Brauner Cc: Christian Koenig Cc: Chuck Lever Cc: Dai Ngo Cc: Daniel Vetter Cc: Daniel Vetter Cc: "Darrick J. Wong" Cc: Dave Chinner Cc: Dave Hansen Cc: David Airlie Cc: David Hildenbrand Cc: David Sterba Cc: Dmitry Baryshkov Cc: Gao Xiang Cc: Greg Kroah-Hartman Cc: Huang Rui Cc: Ingo Molnar Cc: Jaegeuk Kim Cc: Jani Nikula Cc: Jan Kara Cc: Jason Wang Cc: Jeff Layton Cc: Jeffle Xu Cc: Joel Fernandes (Google) Cc: Joonas Lahtinen Cc: Josef Bacik Cc: Juergen Gross Cc: Kirill Tkhai Cc: Marijn Suijten Cc: "Michael S. Tsirkin" Cc: Mike Snitzer Cc: Minchan Kim Cc: Nadav Amit Cc: Neil Brown Cc: Oleksandr Tyshchenko Cc: Olga Kornievskaia Cc: Paul E. McKenney Cc: Richard Weinberger Cc: Rob Clark Cc: Rob Herring Cc: Rodrigo Vivi Cc: Roman Gushchin Cc: Sean Paul Cc: Sergey Senozhatsky Cc: Song Liu Cc: Stefano Stabellini Cc: Steven Price Cc: "Theodore Ts'o" Cc: Thomas Gleixner Cc: Tomeu Vizoso Cc: Tom Talpey Cc: Trond Myklebust Cc: Tvrtko Ursulin Cc: Vlastimil Babka Cc: Xuan Zhuo Cc: Yue Hu Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- drivers/md/bcache/bcache.h | 2 +- drivers/md/bcache/btree.c | 27 ++++++++++++++++----------- drivers/md/bcache/sysfs.c | 3 ++- 3 files changed, 19 insertions(+), 13 deletions(-) diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index 83eb7f27db3d..38f2cb259321 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -542,7 +542,7 @@ struct cache_set { struct bio_set bio_split; /* For the btree cache */ - struct shrinker shrink; + struct shrinker *shrink; /* For the btree cache and anything allocation related */ struct mutex bucket_lock; diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index 30d6973de258..5f8e10e84f3c 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -667,7 +667,7 @@ static int mca_reap(struct btree *b, unsigned int min_order, bool flush) static unsigned long bch_mca_scan(struct shrinker *shrink, struct shrink_control *sc) { - struct cache_set *c = container_of(shrink, struct cache_set, shrink); + struct cache_set *c = shrink->private_data; struct btree *b, *t; unsigned long i, nr = sc->nr_to_scan; unsigned long freed = 0; @@ -734,7 +734,7 @@ static unsigned long bch_mca_scan(struct shrinker *shrink, static unsigned long bch_mca_count(struct shrinker *shrink, struct shrink_control *sc) { - struct cache_set *c = container_of(shrink, struct cache_set, shrink); + struct cache_set *c = shrink->private_data; if (c->shrinker_disabled) return 0; @@ -752,8 +752,8 @@ void bch_btree_cache_free(struct cache_set *c) closure_init_stack(&cl); - if (c->shrink.list.next) - unregister_shrinker(&c->shrink); + if (c->shrink) + shrinker_free(c->shrink); mutex_lock(&c->bucket_lock); @@ -828,14 +828,19 @@ int bch_btree_cache_alloc(struct cache_set *c) c->verify_data = NULL; #endif - c->shrink.count_objects = bch_mca_count; - c->shrink.scan_objects = bch_mca_scan; - c->shrink.seeks = 4; - c->shrink.batch = c->btree_pages * 2; + c->shrink = shrinker_alloc(0, "md-bcache:%pU", c->set_uuid); + if (!c->shrink) { + pr_warn("bcache: %s: could not allocate shrinker\n", __func__); + return 0; + } + + c->shrink->count_objects = bch_mca_count; + c->shrink->scan_objects = bch_mca_scan; + c->shrink->seeks = 4; + c->shrink->batch = c->btree_pages * 2; + c->shrink->private_data = c; - if (register_shrinker(&c->shrink, "md-bcache:%pU", c->set_uuid)) - pr_warn("bcache: %s: could not register shrinker\n", - __func__); + shrinker_register(c->shrink); return 0; } diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index b3a34f3ac081..c4633425acc4 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c @@ -866,7 +866,8 @@ STORE(__bch_cache_set) sc.gfp_mask = GFP_KERNEL; sc.nr_to_scan = strtoul_or_return(buf); - c->shrink.scan_objects(&c->shrink, &sc); + if (c->shrink) + c->shrink->scan_objects(c->shrink, &sc); } sysfs_strtoul_clamp(congested_read_threshold_us, -- Gitee From 55fbf35f667ae1fbfafe4f7aa2d98acf35e5356c Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Mon, 11 Sep 2023 17:44:26 +0800 Subject: [PATCH 066/484] vmw_balloon: dynamically allocate the vmw-balloon shrinker commit 17c4eb036a37f079ce2857be5914f839623b4eb2 upstream Conflicts: none Backport-reason: Shrinker lockless In preparation for implementing lockless slab shrink, use new APIs to dynamically allocate the vmw-balloon shrinker, so that it can be freed asynchronously via RCU. Then it doesn't need to wait for RCU read-side critical section when releasing the struct vmballoon. And we can simply exit vmballoon_init() when registering the shrinker fails. So the shrinker_registered indication is redundant, just remove it. Link: https://lkml.kernel.org/r/20230911094444.68966-28-zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Reviewed-by: Muchun Song Acked-by: Nadav Amit Cc: Arnd Bergmann Cc: Greg Kroah-Hartman Cc: Abhinav Kumar Cc: Alasdair Kergon Cc: Alexander Viro Cc: Alyssa Rosenzweig Cc: Andreas Dilger Cc: Andreas Gruenbacher Cc: Anna Schumaker Cc: Bob Peterson Cc: Borislav Petkov Cc: Carlos Llamas Cc: Chandan Babu R Cc: Chao Yu Cc: Chris Mason Cc: Christian Brauner Cc: Christian Koenig Cc: Chuck Lever Cc: Coly Li Cc: Dai Ngo Cc: Daniel Vetter Cc: Daniel Vetter Cc: "Darrick J. Wong" Cc: Dave Chinner Cc: Dave Hansen Cc: David Airlie Cc: David Hildenbrand Cc: David Sterba Cc: Dmitry Baryshkov Cc: Gao Xiang Cc: Huang Rui Cc: Ingo Molnar Cc: Jaegeuk Kim Cc: Jani Nikula Cc: Jan Kara Cc: Jason Wang Cc: Jeff Layton Cc: Jeffle Xu Cc: Joel Fernandes (Google) Cc: Joonas Lahtinen Cc: Josef Bacik Cc: Juergen Gross Cc: Kent Overstreet Cc: Kirill Tkhai Cc: Marijn Suijten Cc: "Michael S. Tsirkin" Cc: Mike Snitzer Cc: Minchan Kim Cc: Muchun Song Cc: Neil Brown Cc: Oleksandr Tyshchenko Cc: Olga Kornievskaia Cc: Paul E. McKenney Cc: Richard Weinberger Cc: Rob Clark Cc: Rob Herring Cc: Rodrigo Vivi Cc: Roman Gushchin Cc: Sean Paul Cc: Sergey Senozhatsky Cc: Song Liu Cc: Stefano Stabellini Cc: Steven Price Cc: "Theodore Ts'o" Cc: Thomas Gleixner Cc: Tomeu Vizoso Cc: Tom Talpey Cc: Trond Myklebust Cc: Tvrtko Ursulin Cc: Vlastimil Babka Cc: Xuan Zhuo Cc: Yue Hu Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- drivers/misc/vmw_balloon.c | 38 ++++++++++++-------------------------- 1 file changed, 12 insertions(+), 26 deletions(-) diff --git a/drivers/misc/vmw_balloon.c b/drivers/misc/vmw_balloon.c index 9ce9b9e0e9b6..c817d8c21641 100644 --- a/drivers/misc/vmw_balloon.c +++ b/drivers/misc/vmw_balloon.c @@ -380,16 +380,7 @@ struct vmballoon { /** * @shrinker: shrinker interface that is used to avoid over-inflation. */ - struct shrinker shrinker; - - /** - * @shrinker_registered: whether the shrinker was registered. - * - * The shrinker interface does not handle gracefully the removal of - * shrinker that was not registered before. This indication allows to - * simplify the unregistration process. - */ - bool shrinker_registered; + struct shrinker *shrinker; }; static struct vmballoon balloon; @@ -1568,29 +1559,27 @@ static unsigned long vmballoon_shrinker_count(struct shrinker *shrinker, static void vmballoon_unregister_shrinker(struct vmballoon *b) { - if (b->shrinker_registered) - unregister_shrinker(&b->shrinker); - b->shrinker_registered = false; + shrinker_free(b->shrinker); + b->shrinker = NULL; } static int vmballoon_register_shrinker(struct vmballoon *b) { - int r; - /* Do nothing if the shrinker is not enabled */ if (!vmwballoon_shrinker_enable) return 0; - b->shrinker.scan_objects = vmballoon_shrinker_scan; - b->shrinker.count_objects = vmballoon_shrinker_count; - b->shrinker.seeks = DEFAULT_SEEKS; + b->shrinker = shrinker_alloc(0, "vmw-balloon"); + if (!b->shrinker) + return -ENOMEM; - r = register_shrinker(&b->shrinker, "vmw-balloon"); + b->shrinker->scan_objects = vmballoon_shrinker_scan; + b->shrinker->count_objects = vmballoon_shrinker_count; + b->shrinker->private_data = b; - if (r == 0) - b->shrinker_registered = true; + shrinker_register(b->shrinker); - return r; + return 0; } /* @@ -1883,7 +1872,7 @@ static int __init vmballoon_init(void) error = vmballoon_register_shrinker(&balloon); if (error) - goto fail; + return error; /* * Initialization of compaction must be done after the call to @@ -1905,9 +1894,6 @@ static int __init vmballoon_init(void) vmballoon_debugfs_init(&balloon); return 0; -fail: - vmballoon_unregister_shrinker(&balloon); - return error; } /* -- Gitee From 492414107e2bc9507d5739964606b941588b314f Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Mon, 11 Sep 2023 17:44:27 +0800 Subject: [PATCH 067/484] virtio_balloon: dynamically allocate the virtio-balloon shrinker commit 0fbb99698b165444e3cf5421f96de01d9d56baf4 upstream Conflicts: none Backport-reason: Shrinker lockless In preparation for implementing lockless slab shrink, use new APIs to dynamically allocate the virtio-balloon shrinker, so that it can be freed asynchronously via RCU. Then it doesn't need to wait for RCU read-side critical section when releasing the struct virtio_balloon. Link: https://lkml.kernel.org/r/20230911094444.68966-29-zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Reviewed-by: Muchun Song Cc: "Michael S. Tsirkin" Cc: David Hildenbrand Cc: Jason Wang Cc: Xuan Zhuo Cc: Abhinav Kumar Cc: Alasdair Kergon Cc: Alexander Viro Cc: Alyssa Rosenzweig Cc: Andreas Dilger Cc: Andreas Gruenbacher Cc: Anna Schumaker Cc: Arnd Bergmann Cc: Bob Peterson Cc: Borislav Petkov Cc: Carlos Llamas Cc: Chandan Babu R Cc: Chao Yu Cc: Chris Mason Cc: Christian Brauner Cc: Christian Koenig Cc: Chuck Lever Cc: Coly Li Cc: Dai Ngo Cc: Daniel Vetter Cc: Daniel Vetter Cc: "Darrick J. Wong" Cc: Dave Chinner Cc: Dave Hansen Cc: David Airlie Cc: David Sterba Cc: Dmitry Baryshkov Cc: Gao Xiang Cc: Greg Kroah-Hartman Cc: Huang Rui Cc: Ingo Molnar Cc: Jaegeuk Kim Cc: Jani Nikula Cc: Jan Kara Cc: Jeff Layton Cc: Jeffle Xu Cc: Joel Fernandes (Google) Cc: Joonas Lahtinen Cc: Josef Bacik Cc: Juergen Gross Cc: Kent Overstreet Cc: Kirill Tkhai Cc: Marijn Suijten Cc: Mike Snitzer Cc: Minchan Kim Cc: Muchun Song Cc: Nadav Amit Cc: Neil Brown Cc: Oleksandr Tyshchenko Cc: Olga Kornievskaia Cc: Paul E. McKenney Cc: Richard Weinberger Cc: Rob Clark Cc: Rob Herring Cc: Rodrigo Vivi Cc: Roman Gushchin Cc: Sean Paul Cc: Sergey Senozhatsky Cc: Song Liu Cc: Stefano Stabellini Cc: Steven Price Cc: "Theodore Ts'o" Cc: Thomas Gleixner Cc: Tomeu Vizoso Cc: Tom Talpey Cc: Trond Myklebust Cc: Tvrtko Ursulin Cc: Vlastimil Babka Cc: Yue Hu Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- drivers/virtio/virtio_balloon.c | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c index 82ffbc0862dc..e57ef47641e7 100644 --- a/drivers/virtio/virtio_balloon.c +++ b/drivers/virtio/virtio_balloon.c @@ -111,7 +111,7 @@ struct virtio_balloon { struct virtio_balloon_stat stats[VIRTIO_BALLOON_S_NR]; /* Shrinker to return free pages - VIRTIO_BALLOON_F_FREE_PAGE_HINT */ - struct shrinker shrinker; + struct shrinker *shrinker; /* OOM notifier to deflate on OOM - VIRTIO_BALLOON_F_DEFLATE_ON_OOM */ struct notifier_block oom_nb; @@ -820,8 +820,7 @@ static unsigned long shrink_free_pages(struct virtio_balloon *vb, static unsigned long virtio_balloon_shrinker_scan(struct shrinker *shrinker, struct shrink_control *sc) { - struct virtio_balloon *vb = container_of(shrinker, - struct virtio_balloon, shrinker); + struct virtio_balloon *vb = shrinker->private_data; return shrink_free_pages(vb, sc->nr_to_scan); } @@ -829,8 +828,7 @@ static unsigned long virtio_balloon_shrinker_scan(struct shrinker *shrinker, static unsigned long virtio_balloon_shrinker_count(struct shrinker *shrinker, struct shrink_control *sc) { - struct virtio_balloon *vb = container_of(shrinker, - struct virtio_balloon, shrinker); + struct virtio_balloon *vb = shrinker->private_data; return vb->num_free_page_blocks * VIRTIO_BALLOON_HINT_BLOCK_PAGES; } @@ -851,16 +849,22 @@ static int virtio_balloon_oom_notify(struct notifier_block *nb, static void virtio_balloon_unregister_shrinker(struct virtio_balloon *vb) { - unregister_shrinker(&vb->shrinker); + shrinker_free(vb->shrinker); } static int virtio_balloon_register_shrinker(struct virtio_balloon *vb) { - vb->shrinker.scan_objects = virtio_balloon_shrinker_scan; - vb->shrinker.count_objects = virtio_balloon_shrinker_count; - vb->shrinker.seeks = DEFAULT_SEEKS; + vb->shrinker = shrinker_alloc(0, "virtio-balloon"); + if (!vb->shrinker) + return -ENOMEM; - return register_shrinker(&vb->shrinker, "virtio-balloon"); + vb->shrinker->scan_objects = virtio_balloon_shrinker_scan; + vb->shrinker->count_objects = virtio_balloon_shrinker_count; + vb->shrinker->private_data = vb; + + shrinker_register(vb->shrinker); + + return 0; } static int virtballoon_probe(struct virtio_device *vdev) -- Gitee From 13aee429cb7dea9ad819f127b9ac26cbf4fe5b3a Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Mon, 11 Sep 2023 17:44:28 +0800 Subject: [PATCH 068/484] mbcache: dynamically allocate the mbcache shrinker commit 714e5bde00a58ba13b03b68547931e011f93de58 upstream Conflicts: none Backport-reason: Shrinker lockless In preparation for implementing lockless slab shrink, use new APIs to dynamically allocate the mbcache shrinker, so that it can be freed asynchronously via RCU. Then it doesn't need to wait for RCU read-side critical section when releasing the struct mb_cache. Link: https://lkml.kernel.org/r/20230911094444.68966-30-zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Reviewed-by: Muchun Song Cc: Alexander Viro Cc: Christian Brauner Cc: Abhinav Kumar Cc: Alasdair Kergon Cc: Alyssa Rosenzweig Cc: Andreas Dilger Cc: Andreas Gruenbacher Cc: Anna Schumaker Cc: Arnd Bergmann Cc: Bob Peterson Cc: Borislav Petkov Cc: Carlos Llamas Cc: Chandan Babu R Cc: Chao Yu Cc: Chris Mason Cc: Christian Koenig Cc: Chuck Lever Cc: Coly Li Cc: Dai Ngo Cc: Daniel Vetter Cc: Daniel Vetter Cc: "Darrick J. Wong" Cc: Dave Chinner Cc: Dave Hansen Cc: David Airlie Cc: David Hildenbrand Cc: David Sterba Cc: Dmitry Baryshkov Cc: Gao Xiang Cc: Greg Kroah-Hartman Cc: Huang Rui Cc: Ingo Molnar Cc: Jaegeuk Kim Cc: Jani Nikula Cc: Jan Kara Cc: Jason Wang Cc: Jeff Layton Cc: Jeffle Xu Cc: Joel Fernandes (Google) Cc: Joonas Lahtinen Cc: Josef Bacik Cc: Juergen Gross Cc: Kent Overstreet Cc: Kirill Tkhai Cc: Marijn Suijten Cc: "Michael S. Tsirkin" Cc: Mike Snitzer Cc: Minchan Kim Cc: Muchun Song Cc: Nadav Amit Cc: Neil Brown Cc: Oleksandr Tyshchenko Cc: Olga Kornievskaia Cc: Paul E. McKenney Cc: Richard Weinberger Cc: Rob Clark Cc: Rob Herring Cc: Rodrigo Vivi Cc: Roman Gushchin Cc: Sean Paul Cc: Sergey Senozhatsky Cc: Song Liu Cc: Stefano Stabellini Cc: Steven Price Cc: "Theodore Ts'o" Cc: Thomas Gleixner Cc: Tomeu Vizoso Cc: Tom Talpey Cc: Trond Myklebust Cc: Tvrtko Ursulin Cc: Vlastimil Babka Cc: Xuan Zhuo Cc: Yue Hu Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- fs/mbcache.c | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/fs/mbcache.c b/fs/mbcache.c index 2a4b8b549e93..82aa7a35db26 100644 --- a/fs/mbcache.c +++ b/fs/mbcache.c @@ -37,7 +37,7 @@ struct mb_cache { struct list_head c_list; /* Number of entries in cache */ unsigned long c_entry_count; - struct shrinker c_shrink; + struct shrinker *c_shrink; /* Work for shrinking when the cache has too many entries */ struct work_struct c_shrink_work; }; @@ -293,8 +293,7 @@ EXPORT_SYMBOL(mb_cache_entry_touch); static unsigned long mb_cache_count(struct shrinker *shrink, struct shrink_control *sc) { - struct mb_cache *cache = container_of(shrink, struct mb_cache, - c_shrink); + struct mb_cache *cache = shrink->private_data; return cache->c_entry_count; } @@ -333,8 +332,7 @@ static unsigned long mb_cache_shrink(struct mb_cache *cache, static unsigned long mb_cache_scan(struct shrinker *shrink, struct shrink_control *sc) { - struct mb_cache *cache = container_of(shrink, struct mb_cache, - c_shrink); + struct mb_cache *cache = shrink->private_data; return mb_cache_shrink(cache, sc->nr_to_scan); } @@ -377,15 +375,19 @@ struct mb_cache *mb_cache_create(int bucket_bits) for (i = 0; i < bucket_count; i++) INIT_HLIST_BL_HEAD(&cache->c_hash[i]); - cache->c_shrink.count_objects = mb_cache_count; - cache->c_shrink.scan_objects = mb_cache_scan; - cache->c_shrink.seeks = DEFAULT_SEEKS; - if (register_shrinker(&cache->c_shrink, "mbcache-shrinker")) { + cache->c_shrink = shrinker_alloc(0, "mbcache-shrinker"); + if (!cache->c_shrink) { kfree(cache->c_hash); kfree(cache); goto err_out; } + cache->c_shrink->count_objects = mb_cache_count; + cache->c_shrink->scan_objects = mb_cache_scan; + cache->c_shrink->private_data = cache; + + shrinker_register(cache->c_shrink); + INIT_WORK(&cache->c_shrink_work, mb_cache_shrink_worker); return cache; @@ -406,7 +408,7 @@ void mb_cache_destroy(struct mb_cache *cache) { struct mb_cache_entry *entry, *next; - unregister_shrinker(&cache->c_shrink); + shrinker_free(cache->c_shrink); /* * We don't bother with any locking. Cache must not be used at this -- Gitee From df153efe6ad997dbbf80c7f0b0c12738c5284ff4 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Mon, 11 Sep 2023 17:44:29 +0800 Subject: [PATCH 069/484] ext4: dynamically allocate the ext4-es shrinker commit 4d09d75d8b8cb3c8087b46585924777bc8737663 upstream Conflicts: none Backport-reason: Shrinker lockless In preparation for implementing lockless slab shrink, use new APIs to dynamically allocate the ext4-es shrinker, so that it can be freed asynchronously via RCU. Then it doesn't need to wait for RCU read-side critical section when releasing the struct ext4_sb_info. Link: https://lkml.kernel.org/r/20230911094444.68966-31-zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Reviewed-by: Muchun Song Cc: "Theodore Ts'o" Cc: Andreas Dilger Cc: Abhinav Kumar Cc: Alasdair Kergon Cc: Alexander Viro Cc: Alyssa Rosenzweig Cc: Andreas Gruenbacher Cc: Anna Schumaker Cc: Arnd Bergmann Cc: Bob Peterson Cc: Borislav Petkov Cc: Carlos Llamas Cc: Chandan Babu R Cc: Chao Yu Cc: Chris Mason Cc: Christian Brauner Cc: Christian Koenig Cc: Chuck Lever Cc: Coly Li Cc: Dai Ngo Cc: Daniel Vetter Cc: Daniel Vetter Cc: "Darrick J. Wong" Cc: Dave Chinner Cc: Dave Hansen Cc: David Airlie Cc: David Hildenbrand Cc: David Sterba Cc: Dmitry Baryshkov Cc: Gao Xiang Cc: Greg Kroah-Hartman Cc: Huang Rui Cc: Ingo Molnar Cc: Jaegeuk Kim Cc: Jani Nikula Cc: Jan Kara Cc: Jason Wang Cc: Jeff Layton Cc: Jeffle Xu Cc: Joel Fernandes (Google) Cc: Joonas Lahtinen Cc: Josef Bacik Cc: Juergen Gross Cc: Kent Overstreet Cc: Kirill Tkhai Cc: Marijn Suijten Cc: "Michael S. Tsirkin" Cc: Mike Snitzer Cc: Minchan Kim Cc: Muchun Song Cc: Nadav Amit Cc: Neil Brown Cc: Oleksandr Tyshchenko Cc: Olga Kornievskaia Cc: Paul E. McKenney Cc: Richard Weinberger Cc: Rob Clark Cc: Rob Herring Cc: Rodrigo Vivi Cc: Roman Gushchin Cc: Sean Paul Cc: Sergey Senozhatsky Cc: Song Liu Cc: Stefano Stabellini Cc: Steven Price Cc: Thomas Gleixner Cc: Tomeu Vizoso Cc: Tom Talpey Cc: Trond Myklebust Cc: Tvrtko Ursulin Cc: Vlastimil Babka Cc: Xuan Zhuo Cc: Yue Hu Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- fs/ext4/ext4.h | 2 +- fs/ext4/extents_status.c | 23 +++++++++++++---------- 2 files changed, 14 insertions(+), 11 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 32ec73abfda0..11ba3f9eb8ba 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1664,7 +1664,7 @@ struct ext4_sb_info { __u32 s_csum_seed; /* Reclaim extents from extent status tree */ - struct shrinker s_es_shrinker; + struct shrinker *s_es_shrinker; struct list_head s_es_list; /* List of inodes with reclaimable extents */ long s_es_nr_inode; struct ext4_es_stats s_es_stats; diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c index d097a0977b14..32b8ab38020e 100644 --- a/fs/ext4/extents_status.c +++ b/fs/ext4/extents_status.c @@ -1651,7 +1651,7 @@ static unsigned long ext4_es_count(struct shrinker *shrink, unsigned long nr; struct ext4_sb_info *sbi; - sbi = container_of(shrink, struct ext4_sb_info, s_es_shrinker); + sbi = shrink->private_data; nr = percpu_counter_read_positive(&sbi->s_es_stats.es_stats_shk_cnt); trace_ext4_es_shrink_count(sbi->s_sb, sc->nr_to_scan, nr); return nr; @@ -1660,8 +1660,7 @@ static unsigned long ext4_es_count(struct shrinker *shrink, static unsigned long ext4_es_scan(struct shrinker *shrink, struct shrink_control *sc) { - struct ext4_sb_info *sbi = container_of(shrink, - struct ext4_sb_info, s_es_shrinker); + struct ext4_sb_info *sbi = shrink->private_data; int nr_to_scan = sc->nr_to_scan; int ret, nr_shrunk; @@ -1745,13 +1744,17 @@ int ext4_es_register_shrinker(struct ext4_sb_info *sbi) if (err) goto err3; - sbi->s_es_shrinker.scan_objects = ext4_es_scan; - sbi->s_es_shrinker.count_objects = ext4_es_count; - sbi->s_es_shrinker.seeks = DEFAULT_SEEKS; - err = register_shrinker(&sbi->s_es_shrinker, "ext4-es:%s", - sbi->s_sb->s_id); - if (err) + sbi->s_es_shrinker = shrinker_alloc(0, "ext4-es:%s", sbi->s_sb->s_id); + if (!sbi->s_es_shrinker) { + err = -ENOMEM; goto err4; + } + + sbi->s_es_shrinker->scan_objects = ext4_es_scan; + sbi->s_es_shrinker->count_objects = ext4_es_count; + sbi->s_es_shrinker->private_data = sbi; + + shrinker_register(sbi->s_es_shrinker); return 0; err4: @@ -1771,7 +1774,7 @@ void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi) percpu_counter_destroy(&sbi->s_es_stats.es_stats_cache_misses); percpu_counter_destroy(&sbi->s_es_stats.es_stats_all_cnt); percpu_counter_destroy(&sbi->s_es_stats.es_stats_shk_cnt); - unregister_shrinker(&sbi->s_es_shrinker); + shrinker_free(sbi->s_es_shrinker); } /* -- Gitee From 86e087ba017f40daa981a38f171ff0ead1c1656b Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Mon, 11 Sep 2023 17:44:30 +0800 Subject: [PATCH 070/484] jbd2,ext4: dynamically allocate the jbd2-journal shrinker commit 4b403dfa8ea8dfbc9d317b25a0433c1ac6765f7f upstream Conflicts: none Backport-reason: Shrinker lockless In preparation for implementing lockless slab shrink, use new APIs to dynamically allocate the jbd2-journal shrinker, so that it can be freed asynchronously via RCU. Then it doesn't need to wait for RCU read-side critical section when releasing the struct journal_s. Link: https://lkml.kernel.org/r/20230911094444.68966-32-zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Reviewed-by: Muchun Song Acked-by: Jan Kara Cc: "Theodore Ts'o" Cc: Abhinav Kumar Cc: Alasdair Kergon Cc: Alexander Viro Cc: Alyssa Rosenzweig Cc: Andreas Dilger Cc: Andreas Gruenbacher Cc: Anna Schumaker Cc: Arnd Bergmann Cc: Bob Peterson Cc: Borislav Petkov Cc: Carlos Llamas Cc: Chandan Babu R Cc: Chao Yu Cc: Chris Mason Cc: Christian Brauner Cc: Christian Koenig Cc: Chuck Lever Cc: Coly Li Cc: Dai Ngo Cc: Daniel Vetter Cc: Daniel Vetter Cc: "Darrick J. Wong" Cc: Dave Chinner Cc: Dave Hansen Cc: David Airlie Cc: David Hildenbrand Cc: David Sterba Cc: Dmitry Baryshkov Cc: Gao Xiang Cc: Greg Kroah-Hartman Cc: Huang Rui Cc: Ingo Molnar Cc: Jaegeuk Kim Cc: Jani Nikula Cc: Jason Wang Cc: Jeff Layton Cc: Jeffle Xu Cc: Joel Fernandes (Google) Cc: Joonas Lahtinen Cc: Josef Bacik Cc: Juergen Gross Cc: Kent Overstreet Cc: Kirill Tkhai Cc: Marijn Suijten Cc: "Michael S. Tsirkin" Cc: Mike Snitzer Cc: Minchan Kim Cc: Muchun Song Cc: Nadav Amit Cc: Neil Brown Cc: Oleksandr Tyshchenko Cc: Olga Kornievskaia Cc: Paul E. McKenney Cc: Richard Weinberger Cc: Rob Clark Cc: Rob Herring Cc: Rodrigo Vivi Cc: Roman Gushchin Cc: Sean Paul Cc: Sergey Senozhatsky Cc: Song Liu Cc: Stefano Stabellini Cc: Steven Price Cc: Thomas Gleixner Cc: Tomeu Vizoso Cc: Tom Talpey Cc: Trond Myklebust Cc: Tvrtko Ursulin Cc: Vlastimil Babka Cc: Xuan Zhuo Cc: Yue Hu Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- fs/jbd2/journal.c | 29 ++++++++++++++++++----------- include/linux/jbd2.h | 2 +- 2 files changed, 19 insertions(+), 12 deletions(-) diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index ddde73299d62..e9b2985a0bbb 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -1290,7 +1290,7 @@ static int jbd2_min_tag_size(void) static unsigned long jbd2_journal_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) { - journal_t *journal = container_of(shrink, journal_t, j_shrinker); + journal_t *journal = shrink->private_data; unsigned long nr_to_scan = sc->nr_to_scan; unsigned long nr_shrunk; unsigned long count; @@ -1316,7 +1316,7 @@ static unsigned long jbd2_journal_shrink_scan(struct shrinker *shrink, static unsigned long jbd2_journal_shrink_count(struct shrinker *shrink, struct shrink_control *sc) { - journal_t *journal = container_of(shrink, journal_t, j_shrinker); + journal_t *journal = shrink->private_data; unsigned long count; count = percpu_counter_read_positive(&journal->j_checkpoint_jh_count); @@ -1630,14 +1630,21 @@ static journal_t *journal_init_common(struct block_device *bdev, goto err_cleanup; journal->j_shrink_transaction = NULL; - journal->j_shrinker.scan_objects = jbd2_journal_shrink_scan; - journal->j_shrinker.count_objects = jbd2_journal_shrink_count; - journal->j_shrinker.seeks = DEFAULT_SEEKS; - journal->j_shrinker.batch = journal->j_max_transaction_buffers; - err = register_shrinker(&journal->j_shrinker, "jbd2-journal:(%u:%u)", - MAJOR(bdev->bd_dev), MINOR(bdev->bd_dev)); - if (err) + + journal->j_shrinker = shrinker_alloc(0, "jbd2-journal:(%u:%u)", + MAJOR(bdev->bd_dev), + MINOR(bdev->bd_dev)); + if (!journal->j_shrinker) { + err = -ENOMEM; goto err_cleanup; + } + + journal->j_shrinker->scan_objects = jbd2_journal_shrink_scan; + journal->j_shrinker->count_objects = jbd2_journal_shrink_count; + journal->j_shrinker->batch = journal->j_max_transaction_buffers; + journal->j_shrinker->private_data = journal; + + shrinker_register(journal->j_shrinker); return journal; @@ -2216,9 +2223,9 @@ int jbd2_journal_destroy(journal_t *journal) brelse(journal->j_sb_buffer); } - if (journal->j_shrinker.flags & SHRINKER_REGISTERED) { + if (journal->j_shrinker) { percpu_counter_destroy(&journal->j_checkpoint_jh_count); - unregister_shrinker(&journal->j_shrinker); + shrinker_free(journal->j_shrinker); } if (journal->j_proc_entry) jbd2_stats_proc_exit(journal); diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index f0bc9aa5aed3..1b2771c92571 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -886,7 +886,7 @@ struct journal_s * Journal head shrinker, reclaim buffer's journal head which * has been written back. */ - struct shrinker j_shrinker; + struct shrinker *j_shrinker; /** * @j_checkpoint_jh_count: -- Gitee From 3557fbe70d9a72ac80af6243ee1b3cdba3383446 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Mon, 11 Sep 2023 17:44:31 +0800 Subject: [PATCH 071/484] nfsd: dynamically allocate the nfsd-client shrinker commit d17452aa33a6fd0e9cf2bd3d43237f2d2c294a7e upstream Conflicts: resolved Backport-reason: Shrinker lockless In preparation for implementing lockless slab shrink, use new APIs to dynamically allocate the nfsd-client shrinker, so that it can be freed asynchronously via RCU. Then it doesn't need to wait for RCU read-side critical section when releasing the struct nfsd_net. Link: https://lkml.kernel.org/r/20230911094444.68966-33-zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Acked-by: Chuck Lever Acked-by: Jeff Layton Cc: Jeff Layton Cc: Olga Kornievskaia Cc: Dai Ngo Cc: Tom Talpey Cc: Abhinav Kumar Cc: Alasdair Kergon Cc: Alexander Viro Cc: Alyssa Rosenzweig Cc: Andreas Dilger Cc: Andreas Gruenbacher Cc: Anna Schumaker Cc: Arnd Bergmann Cc: Bob Peterson Cc: Borislav Petkov Cc: Carlos Llamas Cc: Chandan Babu R Cc: Chao Yu Cc: Chris Mason Cc: Christian Brauner Cc: Christian Koenig Cc: Chuck Lever Cc: Coly Li Cc: Daniel Vetter Cc: Daniel Vetter Cc: "Darrick J. Wong" Cc: Dave Chinner Cc: Dave Hansen Cc: David Airlie Cc: David Hildenbrand Cc: David Sterba Cc: Dmitry Baryshkov Cc: Gao Xiang Cc: Greg Kroah-Hartman Cc: Huang Rui Cc: Ingo Molnar Cc: Jaegeuk Kim Cc: Jani Nikula Cc: Jan Kara Cc: Jason Wang Cc: Jeffle Xu Cc: Joel Fernandes (Google) Cc: Joonas Lahtinen Cc: Josef Bacik Cc: Juergen Gross Cc: Kent Overstreet Cc: Kirill Tkhai Cc: Marijn Suijten Cc: "Michael S. Tsirkin" Cc: Mike Snitzer Cc: Minchan Kim Cc: Muchun Song Cc: Muchun Song Cc: Nadav Amit Cc: Neil Brown Cc: Oleksandr Tyshchenko Cc: Paul E. McKenney Cc: Richard Weinberger Cc: Rob Clark Cc: Rob Herring Cc: Rodrigo Vivi Cc: Roman Gushchin Cc: Sean Paul Cc: Sergey Senozhatsky Cc: Song Liu Cc: Stefano Stabellini Cc: Steven Price Cc: "Theodore Ts'o" Cc: Thomas Gleixner Cc: Tomeu Vizoso Cc: Trond Myklebust Cc: Tvrtko Ursulin Cc: Vlastimil Babka Cc: Xuan Zhuo Cc: Yue Hu Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- fs/nfsd/netns.h | 2 +- fs/nfsd/nfs4state.c | 19 +++++++++++-------- 2 files changed, 12 insertions(+), 9 deletions(-) diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h index 59570b79456f..8a2f6e3f42ab 100644 --- a/fs/nfsd/netns.h +++ b/fs/nfsd/netns.h @@ -207,7 +207,7 @@ struct nfsd_net { int nfs4_max_clients; atomic_t nfsd_courtesy_clients; - struct shrinker nfsd_client_shrinker; + struct shrinker *nfsd_client_shrinker; struct work_struct nfsd_shrinker_work; }; diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index e2875706e6bf..40154bdd4db0 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -4412,8 +4412,7 @@ static unsigned long nfsd4_state_shrinker_count(struct shrinker *shrink, struct shrink_control *sc) { int count; - struct nfsd_net *nn = container_of(shrink, - struct nfsd_net, nfsd_client_shrinker); + struct nfsd_net *nn = shrink->private_data; count = atomic_read(&nn->nfsd_courtesy_clients); if (!count) @@ -8185,12 +8184,16 @@ static int nfs4_state_create_net(struct net *net) INIT_WORK(&nn->nfsd_shrinker_work, nfsd4_state_shrinker_worker); get_net(net); - nn->nfsd_client_shrinker.scan_objects = nfsd4_state_shrinker_scan; - nn->nfsd_client_shrinker.count_objects = nfsd4_state_shrinker_count; - nn->nfsd_client_shrinker.seeks = DEFAULT_SEEKS; - - if (register_shrinker(&nn->nfsd_client_shrinker, "nfsd-client")) + nn->nfsd_client_shrinker = shrinker_alloc(0, "nfsd-client"); + if (!nn->nfsd_client_shrinker) goto err_shrinker; + + nn->nfsd_client_shrinker->scan_objects = nfsd4_state_shrinker_scan; + nn->nfsd_client_shrinker->count_objects = nfsd4_state_shrinker_count; + nn->nfsd_client_shrinker->private_data = nn; + + shrinker_register(nn->nfsd_client_shrinker); + return 0; err_shrinker: @@ -8288,7 +8291,7 @@ nfs4_state_shutdown_net(struct net *net) struct list_head *pos, *next, reaplist; struct nfsd_net *nn = net_generic(net, nfsd_net_id); - unregister_shrinker(&nn->nfsd_client_shrinker); + shrinker_free(nn->nfsd_client_shrinker); cancel_work_sync(&nn->nfsd_shrinker_work); cancel_delayed_work_sync(&nn->laundromat_work); locks_end_grace(&nn->nfsd4_manager); -- Gitee From 8e4a2e11237d7f258a1dd1914c5dc495778dcd73 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Mon, 11 Sep 2023 17:44:32 +0800 Subject: [PATCH 072/484] nfsd: dynamically allocate the nfsd-reply shrinker commit 8eea99a81c6f67c08fedd7db2ccd09aa93db69a7 upstream Conflicts: none Backport-reason: Shrinker lockless In preparation for implementing lockless slab shrink, use new APIs to dynamically allocate the nfsd-reply shrinker, so that it can be freed asynchronously via RCU. Then it doesn't need to wait for RCU read-side critical section when releasing the struct nfsd_net. Link: https://lkml.kernel.org/r/20230911094444.68966-34-zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Acked-by: Chuck Lever Acked-by: Jeff Layton Acked-by: Muchun Song Cc: Neil Brown Cc: Olga Kornievskaia Cc: Dai Ngo Cc: Tom Talpey Cc: Abhinav Kumar Cc: Alasdair Kergon Cc: Alexander Viro Cc: Alyssa Rosenzweig Cc: Andreas Dilger Cc: Andreas Gruenbacher Cc: Anna Schumaker Cc: Arnd Bergmann Cc: Bob Peterson Cc: Borislav Petkov Cc: Carlos Llamas Cc: Chandan Babu R Cc: Chao Yu Cc: Chris Mason Cc: Christian Brauner Cc: Christian Koenig Cc: Chuck Lever Cc: Coly Li Cc: Daniel Vetter Cc: Daniel Vetter Cc: "Darrick J. Wong" Cc: Dave Chinner Cc: Dave Hansen Cc: David Airlie Cc: David Hildenbrand Cc: David Sterba Cc: Dmitry Baryshkov Cc: Gao Xiang Cc: Greg Kroah-Hartman Cc: Huang Rui Cc: Ingo Molnar Cc: Jaegeuk Kim Cc: Jani Nikula Cc: Jan Kara Cc: Jason Wang Cc: Jeffle Xu Cc: Joel Fernandes (Google) Cc: Joonas Lahtinen Cc: Josef Bacik Cc: Juergen Gross Cc: Kent Overstreet Cc: Kirill Tkhai Cc: Marijn Suijten Cc: "Michael S. Tsirkin" Cc: Mike Snitzer Cc: Minchan Kim Cc: Nadav Amit Cc: Oleksandr Tyshchenko Cc: Paul E. McKenney Cc: Richard Weinberger Cc: Rob Clark Cc: Rob Herring Cc: Rodrigo Vivi Cc: Roman Gushchin Cc: Sean Paul Cc: Sergey Senozhatsky Cc: Song Liu Cc: Stefano Stabellini Cc: Steven Price Cc: "Theodore Ts'o" Cc: Thomas Gleixner Cc: Tomeu Vizoso Cc: Trond Myklebust Cc: Tvrtko Ursulin Cc: Vlastimil Babka Cc: Xuan Zhuo Cc: Yue Hu Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- fs/nfsd/netns.h | 2 +- fs/nfsd/nfscache.c | 31 ++++++++++++++++--------------- 2 files changed, 17 insertions(+), 16 deletions(-) diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h index 8a2f6e3f42ab..a1e5675ba8b2 100644 --- a/fs/nfsd/netns.h +++ b/fs/nfsd/netns.h @@ -189,7 +189,7 @@ struct nfsd_net { /* size of cache when we saw the longest hash chain */ unsigned int longest_chain_cachesize; - struct shrinker nfsd_reply_cache_shrinker; + struct shrinker *nfsd_reply_cache_shrinker; /* tracking server-to-server copy mounts */ spinlock_t nfsd_ssc_lock; diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c index c52132ecb339..622830a09ec3 100644 --- a/fs/nfsd/nfscache.c +++ b/fs/nfsd/nfscache.c @@ -180,26 +180,29 @@ int nfsd_reply_cache_init(struct nfsd_net *nn) { unsigned int hashsize; unsigned int i; - int status = 0; nn->max_drc_entries = nfsd_cache_size_limit(); atomic_set(&nn->num_drc_entries, 0); hashsize = nfsd_hashsize(nn->max_drc_entries); nn->maskbits = ilog2(hashsize); - nn->nfsd_reply_cache_shrinker.scan_objects = nfsd_reply_cache_scan; - nn->nfsd_reply_cache_shrinker.count_objects = nfsd_reply_cache_count; - nn->nfsd_reply_cache_shrinker.seeks = 1; - status = register_shrinker(&nn->nfsd_reply_cache_shrinker, - "nfsd-reply:%s", nn->nfsd_name); - if (status) - return status; - nn->drc_hashtbl = kvzalloc(array_size(hashsize, sizeof(*nn->drc_hashtbl)), GFP_KERNEL); if (!nn->drc_hashtbl) + return -ENOMEM; + + nn->nfsd_reply_cache_shrinker = shrinker_alloc(0, "nfsd-reply:%s", + nn->nfsd_name); + if (!nn->nfsd_reply_cache_shrinker) goto out_shrinker; + nn->nfsd_reply_cache_shrinker->scan_objects = nfsd_reply_cache_scan; + nn->nfsd_reply_cache_shrinker->count_objects = nfsd_reply_cache_count; + nn->nfsd_reply_cache_shrinker->seeks = 1; + nn->nfsd_reply_cache_shrinker->private_data = nn; + + shrinker_register(nn->nfsd_reply_cache_shrinker); + for (i = 0; i < hashsize; i++) { INIT_LIST_HEAD(&nn->drc_hashtbl[i].lru_head); spin_lock_init(&nn->drc_hashtbl[i].cache_lock); @@ -208,7 +211,7 @@ int nfsd_reply_cache_init(struct nfsd_net *nn) return 0; out_shrinker: - unregister_shrinker(&nn->nfsd_reply_cache_shrinker); + kvfree(nn->drc_hashtbl); printk(KERN_ERR "nfsd: failed to allocate reply cache\n"); return -ENOMEM; } @@ -218,7 +221,7 @@ void nfsd_reply_cache_shutdown(struct nfsd_net *nn) struct nfsd_cacherep *rp; unsigned int i; - unregister_shrinker(&nn->nfsd_reply_cache_shrinker); + shrinker_free(nn->nfsd_reply_cache_shrinker); for (i = 0; i < nn->drc_hashsize; i++) { struct list_head *head = &nn->drc_hashtbl[i].lru_head; @@ -302,8 +305,7 @@ nfsd_prune_bucket_locked(struct nfsd_net *nn, struct nfsd_drc_bucket *b, static unsigned long nfsd_reply_cache_count(struct shrinker *shrink, struct shrink_control *sc) { - struct nfsd_net *nn = container_of(shrink, - struct nfsd_net, nfsd_reply_cache_shrinker); + struct nfsd_net *nn = shrink->private_data; return atomic_read(&nn->num_drc_entries); } @@ -322,8 +324,7 @@ nfsd_reply_cache_count(struct shrinker *shrink, struct shrink_control *sc) static unsigned long nfsd_reply_cache_scan(struct shrinker *shrink, struct shrink_control *sc) { - struct nfsd_net *nn = container_of(shrink, - struct nfsd_net, nfsd_reply_cache_shrinker); + struct nfsd_net *nn = shrink->private_data; unsigned long freed = 0; LIST_HEAD(dispose); unsigned int i; -- Gitee From b48803af8c8056bef1e83329e8640fb3a835779b Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Mon, 11 Sep 2023 17:44:33 +0800 Subject: [PATCH 073/484] xfs: dynamically allocate the xfs-buf shrinker commit 17e7a00e60c8fff31082d5216b5c7d06d156f972 upstream Conflicts: none Backport-reason: Shrinker lockless In preparation for implementing lockless slab shrink, use new APIs to dynamically allocate the xfs-buf shrinker, so that it can be freed asynchronously via RCU. Then it doesn't need to wait for RCU read-side critical section when releasing the struct xfs_buftarg. Link: https://lkml.kernel.org/r/20230911094444.68966-35-zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Reviewed-by: Muchun Song Cc: Chandan Babu R Cc: "Darrick J. Wong" Cc: Abhinav Kumar Cc: Alasdair Kergon Cc: Alexander Viro Cc: Alyssa Rosenzweig Cc: Andreas Dilger Cc: Andreas Gruenbacher Cc: Anna Schumaker Cc: Arnd Bergmann Cc: Bob Peterson Cc: Borislav Petkov Cc: Carlos Llamas Cc: Chao Yu Cc: Chris Mason Cc: Christian Brauner Cc: Christian Koenig Cc: Chuck Lever Cc: Coly Li Cc: Dai Ngo Cc: Daniel Vetter Cc: Daniel Vetter Cc: Dave Chinner Cc: Dave Hansen Cc: David Airlie Cc: David Hildenbrand Cc: David Sterba Cc: Dmitry Baryshkov Cc: Gao Xiang Cc: Greg Kroah-Hartman Cc: Huang Rui Cc: Ingo Molnar Cc: Jaegeuk Kim Cc: Jani Nikula Cc: Jan Kara Cc: Jason Wang Cc: Jeff Layton Cc: Jeffle Xu Cc: Joel Fernandes (Google) Cc: Joonas Lahtinen Cc: Josef Bacik Cc: Juergen Gross Cc: Kent Overstreet Cc: Kirill Tkhai Cc: Marijn Suijten Cc: "Michael S. Tsirkin" Cc: Mike Snitzer Cc: Minchan Kim Cc: Muchun Song Cc: Nadav Amit Cc: Neil Brown Cc: Oleksandr Tyshchenko Cc: Olga Kornievskaia Cc: Paul E. McKenney Cc: Richard Weinberger Cc: Rob Clark Cc: Rob Herring Cc: Rodrigo Vivi Cc: Roman Gushchin Cc: Sean Paul Cc: Sergey Senozhatsky Cc: Song Liu Cc: Stefano Stabellini Cc: Steven Price Cc: "Theodore Ts'o" Cc: Thomas Gleixner Cc: Tomeu Vizoso Cc: Tom Talpey Cc: Trond Myklebust Cc: Tvrtko Ursulin Cc: Vlastimil Babka Cc: Xuan Zhuo Cc: Yue Hu Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- fs/xfs/xfs_buf.c | 24 +++++++++++++----------- fs/xfs/xfs_buf.h | 2 +- 2 files changed, 14 insertions(+), 12 deletions(-) diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 20c1d146af1d..541dc4ae4476 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -1913,8 +1913,7 @@ xfs_buftarg_shrink_scan( struct shrinker *shrink, struct shrink_control *sc) { - struct xfs_buftarg *btp = container_of(shrink, - struct xfs_buftarg, bt_shrinker); + struct xfs_buftarg *btp = shrink->private_data; LIST_HEAD(dispose); unsigned long freed; @@ -1936,8 +1935,7 @@ xfs_buftarg_shrink_count( struct shrinker *shrink, struct shrink_control *sc) { - struct xfs_buftarg *btp = container_of(shrink, - struct xfs_buftarg, bt_shrinker); + struct xfs_buftarg *btp = shrink->private_data; return list_lru_shrink_count(&btp->bt_lru, sc); } @@ -1947,7 +1945,7 @@ xfs_free_buftarg( { struct block_device *bdev = btp->bt_bdev; - unregister_shrinker(&btp->bt_shrinker); + shrinker_free(btp->bt_shrinker); ASSERT(percpu_counter_sum(&btp->bt_io_count) == 0); percpu_counter_destroy(&btp->bt_io_count); list_lru_destroy(&btp->bt_lru); @@ -2031,13 +2029,17 @@ xfs_alloc_buftarg( if (percpu_counter_init(&btp->bt_io_count, 0, GFP_KERNEL)) goto error_lru; - btp->bt_shrinker.count_objects = xfs_buftarg_shrink_count; - btp->bt_shrinker.scan_objects = xfs_buftarg_shrink_scan; - btp->bt_shrinker.seeks = DEFAULT_SEEKS; - btp->bt_shrinker.flags = SHRINKER_NUMA_AWARE; - if (register_shrinker(&btp->bt_shrinker, "xfs-buf:%s", - mp->m_super->s_id)) + btp->bt_shrinker = shrinker_alloc(SHRINKER_NUMA_AWARE, "xfs-buf:%s", + mp->m_super->s_id); + if (!btp->bt_shrinker) goto error_pcpu; + + btp->bt_shrinker->count_objects = xfs_buftarg_shrink_count; + btp->bt_shrinker->scan_objects = xfs_buftarg_shrink_scan; + btp->bt_shrinker->private_data = btp; + + shrinker_register(btp->bt_shrinker); + return btp; error_pcpu: diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index 5896b58c5f4d..b21ec7234dbe 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -108,7 +108,7 @@ typedef struct xfs_buftarg { size_t bt_logical_sectormask; /* LRU control structures */ - struct shrinker bt_shrinker; + struct shrinker *bt_shrinker; struct list_lru bt_lru; struct percpu_counter bt_io_count; -- Gitee From 824765369622cb19487830f394e7ff91047946f6 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Mon, 11 Sep 2023 17:44:34 +0800 Subject: [PATCH 074/484] xfs: dynamically allocate the xfs-inodegc shrinker commit 1a86a53da46778a583cd866465e18d662d866f30 upstream Conflicts: none Backport-reason: Shrinker lockless In preparation for implementing lockless slab shrink, use new APIs to dynamically allocate the xfs-inodegc shrinker, so that it can be freed asynchronously via RCU. Then it doesn't need to wait for RCU read-side critical section when releasing the struct xfs_mount. Link: https://lkml.kernel.org/r/20230911094444.68966-36-zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Reviewed-by: Muchun Song Cc: Chandan Babu R Cc: "Darrick J. Wong" Cc: Abhinav Kumar Cc: Alasdair Kergon Cc: Alexander Viro Cc: Alyssa Rosenzweig Cc: Andreas Dilger Cc: Andreas Gruenbacher Cc: Anna Schumaker Cc: Arnd Bergmann Cc: Bob Peterson Cc: Borislav Petkov Cc: Carlos Llamas Cc: Chao Yu Cc: Chris Mason Cc: Christian Brauner Cc: Christian Koenig Cc: Chuck Lever Cc: Coly Li Cc: Dai Ngo Cc: Daniel Vetter Cc: Daniel Vetter Cc: Dave Chinner Cc: Dave Hansen Cc: David Airlie Cc: David Hildenbrand Cc: David Sterba Cc: Dmitry Baryshkov Cc: Gao Xiang Cc: Greg Kroah-Hartman Cc: Huang Rui Cc: Ingo Molnar Cc: Jaegeuk Kim Cc: Jani Nikula Cc: Jan Kara Cc: Jason Wang Cc: Jeff Layton Cc: Jeffle Xu Cc: Joel Fernandes (Google) Cc: Joonas Lahtinen Cc: Josef Bacik Cc: Juergen Gross Cc: Kent Overstreet Cc: Kirill Tkhai Cc: Marijn Suijten Cc: "Michael S. Tsirkin" Cc: Mike Snitzer Cc: Minchan Kim Cc: Muchun Song Cc: Nadav Amit Cc: Neil Brown Cc: Oleksandr Tyshchenko Cc: Olga Kornievskaia Cc: Paul E. McKenney Cc: Richard Weinberger Cc: Rob Clark Cc: Rob Herring Cc: Rodrigo Vivi Cc: Roman Gushchin Cc: Sean Paul Cc: Sergey Senozhatsky Cc: Song Liu Cc: Stefano Stabellini Cc: Steven Price Cc: "Theodore Ts'o" Cc: Thomas Gleixner Cc: Tomeu Vizoso Cc: Tom Talpey Cc: Trond Myklebust Cc: Tvrtko Ursulin Cc: Vlastimil Babka Cc: Xuan Zhuo Cc: Yue Hu Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- fs/xfs/xfs_icache.c | 26 +++++++++++++++----------- fs/xfs/xfs_mount.c | 4 ++-- fs/xfs/xfs_mount.h | 2 +- 3 files changed, 18 insertions(+), 14 deletions(-) diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index c54a7c60e063..f79b35755a39 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -2176,8 +2176,7 @@ xfs_inodegc_shrinker_count( struct shrinker *shrink, struct shrink_control *sc) { - struct xfs_mount *mp = container_of(shrink, struct xfs_mount, - m_inodegc_shrinker); + struct xfs_mount *mp = shrink->private_data; struct xfs_inodegc *gc; int cpu; @@ -2198,8 +2197,7 @@ xfs_inodegc_shrinker_scan( struct shrinker *shrink, struct shrink_control *sc) { - struct xfs_mount *mp = container_of(shrink, struct xfs_mount, - m_inodegc_shrinker); + struct xfs_mount *mp = shrink->private_data; struct xfs_inodegc *gc; int cpu; bool no_items = true; @@ -2235,13 +2233,19 @@ int xfs_inodegc_register_shrinker( struct xfs_mount *mp) { - struct shrinker *shrink = &mp->m_inodegc_shrinker; + mp->m_inodegc_shrinker = shrinker_alloc(SHRINKER_NONSLAB, + "xfs-inodegc:%s", + mp->m_super->s_id); + if (!mp->m_inodegc_shrinker) + return -ENOMEM; + + mp->m_inodegc_shrinker->count_objects = xfs_inodegc_shrinker_count; + mp->m_inodegc_shrinker->scan_objects = xfs_inodegc_shrinker_scan; + mp->m_inodegc_shrinker->seeks = 0; + mp->m_inodegc_shrinker->batch = XFS_INODEGC_SHRINKER_BATCH; + mp->m_inodegc_shrinker->private_data = mp; - shrink->count_objects = xfs_inodegc_shrinker_count; - shrink->scan_objects = xfs_inodegc_shrinker_scan; - shrink->seeks = 0; - shrink->flags = SHRINKER_NONSLAB; - shrink->batch = XFS_INODEGC_SHRINKER_BATCH; + shrinker_register(mp->m_inodegc_shrinker); - return register_shrinker(shrink, "xfs-inodegc:%s", mp->m_super->s_id); + return 0; } diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 747db90731e8..6aff33827b2a 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -1021,7 +1021,7 @@ xfs_mountfs( out_log_dealloc: xfs_log_mount_cancel(mp); out_inodegc_shrinker: - unregister_shrinker(&mp->m_inodegc_shrinker); + shrinker_free(mp->m_inodegc_shrinker); out_fail_wait: if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp) xfs_buftarg_drain(mp->m_logdev_targp); @@ -1104,7 +1104,7 @@ xfs_unmountfs( #if defined(DEBUG) xfs_errortag_clearall(mp); #endif - unregister_shrinker(&mp->m_inodegc_shrinker); + shrinker_free(mp->m_inodegc_shrinker); xfs_free_perag(mp); xfs_errortag_del(mp); diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index d19cca099bc3..219681d29fbc 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -219,7 +219,7 @@ typedef struct xfs_mount { atomic_t m_agirotor; /* last ag dir inode alloced */ /* Memory shrinker to throttle and reprioritize inodegc */ - struct shrinker m_inodegc_shrinker; + struct shrinker *m_inodegc_shrinker; /* * Workqueue item so that we can coalesce multiple inode flush attempts * into a single flush. -- Gitee From 1eda7dd859d46d11683e103049552840bf93ffbd Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Mon, 11 Sep 2023 17:44:35 +0800 Subject: [PATCH 075/484] xfs: dynamically allocate the xfs-qm shrinker commit fe88527852be8a6f1391c961db38d342cc46461d upstream Conflicts: none Backport-reason: Shrinker lockless In preparation for implementing lockless slab shrink, use new APIs to dynamically allocate the xfs-qm shrinker, so that it can be freed asynchronously via RCU. Then it doesn't need to wait for RCU read-side critical section when releasing the struct xfs_quotainfo. Link: https://lkml.kernel.org/r/20230911094444.68966-37-zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Reviewed-by: Muchun Song Cc: Chandan Babu R Cc: "Darrick J. Wong" Cc: Abhinav Kumar Cc: Alasdair Kergon Cc: Alexander Viro Cc: Alyssa Rosenzweig Cc: Andreas Dilger Cc: Andreas Gruenbacher Cc: Anna Schumaker Cc: Arnd Bergmann Cc: Bob Peterson Cc: Borislav Petkov Cc: Carlos Llamas Cc: Chao Yu Cc: Chris Mason Cc: Christian Brauner Cc: Christian Koenig Cc: Chuck Lever Cc: Coly Li Cc: Dai Ngo Cc: Daniel Vetter Cc: Daniel Vetter Cc: Dave Chinner Cc: Dave Hansen Cc: David Airlie Cc: David Hildenbrand Cc: David Sterba Cc: Dmitry Baryshkov Cc: Gao Xiang Cc: Greg Kroah-Hartman Cc: Huang Rui Cc: Ingo Molnar Cc: Jaegeuk Kim Cc: Jani Nikula Cc: Jan Kara Cc: Jason Wang Cc: Jeff Layton Cc: Jeffle Xu Cc: Joel Fernandes (Google) Cc: Joonas Lahtinen Cc: Josef Bacik Cc: Juergen Gross Cc: Kent Overstreet Cc: Kirill Tkhai Cc: Marijn Suijten Cc: "Michael S. Tsirkin" Cc: Mike Snitzer Cc: Minchan Kim Cc: Muchun Song Cc: Nadav Amit Cc: Neil Brown Cc: Oleksandr Tyshchenko Cc: Olga Kornievskaia Cc: Paul E. McKenney Cc: Richard Weinberger Cc: Rob Clark Cc: Rob Herring Cc: Rodrigo Vivi Cc: Roman Gushchin Cc: Sean Paul Cc: Sergey Senozhatsky Cc: Song Liu Cc: Stefano Stabellini Cc: Steven Price Cc: "Theodore Ts'o" Cc: Thomas Gleixner Cc: Tomeu Vizoso Cc: Tom Talpey Cc: Trond Myklebust Cc: Tvrtko Ursulin Cc: Vlastimil Babka Cc: Xuan Zhuo Cc: Yue Hu Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- fs/xfs/xfs_qm.c | 27 ++++++++++++++------------- fs/xfs/xfs_qm.h | 2 +- 2 files changed, 15 insertions(+), 14 deletions(-) diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index 086e78a6143a..94a7932ac570 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -504,8 +504,7 @@ xfs_qm_shrink_scan( struct shrinker *shrink, struct shrink_control *sc) { - struct xfs_quotainfo *qi = container_of(shrink, - struct xfs_quotainfo, qi_shrinker); + struct xfs_quotainfo *qi = shrink->private_data; struct xfs_qm_isolate isol; unsigned long freed; int error; @@ -539,8 +538,7 @@ xfs_qm_shrink_count( struct shrinker *shrink, struct shrink_control *sc) { - struct xfs_quotainfo *qi = container_of(shrink, - struct xfs_quotainfo, qi_shrinker); + struct xfs_quotainfo *qi = shrink->private_data; return list_lru_shrink_count(&qi->qi_lru, sc); } @@ -680,15 +678,18 @@ xfs_qm_init_quotainfo( if (XFS_IS_PQUOTA_ON(mp)) xfs_qm_set_defquota(mp, XFS_DQTYPE_PROJ, qinf); - qinf->qi_shrinker.count_objects = xfs_qm_shrink_count; - qinf->qi_shrinker.scan_objects = xfs_qm_shrink_scan; - qinf->qi_shrinker.seeks = DEFAULT_SEEKS; - qinf->qi_shrinker.flags = SHRINKER_NUMA_AWARE; - - error = register_shrinker(&qinf->qi_shrinker, "xfs-qm:%s", - mp->m_super->s_id); - if (error) + qinf->qi_shrinker = shrinker_alloc(SHRINKER_NUMA_AWARE, "xfs-qm:%s", + mp->m_super->s_id); + if (!qinf->qi_shrinker) { + error = -ENOMEM; goto out_free_inos; + } + + qinf->qi_shrinker->count_objects = xfs_qm_shrink_count; + qinf->qi_shrinker->scan_objects = xfs_qm_shrink_scan; + qinf->qi_shrinker->private_data = qinf; + + shrinker_register(qinf->qi_shrinker); return 0; @@ -718,7 +719,7 @@ xfs_qm_destroy_quotainfo( qi = mp->m_quotainfo; ASSERT(qi != NULL); - unregister_shrinker(&qi->qi_shrinker); + shrinker_free(qi->qi_shrinker); list_lru_destroy(&qi->qi_lru); xfs_qm_destroy_quotainos(qi); mutex_destroy(&qi->qi_tree_lock); diff --git a/fs/xfs/xfs_qm.h b/fs/xfs/xfs_qm.h index 9683f0457d19..d5c9fc4ba591 100644 --- a/fs/xfs/xfs_qm.h +++ b/fs/xfs/xfs_qm.h @@ -63,7 +63,7 @@ struct xfs_quotainfo { struct xfs_def_quota qi_usr_default; struct xfs_def_quota qi_grp_default; struct xfs_def_quota qi_prj_default; - struct shrinker qi_shrinker; + struct shrinker *qi_shrinker; /* Minimum and maximum quota expiration timestamp values. */ time64_t qi_expiry_min; -- Gitee From c1b502ddcba9756757737cc6dea9cce181a8b825 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Mon, 11 Sep 2023 17:44:36 +0800 Subject: [PATCH 076/484] zsmalloc: dynamically allocate the mm-zspool shrinker commit c19b548b493455cfb80895074687152869e77742 upstream Conflicts: none Backport-reason: Shrinker lockless In preparation for implementing lockless slab shrink, use new APIs to dynamically allocate the mm-zspool shrinker, so that it can be freed asynchronously via RCU. Then it doesn't need to wait for RCU read-side critical section when releasing the struct zs_pool. Link: https://lkml.kernel.org/r/20230911094444.68966-38-zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Reviewed-by: Muchun Song Reviewed-by: Sergey Senozhatsky Cc: Minchan Kim Cc: Abhinav Kumar Cc: Alasdair Kergon Cc: Alexander Viro Cc: Alyssa Rosenzweig Cc: Andreas Dilger Cc: Andreas Gruenbacher Cc: Anna Schumaker Cc: Arnd Bergmann Cc: Bob Peterson Cc: Borislav Petkov Cc: Carlos Llamas Cc: Chandan Babu R Cc: Chao Yu Cc: Chris Mason Cc: Christian Brauner Cc: Christian Koenig Cc: Chuck Lever Cc: Coly Li Cc: Dai Ngo Cc: Daniel Vetter Cc: Daniel Vetter Cc: "Darrick J. Wong" Cc: Dave Chinner Cc: Dave Hansen Cc: David Airlie Cc: David Hildenbrand Cc: David Sterba Cc: Dmitry Baryshkov Cc: Gao Xiang Cc: Greg Kroah-Hartman Cc: Huang Rui Cc: Ingo Molnar Cc: Jaegeuk Kim Cc: Jani Nikula Cc: Jan Kara Cc: Jason Wang Cc: Jeff Layton Cc: Jeffle Xu Cc: Joel Fernandes (Google) Cc: Joonas Lahtinen Cc: Josef Bacik Cc: Juergen Gross Cc: Kent Overstreet Cc: Kirill Tkhai Cc: Marijn Suijten Cc: "Michael S. Tsirkin" Cc: Mike Snitzer Cc: Muchun Song Cc: Nadav Amit Cc: Neil Brown Cc: Oleksandr Tyshchenko Cc: Olga Kornievskaia Cc: Paul E. McKenney Cc: Richard Weinberger Cc: Rob Clark Cc: Rob Herring Cc: Rodrigo Vivi Cc: Roman Gushchin Cc: Sean Paul Cc: Song Liu Cc: Stefano Stabellini Cc: Steven Price Cc: "Theodore Ts'o" Cc: Thomas Gleixner Cc: Tomeu Vizoso Cc: Tom Talpey Cc: Trond Myklebust Cc: Tvrtko Ursulin Cc: Vlastimil Babka Cc: Xuan Zhuo Cc: Yue Hu Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/zsmalloc.c | 27 +++++++++++++++------------ 1 file changed, 15 insertions(+), 12 deletions(-) diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index b58f957429f0..c743ce7a5f49 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -229,7 +229,7 @@ struct zs_pool { struct zs_pool_stats stats; /* Compact classes */ - struct shrinker shrinker; + struct shrinker *shrinker; #ifdef CONFIG_ZSMALLOC_STAT struct dentry *stat_dentry; @@ -2086,8 +2086,7 @@ static unsigned long zs_shrinker_scan(struct shrinker *shrinker, struct shrink_control *sc) { unsigned long pages_freed; - struct zs_pool *pool = container_of(shrinker, struct zs_pool, - shrinker); + struct zs_pool *pool = shrinker->private_data; /* * Compact classes and calculate compaction delta. @@ -2105,8 +2104,7 @@ static unsigned long zs_shrinker_count(struct shrinker *shrinker, int i; struct size_class *class; unsigned long pages_to_free = 0; - struct zs_pool *pool = container_of(shrinker, struct zs_pool, - shrinker); + struct zs_pool *pool = shrinker->private_data; for (i = ZS_SIZE_CLASSES - 1; i >= 0; i--) { class = pool->size_class[i]; @@ -2121,18 +2119,23 @@ static unsigned long zs_shrinker_count(struct shrinker *shrinker, static void zs_unregister_shrinker(struct zs_pool *pool) { - unregister_shrinker(&pool->shrinker); + shrinker_free(pool->shrinker); } static int zs_register_shrinker(struct zs_pool *pool) { - pool->shrinker.scan_objects = zs_shrinker_scan; - pool->shrinker.count_objects = zs_shrinker_count; - pool->shrinker.batch = 0; - pool->shrinker.seeks = DEFAULT_SEEKS; + pool->shrinker = shrinker_alloc(0, "mm-zspool:%s", pool->name); + if (!pool->shrinker) + return -ENOMEM; + + pool->shrinker->scan_objects = zs_shrinker_scan; + pool->shrinker->count_objects = zs_shrinker_count; + pool->shrinker->batch = 0; + pool->shrinker->private_data = pool; - return register_shrinker(&pool->shrinker, "mm-zspool:%s", - pool->name); + shrinker_register(pool->shrinker); + + return 0; } static int calculate_zspage_chain_size(int class_size) -- Gitee From f94dedc3f392be9af7dc6418172507509d1af65d Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Mon, 11 Sep 2023 17:44:37 +0800 Subject: [PATCH 077/484] fs: super: dynamically allocate the s_shrink commit 1720f5dd8d3af34d6023fb9e8c35e5e60e8b6643 upstream Conflicts: none Backport-reason: Shrinker lockless In preparation for implementing lockless slab shrink, use new APIs to dynamically allocate the s_shrink, so that it can be freed asynchronously via RCU. Then it doesn't need to wait for RCU read-side critical section when releasing the struct super_block. Link: https://lkml.kernel.org/r/20230911094444.68966-39-zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Reviewed-by: Muchun Song Acked-by: David Sterba Cc: Chris Mason Cc: Josef Bacik Cc: Alexander Viro Cc: Christian Brauner Cc: Abhinav Kumar Cc: Alasdair Kergon Cc: Alyssa Rosenzweig Cc: Andreas Dilger Cc: Andreas Gruenbacher Cc: Anna Schumaker Cc: Arnd Bergmann Cc: Bob Peterson Cc: Borislav Petkov Cc: Carlos Llamas Cc: Chandan Babu R Cc: Chao Yu Cc: Christian Koenig Cc: Chuck Lever Cc: Coly Li Cc: Dai Ngo Cc: Daniel Vetter Cc: Daniel Vetter Cc: "Darrick J. Wong" Cc: Dave Chinner Cc: Dave Hansen Cc: David Airlie Cc: David Hildenbrand Cc: Dmitry Baryshkov Cc: Gao Xiang Cc: Greg Kroah-Hartman Cc: Huang Rui Cc: Ingo Molnar Cc: Jaegeuk Kim Cc: Jani Nikula Cc: Jan Kara Cc: Jason Wang Cc: Jeff Layton Cc: Jeffle Xu Cc: Joel Fernandes (Google) Cc: Joonas Lahtinen Cc: Juergen Gross Cc: Kent Overstreet Cc: Kirill Tkhai Cc: Marijn Suijten Cc: "Michael S. Tsirkin" Cc: Mike Snitzer Cc: Minchan Kim Cc: Muchun Song Cc: Nadav Amit Cc: Neil Brown Cc: Oleksandr Tyshchenko Cc: Olga Kornievskaia Cc: Paul E. McKenney Cc: Richard Weinberger Cc: Rob Clark Cc: Rob Herring Cc: Rodrigo Vivi Cc: Roman Gushchin Cc: Sean Paul Cc: Sergey Senozhatsky Cc: Song Liu Cc: Stefano Stabellini Cc: Steven Price Cc: "Theodore Ts'o" Cc: Thomas Gleixner Cc: Tomeu Vizoso Cc: Tom Talpey Cc: Trond Myklebust Cc: Tvrtko Ursulin Cc: Vlastimil Babka Cc: Xuan Zhuo Cc: Yue Hu Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- fs/btrfs/super.c | 2 +- fs/kernfs/mount.c | 2 +- fs/proc/root.c | 2 +- fs/super.c | 33 ++++++++++++++++++--------------- include/linux/fs.h | 2 +- 5 files changed, 22 insertions(+), 19 deletions(-) diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 2045ac3d94c4..55dfa57d51a7 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -1521,7 +1521,7 @@ static struct dentry *btrfs_mount_root(struct file_system_type *fs_type, error = -EBUSY; } else { snprintf(s->s_id, sizeof(s->s_id), "%pg", bdev); - shrinker_debugfs_rename(&s->s_shrink, "sb-%s:%s", fs_type->name, + shrinker_debugfs_rename(s->s_shrink, "sb-%s:%s", fs_type->name, s->s_id); btrfs_sb(s)->bdev_holder = fs_type; error = btrfs_fill_super(s, fs_devices, data); diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c index c4bf26142eec..79b96e74a8a0 100644 --- a/fs/kernfs/mount.c +++ b/fs/kernfs/mount.c @@ -265,7 +265,7 @@ static int kernfs_fill_super(struct super_block *sb, struct kernfs_fs_context *k sb->s_time_gran = 1; /* sysfs dentries and inodes don't require IO to create */ - sb->s_shrink.seeks = 0; + sb->s_shrink->seeks = 0; /* get root inode, initialize and unlock it */ down_read(&kf_root->kernfs_rwsem); diff --git a/fs/proc/root.c b/fs/proc/root.c index 9191248f2dac..b55dbc70287b 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -188,7 +188,7 @@ static int proc_fill_super(struct super_block *s, struct fs_context *fc) s->s_stack_depth = FILESYSTEM_MAX_STACK_DEPTH; /* procfs dentries and inodes don't require IO to create */ - s->s_shrink.seeks = 0; + s->s_shrink->seeks = 0; pde_get(&proc_root); root_inode = proc_get_inode(s, &proc_root); diff --git a/fs/super.c b/fs/super.c index 977b6861b284..077ad7f5d6f2 100644 --- a/fs/super.c +++ b/fs/super.c @@ -191,7 +191,7 @@ static unsigned long super_cache_scan(struct shrinker *shrink, long dentries; long inodes; - sb = container_of(shrink, struct super_block, s_shrink); + sb = shrink->private_data; /* * Deadlock avoidance. We may hold various FS locks, and we don't want @@ -244,7 +244,7 @@ static unsigned long super_cache_count(struct shrinker *shrink, struct super_block *sb; long total_objects = 0; - sb = container_of(shrink, struct super_block, s_shrink); + sb = shrink->private_data; /* * We don't call super_trylock_shared() here as it is a scalability @@ -306,7 +306,7 @@ static void destroy_unused_super(struct super_block *s) security_sb_free(s); put_user_ns(s->s_user_ns); kfree(s->s_subtype); - free_prealloced_shrinker(&s->s_shrink); + shrinker_free(s->s_shrink); /* no delays needed */ destroy_super_work(&s->destroy_work); } @@ -383,16 +383,19 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags, s->s_time_min = TIME64_MIN; s->s_time_max = TIME64_MAX; - s->s_shrink.seeks = DEFAULT_SEEKS; - s->s_shrink.scan_objects = super_cache_scan; - s->s_shrink.count_objects = super_cache_count; - s->s_shrink.batch = 1024; - s->s_shrink.flags = SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE; - if (prealloc_shrinker(&s->s_shrink, "sb-%s", type->name)) + s->s_shrink = shrinker_alloc(SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE, + "sb-%s", type->name); + if (!s->s_shrink) goto fail; - if (list_lru_init_memcg(&s->s_dentry_lru, &s->s_shrink)) + + s->s_shrink->scan_objects = super_cache_scan; + s->s_shrink->count_objects = super_cache_count; + s->s_shrink->batch = 1024; + s->s_shrink->private_data = s; + + if (list_lru_init_memcg(&s->s_dentry_lru, s->s_shrink)) goto fail; - if (list_lru_init_memcg(&s->s_inode_lru, &s->s_shrink)) + if (list_lru_init_memcg(&s->s_inode_lru, s->s_shrink)) goto fail; return s; @@ -477,7 +480,7 @@ void deactivate_locked_super(struct super_block *s) { struct file_system_type *fs = s->s_type; if (atomic_dec_and_test(&s->s_active)) { - unregister_shrinker(&s->s_shrink); + shrinker_free(s->s_shrink); fs->kill_sb(s); kill_super_notify(s); @@ -829,7 +832,7 @@ struct super_block *sget_fc(struct fs_context *fc, hlist_add_head(&s->s_instances, &s->s_type->fs_supers); spin_unlock(&sb_lock); get_filesystem(s->s_type); - register_shrinker_prepared(&s->s_shrink); + shrinker_register(s->s_shrink); return s; share_extant_sb: @@ -912,7 +915,7 @@ struct super_block *sget(struct file_system_type *type, hlist_add_head(&s->s_instances, &type->fs_supers); spin_unlock(&sb_lock); get_filesystem(type); - register_shrinker_prepared(&s->s_shrink); + shrinker_register(s->s_shrink); return s; } EXPORT_SYMBOL(sget); @@ -1536,7 +1539,7 @@ int setup_bdev_super(struct super_block *sb, int sb_flags, mutex_unlock(&bdev->bd_fsfreeze_mutex); snprintf(sb->s_id, sizeof(sb->s_id), "%pg", bdev); - shrinker_debugfs_rename(&sb->s_shrink, "sb-%s:%s", sb->s_type->name, + shrinker_debugfs_rename(sb->s_shrink, "sb-%s:%s", sb->s_type->name, sb->s_id); sb_set_blocksize(sb, block_size(bdev)); return 0; diff --git a/include/linux/fs.h b/include/linux/fs.h index d543ece194ea..40f95341dbb9 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1279,7 +1279,7 @@ struct super_block { const struct dentry_operations *s_d_op; /* default d_op for dentries */ - struct shrinker s_shrink; /* per-sb shrinker handle */ + struct shrinker *s_shrink; /* per-sb shrinker handle */ /* Number of inodes with nlink == 0 but still referenced */ atomic_long_t s_remove_count; -- Gitee From cb4592a3c4bc17a41ced19c7dac9a7ad973a5476 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Mon, 11 Sep 2023 17:44:38 +0800 Subject: [PATCH 078/484] mm: shrinker: remove old APIs commit f2383e01507eeee8a1c1283d61a117a97d6c4ebe upstream Conflicts: none Backport-reason: Shrinker lockless Now no users are using the old APIs, just remove them. Link: https://lkml.kernel.org/r/20230911094444.68966-40-zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Reviewed-by: Muchun Song Cc: Abhinav Kumar Cc: Alasdair Kergon Cc: Alexander Viro Cc: Alyssa Rosenzweig Cc: Andreas Dilger Cc: Andreas Gruenbacher Cc: Anna Schumaker Cc: Arnd Bergmann Cc: Bob Peterson Cc: Borislav Petkov Cc: Carlos Llamas Cc: Chandan Babu R Cc: Chao Yu Cc: Chris Mason Cc: Christian Brauner Cc: Christian Koenig Cc: Chuck Lever Cc: Coly Li Cc: Dai Ngo Cc: Daniel Vetter Cc: Daniel Vetter Cc: "Darrick J. Wong" Cc: Dave Chinner Cc: Dave Hansen Cc: David Airlie Cc: David Hildenbrand Cc: David Sterba Cc: Dmitry Baryshkov Cc: Gao Xiang Cc: Greg Kroah-Hartman Cc: Huang Rui Cc: Ingo Molnar Cc: Jaegeuk Kim Cc: Jani Nikula Cc: Jan Kara Cc: Jason Wang Cc: Jeff Layton Cc: Jeffle Xu Cc: Joel Fernandes (Google) Cc: Joonas Lahtinen Cc: Josef Bacik Cc: Juergen Gross Cc: Kent Overstreet Cc: Kirill Tkhai Cc: Marijn Suijten Cc: "Michael S. Tsirkin" Cc: Mike Snitzer Cc: Minchan Kim Cc: Muchun Song Cc: Nadav Amit Cc: Neil Brown Cc: Oleksandr Tyshchenko Cc: Olga Kornievskaia Cc: Paul E. McKenney Cc: Richard Weinberger Cc: Rob Clark Cc: Rob Herring Cc: Rodrigo Vivi Cc: Roman Gushchin Cc: Sean Paul Cc: Sergey Senozhatsky Cc: Song Liu Cc: Stefano Stabellini Cc: Steven Price Cc: "Theodore Ts'o" Cc: Thomas Gleixner Cc: Tomeu Vizoso Cc: Tom Talpey Cc: Trond Myklebust Cc: Tvrtko Ursulin Cc: Vlastimil Babka Cc: Xuan Zhuo Cc: Yue Hu Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- include/linux/shrinker.h | 8 --- mm/shrinker.c | 143 --------------------------------------- 2 files changed, 151 deletions(-) diff --git a/include/linux/shrinker.h b/include/linux/shrinker.h index 484724cabaae..babc3ccdda24 100644 --- a/include/linux/shrinker.h +++ b/include/linux/shrinker.h @@ -109,14 +109,6 @@ struct shrinker *shrinker_alloc(unsigned int flags, const char *fmt, ...); void shrinker_register(struct shrinker *shrinker); void shrinker_free(struct shrinker *shrinker); -extern int __printf(2, 3) prealloc_shrinker(struct shrinker *shrinker, - const char *fmt, ...); -extern void register_shrinker_prepared(struct shrinker *shrinker); -extern int __printf(2, 3) register_shrinker(struct shrinker *shrinker, - const char *fmt, ...); -extern void unregister_shrinker(struct shrinker *shrinker); -extern void free_prealloced_shrinker(struct shrinker *shrinker); - #ifdef CONFIG_SHRINKER_DEBUG extern int __printf(2, 3) shrinker_debugfs_rename(struct shrinker *shrinker, const char *fmt, ...); diff --git a/mm/shrinker.c b/mm/shrinker.c index d1032a4d5684..736fa67e8454 100644 --- a/mm/shrinker.c +++ b/mm/shrinker.c @@ -655,146 +655,3 @@ void shrinker_free(struct shrinker *shrinker) kfree(shrinker); } EXPORT_SYMBOL_GPL(shrinker_free); - -/* - * Add a shrinker callback to be called from the vm. - */ -static int __prealloc_shrinker(struct shrinker *shrinker) -{ - unsigned int size; - int err; - - if (shrinker->flags & SHRINKER_MEMCG_AWARE) { - err = prealloc_memcg_shrinker(shrinker); - if (err != -ENOSYS) - return err; - - shrinker->flags &= ~SHRINKER_MEMCG_AWARE; - } - - size = sizeof(*shrinker->nr_deferred); - if (shrinker->flags & SHRINKER_NUMA_AWARE) - size *= nr_node_ids; - - shrinker->nr_deferred = kzalloc(size, GFP_KERNEL); - if (!shrinker->nr_deferred) - return -ENOMEM; - - return 0; -} - -#ifdef CONFIG_SHRINKER_DEBUG -int prealloc_shrinker(struct shrinker *shrinker, const char *fmt, ...) -{ - va_list ap; - int err; - - va_start(ap, fmt); - shrinker->name = kvasprintf_const(GFP_KERNEL, fmt, ap); - va_end(ap); - if (!shrinker->name) - return -ENOMEM; - - err = __prealloc_shrinker(shrinker); - if (err) { - kfree_const(shrinker->name); - shrinker->name = NULL; - } - - return err; -} -#else -int prealloc_shrinker(struct shrinker *shrinker, const char *fmt, ...) -{ - return __prealloc_shrinker(shrinker); -} -#endif - -void free_prealloced_shrinker(struct shrinker *shrinker) -{ -#ifdef CONFIG_SHRINKER_DEBUG - kfree_const(shrinker->name); - shrinker->name = NULL; -#endif - if (shrinker->flags & SHRINKER_MEMCG_AWARE) { - down_write(&shrinker_rwsem); - unregister_memcg_shrinker(shrinker); - up_write(&shrinker_rwsem); - return; - } - - kfree(shrinker->nr_deferred); - shrinker->nr_deferred = NULL; -} - -void register_shrinker_prepared(struct shrinker *shrinker) -{ - down_write(&shrinker_rwsem); - list_add_tail(&shrinker->list, &shrinker_list); - shrinker->flags |= SHRINKER_REGISTERED; - shrinker_debugfs_add(shrinker); - up_write(&shrinker_rwsem); -} - -static int __register_shrinker(struct shrinker *shrinker) -{ - int err = __prealloc_shrinker(shrinker); - - if (err) - return err; - register_shrinker_prepared(shrinker); - return 0; -} - -#ifdef CONFIG_SHRINKER_DEBUG -int register_shrinker(struct shrinker *shrinker, const char *fmt, ...) -{ - va_list ap; - int err; - - va_start(ap, fmt); - shrinker->name = kvasprintf_const(GFP_KERNEL, fmt, ap); - va_end(ap); - if (!shrinker->name) - return -ENOMEM; - - err = __register_shrinker(shrinker); - if (err) { - kfree_const(shrinker->name); - shrinker->name = NULL; - } - return err; -} -#else -int register_shrinker(struct shrinker *shrinker, const char *fmt, ...) -{ - return __register_shrinker(shrinker); -} -#endif -EXPORT_SYMBOL(register_shrinker); - -/* - * Remove one - */ -void unregister_shrinker(struct shrinker *shrinker) -{ - struct dentry *debugfs_entry; - int debugfs_id; - - if (!(shrinker->flags & SHRINKER_REGISTERED)) - return; - - down_write(&shrinker_rwsem); - list_del(&shrinker->list); - shrinker->flags &= ~SHRINKER_REGISTERED; - if (shrinker->flags & SHRINKER_MEMCG_AWARE) - unregister_memcg_shrinker(shrinker); - debugfs_entry = shrinker_debugfs_detach(shrinker, &debugfs_id); - up_write(&shrinker_rwsem); - - shrinker_debugfs_remove(debugfs_entry, debugfs_id); - - kfree(shrinker->nr_deferred); - shrinker->nr_deferred = NULL; -} -EXPORT_SYMBOL(unregister_shrinker); -- Gitee From d066314cab7b68297437d7426df1c9419f9e63b5 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Mon, 11 Sep 2023 17:44:39 +0800 Subject: [PATCH 079/484] mm: shrinker: add a secondary array for shrinker_info::{map, nr_deferred} commit 307bececcd1205bcb67a3c0d53a69db237ccc9d4 upstream Conflicts: trivial, resolved Backport-reason: Shrinker lockless Currently, we maintain two linear arrays per node per memcg, which are shrinker_info::map and shrinker_info::nr_deferred. And we need to resize them when the shrinker_nr_max is exceeded, that is, allocate a new array, and then copy the old array to the new array, and finally free the old array by RCU. For shrinker_info::map, we do set_bit() under the RCU lock, so we may set the value into the old map which is about to be freed. This may cause the value set to be lost. The current solution is not to copy the old map when resizing, but to set all the corresponding bits in the new map to 1. This solves the data loss problem, but bring the overhead of more pointless loops while doing memcg slab shrink. For shrinker_info::nr_deferred, we will only modify it under the read lock of shrinker_rwsem, so it will not run concurrently with the resizing. But after we make memcg slab shrink lockless, there will be the same data loss problem as shrinker_info::map, and we can't work around it like the map. For such resizable arrays, the most straightforward idea is to change it to xarray, like we did for list_lru [1]. We need to do xa_store() in the list_lru_add()-->set_shrinker_bit(), but this will cause memory allocation, and the list_lru_add() doesn't accept failure. A possible solution is to pre-allocate, but the location of pre-allocation is not well determined (such as deferred_split_shrinker case). Therefore, this commit chooses to introduce the following secondary array for shrinker_info::{map, nr_deferred}: +---------------+--------+--------+-----+ | shrinker_info | unit 0 | unit 1 | ... | (secondary array) +---------------+--------+--------+-----+ | v +---------------+-----+ | nr_deferred[] | map | (leaf array) +---------------+-----+ (shrinker_info_unit) The leaf array is never freed unless the memcg is destroyed. The secondary array will be resized every time the shrinker id exceeds shrinker_nr_max. So the shrinker_info_unit can be indexed from both the old and the new shrinker_info->unit[x]. Then even if we get the old secondary array under the RCU lock, the found map and nr_deferred are also true, so the updated nr_deferred and map will not be lost. [1]. https://lore.kernel.org/all/20220228122126.37293-13-songmuchun@bytedance.com/ [zhengqi.arch@bytedance.com: unlock the &shrinker_rwsem before the call to free_shrinker_info()] Link: https://lkml.kernel.org/r/20230928141517.12164-1-zhengqi.arch@bytedance.com Link: https://lkml.kernel.org/r/20230911094444.68966-41-zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Reviewed-by: Muchun Song Cc: Abhinav Kumar Cc: Alasdair Kergon Cc: Alexander Viro Cc: Alyssa Rosenzweig Cc: Andreas Dilger Cc: Andreas Gruenbacher Cc: Anna Schumaker Cc: Arnd Bergmann Cc: Bob Peterson Cc: Borislav Petkov Cc: Carlos Llamas Cc: Chandan Babu R Cc: Chao Yu Cc: Chris Mason Cc: Christian Brauner Cc: Christian Koenig Cc: Chuck Lever Cc: Coly Li Cc: Dai Ngo Cc: Daniel Vetter Cc: Daniel Vetter Cc: "Darrick J. Wong" Cc: Dave Chinner Cc: Dave Hansen Cc: David Airlie Cc: David Hildenbrand Cc: David Sterba Cc: Dmitry Baryshkov Cc: Gao Xiang Cc: Greg Kroah-Hartman Cc: Huang Rui Cc: Ingo Molnar Cc: Jaegeuk Kim Cc: Jani Nikula Cc: Jan Kara Cc: Jason Wang Cc: Jeff Layton Cc: Jeffle Xu Cc: Joel Fernandes (Google) Cc: Joonas Lahtinen Cc: Josef Bacik Cc: Juergen Gross Cc: Kent Overstreet Cc: Kirill Tkhai Cc: Marijn Suijten Cc: "Michael S. Tsirkin" Cc: Mike Snitzer Cc: Minchan Kim Cc: Muchun Song Cc: Nadav Amit Cc: Neil Brown Cc: Oleksandr Tyshchenko Cc: Olga Kornievskaia Cc: Paul E. McKenney Cc: Richard Weinberger Cc: Rob Clark Cc: Rob Herring Cc: Rodrigo Vivi Cc: Roman Gushchin Cc: Sean Paul Cc: Sergey Senozhatsky Cc: Song Liu Cc: Stefano Stabellini Cc: Steven Price Cc: "Theodore Ts'o" Cc: Thomas Gleixner Cc: Tomeu Vizoso Cc: Tom Talpey Cc: Trond Myklebust Cc: Tvrtko Ursulin Cc: Vlastimil Babka Cc: Xuan Zhuo Cc: Yue Hu Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- include/linux/memcontrol.h | 12 +- include/linux/shrinker.h | 17 +++ mm/shrinker.c | 250 +++++++++++++++++++++++-------------- 3 files changed, 172 insertions(+), 107 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 138b484cd8f9..425b6793ac7b 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -21,6 +21,7 @@ #include #include #include +#include #include #define MEM_LATENCY_MAX_SLOTS 64 @@ -123,17 +124,6 @@ struct mem_cgroup_reclaim_iter { unsigned int generation; }; -/* - * Bitmap and deferred work of shrinker::id corresponding to memcg-aware - * shrinkers, which have elements charged to this memcg. - */ -struct shrinker_info { - struct rcu_head rcu; - atomic_long_t *nr_deferred; - unsigned long *map; - int map_nr_max; -}; - struct lruvec_stats_percpu { /* Local (CPU and cgroup) state */ long state[NR_VM_NODE_STAT_ITEMS]; diff --git a/include/linux/shrinker.h b/include/linux/shrinker.h index babc3ccdda24..d5aae1734c4a 100644 --- a/include/linux/shrinker.h +++ b/include/linux/shrinker.h @@ -6,6 +6,23 @@ #include #include +#define SHRINKER_UNIT_BITS BITS_PER_LONG + +/* + * Bitmap and deferred work of shrinker::id corresponding to memcg-aware + * shrinkers, which have elements charged to the memcg. + */ +struct shrinker_info_unit { + atomic_long_t nr_deferred[SHRINKER_UNIT_BITS]; + DECLARE_BITMAP(map, SHRINKER_UNIT_BITS); +}; + +struct shrinker_info { + struct rcu_head rcu; + int map_nr_max; + struct shrinker_info_unit *unit[]; +}; + /* * This struct is used to pass information from page reclaim to the shrinkers. * We consolidate the values for easier extension later. diff --git a/mm/shrinker.c b/mm/shrinker.c index 736fa67e8454..e9644cda80b5 100644 --- a/mm/shrinker.c +++ b/mm/shrinker.c @@ -12,15 +12,50 @@ DECLARE_RWSEM(shrinker_rwsem); #ifdef CONFIG_MEMCG static int shrinker_nr_max; -/* The shrinker_info is expanded in a batch of BITS_PER_LONG */ -static inline int shrinker_map_size(int nr_items) +static inline int shrinker_unit_size(int nr_items) { - return (DIV_ROUND_UP(nr_items, BITS_PER_LONG) * sizeof(unsigned long)); + return (DIV_ROUND_UP(nr_items, SHRINKER_UNIT_BITS) * sizeof(struct shrinker_info_unit *)); } -static inline int shrinker_defer_size(int nr_items) +static inline void shrinker_unit_free(struct shrinker_info *info, int start) { - return (round_up(nr_items, BITS_PER_LONG) * sizeof(atomic_long_t)); + struct shrinker_info_unit **unit; + int nr, i; + + if (!info) + return; + + unit = info->unit; + nr = DIV_ROUND_UP(info->map_nr_max, SHRINKER_UNIT_BITS); + + for (i = start; i < nr; i++) { + if (!unit[i]) + break; + + kfree(unit[i]); + unit[i] = NULL; + } +} + +static inline int shrinker_unit_alloc(struct shrinker_info *new, + struct shrinker_info *old, int nid) +{ + struct shrinker_info_unit *unit; + int nr = DIV_ROUND_UP(new->map_nr_max, SHRINKER_UNIT_BITS); + int start = old ? DIV_ROUND_UP(old->map_nr_max, SHRINKER_UNIT_BITS) : 0; + int i; + + for (i = start; i < nr; i++) { + unit = kzalloc_node(sizeof(*unit), GFP_KERNEL, nid); + if (!unit) { + shrinker_unit_free(new, start); + return -ENOMEM; + } + + new->unit[i] = unit; + } + + return 0; } void free_shrinker_info(struct mem_cgroup *memcg) @@ -32,6 +67,7 @@ void free_shrinker_info(struct mem_cgroup *memcg) for_each_node(nid) { pn = memcg->nodeinfo[nid]; info = rcu_dereference_protected(pn->shrinker_info, true); + shrinker_unit_free(info, 0); kvfree(info); rcu_assign_pointer(pn->shrinker_info, NULL); } @@ -40,28 +76,28 @@ void free_shrinker_info(struct mem_cgroup *memcg) int alloc_shrinker_info(struct mem_cgroup *memcg) { struct shrinker_info *info; - int nid, size, ret = 0; - int map_size, defer_size = 0; + int nid, ret = 0; + int array_size = 0; down_write(&shrinker_rwsem); - map_size = shrinker_map_size(shrinker_nr_max); - defer_size = shrinker_defer_size(shrinker_nr_max); - size = map_size + defer_size; + array_size = shrinker_unit_size(shrinker_nr_max); for_each_node(nid) { - info = kvzalloc_node(sizeof(*info) + size, GFP_KERNEL, nid); - if (!info) { - free_shrinker_info(memcg); - ret = -ENOMEM; - break; - } - info->nr_deferred = (atomic_long_t *)(info + 1); - info->map = (void *)info->nr_deferred + defer_size; + info = kvzalloc_node(sizeof(*info) + array_size, GFP_KERNEL, nid); + if (!info) + goto err; info->map_nr_max = shrinker_nr_max; + if (shrinker_unit_alloc(info, NULL, nid)) + goto err; rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_info, info); } up_write(&shrinker_rwsem); return ret; + +err: + up_write(&shrinker_rwsem); + free_shrinker_info(memcg); + return -ENOMEM; } static struct shrinker_info *shrinker_info_protected(struct mem_cgroup *memcg, @@ -71,15 +107,12 @@ static struct shrinker_info *shrinker_info_protected(struct mem_cgroup *memcg, lockdep_is_held(&shrinker_rwsem)); } -static int expand_one_shrinker_info(struct mem_cgroup *memcg, - int map_size, int defer_size, - int old_map_size, int old_defer_size, - int new_nr_max) +static int expand_one_shrinker_info(struct mem_cgroup *memcg, int new_size, + int old_size, int new_nr_max) { struct shrinker_info *new, *old; struct mem_cgroup_per_node *pn; int nid; - int size = map_size + defer_size; for_each_node(nid) { pn = memcg->nodeinfo[nid]; @@ -92,21 +125,17 @@ static int expand_one_shrinker_info(struct mem_cgroup *memcg, if (new_nr_max <= old->map_nr_max) continue; - new = kvmalloc_node(sizeof(*new) + size, GFP_KERNEL, nid); + new = kvmalloc_node(sizeof(*new) + new_size, GFP_KERNEL, nid); if (!new) return -ENOMEM; - new->nr_deferred = (atomic_long_t *)(new + 1); - new->map = (void *)new->nr_deferred + defer_size; new->map_nr_max = new_nr_max; - /* map: set all old bits, clear all new bits */ - memset(new->map, (int)0xff, old_map_size); - memset((void *)new->map + old_map_size, 0, map_size - old_map_size); - /* nr_deferred: copy old values, clear all new values */ - memcpy(new->nr_deferred, old->nr_deferred, old_defer_size); - memset((void *)new->nr_deferred + old_defer_size, 0, - defer_size - old_defer_size); + memcpy(new->unit, old->unit, old_size); + if (shrinker_unit_alloc(new, old, nid)) { + kvfree(new); + return -ENOMEM; + } rcu_assign_pointer(pn->shrinker_info, new); kvfree_rcu(old, rcu); @@ -118,9 +147,8 @@ static int expand_one_shrinker_info(struct mem_cgroup *memcg, static int expand_shrinker_info(int new_id) { int ret = 0; - int new_nr_max = round_up(new_id + 1, BITS_PER_LONG); - int map_size, defer_size = 0; - int old_map_size, old_defer_size = 0; + int new_nr_max = round_up(new_id + 1, SHRINKER_UNIT_BITS); + int new_size, old_size = 0; struct mem_cgroup *memcg; if (!root_mem_cgroup) @@ -128,15 +156,12 @@ static int expand_shrinker_info(int new_id) lockdep_assert_held(&shrinker_rwsem); - map_size = shrinker_map_size(new_nr_max); - defer_size = shrinker_defer_size(new_nr_max); - old_map_size = shrinker_map_size(shrinker_nr_max); - old_defer_size = shrinker_defer_size(shrinker_nr_max); + new_size = shrinker_unit_size(new_nr_max); + old_size = shrinker_unit_size(shrinker_nr_max); memcg = mem_cgroup_iter(NULL, NULL, NULL); do { - ret = expand_one_shrinker_info(memcg, map_size, defer_size, - old_map_size, old_defer_size, + ret = expand_one_shrinker_info(memcg, new_size, old_size, new_nr_max); if (ret) { mem_cgroup_iter_break(NULL, memcg); @@ -150,17 +175,34 @@ static int expand_shrinker_info(int new_id) return ret; } +static inline int shrinker_id_to_index(int shrinker_id) +{ + return shrinker_id / SHRINKER_UNIT_BITS; +} + +static inline int shrinker_id_to_offset(int shrinker_id) +{ + return shrinker_id % SHRINKER_UNIT_BITS; +} + +static inline int calc_shrinker_id(int index, int offset) +{ + return index * SHRINKER_UNIT_BITS + offset; +} + void set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id) { if (shrinker_id >= 0 && memcg && !mem_cgroup_is_root(memcg)) { struct shrinker_info *info; + struct shrinker_info_unit *unit; rcu_read_lock(); info = rcu_dereference(memcg->nodeinfo[nid]->shrinker_info); + unit = info->unit[shrinker_id_to_index(shrinker_id)]; if (!WARN_ON_ONCE(shrinker_id >= info->map_nr_max)) { /* Pairs with smp mb in shrink_slab() */ smp_mb__before_atomic(); - set_bit(shrinker_id, info->map); + set_bit(shrinker_id_to_offset(shrinker_id), unit->map); } rcu_read_unlock(); } @@ -209,26 +251,31 @@ static long xchg_nr_deferred_memcg(int nid, struct shrinker *shrinker, struct mem_cgroup *memcg) { struct shrinker_info *info; + struct shrinker_info_unit *unit; info = shrinker_info_protected(memcg, nid); - return atomic_long_xchg(&info->nr_deferred[shrinker->id], 0); + unit = info->unit[shrinker_id_to_index(shrinker->id)]; + return atomic_long_xchg(&unit->nr_deferred[shrinker_id_to_offset(shrinker->id)], 0); } static long add_nr_deferred_memcg(long nr, int nid, struct shrinker *shrinker, struct mem_cgroup *memcg) { struct shrinker_info *info; + struct shrinker_info_unit *unit; info = shrinker_info_protected(memcg, nid); - return atomic_long_add_return(nr, &info->nr_deferred[shrinker->id]); + unit = info->unit[shrinker_id_to_index(shrinker->id)]; + return atomic_long_add_return(nr, &unit->nr_deferred[shrinker_id_to_offset(shrinker->id)]); } void reparent_shrinker_deferred(struct mem_cgroup *memcg) { - int i, nid; + int nid, index, offset; long nr; struct mem_cgroup *parent; struct shrinker_info *child_info, *parent_info; + struct shrinker_info_unit *child_unit, *parent_unit; parent = parent_mem_cgroup(memcg); if (!parent) @@ -239,9 +286,13 @@ void reparent_shrinker_deferred(struct mem_cgroup *memcg) for_each_node(nid) { child_info = shrinker_info_protected(memcg, nid); parent_info = shrinker_info_protected(parent, nid); - for (i = 0; i < child_info->map_nr_max; i++) { - nr = atomic_long_read(&child_info->nr_deferred[i]); - atomic_long_add(nr, &parent_info->nr_deferred[i]); + for (index = 0; index < shrinker_id_to_index(child_info->map_nr_max); index++) { + child_unit = child_info->unit[index]; + parent_unit = parent_info->unit[index]; + for (offset = 0; offset < SHRINKER_UNIT_BITS; offset++) { + nr = atomic_long_read(&child_unit->nr_deferred[offset]); + atomic_long_add(nr, &parent_unit->nr_deferred[offset]); + } } } up_read(&shrinker_rwsem); @@ -407,7 +458,7 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, { struct shrinker_info *info; unsigned long ret, freed = 0; - int i; + int offset, index = 0; if (!mem_cgroup_online(memcg)) return 0; @@ -419,56 +470,63 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, if (unlikely(!info)) goto unlock; - for_each_set_bit(i, info->map, info->map_nr_max) { - struct shrink_control sc = { - .gfp_mask = gfp_mask, - .nid = nid, - .memcg = memcg, - }; - struct shrinker *shrinker; + for (; index < shrinker_id_to_index(info->map_nr_max); index++) { + struct shrinker_info_unit *unit; - shrinker = idr_find(&shrinker_idr, i); - if (unlikely(!shrinker || !(shrinker->flags & SHRINKER_REGISTERED))) { - if (!shrinker) - clear_bit(i, info->map); - continue; - } + unit = info->unit[index]; - /* Call non-slab shrinkers even though kmem is disabled */ - if (!memcg_kmem_online() && - !(shrinker->flags & SHRINKER_NONSLAB)) - continue; + for_each_set_bit(offset, unit->map, SHRINKER_UNIT_BITS) { + struct shrink_control sc = { + .gfp_mask = gfp_mask, + .nid = nid, + .memcg = memcg, + }; + struct shrinker *shrinker; + int shrinker_id = calc_shrinker_id(index, offset); - ret = do_shrink_slab(&sc, shrinker, priority); - if (ret == SHRINK_EMPTY) { - clear_bit(i, info->map); - /* - * After the shrinker reported that it had no objects to - * free, but before we cleared the corresponding bit in - * the memcg shrinker map, a new object might have been - * added. To make sure, we have the bit set in this - * case, we invoke the shrinker one more time and reset - * the bit if it reports that it is not empty anymore. - * The memory barrier here pairs with the barrier in - * set_shrinker_bit(): - * - * list_lru_add() shrink_slab_memcg() - * list_add_tail() clear_bit() - * - * set_bit() do_shrink_slab() - */ - smp_mb__after_atomic(); - ret = do_shrink_slab(&sc, shrinker, priority); - if (ret == SHRINK_EMPTY) - ret = 0; - else - set_shrinker_bit(memcg, nid, i); - } - freed += ret; + shrinker = idr_find(&shrinker_idr, shrinker_id); + if (unlikely(!shrinker || !(shrinker->flags & SHRINKER_REGISTERED))) { + if (!shrinker) + clear_bit(offset, unit->map); + continue; + } - if (rwsem_is_contended(&shrinker_rwsem)) { - freed = freed ? : 1; - break; + /* Call non-slab shrinkers even though kmem is disabled */ + if (!memcg_kmem_online() && + !(shrinker->flags & SHRINKER_NONSLAB)) + continue; + + ret = do_shrink_slab(&sc, shrinker, priority); + if (ret == SHRINK_EMPTY) { + clear_bit(offset, unit->map); + /* + * After the shrinker reported that it had no objects to + * free, but before we cleared the corresponding bit in + * the memcg shrinker map, a new object might have been + * added. To make sure, we have the bit set in this + * case, we invoke the shrinker one more time and reset + * the bit if it reports that it is not empty anymore. + * The memory barrier here pairs with the barrier in + * set_shrinker_bit(): + * + * list_lru_add() shrink_slab_memcg() + * list_add_tail() clear_bit() + * + * set_bit() do_shrink_slab() + */ + smp_mb__after_atomic(); + ret = do_shrink_slab(&sc, shrinker, priority); + if (ret == SHRINK_EMPTY) + ret = 0; + else + set_shrinker_bit(memcg, nid, shrinker_id); + } + freed += ret; + + if (rwsem_is_contended(&shrinker_rwsem)) { + freed = freed ? : 1; + goto unlock; + } } } unlock: -- Gitee From 81e8714000f897a5a3e55ffd2e4d534f4dd8a94d Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Mon, 11 Sep 2023 17:44:40 +0800 Subject: [PATCH 080/484] mm: shrinker: rename {prealloc|unregister}_memcg_shrinker() to shrinker_memcg_{alloc|remove}() commit 48a7a0996a0089f4161ad437a810a10596e8d278 upstream Conflicts: none Backport-reason: Shrinker lockless With the new shrinker APIs, there is no action such as prealloc, so rename {prealloc|unregister}_memcg_shrinker() to shrinker_memcg_{alloc|remove}(), which corresponds to the idr_{alloc|remove}() inside the function. Link: https://lkml.kernel.org/r/20230911094444.68966-42-zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Reviewed-by: Muchun Song Cc: Abhinav Kumar Cc: Alasdair Kergon Cc: Alexander Viro Cc: Alyssa Rosenzweig Cc: Andreas Dilger Cc: Andreas Gruenbacher Cc: Anna Schumaker Cc: Arnd Bergmann Cc: Bob Peterson Cc: Borislav Petkov Cc: Carlos Llamas Cc: Chandan Babu R Cc: Chao Yu Cc: Chris Mason Cc: Christian Brauner Cc: Christian Koenig Cc: Chuck Lever Cc: Coly Li Cc: Dai Ngo Cc: Daniel Vetter Cc: Daniel Vetter Cc: "Darrick J. Wong" Cc: Dave Chinner Cc: Dave Hansen Cc: David Airlie Cc: David Hildenbrand Cc: David Sterba Cc: Dmitry Baryshkov Cc: Gao Xiang Cc: Greg Kroah-Hartman Cc: Huang Rui Cc: Ingo Molnar Cc: Jaegeuk Kim Cc: Jani Nikula Cc: Jan Kara Cc: Jason Wang Cc: Jeff Layton Cc: Jeffle Xu Cc: Joel Fernandes (Google) Cc: Joonas Lahtinen Cc: Josef Bacik Cc: Juergen Gross Cc: Kent Overstreet Cc: Kirill Tkhai Cc: Marijn Suijten Cc: "Michael S. Tsirkin" Cc: Mike Snitzer Cc: Minchan Kim Cc: Nadav Amit Cc: Neil Brown Cc: Oleksandr Tyshchenko Cc: Olga Kornievskaia Cc: Paul E. McKenney Cc: Richard Weinberger Cc: Rob Clark Cc: Rob Herring Cc: Rodrigo Vivi Cc: Roman Gushchin Cc: Sean Paul Cc: Sergey Senozhatsky Cc: Song Liu Cc: Stefano Stabellini Cc: Steven Price Cc: "Theodore Ts'o" Cc: Thomas Gleixner Cc: Tomeu Vizoso Cc: Tom Talpey Cc: Trond Myklebust Cc: Tvrtko Ursulin Cc: Vlastimil Babka Cc: Xuan Zhuo Cc: Yue Hu Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/shrinker.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/mm/shrinker.c b/mm/shrinker.c index e9644cda80b5..e2e832b15803 100644 --- a/mm/shrinker.c +++ b/mm/shrinker.c @@ -210,7 +210,7 @@ void set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id) static DEFINE_IDR(shrinker_idr); -static int prealloc_memcg_shrinker(struct shrinker *shrinker) +static int shrinker_memcg_alloc(struct shrinker *shrinker) { int id, ret = -ENOMEM; @@ -236,7 +236,7 @@ static int prealloc_memcg_shrinker(struct shrinker *shrinker) return ret; } -static void unregister_memcg_shrinker(struct shrinker *shrinker) +static void shrinker_memcg_remove(struct shrinker *shrinker) { int id = shrinker->id; @@ -298,12 +298,12 @@ void reparent_shrinker_deferred(struct mem_cgroup *memcg) up_read(&shrinker_rwsem); } #else -static int prealloc_memcg_shrinker(struct shrinker *shrinker) +static int shrinker_memcg_alloc(struct shrinker *shrinker) { return -ENOSYS; } -static void unregister_memcg_shrinker(struct shrinker *shrinker) +static void shrinker_memcg_remove(struct shrinker *shrinker) { } @@ -629,7 +629,7 @@ struct shrinker *shrinker_alloc(unsigned int flags, const char *fmt, ...) shrinker->seeks = DEFAULT_SEEKS; if (flags & SHRINKER_MEMCG_AWARE) { - err = prealloc_memcg_shrinker(shrinker); + err = shrinker_memcg_alloc(shrinker); if (err == -ENOSYS) { /* Memcg is not supported, fallback to non-memcg-aware shrinker. */ shrinker->flags &= ~SHRINKER_MEMCG_AWARE; @@ -701,7 +701,7 @@ void shrinker_free(struct shrinker *shrinker) shrinker_debugfs_name_free(shrinker); if (shrinker->flags & SHRINKER_MEMCG_AWARE) - unregister_memcg_shrinker(shrinker); + shrinker_memcg_remove(shrinker); up_write(&shrinker_rwsem); if (debugfs_entry) -- Gitee From fe3eed828e858ffaa7da6dc2602866a78481c640 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Mon, 11 Sep 2023 17:44:41 +0800 Subject: [PATCH 081/484] mm: shrinker: make global slab shrink lockless commit ca1d36b823944f24b5755311e95883fb5fdb807b upstream Conflicts: trivial, resolved Backport-reason: Shrinker lockless The shrinker_rwsem is a global read-write lock in shrinkers subsystem, which protects most operations such as slab shrink, registration and unregistration of shrinkers, etc. This can easily cause problems in the following cases. 1) When the memory pressure is high and there are many filesystems mounted or unmounted at the same time, slab shrink will be affected (down_read_trylock() failed). Such as the real workload mentioned by Kirill Tkhai: ``` One of the real workloads from my experience is start of an overcommitted node containing many starting containers after node crash (or many resuming containers after reboot for kernel update). In these cases memory pressure is huge, and the node goes round in long reclaim. ``` 2) If a shrinker is blocked (such as the case mentioned in [1]) and a writer comes in (such as mount a fs), then this writer will be blocked and cause all subsequent shrinker-related operations to be blocked. Even if there is no competitor when shrinking slab, there may still be a problem. The down_read_trylock() may become a perf hotspot with frequent calls to shrink_slab(). Because of the poor multicore scalability of atomic operations, this can lead to a significant drop in IPC (instructions per cycle). We used to implement the lockless slab shrink with SRCU [2], but then kernel test robot reported -88.8% regression in stress-ng.ramfs.ops_per_sec test case [3], so we reverted it [4]. This commit uses the refcount+RCU method [5] proposed by Dave Chinner to re-implement the lockless global slab shrink. The memcg slab shrink is handled in the subsequent patch. For now, all shrinker instances are converted to dynamically allocated and will be freed by call_rcu(). So we can use rcu_read_{lock,unlock}() to ensure that the shrinker instance is valid. And the shrinker instance will not be run again after unregistration. So the structure that records the pointer of shrinker instance can be safely freed without waiting for the RCU read-side critical section. In this way, while we implement the lockless slab shrink, we don't need to be blocked in unregister_shrinker(). The following are the test results: stress-ng --timeout 60 --times --verify --metrics-brief --ramfs 9 & 1) Before applying this patchset: setting to a 60 second run per stressor dispatching hogs: 9 ramfs stressor bogo ops real time usr time sys time bogo ops/s bogo ops/s (secs) (secs) (secs) (real time) (usr+sys time) ramfs 473062 60.00 8.00 279.13 7884.12 1647.59 for a 60.01s run time: 1440.34s available CPU time 7.99s user time ( 0.55%) 279.13s system time ( 19.38%) 287.12s total time ( 19.93%) load average: 7.12 2.99 1.15 successful run completed in 60.01s (1 min, 0.01 secs) 2) After applying this patchset: setting to a 60 second run per stressor dispatching hogs: 9 ramfs stressor bogo ops real time usr time sys time bogo ops/s bogo ops/s (secs) (secs) (secs) (real time) (usr+sys time) ramfs 477165 60.00 8.13 281.34 7952.55 1648.40 for a 60.01s run time: 1440.33s available CPU time 8.12s user time ( 0.56%) 281.34s system time ( 19.53%) 289.46s total time ( 20.10%) load average: 6.98 3.03 1.19 successful run completed in 60.01s (1 min, 0.01 secs) We can see that the ops/s has hardly changed. [1]. https://lore.kernel.org/lkml/20191129214541.3110-1-ptikhomirov@virtuozzo.com/ [2]. https://lore.kernel.org/lkml/20230313112819.38938-1-zhengqi.arch@bytedance.com/ [3]. https://lore.kernel.org/lkml/202305230837.db2c233f-yujie.liu@intel.com/ [4]. https://lore.kernel.org/all/20230609081518.3039120-1-qi.zheng@linux.dev/ [5]. https://lore.kernel.org/lkml/ZIJhou1d55d4H1s0@dread.disaster.area/ Link: https://lkml.kernel.org/r/20230911094444.68966-43-zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Cc: Abhinav Kumar Cc: Alasdair Kergon Cc: Alexander Viro Cc: Alyssa Rosenzweig Cc: Andreas Dilger Cc: Andreas Gruenbacher Cc: Anna Schumaker Cc: Arnd Bergmann Cc: Bob Peterson Cc: Borislav Petkov Cc: Carlos Llamas Cc: Chandan Babu R Cc: Chao Yu Cc: Chris Mason Cc: Christian Brauner Cc: Christian Koenig Cc: Chuck Lever Cc: Coly Li Cc: Dai Ngo Cc: Daniel Vetter Cc: Daniel Vetter Cc: "Darrick J. Wong" Cc: Dave Chinner Cc: Dave Hansen Cc: David Airlie Cc: David Hildenbrand Cc: David Sterba Cc: Dmitry Baryshkov Cc: Gao Xiang Cc: Greg Kroah-Hartman Cc: Huang Rui Cc: Ingo Molnar Cc: Jaegeuk Kim Cc: Jani Nikula Cc: Jan Kara Cc: Jason Wang Cc: Jeff Layton Cc: Jeffle Xu Cc: Joel Fernandes (Google) Cc: Joonas Lahtinen Cc: Josef Bacik Cc: Juergen Gross Cc: Kent Overstreet Cc: Kirill Tkhai Cc: Marijn Suijten Cc: "Michael S. Tsirkin" Cc: Mike Snitzer Cc: Minchan Kim Cc: Muchun Song Cc: Muchun Song Cc: Nadav Amit Cc: Neil Brown Cc: Oleksandr Tyshchenko Cc: Olga Kornievskaia Cc: Paul E. McKenney Cc: Richard Weinberger Cc: Rob Clark Cc: Rob Herring Cc: Rodrigo Vivi Cc: Roman Gushchin Cc: Sean Paul Cc: Sergey Senozhatsky Cc: Song Liu Cc: Stefano Stabellini Cc: Steven Price Cc: "Theodore Ts'o" Cc: Thomas Gleixner Cc: Tomeu Vizoso Cc: Tom Talpey Cc: Trond Myklebust Cc: Tvrtko Ursulin Cc: Vlastimil Babka Cc: Xuan Zhuo Cc: Yue Hu Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- include/linux/shrinker.h | 24 +++++++++++ mm/shrinker.c | 89 ++++++++++++++++++++++++++++++---------- 2 files changed, 92 insertions(+), 21 deletions(-) diff --git a/include/linux/shrinker.h b/include/linux/shrinker.h index d5aae1734c4a..746e86918d24 100644 --- a/include/linux/shrinker.h +++ b/include/linux/shrinker.h @@ -4,6 +4,8 @@ #include #include +#include +#include #include #define SHRINKER_UNIT_BITS BITS_PER_LONG @@ -88,6 +90,17 @@ struct shrinker { int seeks; /* seeks to recreate an obj */ unsigned flags; + /* + * The reference count of this shrinker. Registered shrinker have an + * initial refcount of 1, then the lookup operations are now allowed + * to use it via shrinker_try_get(). Later in the unregistration step, + * the initial refcount will be discarded, and will free the shrinker + * asynchronously via RCU after its refcount reaches 0. + */ + refcount_t refcount; + struct completion done; /* use to wait for refcount to reach 0 */ + struct rcu_head rcu; + void *private_data; /* These are for internal use */ @@ -126,6 +139,17 @@ struct shrinker *shrinker_alloc(unsigned int flags, const char *fmt, ...); void shrinker_register(struct shrinker *shrinker); void shrinker_free(struct shrinker *shrinker); +static inline bool shrinker_try_get(struct shrinker *shrinker) +{ + return refcount_inc_not_zero(&shrinker->refcount); +} + +static inline void shrinker_put(struct shrinker *shrinker) +{ + if (refcount_dec_and_test(&shrinker->refcount)) + complete(&shrinker->done); +} + #ifdef CONFIG_SHRINKER_DEBUG extern int __printf(2, 3) shrinker_debugfs_rename(struct shrinker *shrinker, const char *fmt, ...); diff --git a/mm/shrinker.c b/mm/shrinker.c index e2e832b15803..9bb2e61092b3 100644 --- a/mm/shrinker.c +++ b/mm/shrinker.c @@ -2,6 +2,7 @@ #include #include #include +#include #include #include "internal.h" @@ -577,33 +578,50 @@ unsigned long shrink_slab(gfp_t gfp_mask, int nid, struct mem_cgroup *memcg, if (!mem_cgroup_disabled() && !mem_cgroup_is_root(memcg)) return shrink_slab_memcg(gfp_mask, nid, memcg, priority); - if (!down_read_trylock(&shrinker_rwsem)) - goto out; - - list_for_each_entry(shrinker, &shrinker_list, list) { + /* + * lockless algorithm of global shrink. + * + * In the unregistration setp, the shrinker will be freed asynchronously + * via RCU after its refcount reaches 0. So both rcu_read_lock() and + * shrinker_try_get() can be used to ensure the existence of the shrinker. + * + * So in the global shrink: + * step 1: use rcu_read_lock() to guarantee existence of the shrinker + * and the validity of the shrinker_list walk. + * step 2: use shrinker_try_get() to try get the refcount, if successful, + * then the existence of the shrinker can also be guaranteed, + * so we can release the RCU lock to do do_shrink_slab() that + * may sleep. + * step 3: *MUST* to reacquire the RCU lock before calling shrinker_put(), + * which ensures that neither this shrinker nor the next shrinker + * will be freed in the next traversal operation. + * step 4: do shrinker_put() paired with step 2 to put the refcount, + * if the refcount reaches 0, then wake up the waiter in + * shrinker_free() by calling complete(). + */ + rcu_read_lock(); + list_for_each_entry_rcu(shrinker, &shrinker_list, list) { struct shrink_control sc = { .gfp_mask = gfp_mask, .nid = nid, .memcg = memcg, }; + if (!shrinker_try_get(shrinker)) + continue; + + rcu_read_unlock(); + ret = do_shrink_slab(&sc, shrinker, priority); if (ret == SHRINK_EMPTY) ret = 0; freed += ret; - /* - * Bail out if someone want to register a new shrinker to - * prevent the registration from being stalled for long periods - * by parallel ongoing shrinking. - */ - if (rwsem_is_contended(&shrinker_rwsem)) { - freed = freed ? : 1; - break; - } + + rcu_read_lock(); + shrinker_put(shrinker); } - up_read(&shrinker_rwsem); -out: + rcu_read_unlock(); cond_resched(); return freed; } @@ -676,13 +694,29 @@ void shrinker_register(struct shrinker *shrinker) } down_write(&shrinker_rwsem); - list_add_tail(&shrinker->list, &shrinker_list); + list_add_tail_rcu(&shrinker->list, &shrinker_list); shrinker->flags |= SHRINKER_REGISTERED; shrinker_debugfs_add(shrinker); up_write(&shrinker_rwsem); + + init_completion(&shrinker->done); + /* + * Now the shrinker is fully set up, take the first reference to it to + * indicate that lookup operations are now allowed to use it via + * shrinker_try_get(). + */ + refcount_set(&shrinker->refcount, 1); } EXPORT_SYMBOL_GPL(shrinker_register); +static void shrinker_free_rcu_cb(struct rcu_head *head) +{ + struct shrinker *shrinker = container_of(head, struct shrinker, rcu); + + kfree(shrinker->nr_deferred); + kfree(shrinker); +} + void shrinker_free(struct shrinker *shrinker) { struct dentry *debugfs_entry = NULL; @@ -691,9 +725,25 @@ void shrinker_free(struct shrinker *shrinker) if (!shrinker) return; + if (shrinker->flags & SHRINKER_REGISTERED) { + /* drop the initial refcount */ + shrinker_put(shrinker); + /* + * Wait for all lookups of the shrinker to complete, after that, + * no shrinker is running or will run again, then we can safely + * free it asynchronously via RCU and safely free the structure + * where the shrinker is located, such as super_block etc. + */ + wait_for_completion(&shrinker->done); + } + down_write(&shrinker_rwsem); if (shrinker->flags & SHRINKER_REGISTERED) { - list_del(&shrinker->list); + /* + * Now we can safely remove it from the shrinker_list and then + * free it. + */ + list_del_rcu(&shrinker->list); debugfs_entry = shrinker_debugfs_detach(shrinker, &debugfs_id); shrinker->flags &= ~SHRINKER_REGISTERED; } @@ -707,9 +757,6 @@ void shrinker_free(struct shrinker *shrinker) if (debugfs_entry) shrinker_debugfs_remove(debugfs_entry, debugfs_id); - kfree(shrinker->nr_deferred); - shrinker->nr_deferred = NULL; - - kfree(shrinker); + call_rcu(&shrinker->rcu, shrinker_free_rcu_cb); } EXPORT_SYMBOL_GPL(shrinker_free); -- Gitee From 1bb4f9d3f3591e9700eed350e6eea589559cbd4f Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Mon, 11 Sep 2023 17:44:42 +0800 Subject: [PATCH 082/484] mm: shrinker: make memcg slab shrink lockless commit 50d09da8e1199092a128eebd8a986d1fb0335fe4 upstream Conflicts: none Backport-reason: Shrinker lockless Like global slab shrink, this commit also uses refcount+RCU method to make memcg slab shrink lockless. Use the following script to do slab shrink stress test: ``` DIR="/root/shrinker/memcg/mnt" do_create() { mkdir -p /sys/fs/cgroup/memory/test echo 4G > /sys/fs/cgroup/memory/test/memory.limit_in_bytes for i in `seq 0 $1`; do mkdir -p /sys/fs/cgroup/memory/test/$i; echo $$ > /sys/fs/cgroup/memory/test/$i/cgroup.procs; mkdir -p $DIR/$i; done } do_mount() { for i in `seq $1 $2`; do mount -t tmpfs $i $DIR/$i; done } do_touch() { for i in `seq $1 $2`; do echo $$ > /sys/fs/cgroup/memory/test/$i/cgroup.procs; dd if=/dev/zero of=$DIR/$i/file$i bs=1M count=1 & done } case "$1" in touch) do_touch $2 $3 ;; test) do_create 4000 do_mount 0 4000 do_touch 0 3000 ;; *) exit 1 ;; esac ``` Save the above script, then run test and touch commands. Then we can use the following perf command to view hotspots: perf top -U -F 999 1) Before applying this patchset: 33.15% [kernel] [k] down_read_trylock 25.38% [kernel] [k] shrink_slab 21.75% [kernel] [k] up_read 4.45% [kernel] [k] _find_next_bit 2.27% [kernel] [k] do_shrink_slab 1.80% [kernel] [k] intel_idle_irq 1.79% [kernel] [k] shrink_lruvec 0.67% [kernel] [k] xas_descend 0.41% [kernel] [k] mem_cgroup_iter 0.40% [kernel] [k] shrink_node 0.38% [kernel] [k] list_lru_count_one 2) After applying this patchset: 64.56% [kernel] [k] shrink_slab 12.18% [kernel] [k] do_shrink_slab 3.30% [kernel] [k] __rcu_read_unlock 2.61% [kernel] [k] shrink_lruvec 2.49% [kernel] [k] __rcu_read_lock 1.93% [kernel] [k] intel_idle_irq 0.89% [kernel] [k] shrink_node 0.81% [kernel] [k] mem_cgroup_iter 0.77% [kernel] [k] mem_cgroup_calculate_protection 0.66% [kernel] [k] list_lru_count_one We can see that the first perf hotspot becomes shrink_slab, which is what we expect. Link: https://lkml.kernel.org/r/20230911094444.68966-44-zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Cc: Abhinav Kumar Cc: Alasdair Kergon Cc: Alexander Viro Cc: Alyssa Rosenzweig Cc: Andreas Dilger Cc: Andreas Gruenbacher Cc: Anna Schumaker Cc: Arnd Bergmann Cc: Bob Peterson Cc: Borislav Petkov Cc: Carlos Llamas Cc: Chandan Babu R Cc: Chao Yu Cc: Chris Mason Cc: Christian Brauner Cc: Christian Koenig Cc: Chuck Lever Cc: Coly Li Cc: Dai Ngo Cc: Daniel Vetter Cc: Daniel Vetter Cc: "Darrick J. Wong" Cc: Dave Chinner Cc: Dave Hansen Cc: David Airlie Cc: David Hildenbrand Cc: David Sterba Cc: Dmitry Baryshkov Cc: Gao Xiang Cc: Greg Kroah-Hartman Cc: Huang Rui Cc: Ingo Molnar Cc: Jaegeuk Kim Cc: Jani Nikula Cc: Jan Kara Cc: Jason Wang Cc: Jeff Layton Cc: Jeffle Xu Cc: Joel Fernandes (Google) Cc: Joonas Lahtinen Cc: Josef Bacik Cc: Juergen Gross Cc: Kent Overstreet Cc: Kirill Tkhai Cc: Marijn Suijten Cc: "Michael S. Tsirkin" Cc: Mike Snitzer Cc: Minchan Kim Cc: Muchun Song Cc: Muchun Song Cc: Nadav Amit Cc: Neil Brown Cc: Oleksandr Tyshchenko Cc: Olga Kornievskaia Cc: Paul E. McKenney Cc: Richard Weinberger Cc: Rob Clark Cc: Rob Herring Cc: Rodrigo Vivi Cc: Roman Gushchin Cc: Sean Paul Cc: Sergey Senozhatsky Cc: Song Liu Cc: Stefano Stabellini Cc: Steven Price Cc: "Theodore Ts'o" Cc: Thomas Gleixner Cc: Tomeu Vizoso Cc: Tom Talpey Cc: Trond Myklebust Cc: Tvrtko Ursulin Cc: Vlastimil Babka Cc: Xuan Zhuo Cc: Yue Hu Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/shrinker.c | 85 +++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 66 insertions(+), 19 deletions(-) diff --git a/mm/shrinker.c b/mm/shrinker.c index 9bb2e61092b3..d1b34a79c7a7 100644 --- a/mm/shrinker.c +++ b/mm/shrinker.c @@ -219,7 +219,6 @@ static int shrinker_memcg_alloc(struct shrinker *shrinker) return -ENOSYS; down_write(&shrinker_rwsem); - /* This may call shrinker, so it must use down_read_trylock() */ id = idr_alloc(&shrinker_idr, shrinker, 0, 0, GFP_KERNEL); if (id < 0) goto unlock; @@ -253,10 +252,15 @@ static long xchg_nr_deferred_memcg(int nid, struct shrinker *shrinker, { struct shrinker_info *info; struct shrinker_info_unit *unit; + long nr_deferred; - info = shrinker_info_protected(memcg, nid); + rcu_read_lock(); + info = rcu_dereference(memcg->nodeinfo[nid]->shrinker_info); unit = info->unit[shrinker_id_to_index(shrinker->id)]; - return atomic_long_xchg(&unit->nr_deferred[shrinker_id_to_offset(shrinker->id)], 0); + nr_deferred = atomic_long_xchg(&unit->nr_deferred[shrinker_id_to_offset(shrinker->id)], 0); + rcu_read_unlock(); + + return nr_deferred; } static long add_nr_deferred_memcg(long nr, int nid, struct shrinker *shrinker, @@ -264,10 +268,16 @@ static long add_nr_deferred_memcg(long nr, int nid, struct shrinker *shrinker, { struct shrinker_info *info; struct shrinker_info_unit *unit; + long nr_deferred; - info = shrinker_info_protected(memcg, nid); + rcu_read_lock(); + info = rcu_dereference(memcg->nodeinfo[nid]->shrinker_info); unit = info->unit[shrinker_id_to_index(shrinker->id)]; - return atomic_long_add_return(nr, &unit->nr_deferred[shrinker_id_to_offset(shrinker->id)]); + nr_deferred = + atomic_long_add_return(nr, &unit->nr_deferred[shrinker_id_to_offset(shrinker->id)]); + rcu_read_unlock(); + + return nr_deferred; } void reparent_shrinker_deferred(struct mem_cgroup *memcg) @@ -464,18 +474,54 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, if (!mem_cgroup_online(memcg)) return 0; - if (!down_read_trylock(&shrinker_rwsem)) - return 0; - - info = shrinker_info_protected(memcg, nid); + /* + * lockless algorithm of memcg shrink. + * + * The shrinker_info may be freed asynchronously via RCU in the + * expand_one_shrinker_info(), so the rcu_read_lock() needs to be used + * to ensure the existence of the shrinker_info. + * + * The shrinker_info_unit is never freed unless its corresponding memcg + * is destroyed. Here we already hold the refcount of memcg, so the + * memcg will not be destroyed, and of course shrinker_info_unit will + * not be freed. + * + * So in the memcg shrink: + * step 1: use rcu_read_lock() to guarantee existence of the + * shrinker_info. + * step 2: after getting shrinker_info_unit we can safely release the + * RCU lock. + * step 3: traverse the bitmap and calculate shrinker_id + * step 4: use rcu_read_lock() to guarantee existence of the shrinker. + * step 5: use shrinker_id to find the shrinker, then use + * shrinker_try_get() to guarantee existence of the shrinker, + * then we can release the RCU lock to do do_shrink_slab() that + * may sleep. + * step 6: do shrinker_put() paired with step 5 to put the refcount, + * if the refcount reaches 0, then wake up the waiter in + * shrinker_free() by calling complete(). + * Note: here is different from the global shrink, we don't + * need to acquire the RCU lock to guarantee existence of + * the shrinker, because we don't need to use this + * shrinker to traverse the next shrinker in the bitmap. + * step 7: we have already exited the read-side of rcu critical section + * before calling do_shrink_slab(), the shrinker_info may be + * released in expand_one_shrinker_info(), so go back to step 1 + * to reacquire the shrinker_info. + */ +again: + rcu_read_lock(); + info = rcu_dereference(memcg->nodeinfo[nid]->shrinker_info); if (unlikely(!info)) goto unlock; - for (; index < shrinker_id_to_index(info->map_nr_max); index++) { + if (index < shrinker_id_to_index(info->map_nr_max)) { struct shrinker_info_unit *unit; unit = info->unit[index]; + rcu_read_unlock(); + for_each_set_bit(offset, unit->map, SHRINKER_UNIT_BITS) { struct shrink_control sc = { .gfp_mask = gfp_mask, @@ -485,12 +531,14 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, struct shrinker *shrinker; int shrinker_id = calc_shrinker_id(index, offset); + rcu_read_lock(); shrinker = idr_find(&shrinker_idr, shrinker_id); - if (unlikely(!shrinker || !(shrinker->flags & SHRINKER_REGISTERED))) { - if (!shrinker) - clear_bit(offset, unit->map); + if (unlikely(!shrinker || !shrinker_try_get(shrinker))) { + clear_bit(offset, unit->map); + rcu_read_unlock(); continue; } + rcu_read_unlock(); /* Call non-slab shrinkers even though kmem is disabled */ if (!memcg_kmem_online() && @@ -523,15 +571,14 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, set_shrinker_bit(memcg, nid, shrinker_id); } freed += ret; - - if (rwsem_is_contended(&shrinker_rwsem)) { - freed = freed ? : 1; - goto unlock; - } + shrinker_put(shrinker); } + + index++; + goto again; } unlock: - up_read(&shrinker_rwsem); + rcu_read_unlock(); return freed; } #else /* !CONFIG_MEMCG */ -- Gitee From 1d1019129e982a302bc26a3e378fdb3a790c748f Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Mon, 11 Sep 2023 17:44:43 +0800 Subject: [PATCH 083/484] mm: shrinker: hold write lock to reparent shrinker nr_deferred commit 604b8b655020816b66ae474681d1100c5c0ba8c4 upstream Conflicts: none Backport-reason: Shrinker lockless For now, reparent_shrinker_deferred() is the only holder of read lock of shrinker_rwsem. And it already holds the global cgroup_mutex, so it will not be called in parallel. Therefore, in order to convert shrinker_rwsem to shrinker_mutex later, here we change to hold the write lock of shrinker_rwsem to reparent. Link: https://lkml.kernel.org/r/20230911094444.68966-45-zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Reviewed-by: Muchun Song Cc: Abhinav Kumar Cc: Alasdair Kergon Cc: Alexander Viro Cc: Alyssa Rosenzweig Cc: Andreas Dilger Cc: Andreas Gruenbacher Cc: Anna Schumaker Cc: Arnd Bergmann Cc: Bob Peterson Cc: Borislav Petkov Cc: Carlos Llamas Cc: Chandan Babu R Cc: Chao Yu Cc: Chris Mason Cc: Christian Brauner Cc: Christian Koenig Cc: Chuck Lever Cc: Coly Li Cc: Dai Ngo Cc: Daniel Vetter Cc: Daniel Vetter Cc: "Darrick J. Wong" Cc: Dave Chinner Cc: Dave Hansen Cc: David Airlie Cc: David Hildenbrand Cc: David Sterba Cc: Dmitry Baryshkov Cc: Gao Xiang Cc: Greg Kroah-Hartman Cc: Huang Rui Cc: Ingo Molnar Cc: Jaegeuk Kim Cc: Jani Nikula Cc: Jan Kara Cc: Jason Wang Cc: Jeff Layton Cc: Jeffle Xu Cc: Joel Fernandes (Google) Cc: Joonas Lahtinen Cc: Josef Bacik Cc: Juergen Gross Cc: Kent Overstreet Cc: Kirill Tkhai Cc: Marijn Suijten Cc: "Michael S. Tsirkin" Cc: Mike Snitzer Cc: Minchan Kim Cc: Nadav Amit Cc: Neil Brown Cc: Oleksandr Tyshchenko Cc: Olga Kornievskaia Cc: Paul E. McKenney Cc: Richard Weinberger Cc: Rob Clark Cc: Rob Herring Cc: Rodrigo Vivi Cc: Roman Gushchin Cc: Sean Paul Cc: Sergey Senozhatsky Cc: Song Liu Cc: Stefano Stabellini Cc: Steven Price Cc: "Theodore Ts'o" Cc: Thomas Gleixner Cc: Tomeu Vizoso Cc: Tom Talpey Cc: Trond Myklebust Cc: Tvrtko Ursulin Cc: Vlastimil Babka Cc: Xuan Zhuo Cc: Yue Hu Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/shrinker.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/shrinker.c b/mm/shrinker.c index d1b34a79c7a7..ef22f20a97cb 100644 --- a/mm/shrinker.c +++ b/mm/shrinker.c @@ -293,7 +293,7 @@ void reparent_shrinker_deferred(struct mem_cgroup *memcg) parent = root_mem_cgroup; /* Prevent from concurrent shrinker_info expand */ - down_read(&shrinker_rwsem); + down_write(&shrinker_rwsem); for_each_node(nid) { child_info = shrinker_info_protected(memcg, nid); parent_info = shrinker_info_protected(parent, nid); @@ -306,7 +306,7 @@ void reparent_shrinker_deferred(struct mem_cgroup *memcg) } } } - up_read(&shrinker_rwsem); + up_write(&shrinker_rwsem); } #else static int shrinker_memcg_alloc(struct shrinker *shrinker) -- Gitee From 031a52785cc5e2a9d3b106cb1d3dc9dde3901226 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Mon, 11 Sep 2023 17:44:44 +0800 Subject: [PATCH 084/484] mm: shrinker: convert shrinker_rwsem to mutex commit 8a0e8bb112af28362514624fc46e20426b4bdf1f upstream Conflicts: none Backport-reason: Shrinker lockless Now there are no readers of shrinker_rwsem, so we can simply replace it with mutex lock. [akpm@linux-foundation.org: update the fix to alloc_shrinker_info()] Link: https://lkml.kernel.org/r/20230911094444.68966-46-zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Reviewed-by: Muchun Song Cc: Abhinav Kumar Cc: Alasdair Kergon Cc: Alexander Viro Cc: Alyssa Rosenzweig Cc: Andreas Dilger Cc: Andreas Gruenbacher Cc: Anna Schumaker Cc: Arnd Bergmann Cc: Bob Peterson Cc: Borislav Petkov Cc: Carlos Llamas Cc: Chandan Babu R Cc: Chao Yu Cc: Chris Mason Cc: Christian Brauner Cc: Christian Koenig Cc: Chuck Lever Cc: Coly Li Cc: Dai Ngo Cc: Daniel Vetter Cc: Daniel Vetter Cc: "Darrick J. Wong" Cc: Dave Chinner Cc: Dave Hansen Cc: David Airlie Cc: David Hildenbrand Cc: David Sterba Cc: Dmitry Baryshkov Cc: Gao Xiang Cc: Greg Kroah-Hartman Cc: Huang Rui Cc: Ingo Molnar Cc: Jaegeuk Kim Cc: Jani Nikula Cc: Jan Kara Cc: Jason Wang Cc: Jeff Layton Cc: Jeffle Xu Cc: Joel Fernandes (Google) Cc: Joonas Lahtinen Cc: Josef Bacik Cc: Juergen Gross Cc: Kent Overstreet Cc: Kirill Tkhai Cc: Marijn Suijten Cc: "Michael S. Tsirkin" Cc: Mike Snitzer Cc: Minchan Kim Cc: Nadav Amit Cc: Neil Brown Cc: Oleksandr Tyshchenko Cc: Olga Kornievskaia Cc: Paul E. McKenney Cc: Richard Weinberger Cc: Rob Clark Cc: Rob Herring Cc: Rodrigo Vivi Cc: Roman Gushchin Cc: Sean Paul Cc: Sergey Senozhatsky Cc: Song Liu Cc: Stefano Stabellini Cc: Steven Price Cc: "Theodore Ts'o" Cc: Thomas Gleixner Cc: Tomeu Vizoso Cc: Tom Talpey Cc: Trond Myklebust Cc: Tvrtko Ursulin Cc: Vlastimil Babka Cc: Xuan Zhuo Cc: Yue Hu Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- drivers/md/dm-cache-metadata.c | 2 +- fs/super.c | 2 +- mm/shrinker.c | 30 +++++++++++++++--------------- mm/shrinker_debug.c | 14 +++++++------- 4 files changed, 24 insertions(+), 24 deletions(-) diff --git a/drivers/md/dm-cache-metadata.c b/drivers/md/dm-cache-metadata.c index acffed750e3e..9e0c69958587 100644 --- a/drivers/md/dm-cache-metadata.c +++ b/drivers/md/dm-cache-metadata.c @@ -1828,7 +1828,7 @@ int dm_cache_metadata_abort(struct dm_cache_metadata *cmd) * Replacement block manager (new_bm) is created and old_bm destroyed outside of * cmd root_lock to avoid ABBA deadlock that would result (due to life-cycle of * shrinker associated with the block manager's bufio client vs cmd root_lock). - * - must take shrinker_rwsem without holding cmd->root_lock + * - must take shrinker_mutex without holding cmd->root_lock */ new_bm = dm_block_manager_create(cmd->bdev, DM_CACHE_METADATA_BLOCK_SIZE << SECTOR_SHIFT, CACHE_MAX_CONCURRENT_LOCKS); diff --git a/fs/super.c b/fs/super.c index 077ad7f5d6f2..0dba89356c64 100644 --- a/fs/super.c +++ b/fs/super.c @@ -178,7 +178,7 @@ static void super_wake(struct super_block *sb, unsigned int flag) * One thing we have to be careful of with a per-sb shrinker is that we don't * drop the last active reference to the superblock from within the shrinker. * If that happens we could trigger unregistering the shrinker from within the - * shrinker path and that leads to deadlock on the shrinker_rwsem. Hence we + * shrinker path and that leads to deadlock on the shrinker_mutex. Hence we * take a passive reference to the superblock to avoid this from occurring. */ static unsigned long super_cache_scan(struct shrinker *shrink, diff --git a/mm/shrinker.c b/mm/shrinker.c index ef22f20a97cb..dd91eab43ed3 100644 --- a/mm/shrinker.c +++ b/mm/shrinker.c @@ -8,7 +8,7 @@ #include "internal.h" LIST_HEAD(shrinker_list); -DECLARE_RWSEM(shrinker_rwsem); +DEFINE_MUTEX(shrinker_mutex); #ifdef CONFIG_MEMCG static int shrinker_nr_max; @@ -80,7 +80,7 @@ int alloc_shrinker_info(struct mem_cgroup *memcg) int nid, ret = 0; int array_size = 0; - down_write(&shrinker_rwsem); + mutex_lock(&shrinker_mutex); array_size = shrinker_unit_size(shrinker_nr_max); for_each_node(nid) { info = kvzalloc_node(sizeof(*info) + array_size, GFP_KERNEL, nid); @@ -91,12 +91,12 @@ int alloc_shrinker_info(struct mem_cgroup *memcg) goto err; rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_info, info); } - up_write(&shrinker_rwsem); + mutex_unlock(&shrinker_mutex); return ret; err: - up_write(&shrinker_rwsem); + mutex_unlock(&shrinker_mutex); free_shrinker_info(memcg); return -ENOMEM; } @@ -105,7 +105,7 @@ static struct shrinker_info *shrinker_info_protected(struct mem_cgroup *memcg, int nid) { return rcu_dereference_protected(memcg->nodeinfo[nid]->shrinker_info, - lockdep_is_held(&shrinker_rwsem)); + lockdep_is_held(&shrinker_mutex)); } static int expand_one_shrinker_info(struct mem_cgroup *memcg, int new_size, @@ -155,7 +155,7 @@ static int expand_shrinker_info(int new_id) if (!root_mem_cgroup) goto out; - lockdep_assert_held(&shrinker_rwsem); + lockdep_assert_held(&shrinker_mutex); new_size = shrinker_unit_size(new_nr_max); old_size = shrinker_unit_size(shrinker_nr_max); @@ -218,7 +218,7 @@ static int shrinker_memcg_alloc(struct shrinker *shrinker) if (mem_cgroup_disabled()) return -ENOSYS; - down_write(&shrinker_rwsem); + mutex_lock(&shrinker_mutex); id = idr_alloc(&shrinker_idr, shrinker, 0, 0, GFP_KERNEL); if (id < 0) goto unlock; @@ -232,7 +232,7 @@ static int shrinker_memcg_alloc(struct shrinker *shrinker) shrinker->id = id; ret = 0; unlock: - up_write(&shrinker_rwsem); + mutex_unlock(&shrinker_mutex); return ret; } @@ -242,7 +242,7 @@ static void shrinker_memcg_remove(struct shrinker *shrinker) BUG_ON(id < 0); - lockdep_assert_held(&shrinker_rwsem); + lockdep_assert_held(&shrinker_mutex); idr_remove(&shrinker_idr, id); } @@ -293,7 +293,7 @@ void reparent_shrinker_deferred(struct mem_cgroup *memcg) parent = root_mem_cgroup; /* Prevent from concurrent shrinker_info expand */ - down_write(&shrinker_rwsem); + mutex_lock(&shrinker_mutex); for_each_node(nid) { child_info = shrinker_info_protected(memcg, nid); parent_info = shrinker_info_protected(parent, nid); @@ -306,7 +306,7 @@ void reparent_shrinker_deferred(struct mem_cgroup *memcg) } } } - up_write(&shrinker_rwsem); + mutex_unlock(&shrinker_mutex); } #else static int shrinker_memcg_alloc(struct shrinker *shrinker) @@ -740,11 +740,11 @@ void shrinker_register(struct shrinker *shrinker) return; } - down_write(&shrinker_rwsem); + mutex_lock(&shrinker_mutex); list_add_tail_rcu(&shrinker->list, &shrinker_list); shrinker->flags |= SHRINKER_REGISTERED; shrinker_debugfs_add(shrinker); - up_write(&shrinker_rwsem); + mutex_unlock(&shrinker_mutex); init_completion(&shrinker->done); /* @@ -784,7 +784,7 @@ void shrinker_free(struct shrinker *shrinker) wait_for_completion(&shrinker->done); } - down_write(&shrinker_rwsem); + mutex_lock(&shrinker_mutex); if (shrinker->flags & SHRINKER_REGISTERED) { /* * Now we can safely remove it from the shrinker_list and then @@ -799,7 +799,7 @@ void shrinker_free(struct shrinker *shrinker) if (shrinker->flags & SHRINKER_MEMCG_AWARE) shrinker_memcg_remove(shrinker); - up_write(&shrinker_rwsem); + mutex_unlock(&shrinker_mutex); if (debugfs_entry) shrinker_debugfs_remove(debugfs_entry, debugfs_id); diff --git a/mm/shrinker_debug.c b/mm/shrinker_debug.c index 24aebe7c24cc..12ea5486a3e9 100644 --- a/mm/shrinker_debug.c +++ b/mm/shrinker_debug.c @@ -9,7 +9,7 @@ #include "internal.h" /* defined in vmscan.c */ -extern struct rw_semaphore shrinker_rwsem; +extern struct mutex shrinker_mutex; extern struct list_head shrinker_list; static DEFINE_IDA(shrinker_debugfs_ida); @@ -165,7 +165,7 @@ int shrinker_debugfs_add(struct shrinker *shrinker) char buf[128]; int id; - lockdep_assert_held(&shrinker_rwsem); + lockdep_assert_held(&shrinker_mutex); /* debugfs isn't initialized yet, add debugfs entries later. */ if (!shrinker_debugfs_root) @@ -208,7 +208,7 @@ int shrinker_debugfs_rename(struct shrinker *shrinker, const char *fmt, ...) if (!new) return -ENOMEM; - down_write(&shrinker_rwsem); + mutex_lock(&shrinker_mutex); old = shrinker->name; shrinker->name = new; @@ -226,7 +226,7 @@ int shrinker_debugfs_rename(struct shrinker *shrinker, const char *fmt, ...) shrinker->debugfs_entry = entry; } - up_write(&shrinker_rwsem); + mutex_unlock(&shrinker_mutex); kfree_const(old); @@ -239,7 +239,7 @@ struct dentry *shrinker_debugfs_detach(struct shrinker *shrinker, { struct dentry *entry = shrinker->debugfs_entry; - lockdep_assert_held(&shrinker_rwsem); + lockdep_assert_held(&shrinker_mutex); *debugfs_id = entry ? shrinker->debugfs_id : -1; shrinker->debugfs_entry = NULL; @@ -265,14 +265,14 @@ static int __init shrinker_debugfs_init(void) shrinker_debugfs_root = dentry; /* Create debugfs entries for shrinkers registered at boot */ - down_write(&shrinker_rwsem); + mutex_lock(&shrinker_mutex); list_for_each_entry(shrinker, &shrinker_list, list) if (!shrinker->debugfs_entry) { ret = shrinker_debugfs_add(shrinker); if (ret) break; } - up_write(&shrinker_rwsem); + mutex_unlock(&shrinker_mutex); return ret; } -- Gitee From 0bed495a5962943e5397207bd551b839a72060ff Mon Sep 17 00:00:00 2001 From: Nhat Pham Date: Thu, 30 Nov 2023 11:40:18 -0800 Subject: [PATCH 085/484] list_lru: allow explicit memcg and NUMA node selection commit 0a97c01cd20bb96359d8c9dedad92a061ed34e0b upstream Conflicts: none Backport-reason: ZSWAP updates to be consistent with SWAP Patch series "workload-specific and memory pressure-driven zswap writeback", v8. There are currently several issues with zswap writeback: 1. There is only a single global LRU for zswap, making it impossible to perform worload-specific shrinking - an memcg under memory pressure cannot determine which pages in the pool it owns, and often ends up writing pages from other memcgs. This issue has been previously observed in practice and mitigated by simply disabling memcg-initiated shrinking: https://lore.kernel.org/all/20230530232435.3097106-1-nphamcs@gmail.com/T/#u But this solution leaves a lot to be desired, as we still do not have an avenue for an memcg to free up its own memory locked up in the zswap pool. 2. We only shrink the zswap pool when the user-defined limit is hit. This means that if we set the limit too high, cold data that are unlikely to be used again will reside in the pool, wasting precious memory. It is hard to predict how much zswap space will be needed ahead of time, as this depends on the workload (specifically, on factors such as memory access patterns and compressibility of the memory pages). This patch series solves these issues by separating the global zswap LRU into per-memcg and per-NUMA LRUs, and performs workload-specific (i.e memcg- and NUMA-aware) zswap writeback under memory pressure. The new shrinker does not have any parameter that must be tuned by the user, and can be opted in or out on a per-memcg basis. As a proof of concept, we ran the following synthetic benchmark: build the linux kernel in a memory-limited cgroup, and allocate some cold data in tmpfs to see if the shrinker could write them out and improved the overall performance. Depending on the amount of cold data generated, we observe from 14% to 35% reduction in kernel CPU time used in the kernel builds. This patch (of 6): The interface of list_lru is based on the assumption that the list node and the data it represents belong to the same allocated on the correct node/memcg. While this assumption is valid for existing slab objects LRU such as dentries and inodes, it is undocumented, and rather inflexible for certain potential list_lru users (such as the upcoming zswap shrinker and the THP shrinker). It has caused us a lot of issues during our development. This patch changes list_lru interface so that the caller must explicitly specify numa node and memcg when adding and removing objects. The old list_lru_add() and list_lru_del() are renamed to list_lru_add_obj() and list_lru_del_obj(), respectively. It also extends the list_lru API with a new function, list_lru_putback, which undoes a previous list_lru_isolate call. Unlike list_lru_add, it does not increment the LRU node count (as list_lru_isolate does not decrement the node count). list_lru_putback also allows for explicit memcg and NUMA node selection. Link: https://lkml.kernel.org/r/20231130194023.4102148-1-nphamcs@gmail.com Link: https://lkml.kernel.org/r/20231130194023.4102148-2-nphamcs@gmail.com Signed-off-by: Nhat Pham Suggested-by: Johannes Weiner Acked-by: Johannes Weiner Tested-by: Bagas Sanjaya Cc: Chris Li Cc: Dan Streetman Cc: Domenico Cerasuolo Cc: Michal Hocko Cc: Muchun Song Cc: Roman Gushchin Cc: Seth Jennings Cc: Shakeel Butt Cc: Shuah Khan Cc: Vitaly Wool Cc: Yosry Ahmed Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- drivers/android/binder_alloc.c | 7 ++--- fs/dcache.c | 8 +++-- fs/gfs2/quota.c | 6 ++-- fs/inode.c | 4 +-- fs/nfs/nfs42xattr.c | 8 ++--- fs/nfsd/filecache.c | 4 +-- fs/xfs/xfs_buf.c | 6 ++-- fs/xfs/xfs_dquot.c | 2 +- fs/xfs/xfs_qm.c | 2 +- include/linux/list_lru.h | 54 ++++++++++++++++++++++++++++++++-- mm/list_lru.c | 48 +++++++++++++++++++++++++----- mm/workingset.c | 4 +-- 12 files changed, 117 insertions(+), 36 deletions(-) diff --git a/drivers/android/binder_alloc.c b/drivers/android/binder_alloc.c index a56cbfd9ba44..5a84112f1a77 100644 --- a/drivers/android/binder_alloc.c +++ b/drivers/android/binder_alloc.c @@ -234,7 +234,7 @@ static int binder_update_page_range(struct binder_alloc *alloc, int allocate, if (page->page_ptr) { trace_binder_alloc_lru_start(alloc, index); - on_lru = list_lru_del(&binder_alloc_lru, &page->lru); + on_lru = list_lru_del_obj(&binder_alloc_lru, &page->lru); WARN_ON(!on_lru); trace_binder_alloc_lru_end(alloc, index); @@ -285,7 +285,7 @@ static int binder_update_page_range(struct binder_alloc *alloc, int allocate, trace_binder_free_lru_start(alloc, index); - ret = list_lru_add(&binder_alloc_lru, &page->lru); + ret = list_lru_add_obj(&binder_alloc_lru, &page->lru); WARN_ON(!ret); trace_binder_free_lru_end(alloc, index); @@ -846,7 +846,7 @@ void binder_alloc_deferred_release(struct binder_alloc *alloc) if (!alloc->pages[i].page_ptr) continue; - on_lru = list_lru_del(&binder_alloc_lru, + on_lru = list_lru_del_obj(&binder_alloc_lru, &alloc->pages[i].lru); page_addr = alloc->buffer + i * PAGE_SIZE; binder_alloc_debug(BINDER_DEBUG_BUFFER_ALLOC, @@ -1289,4 +1289,3 @@ int binder_alloc_copy_from_buffer(struct binder_alloc *alloc, return binder_alloc_do_buffer_copy(alloc, false, buffer, buffer_offset, dest, bytes); } - diff --git a/fs/dcache.c b/fs/dcache.c index 4030c010a768..5088c09ccb84 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -432,7 +432,8 @@ static void d_lru_add(struct dentry *dentry) this_cpu_inc(nr_dentry_unused); if (d_is_negative(dentry)) this_cpu_inc(nr_dentry_negative); - WARN_ON_ONCE(!list_lru_add(&dentry->d_sb->s_dentry_lru, &dentry->d_lru)); + WARN_ON_ONCE(!list_lru_add_obj( + &dentry->d_sb->s_dentry_lru, &dentry->d_lru)); } static void d_lru_del(struct dentry *dentry) @@ -442,7 +443,8 @@ static void d_lru_del(struct dentry *dentry) this_cpu_dec(nr_dentry_unused); if (d_is_negative(dentry)) this_cpu_dec(nr_dentry_negative); - WARN_ON_ONCE(!list_lru_del(&dentry->d_sb->s_dentry_lru, &dentry->d_lru)); + WARN_ON_ONCE(!list_lru_del_obj( + &dentry->d_sb->s_dentry_lru, &dentry->d_lru)); } static void d_shrink_del(struct dentry *dentry) @@ -1245,7 +1247,7 @@ static enum lru_status dentry_lru_isolate(struct list_head *item, * * This is guaranteed by the fact that all LRU management * functions are intermediated by the LRU API calls like - * list_lru_add and list_lru_del. List movement in this file + * list_lru_add_obj and list_lru_del_obj. List movement in this file * only ever occur through this functions or through callbacks * like this one, that are called from the LRU API. * diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c index ae317d1951e7..178d59505d4a 100644 --- a/fs/gfs2/quota.c +++ b/fs/gfs2/quota.c @@ -268,7 +268,7 @@ static struct gfs2_quota_data *gfs2_qd_search_bucket(unsigned int hash, if (qd->qd_sbd != sdp) continue; if (lockref_get_not_dead(&qd->qd_lockref)) { - list_lru_del(&gfs2_qd_lru, &qd->qd_lru); + list_lru_del_obj(&gfs2_qd_lru, &qd->qd_lru); return qd; } } @@ -341,7 +341,7 @@ static void qd_put(struct gfs2_quota_data *qd) } qd->qd_lockref.count = 0; - list_lru_add(&gfs2_qd_lru, &qd->qd_lru); + list_lru_add_obj(&gfs2_qd_lru, &qd->qd_lru); spin_unlock(&qd->qd_lockref.lock); } @@ -1505,7 +1505,7 @@ void gfs2_quota_cleanup(struct gfs2_sbd *sdp) lockref_mark_dead(&qd->qd_lockref); spin_unlock(&qd->qd_lockref.lock); - list_lru_del(&gfs2_qd_lru, &qd->qd_lru); + list_lru_del_obj(&gfs2_qd_lru, &qd->qd_lru); list_add(&qd->qd_lru, &dispose); } spin_unlock(&qd_lock); diff --git a/fs/inode.c b/fs/inode.c index d577211e5fb3..60c95f26d814 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -464,7 +464,7 @@ static void __inode_add_lru(struct inode *inode, bool rotate) if (!mapping_shrinkable(&inode->i_data)) return; - if (list_lru_add(&inode->i_sb->s_inode_lru, &inode->i_lru)) + if (list_lru_add_obj(&inode->i_sb->s_inode_lru, &inode->i_lru)) this_cpu_inc(nr_unused); else if (rotate) inode->i_state |= I_REFERENCED; @@ -482,7 +482,7 @@ void inode_add_lru(struct inode *inode) static void inode_lru_list_del(struct inode *inode) { - if (list_lru_del(&inode->i_sb->s_inode_lru, &inode->i_lru)) + if (list_lru_del_obj(&inode->i_sb->s_inode_lru, &inode->i_lru)) this_cpu_dec(nr_unused); } diff --git a/fs/nfs/nfs42xattr.c b/fs/nfs/nfs42xattr.c index 2ad66a8922f4..49aaf28a6950 100644 --- a/fs/nfs/nfs42xattr.c +++ b/fs/nfs/nfs42xattr.c @@ -132,7 +132,7 @@ nfs4_xattr_entry_lru_add(struct nfs4_xattr_entry *entry) lru = (entry->flags & NFS4_XATTR_ENTRY_EXTVAL) ? &nfs4_xattr_large_entry_lru : &nfs4_xattr_entry_lru; - return list_lru_add(lru, &entry->lru); + return list_lru_add_obj(lru, &entry->lru); } static bool @@ -143,7 +143,7 @@ nfs4_xattr_entry_lru_del(struct nfs4_xattr_entry *entry) lru = (entry->flags & NFS4_XATTR_ENTRY_EXTVAL) ? &nfs4_xattr_large_entry_lru : &nfs4_xattr_entry_lru; - return list_lru_del(lru, &entry->lru); + return list_lru_del_obj(lru, &entry->lru); } /* @@ -349,7 +349,7 @@ nfs4_xattr_cache_unlink(struct inode *inode) oldcache = nfsi->xattr_cache; if (oldcache != NULL) { - list_lru_del(&nfs4_xattr_cache_lru, &oldcache->lru); + list_lru_del_obj(&nfs4_xattr_cache_lru, &oldcache->lru); oldcache->inode = NULL; } nfsi->xattr_cache = NULL; @@ -474,7 +474,7 @@ nfs4_xattr_get_cache(struct inode *inode, int add) kref_get(&cache->ref); nfsi->xattr_cache = cache; cache->inode = inode; - list_lru_add(&nfs4_xattr_cache_lru, &cache->lru); + list_lru_add_obj(&nfs4_xattr_cache_lru, &cache->lru); } spin_unlock(&inode->i_lock); diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c index 2419cf330210..5ff082ef6ce7 100644 --- a/fs/nfsd/filecache.c +++ b/fs/nfsd/filecache.c @@ -323,7 +323,7 @@ nfsd_file_check_writeback(struct nfsd_file *nf) static bool nfsd_file_lru_add(struct nfsd_file *nf) { set_bit(NFSD_FILE_REFERENCED, &nf->nf_flags); - if (list_lru_add(&nfsd_file_lru, &nf->nf_lru)) { + if (list_lru_add_obj(&nfsd_file_lru, &nf->nf_lru)) { trace_nfsd_file_lru_add(nf); return true; } @@ -332,7 +332,7 @@ static bool nfsd_file_lru_add(struct nfsd_file *nf) static bool nfsd_file_lru_remove(struct nfsd_file *nf) { - if (list_lru_del(&nfsd_file_lru, &nf->nf_lru)) { + if (list_lru_del_obj(&nfsd_file_lru, &nf->nf_lru)) { trace_nfsd_file_lru_del(nf); return true; } diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 541dc4ae4476..287d772479b0 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -169,7 +169,7 @@ xfs_buf_stale( atomic_set(&bp->b_lru_ref, 0); if (!(bp->b_state & XFS_BSTATE_DISPOSE) && - (list_lru_del(&bp->b_target->bt_lru, &bp->b_lru))) + (list_lru_del_obj(&bp->b_target->bt_lru, &bp->b_lru))) atomic_dec(&bp->b_hold); ASSERT(atomic_read(&bp->b_hold) >= 1); @@ -1047,7 +1047,7 @@ xfs_buf_rele( * buffer for the LRU and clear the (now stale) dispose list * state flag */ - if (list_lru_add(&bp->b_target->bt_lru, &bp->b_lru)) { + if (list_lru_add_obj(&bp->b_target->bt_lru, &bp->b_lru)) { bp->b_state &= ~XFS_BSTATE_DISPOSE; atomic_inc(&bp->b_hold); } @@ -1060,7 +1060,7 @@ xfs_buf_rele( * was on was the disposal list */ if (!(bp->b_state & XFS_BSTATE_DISPOSE)) { - list_lru_del(&bp->b_target->bt_lru, &bp->b_lru); + list_lru_del_obj(&bp->b_target->bt_lru, &bp->b_lru); } else { ASSERT(list_empty(&bp->b_lru)); } diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index 9b67f05d92a1..deb5f2721f72 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -1064,7 +1064,7 @@ xfs_qm_dqput( struct xfs_quotainfo *qi = dqp->q_mount->m_quotainfo; trace_xfs_dqput_free(dqp); - if (list_lru_add(&qi->qi_lru, &dqp->q_lru)) + if (list_lru_add_obj(&qi->qi_lru, &dqp->q_lru)) XFS_STATS_INC(dqp->q_mount, xs_qm_dquot_unused); } xfs_dqunlock(dqp); diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index 94a7932ac570..67d0a8564ff3 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -171,7 +171,7 @@ xfs_qm_dqpurge( * hits zero, so it really should be on the freelist here. */ ASSERT(!list_empty(&dqp->q_lru)); - list_lru_del(&qi->qi_lru, &dqp->q_lru); + list_lru_del_obj(&qi->qi_lru, &dqp->q_lru); XFS_STATS_DEC(dqp->q_mount, xs_qm_dquot_unused); xfs_qm_dqdestroy(dqp); diff --git a/include/linux/list_lru.h b/include/linux/list_lru.h index db86ad78d428..7675a48a0701 100644 --- a/include/linux/list_lru.h +++ b/include/linux/list_lru.h @@ -75,6 +75,8 @@ void memcg_reparent_list_lrus(struct mem_cgroup *memcg, struct mem_cgroup *paren * list_lru_add: add an element to the lru list's tail * @lru: the lru pointer * @item: the item to be added. + * @nid: the node id of the sublist to add the item to. + * @memcg: the cgroup of the sublist to add the item to. * * If the element is already part of a list, this function returns doing * nothing. Therefore the caller does not need to keep state about whether or @@ -87,12 +89,28 @@ void memcg_reparent_list_lrus(struct mem_cgroup *memcg, struct mem_cgroup *paren * * Return: true if the list was updated, false otherwise */ -bool list_lru_add(struct list_lru *lru, struct list_head *item); +bool list_lru_add(struct list_lru *lru, struct list_head *item, int nid, + struct mem_cgroup *memcg); /** - * list_lru_del: delete an element to the lru list + * list_lru_add_obj: add an element to the lru list's tail + * @lru: the lru pointer + * @item: the item to be added. + * + * This function is similar to list_lru_add(), but the NUMA node and the + * memcg of the sublist is determined by @item list_head. This assumption is + * valid for slab objects LRU such as dentries, inodes, etc. + * + * Return value: true if the list was updated, false otherwise + */ +bool list_lru_add_obj(struct list_lru *lru, struct list_head *item); + +/** + * list_lru_del: delete an element from the lru list * @lru: the lru pointer * @item: the item to be deleted. + * @nid: the node id of the sublist to delete the item from. + * @memcg: the cgroup of the sublist to delete the item from. * * This function works analogously as list_lru_add() in terms of list * manipulation. The comments about an element already pertaining to @@ -100,7 +118,21 @@ bool list_lru_add(struct list_lru *lru, struct list_head *item); * * Return: true if the list was updated, false otherwise */ -bool list_lru_del(struct list_lru *lru, struct list_head *item); +bool list_lru_del(struct list_lru *lru, struct list_head *item, int nid, + struct mem_cgroup *memcg); + +/** + * list_lru_del_obj: delete an element from the lru list + * @lru: the lru pointer + * @item: the item to be deleted. + * + * This function is similar to list_lru_del(), but the NUMA node and the + * memcg of the sublist is determined by @item list_head. This assumption is + * valid for slab objects LRU such as dentries, inodes, etc. + * + * Return value: true if the list was updated, false otherwise. + */ +bool list_lru_del_obj(struct list_lru *lru, struct list_head *item); /** * list_lru_count_one: return the number of objects currently held by @lru @@ -138,6 +170,22 @@ static inline unsigned long list_lru_count(struct list_lru *lru) void list_lru_isolate(struct list_lru_one *list, struct list_head *item); void list_lru_isolate_move(struct list_lru_one *list, struct list_head *item, struct list_head *head); +/** + * list_lru_putback: undo list_lru_isolate + * @lru: the lru pointer. + * @item: the item to put back. + * @nid: the node id of the sublist to put the item back to. + * @memcg: the cgroup of the sublist to put the item back to. + * + * Put back an isolated item into its original LRU. Note that unlike + * list_lru_add, this does not increment the node LRU count (as + * list_lru_isolate does not originally decrement this count). + * + * Since we might have dropped the LRU lock in between, recompute list_lru_one + * from the node's id and memcg. + */ +void list_lru_putback(struct list_lru *lru, struct list_head *item, int nid, + struct mem_cgroup *memcg); typedef enum lru_status (*list_lru_walk_cb)(struct list_head *item, struct list_lru_one *list, spinlock_t *lock, void *cb_arg); diff --git a/mm/list_lru.c b/mm/list_lru.c index a05e5bef3b40..fcca67ac26ec 100644 --- a/mm/list_lru.c +++ b/mm/list_lru.c @@ -116,21 +116,19 @@ list_lru_from_kmem(struct list_lru *lru, int nid, void *ptr, } #endif /* CONFIG_MEMCG_KMEM */ -bool list_lru_add(struct list_lru *lru, struct list_head *item) +bool list_lru_add(struct list_lru *lru, struct list_head *item, int nid, + struct mem_cgroup *memcg) { - int nid = page_to_nid(virt_to_page(item)); struct list_lru_node *nlru = &lru->node[nid]; - struct mem_cgroup *memcg; struct list_lru_one *l; spin_lock(&nlru->lock); if (list_empty(item)) { - l = list_lru_from_kmem(lru, nid, item, &memcg); + l = list_lru_from_memcg_idx(lru, nid, memcg_kmem_id(memcg)); list_add_tail(item, &l->list); /* Set shrinker bit if the first element was added */ if (!l->nr_items++) - set_shrinker_bit(memcg, nid, - lru_shrinker_id(lru)); + set_shrinker_bit(memcg, nid, lru_shrinker_id(lru)); nlru->nr_items++; spin_unlock(&nlru->lock); return true; @@ -140,15 +138,25 @@ bool list_lru_add(struct list_lru *lru, struct list_head *item) } EXPORT_SYMBOL_GPL(list_lru_add); -bool list_lru_del(struct list_lru *lru, struct list_head *item) +bool list_lru_add_obj(struct list_lru *lru, struct list_head *item) { int nid = page_to_nid(virt_to_page(item)); + struct mem_cgroup *memcg = list_lru_memcg_aware(lru) ? + mem_cgroup_from_slab_obj(item) : NULL; + + return list_lru_add(lru, item, nid, memcg); +} +EXPORT_SYMBOL_GPL(list_lru_add_obj); + +bool list_lru_del(struct list_lru *lru, struct list_head *item, int nid, + struct mem_cgroup *memcg) +{ struct list_lru_node *nlru = &lru->node[nid]; struct list_lru_one *l; spin_lock(&nlru->lock); if (!list_empty(item)) { - l = list_lru_from_kmem(lru, nid, item, NULL); + l = list_lru_from_memcg_idx(lru, nid, memcg_kmem_id(memcg)); list_del_init(item); l->nr_items--; nlru->nr_items--; @@ -160,6 +168,16 @@ bool list_lru_del(struct list_lru *lru, struct list_head *item) } EXPORT_SYMBOL_GPL(list_lru_del); +bool list_lru_del_obj(struct list_lru *lru, struct list_head *item) +{ + int nid = page_to_nid(virt_to_page(item)); + struct mem_cgroup *memcg = list_lru_memcg_aware(lru) ? + mem_cgroup_from_slab_obj(item) : NULL; + + return list_lru_del(lru, item, nid, memcg); +} +EXPORT_SYMBOL_GPL(list_lru_del_obj); + void list_lru_isolate(struct list_lru_one *list, struct list_head *item) { list_del_init(item); @@ -175,6 +193,20 @@ void list_lru_isolate_move(struct list_lru_one *list, struct list_head *item, } EXPORT_SYMBOL_GPL(list_lru_isolate_move); +void list_lru_putback(struct list_lru *lru, struct list_head *item, int nid, + struct mem_cgroup *memcg) +{ + struct list_lru_one *list = + list_lru_from_memcg_idx(lru, nid, memcg_kmem_id(memcg)); + + if (list_empty(item)) { + list_add_tail(item, &list->list); + if (!list->nr_items++) + set_shrinker_bit(memcg, nid, lru_shrinker_id(lru)); + } +} +EXPORT_SYMBOL_GPL(list_lru_putback); + unsigned long list_lru_count_one(struct list_lru *lru, int nid, struct mem_cgroup *memcg) { diff --git a/mm/workingset.c b/mm/workingset.c index 4a5e6054750e..3e9044e448e7 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -774,12 +774,12 @@ void workingset_update_node(struct xa_node *node) if (node->count && node->count == node->nr_values) { if (list_empty(&node->private_list)) { - list_lru_add(&shadow_nodes, &node->private_list); + list_lru_add_obj(&shadow_nodes, &node->private_list); __inc_lruvec_kmem_state(node, WORKINGSET_NODES); } } else { if (!list_empty(&node->private_list)) { - list_lru_del(&shadow_nodes, &node->private_list); + list_lru_del_obj(&shadow_nodes, &node->private_list); __dec_lruvec_kmem_state(node, WORKINGSET_NODES); } } -- Gitee From ad074d31d478e6dc02952957aae224436cdc9711 Mon Sep 17 00:00:00 2001 From: Nhat Pham Date: Thu, 30 Nov 2023 11:40:19 -0800 Subject: [PATCH 086/484] memcontrol: implement mem_cgroup_tryget_online() commit fdc4161ff6a5e96222e159c1f1b28d31a985130d upstream Conflicts: none Backport-reason: ZSWAP updates to be consistent with SWAP This patch implements a helper function that try to get a reference to an memcg's css, as well as checking if it is online. This new function is almost exactly the same as the existing mem_cgroup_tryget(), except for the onlineness check. In the !CONFIG_MEMCG case, it always returns true, analogous to mem_cgroup_tryget(). This is useful for e.g to the new zswap writeback scheme, where we need to select the next online memcg as a candidate for the global limit reclaim. Link: https://lkml.kernel.org/r/20231130194023.4102148-3-nphamcs@gmail.com Signed-off-by: Nhat Pham Tested-by: Bagas Sanjaya Reviewed-by: Yosry Ahmed Cc: Chris Li Cc: Dan Streetman Cc: Domenico Cerasuolo Cc: Johannes Weiner Cc: Michal Hocko Cc: Muchun Song Cc: Roman Gushchin Cc: Seth Jennings Cc: Shakeel Butt Cc: Shuah Khan Cc: Vitaly Wool Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- include/linux/memcontrol.h | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 425b6793ac7b..cea626da9bc2 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -979,6 +979,11 @@ static inline bool mem_cgroup_tryget(struct mem_cgroup *memcg) return !memcg || css_tryget(&memcg->css); } +static inline bool mem_cgroup_tryget_online(struct mem_cgroup *memcg) +{ + return !memcg || css_tryget_online(&memcg->css); +} + static inline void mem_cgroup_put(struct mem_cgroup *memcg) { if (memcg) @@ -1506,6 +1511,11 @@ static inline bool mem_cgroup_tryget(struct mem_cgroup *memcg) return true; } +static inline bool mem_cgroup_tryget_online(struct mem_cgroup *memcg) +{ + return true; +} + static inline void mem_cgroup_put(struct mem_cgroup *memcg) { } -- Gitee From 814c4e588c41e9150b5acd39eb349f7b1c9f3f56 Mon Sep 17 00:00:00 2001 From: Domenico Cerasuolo Date: Thu, 30 Nov 2023 11:40:20 -0800 Subject: [PATCH 087/484] zswap: make shrinking memcg-aware commit a65b0e7607ccb5e5184591f73e48512f25c76061 upstream Conflicts: minor, out writeback might fail, and alraedy backported 78524b05f1a3 Backport-reason: ZSWAP updates to be consistent with SWAP Currently, we only have a single global LRU for zswap. This makes it impossible to perform worload-specific shrinking - an memcg cannot determine which pages in the pool it owns, and often ends up writing pages from other memcgs. This issue has been previously observed in practice and mitigated by simply disabling memcg-initiated shrinking: https://lore.kernel.org/all/20230530232435.3097106-1-nphamcs@gmail.com/T/#u This patch fully resolves the issue by replacing the global zswap LRU with memcg- and NUMA-specific LRUs, and modify the reclaim logic: a) When a store attempt hits an memcg limit, it now triggers a synchronous reclaim attempt that, if successful, allows the new hotter page to be accepted by zswap. b) If the store attempt instead hits the global zswap limit, it will trigger an asynchronous reclaim attempt, in which an memcg is selected for reclaim in a round-robin-like fashion. [nphamcs@gmail.com: use correct function for the onlineness check, use mem_cgroup_iter_break()] Link: https://lkml.kernel.org/r/20231205195419.2563217-1-nphamcs@gmail.com [nphamcs@gmail.com: drop the pool's reference at the end of the writeback step] Link: https://lkml.kernel.org/r/20231206030627.4155634-1-nphamcs@gmail.com Link: https://lkml.kernel.org/r/20231130194023.4102148-4-nphamcs@gmail.com Signed-off-by: Domenico Cerasuolo Co-developed-by: Nhat Pham Signed-off-by: Nhat Pham Tested-by: Bagas Sanjaya Cc: Chris Li Cc: Dan Streetman Cc: Johannes Weiner Cc: Michal Hocko Cc: Muchun Song Cc: Roman Gushchin Cc: Seth Jennings Cc: Shakeel Butt Cc: Shuah Khan Cc: Vitaly Wool Cc: Yosry Ahmed Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- include/linux/memcontrol.h | 5 + include/linux/zswap.h | 2 + mm/memcontrol.c | 2 + mm/swap.h | 3 +- mm/swap_state.c | 24 +++- mm/zswap.c | 269 +++++++++++++++++++++++++++++-------- 6 files changed, 245 insertions(+), 60 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index cea626da9bc2..b9cd40d162ac 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -1369,6 +1369,11 @@ static inline struct mem_cgroup *page_memcg_check(struct page *page) return NULL; } +static inline struct mem_cgroup *get_mem_cgroup_from_objcg(struct obj_cgroup *objcg) +{ + return NULL; +} + static inline bool folio_memcg_kmem(struct folio *folio) { return false; diff --git a/include/linux/zswap.h b/include/linux/zswap.h index 2a60ce39cfde..e571e393669b 100644 --- a/include/linux/zswap.h +++ b/include/linux/zswap.h @@ -15,6 +15,7 @@ bool zswap_load(struct folio *folio); void zswap_invalidate(int type, pgoff_t offset); void zswap_swapon(int type); void zswap_swapoff(int type); +void zswap_memcg_offline_cleanup(struct mem_cgroup *memcg); #else @@ -31,6 +32,7 @@ static inline bool zswap_load(struct folio *folio) static inline void zswap_invalidate(int type, pgoff_t offset) {} static inline void zswap_swapon(int type) {} static inline void zswap_swapoff(int type) {} +static inline void zswap_memcg_offline_cleanup(struct mem_cgroup *memcg) {} #endif diff --git a/mm/memcontrol.c b/mm/memcontrol.c index f0183e317377..68e02b3bce3c 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -7467,6 +7467,8 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) page_counter_set_async_high(&memcg->memory, PAGE_COUNTER_MAX); page_counter_set_async_low(&memcg->memory, PAGE_COUNTER_MAX); + zswap_memcg_offline_cleanup(memcg); + memcg_offline_kmem(memcg); reparent_shrinker_deferred(memcg); wb_memcg_offline(memcg); diff --git a/mm/swap.h b/mm/swap.h index 2d2662815abb..8d9d8f472ad3 100644 --- a/mm/swap.h +++ b/mm/swap.h @@ -51,7 +51,8 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, struct swap_iocb **plug); struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, struct mempolicy *mpol, pgoff_t ilx, - bool *new_page_allocated); + bool *new_page_allocated, + bool skip_if_exists); struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t flag, struct mempolicy *mpol, pgoff_t ilx); struct page *swapin_readahead(swp_entry_t entry, gfp_t flag, diff --git a/mm/swap_state.c b/mm/swap_state.c index 0e74567d60aa..1ab30de75310 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -363,7 +363,8 @@ struct folio *filemap_get_incore_folio(struct address_space *mapping, struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, struct mempolicy *mpol, pgoff_t ilx, - bool *new_page_allocated) + bool *new_page_allocated, + bool skip_if_exists) { struct swap_info_struct *si = swp_swap_info(entry); struct folio *folio; @@ -413,6 +414,17 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, if (err != -EEXIST) goto fail_put_swap; + /* + * Protect against a recursive call to __read_swap_cache_async() + * on the same entry waiting forever here because SWAP_HAS_CACHE + * is set but the folio is not the swap cache yet. This can + * happen today if mem_cgroup_swapin_charge_folio() below + * triggers reclaim through zswap, which may call + * __read_swap_cache_async() in the writeback path. + */ + if (skip_if_exists) + goto fail_put_swap; + /* * We might race against __delete_from_swap_cache(), and * stumble across a swap_map entry whose SWAP_HAS_CACHE @@ -483,7 +495,7 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, mpol = get_vma_policy(vma, addr, 0, &ilx); page = __read_swap_cache_async(entry, gfp_mask, mpol, ilx, - &page_allocated); + &page_allocated, false); mpol_cond_put(mpol); if (page_allocated) @@ -603,7 +615,7 @@ struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask, /* Ok, do the async read-ahead now */ page = __read_swap_cache_async( swp_entry(swp_type(entry), offset), - gfp_mask, mpol, ilx, &page_allocated); + gfp_mask, mpol, ilx, &page_allocated, false); if (!page) continue; if (page_allocated) { @@ -621,7 +633,7 @@ struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask, skip: /* The page was likely read above, so no need for plugging here */ page = __read_swap_cache_async(entry, gfp_mask, mpol, ilx, - &page_allocated); + &page_allocated, false); if (unlikely(page_allocated)) swap_readpage(page, false, NULL); return page; @@ -781,7 +793,7 @@ static struct page *swap_vma_readahead(swp_entry_t targ_entry, gfp_t gfp_mask, pte_unmap(pte); pte = NULL; page = __read_swap_cache_async(entry, gfp_mask, mpol, ilx, - &page_allocated); + &page_allocated, false); if (!page) continue; if (page_allocated) { @@ -801,7 +813,7 @@ static struct page *swap_vma_readahead(swp_entry_t targ_entry, gfp_t gfp_mask, skip: /* The page was likely read above, so no need for plugging here */ page = __read_swap_cache_async(targ_entry, gfp_mask, mpol, targ_ilx, - &page_allocated); + &page_allocated, false); if (unlikely(page_allocated)) swap_readpage(page, false, NULL); return page; diff --git a/mm/zswap.c b/mm/zswap.c index 5cecf3689c70..ec5476597187 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -35,6 +35,7 @@ #include #include #include +#include #include "swap.h" #include "internal.h" @@ -172,8 +173,8 @@ struct zswap_pool { struct work_struct shrink_work; struct hlist_node node; char tfm_name[CRYPTO_MAX_ALG_NAME]; - struct list_head lru; - spinlock_t lru_lock; + struct list_lru list_lru; + struct mem_cgroup *next_shrink; }; /* @@ -289,15 +290,46 @@ static void zswap_update_total_size(void) zswap_pool_total_size = total; } +/* should be called under RCU */ +#ifdef CONFIG_MEMCG +static inline struct mem_cgroup *mem_cgroup_from_entry(struct zswap_entry *entry) +{ + return entry->objcg ? obj_cgroup_memcg(entry->objcg) : NULL; +} +#else +static inline struct mem_cgroup *mem_cgroup_from_entry(struct zswap_entry *entry) +{ + return NULL; +} +#endif + +static inline int entry_to_nid(struct zswap_entry *entry) +{ + return page_to_nid(virt_to_page(entry)); +} + +void zswap_memcg_offline_cleanup(struct mem_cgroup *memcg) +{ + struct zswap_pool *pool; + + /* lock out zswap pools list modification */ + spin_lock(&zswap_pools_lock); + list_for_each_entry(pool, &zswap_pools, list) { + if (pool->next_shrink == memcg) + pool->next_shrink = mem_cgroup_iter(NULL, pool->next_shrink, NULL); + } + spin_unlock(&zswap_pools_lock); +} + /********************************* * zswap entry functions **********************************/ static struct kmem_cache *zswap_entry_cache; -static struct zswap_entry *zswap_entry_cache_alloc(gfp_t gfp) +static struct zswap_entry *zswap_entry_cache_alloc(gfp_t gfp, int nid) { struct zswap_entry *entry; - entry = kmem_cache_alloc(zswap_entry_cache, gfp); + entry = kmem_cache_alloc_node(zswap_entry_cache, gfp, nid); if (!entry) return NULL; entry->refcount = 1; @@ -310,6 +342,61 @@ static void zswap_entry_cache_free(struct zswap_entry *entry) kmem_cache_free(zswap_entry_cache, entry); } +/********************************* +* lru functions +**********************************/ +static void zswap_lru_add(struct list_lru *list_lru, struct zswap_entry *entry) +{ + int nid = entry_to_nid(entry); + struct mem_cgroup *memcg; + + /* + * Note that it is safe to use rcu_read_lock() here, even in the face of + * concurrent memcg offlining. Thanks to the memcg->kmemcg_id indirection + * used in list_lru lookup, only two scenarios are possible: + * + * 1. list_lru_add() is called before memcg->kmemcg_id is updated. The + * new entry will be reparented to memcg's parent's list_lru. + * 2. list_lru_add() is called after memcg->kmemcg_id is updated. The + * new entry will be added directly to memcg's parent's list_lru. + * + * Similar reasoning holds for list_lru_del() and list_lru_putback(). + */ + rcu_read_lock(); + memcg = mem_cgroup_from_entry(entry); + /* will always succeed */ + list_lru_add(list_lru, &entry->lru, nid, memcg); + rcu_read_unlock(); +} + +static void zswap_lru_del(struct list_lru *list_lru, struct zswap_entry *entry) +{ + int nid = entry_to_nid(entry); + struct mem_cgroup *memcg; + + rcu_read_lock(); + memcg = mem_cgroup_from_entry(entry); + /* will always succeed */ + list_lru_del(list_lru, &entry->lru, nid, memcg); + rcu_read_unlock(); +} + +static void zswap_lru_putback(struct list_lru *list_lru, + struct zswap_entry *entry) +{ + int nid = entry_to_nid(entry); + spinlock_t *lock = &list_lru->node[nid].lock; + struct mem_cgroup *memcg; + + rcu_read_lock(); + memcg = mem_cgroup_from_entry(entry); + spin_lock(lock); + /* we cannot use list_lru_add here, because it increments node's lru count */ + list_lru_putback(list_lru, &entry->lru, nid, memcg); + spin_unlock(lock); + rcu_read_unlock(); +} + /********************************* * rbtree functions **********************************/ @@ -394,9 +481,7 @@ static void zswap_free_entry(struct zswap_entry *entry) if (!entry->length) atomic_dec(&zswap_same_filled_pages); else { - spin_lock(&entry->pool->lru_lock); - list_del(&entry->lru); - spin_unlock(&entry->pool->lru_lock); + zswap_lru_del(&entry->pool->list_lru, entry); zpool_free(zswap_find_zpool(entry), entry->handle); zswap_pool_put(entry->pool); } @@ -630,21 +715,15 @@ static void zswap_invalidate_entry(struct zswap_tree *tree, zswap_entry_put(tree, entry); } -static int zswap_reclaim_entry(struct zswap_pool *pool) +static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_one *l, + spinlock_t *lock, void *arg) { - struct zswap_entry *entry; + struct zswap_entry *entry = container_of(item, struct zswap_entry, lru); struct zswap_tree *tree; pgoff_t swpoffset; - int ret; + enum lru_status ret = LRU_REMOVED_RETRY; + int writeback_result; - /* Get an entry off the LRU */ - spin_lock(&pool->lru_lock); - if (list_empty(&pool->lru)) { - spin_unlock(&pool->lru_lock); - return -EINVAL; - } - entry = list_last_entry(&pool->lru, struct zswap_entry, lru); - list_del_init(&entry->lru); /* * Once the lru lock is dropped, the entry might get freed. The * swpoffset is copied to the stack, and entry isn't deref'd again @@ -652,28 +731,32 @@ static int zswap_reclaim_entry(struct zswap_pool *pool) */ swpoffset = swp_offset(entry->swpentry); tree = zswap_trees[swp_type(entry->swpentry)]; - spin_unlock(&pool->lru_lock); + list_lru_isolate(l, item); + /* + * It's safe to drop the lock here because we return either + * LRU_REMOVED_RETRY or LRU_RETRY. + */ + spin_unlock(lock); /* Check for invalidate() race */ spin_lock(&tree->lock); - if (entry != zswap_rb_search(&tree->rbroot, swpoffset)) { - ret = -EAGAIN; + if (entry != zswap_rb_search(&tree->rbroot, swpoffset)) goto unlock; - } + /* Hold a reference to prevent a free during writeback */ zswap_entry_get(entry); spin_unlock(&tree->lock); - ret = zswap_writeback_entry(entry, tree); + writeback_result = zswap_writeback_entry(entry, tree); spin_lock(&tree->lock); - if (ret) { - /* Writeback failed, put entry back on LRU */ - spin_lock(&pool->lru_lock); - list_move(&entry->lru, &pool->lru); - spin_unlock(&pool->lru_lock); + if (writeback_result) { + zswap_reject_reclaim_fail++; + zswap_lru_putback(&entry->pool->list_lru, entry); + ret = LRU_RETRY; goto put_unlock; } + zswap_written_back_pages++; /* * Writeback started successfully, the page now belongs to the @@ -687,24 +770,91 @@ static int zswap_reclaim_entry(struct zswap_pool *pool) zswap_entry_put(tree, entry); unlock: spin_unlock(&tree->lock); - return ret ? -EAGAIN : 0; + spin_lock(lock); + return ret; +} + +static int shrink_memcg(struct mem_cgroup *memcg) +{ + struct zswap_pool *pool; + int nid, shrunk = 0; + + /* + * Skip zombies because their LRUs are reparented and we would be + * reclaiming from the parent instead of the dead memcg. + */ + if (memcg && !mem_cgroup_online(memcg)) + return -ENOENT; + + pool = zswap_pool_current_get(); + if (!pool) + return -EINVAL; + + for_each_node_state(nid, N_NORMAL_MEMORY) { + unsigned long nr_to_walk = 1; + + shrunk += list_lru_walk_one(&pool->list_lru, nid, memcg, + &shrink_memcg_cb, NULL, &nr_to_walk); + } + zswap_pool_put(pool); + return shrunk ? 0 : -EAGAIN; } static void shrink_worker(struct work_struct *w) { struct zswap_pool *pool = container_of(w, typeof(*pool), shrink_work); + struct mem_cgroup *memcg; int ret, failures = 0; + /* global reclaim will select cgroup in a round-robin fashion. */ do { - ret = zswap_reclaim_entry(pool); - if (ret) { - zswap_reject_reclaim_fail++; - if (ret != -EAGAIN) + spin_lock(&zswap_pools_lock); + pool->next_shrink = mem_cgroup_iter(NULL, pool->next_shrink, NULL); + memcg = pool->next_shrink; + + /* + * We need to retry if we have gone through a full round trip, or if we + * got an offline memcg (or else we risk undoing the effect of the + * zswap memcg offlining cleanup callback). This is not catastrophic + * per se, but it will keep the now offlined memcg hostage for a while. + * + * Note that if we got an online memcg, we will keep the extra + * reference in case the original reference obtained by mem_cgroup_iter + * is dropped by the zswap memcg offlining callback, ensuring that the + * memcg is not killed when we are reclaiming. + */ + if (!memcg) { + spin_unlock(&zswap_pools_lock); + if (++failures == MAX_RECLAIM_RETRIES) break; + + goto resched; + } + + if (!mem_cgroup_tryget_online(memcg)) { + /* drop the reference from mem_cgroup_iter() */ + mem_cgroup_iter_break(NULL, memcg); + pool->next_shrink = NULL; + spin_unlock(&zswap_pools_lock); + if (++failures == MAX_RECLAIM_RETRIES) break; + + goto resched; } + spin_unlock(&zswap_pools_lock); + + ret = shrink_memcg(memcg); + /* drop the extra reference */ + mem_cgroup_put(memcg); + + if (ret == -EINVAL) + break; + if (ret && ++failures == MAX_RECLAIM_RETRIES) + break; + +resched: cond_resched(); } while (!zswap_can_accept()); zswap_pool_put(pool); @@ -765,8 +915,7 @@ static struct zswap_pool *zswap_pool_create(char *type, char *compressor) */ kref_init(&pool->kref); INIT_LIST_HEAD(&pool->list); - INIT_LIST_HEAD(&pool->lru); - spin_lock_init(&pool->lru_lock); + list_lru_init_memcg(&pool->list_lru, NULL); INIT_WORK(&pool->shrink_work, shrink_worker); zswap_pool_debug("created", pool); @@ -832,6 +981,13 @@ static void zswap_pool_destroy(struct zswap_pool *pool) cpuhp_state_remove_instance(CPUHP_MM_ZSWP_POOL_PREPARE, &pool->node); free_percpu(pool->acomp_ctx); + list_lru_destroy(&pool->list_lru); + + spin_lock(&zswap_pools_lock); + mem_cgroup_iter_break(NULL, pool->next_shrink); + pool->next_shrink = NULL; + spin_unlock(&zswap_pools_lock); + for (i = 0; i < ZSWAP_NR_ZPOOLS; i++) zpool_destroy_pool(pool->zpools[i]); kfree(pool); @@ -1084,7 +1240,7 @@ static int zswap_writeback_entry(struct zswap_entry *entry, mpol = get_task_policy(current); page = __read_swap_cache_async(swpentry, GFP_KERNEL, mpol, - NO_INTERLEAVE_INDEX, &page_was_allocated); + NO_INTERLEAVE_INDEX, &page_was_allocated, true); put_swap_device(si); if (!page) { ret = -ENOMEM; @@ -1154,9 +1310,6 @@ static int zswap_writeback_entry(struct zswap_entry *entry, ret = __swap_writepage(page, &wbc); put_page(page); - if (!ret) - zswap_written_back_pages++; - return ret; fail: @@ -1212,6 +1365,7 @@ bool zswap_store(struct folio *folio) struct scatterlist input, output; struct crypto_acomp_ctx *acomp_ctx; struct obj_cgroup *objcg = NULL; + struct mem_cgroup *memcg = NULL; struct zswap_pool *pool; struct zpool *zpool; unsigned int dlen = PAGE_SIZE; @@ -1247,14 +1401,15 @@ bool zswap_store(struct folio *folio) if (!zswap_enabled) return false; - /* - * XXX: zswap reclaim does not work with cgroups yet. Without a - * cgroup-aware entry LRU, we will push out entries system-wide based on - * local cgroup limits. - */ objcg = get_obj_cgroup_from_folio(folio); - if (objcg && !obj_cgroup_may_zswap(objcg)) - goto reject; + if (objcg && !obj_cgroup_may_zswap(objcg)) { + memcg = get_mem_cgroup_from_objcg(objcg); + if (shrink_memcg(memcg)) { + mem_cgroup_put(memcg); + goto reject; + } + mem_cgroup_put(memcg); + } /* reclaim space if needed */ if (zswap_is_full()) { @@ -1271,7 +1426,7 @@ bool zswap_store(struct folio *folio) } /* allocate entry */ - entry = zswap_entry_cache_alloc(GFP_KERNEL); + entry = zswap_entry_cache_alloc(GFP_KERNEL, page_to_nid(page)); if (!entry) { zswap_reject_kmemcache_fail++; goto reject; @@ -1298,6 +1453,15 @@ bool zswap_store(struct folio *folio) if (!entry->pool) goto freepage; + if (objcg) { + memcg = get_mem_cgroup_from_objcg(objcg); + if (memcg_list_lru_alloc(memcg, &entry->pool->list_lru, GFP_KERNEL)) { + mem_cgroup_put(memcg); + goto put_pool; + } + mem_cgroup_put(memcg); + } + /* compress */ acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx); @@ -1374,9 +1538,8 @@ bool zswap_store(struct folio *folio) zswap_invalidate_entry(tree, dupentry); } if (entry->length) { - spin_lock(&entry->pool->lru_lock); - list_add(&entry->lru, &entry->pool->lru); - spin_unlock(&entry->pool->lru_lock); + INIT_LIST_HEAD(&entry->lru); + zswap_lru_add(&entry->pool->list_lru, entry); } spin_unlock(&tree->lock); @@ -1389,6 +1552,7 @@ bool zswap_store(struct folio *folio) put_dstmem: mutex_unlock(acomp_ctx->mutex); +put_pool: zswap_pool_put(entry->pool); freepage: zswap_entry_cache_free(entry); @@ -1483,9 +1647,8 @@ bool zswap_load(struct folio *folio) zswap_invalidate_entry(tree, entry); folio_mark_dirty(folio); } else if (entry->length) { - spin_lock(&entry->pool->lru_lock); - list_move(&entry->lru, &entry->pool->lru); - spin_unlock(&entry->pool->lru_lock); + zswap_lru_del(&entry->pool->list_lru, entry); + zswap_lru_add(&entry->pool->list_lru, entry); } zswap_entry_put(tree, entry); spin_unlock(&tree->lock); -- Gitee From 010c802380baa1560ae579787641dd14be1ae332 Mon Sep 17 00:00:00 2001 From: Domenico Cerasuolo Date: Thu, 30 Nov 2023 11:40:21 -0800 Subject: [PATCH 088/484] mm: memcg: add per-memcg zswap writeback stat commit 7108cc3f765cafd48a6a35f8add140beaecfa75b upstream Conflicts: minor, due to moved memcg_vm_event_stat Backport-reason: ZSWAP updates to be consistent with SWAP Since zswap now writes back pages from memcg-specific LRUs, we now need a new stat to show writebacks count for each memcg. [nphamcs@gmail.com: rename ZSWP_WB to ZSWPWB] Link: https://lkml.kernel.org/r/20231205193307.2432803-1-nphamcs@gmail.com Link: https://lkml.kernel.org/r/20231130194023.4102148-5-nphamcs@gmail.com Suggested-by: Nhat Pham Signed-off-by: Domenico Cerasuolo Signed-off-by: Nhat Pham Tested-by: Bagas Sanjaya Reviewed-by: Yosry Ahmed Cc: Chris Li Cc: Dan Streetman Cc: Johannes Weiner Cc: Michal Hocko Cc: Muchun Song Cc: Roman Gushchin Cc: Seth Jennings Cc: Shakeel Butt Cc: Shuah Khan Cc: Vitaly Wool Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- include/linux/vm_event_item.h | 1 + mm/memcontrol.c | 1 + mm/vmstat.c | 1 + mm/zswap.c | 4 ++++ 4 files changed, 7 insertions(+) diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h index 8abfa1240040..b61796a35d2b 100644 --- a/include/linux/vm_event_item.h +++ b/include/linux/vm_event_item.h @@ -145,6 +145,7 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, #ifdef CONFIG_ZSWAP ZSWPIN, ZSWPOUT, + ZSWPWB, #endif #ifdef CONFIG_X86 DIRECT_MAP_LEVEL2_SPLIT, diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 68e02b3bce3c..3f3031a92be7 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -627,6 +627,7 @@ static const unsigned int memcg_vm_event_stat[] = { #if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP) ZSWPIN, ZSWPOUT, + ZSWPWB, #endif #ifdef CONFIG_TRANSPARENT_HUGEPAGE THP_FAULT_ALLOC, diff --git a/mm/vmstat.c b/mm/vmstat.c index c87301750612..2c736afc0325 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1401,6 +1401,7 @@ const char * const vmstat_text[] = { #ifdef CONFIG_ZSWAP "zswpin", "zswpout", + "zswpwb", #endif #ifdef CONFIG_X86 "direct_map_level2_splits", diff --git a/mm/zswap.c b/mm/zswap.c index ec5476597187..420607bd3987 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -758,6 +758,10 @@ static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_o } zswap_written_back_pages++; + if (entry->objcg) + count_objcg_event(entry->objcg, ZSWPWB); + + count_vm_event(ZSWPWB); /* * Writeback started successfully, the page now belongs to the * swapcache. Drop the entry from zswap - unless invalidate already -- Gitee From 4d34ac3e3d11d905b8cea9379290ad5730b74a29 Mon Sep 17 00:00:00 2001 From: Domenico Cerasuolo Date: Thu, 30 Nov 2023 11:40:22 -0800 Subject: [PATCH 089/484] selftests: cgroup: update per-memcg zswap writeback selftest commit a697dc2be925d4814f26d7588347ccdd2c5525ed upstream Conflicts: none Backport-reason: ZSWAP updates to be consistent with SWAP The memcg-zswap self test is updated to adjust to the behavior change implemented by commit 87730b165089 ("zswap: make shrinking memcg-aware"), where zswap performs writeback for specific memcg. Link: https://lkml.kernel.org/r/20231130194023.4102148-6-nphamcs@gmail.com Signed-off-by: Domenico Cerasuolo Signed-off-by: Nhat Pham Tested-by: Bagas Sanjaya Acked-by: Chris Li (Google) Cc: Dan Streetman Cc: Johannes Weiner Cc: Michal Hocko Cc: Muchun Song Cc: Roman Gushchin Cc: Seth Jennings Cc: Shakeel Butt Cc: Shuah Khan Cc: Vitaly Wool Cc: Yosry Ahmed Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- tools/testing/selftests/cgroup/test_zswap.c | 74 ++++++++++++++------- 1 file changed, 50 insertions(+), 24 deletions(-) diff --git a/tools/testing/selftests/cgroup/test_zswap.c b/tools/testing/selftests/cgroup/test_zswap.c index 6927b4a06dee..41858d67cb75 100644 --- a/tools/testing/selftests/cgroup/test_zswap.c +++ b/tools/testing/selftests/cgroup/test_zswap.c @@ -50,9 +50,9 @@ static int get_zswap_stored_pages(size_t *value) return read_int("/sys/kernel/debug/zswap/stored_pages", value); } -static int get_zswap_written_back_pages(size_t *value) +static int get_cg_wb_count(const char *cg) { - return read_int("/sys/kernel/debug/zswap/written_back_pages", value); + return cg_read_key_long(cg, "memory.stat", "zswp_wb"); } static int allocate_bytes(const char *cgroup, void *arg) @@ -68,45 +68,71 @@ static int allocate_bytes(const char *cgroup, void *arg) return 0; } +static char *setup_test_group_1M(const char *root, const char *name) +{ + char *group_name = cg_name(root, name); + + if (!group_name) + return NULL; + if (cg_create(group_name)) + goto fail; + if (cg_write(group_name, "memory.max", "1M")) { + cg_destroy(group_name); + goto fail; + } + return group_name; +fail: + free(group_name); + return NULL; +} + /* * When trying to store a memcg page in zswap, if the memcg hits its memory - * limit in zswap, writeback should not be triggered. - * - * This was fixed with commit 0bdf0efa180a("zswap: do not shrink if cgroup may - * not zswap"). Needs to be revised when a per memcg writeback mechanism is - * implemented. + * limit in zswap, writeback should affect only the zswapped pages of that + * memcg. */ static int test_no_invasive_cgroup_shrink(const char *root) { - size_t written_back_before, written_back_after; int ret = KSFT_FAIL; - char *test_group; + size_t control_allocation_size = MB(10); + char *control_allocation, *wb_group = NULL, *control_group = NULL; /* Set up */ - test_group = cg_name(root, "no_shrink_test"); - if (!test_group) - goto out; - if (cg_create(test_group)) + wb_group = setup_test_group_1M(root, "per_memcg_wb_test1"); + if (!wb_group) + return KSFT_FAIL; + if (cg_write(wb_group, "memory.zswap.max", "10K")) goto out; - if (cg_write(test_group, "memory.max", "1M")) + control_group = setup_test_group_1M(root, "per_memcg_wb_test2"); + if (!control_group) goto out; - if (cg_write(test_group, "memory.zswap.max", "10K")) + + /* Push some test_group2 memory into zswap */ + if (cg_enter_current(control_group)) goto out; - if (get_zswap_written_back_pages(&written_back_before)) + control_allocation = malloc(control_allocation_size); + for (int i = 0; i < control_allocation_size; i += 4095) + control_allocation[i] = 'a'; + if (cg_read_key_long(control_group, "memory.stat", "zswapped") < 1) goto out; - /* Allocate 10x memory.max to push memory into zswap */ - if (cg_run(test_group, allocate_bytes, (void *)MB(10))) + /* Allocate 10x memory.max to push wb_group memory into zswap and trigger wb */ + if (cg_run(wb_group, allocate_bytes, (void *)MB(10))) goto out; - /* Verify that no writeback happened because of the memcg allocation */ - if (get_zswap_written_back_pages(&written_back_after)) - goto out; - if (written_back_after == written_back_before) + /* Verify that only zswapped memory from gwb_group has been written back */ + if (get_cg_wb_count(wb_group) > 0 && get_cg_wb_count(control_group) == 0) ret = KSFT_PASS; out: - cg_destroy(test_group); - free(test_group); + cg_enter_current(root); + if (control_group) { + cg_destroy(control_group); + free(control_group); + } + cg_destroy(wb_group); + free(wb_group); + if (control_allocation) + free(control_allocation); return ret; } -- Gitee From 09ad5782afa1c4831bf32d63012b5f79c996b37c Mon Sep 17 00:00:00 2001 From: Nhat Pham Date: Mon, 5 Feb 2024 14:56:07 -0800 Subject: [PATCH 090/484] selftests: fix the zswap invasive shrink test commit 012688f6006c01655e250c1612b544561b7c50be upstream Conflicts: none Backport-reason: ZSWAP fixes The zswap no invasive shrink selftest breaks because we rename the zswap writeback counter (see [1]). Fix the test. [1]: https://patchwork.kernel.org/project/linux-kselftest/patch/20231205193307.2432803-1-nphamcs@gmail.com/ Link: https://lkml.kernel.org/r/20240205225608.3083251-3-nphamcs@gmail.com Fixes: a697dc2be925 ("selftests: cgroup: update per-memcg zswap writeback selftest") Signed-off-by: Nhat Pham Acked-by: Yosry Ahmed Cc: Johannes Weiner Cc: Rik van Riel Cc: Roman Gushchin Cc: Shuah Khan Cc: Tejun Heo Cc: Zefan Li Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- tools/testing/selftests/cgroup/test_zswap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/cgroup/test_zswap.c b/tools/testing/selftests/cgroup/test_zswap.c index 41858d67cb75..6d57e9d10906 100644 --- a/tools/testing/selftests/cgroup/test_zswap.c +++ b/tools/testing/selftests/cgroup/test_zswap.c @@ -52,7 +52,7 @@ static int get_zswap_stored_pages(size_t *value) static int get_cg_wb_count(const char *cg) { - return cg_read_key_long(cg, "memory.stat", "zswp_wb"); + return cg_read_key_long(cg, "memory.stat", "zswpwb"); } static int allocate_bytes(const char *cgroup, void *arg) -- Gitee From 4fa1916abe3bda5dfa5fab44d5e43896a0b06d8b Mon Sep 17 00:00:00 2001 From: Lucy Mielke Date: Fri, 6 Oct 2023 22:30:51 +0200 Subject: [PATCH 091/484] mm: add printf attribute to shrinker_debugfs_name_alloc commit f04eba134e598d4a5af2eeecff0562df5042cbfc upstream Conflicts: none Backport-reason: Shrinker fixes This fixes a compiler warning when compiling an allyesconfig with W=1: mm/internal.h:1235:9: error: function might be a candidate for `gnu_printf' format attribute [-Werror=suggest-attribute=format] [akpm@linux-foundation.org: fix shrinker_alloc() as welll per Qi Zheng] Link: https://lkml.kernel.org/r/822387b7-4895-4e64-5806-0f56b5d6c447@bytedance.com Link: https://lkml.kernel.org/r/ZSBue-3kM6gI6jCr@mainframe Fixes: c42d50aefd17 ("mm: shrinker: add infrastructure for dynamically allocating shrinker") Signed-off-by: Lucy Mielke Cc: Qi Zheng Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- include/linux/shrinker.h | 1 + mm/internal.h | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/include/linux/shrinker.h b/include/linux/shrinker.h index 746e86918d24..b9ddbfd13b36 100644 --- a/include/linux/shrinker.h +++ b/include/linux/shrinker.h @@ -135,6 +135,7 @@ struct shrinker { */ #define SHRINKER_NONSLAB BIT(4) +__printf(2, 3) struct shrinker *shrinker_alloc(unsigned int flags, const char *fmt, ...); void shrinker_register(struct shrinker *shrinker); void shrinker_free(struct shrinker *shrinker); diff --git a/mm/internal.h b/mm/internal.h index 54d02bc1c44b..d6d174353eaa 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -1449,8 +1449,8 @@ unsigned long shrink_slab(gfp_t gfp_mask, int nid, struct mem_cgroup *memcg, int priority); #ifdef CONFIG_SHRINKER_DEBUG -static inline int shrinker_debugfs_name_alloc(struct shrinker *shrinker, - const char *fmt, va_list ap) +static inline __printf(2, 0) int shrinker_debugfs_name_alloc( + struct shrinker *shrinker, const char *fmt, va_list ap) { shrinker->name = kvasprintf_const(GFP_KERNEL, fmt, ap); -- Gitee From 4774e62d47d5768631a4b2c5ef60991c6b674061 Mon Sep 17 00:00:00 2001 From: Nhat Pham Date: Thu, 30 Nov 2023 11:40:23 -0800 Subject: [PATCH 092/484] zswap: shrink zswap pool based on memory pressure commit b5ba474f3f518701249598b35c581b92a3c95b48 upstream Conflicts: mem_cgroup_flush_stat requires extra args due to backported 7d7ef0a4686a. Backport-reason: ZSWAP updates to be consistent with SWAP Currently, we only shrink the zswap pool when the user-defined limit is hit. This means that if we set the limit too high, cold data that are unlikely to be used again will reside in the pool, wasting precious memory. It is hard to predict how much zswap space will be needed ahead of time, as this depends on the workload (specifically, on factors such as memory access patterns and compressibility of the memory pages). This patch implements a memcg- and NUMA-aware shrinker for zswap, that is initiated when there is memory pressure. The shrinker does not have any parameter that must be tuned by the user, and can be opted in or out on a per-memcg basis. Furthermore, to make it more robust for many workloads and prevent overshrinking (i.e evicting warm pages that might be refaulted into memory), we build in the following heuristics: * Estimate the number of warm pages residing in zswap, and attempt to protect this region of the zswap LRU. * Scale the number of freeable objects by an estimate of the memory saving factor. The better zswap compresses the data, the fewer pages we will evict to swap (as we will otherwise incur IO for relatively small memory saving). * During reclaim, if the shrinker encounters a page that is also being brought into memory, the shrinker will cautiously terminate its shrinking action, as this is a sign that it is touching the warmer region of the zswap LRU. As a proof of concept, we ran the following synthetic benchmark: build the linux kernel in a memory-limited cgroup, and allocate some cold data in tmpfs to see if the shrinker could write them out and improved the overall performance. Depending on the amount of cold data generated, we observe from 14% to 35% reduction in kernel CPU time used in the kernel builds. [nphamcs@gmail.com: check shrinker enablement early, use less costly stat flushing] Link: https://lkml.kernel.org/r/20231206194456.3234203-1-nphamcs@gmail.com Link: https://lkml.kernel.org/r/20231130194023.4102148-7-nphamcs@gmail.com Signed-off-by: Nhat Pham Acked-by: Johannes Weiner Tested-by: Bagas Sanjaya Cc: Chris Li Cc: Dan Streetman Cc: Domenico Cerasuolo Cc: Michal Hocko Cc: Muchun Song Cc: Roman Gushchin Cc: Seth Jennings Cc: Shakeel Butt Cc: Shuah Khan Cc: Vitaly Wool Cc: Yosry Ahmed Cc: Chengming Zhou Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- Documentation/admin-guide/mm/zswap.rst | 10 ++ include/linux/mmzone.h | 3 +- include/linux/zswap.h | 25 +++- mm/Kconfig | 14 ++ mm/mmzone.c | 1 + mm/swap_state.c | 2 + mm/zswap.c | 192 ++++++++++++++++++++++++- 7 files changed, 240 insertions(+), 7 deletions(-) diff --git a/Documentation/admin-guide/mm/zswap.rst b/Documentation/admin-guide/mm/zswap.rst index 45b98390e938..62fc244ec702 100644 --- a/Documentation/admin-guide/mm/zswap.rst +++ b/Documentation/admin-guide/mm/zswap.rst @@ -153,6 +153,16 @@ attribute, e. g.:: Setting this parameter to 100 will disable the hysteresis. +When there is a sizable amount of cold memory residing in the zswap pool, it +can be advantageous to proactively write these cold pages to swap and reclaim +the memory for other use cases. By default, the zswap shrinker is disabled. +User can enable it as follows: + + echo Y > /sys/module/zswap/parameters/shrinker_enabled + +This can be enabled at the boot time if ``CONFIG_ZSWAP_SHRINKER_DEFAULT_ON`` is +selected. + A debugfs interface is provided for various statistic about pool size, number of pages stored, same-value filled pages and various counters for the reasons pages are rejected. diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 4ce4eb965f44..4ff96faa356a 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -22,6 +22,7 @@ #include #include #include +#include #include #include @@ -652,7 +653,7 @@ struct lruvec { #ifdef CONFIG_MEMCG struct pglist_data *pgdat; #endif - + struct zswap_lruvec_state zswap_lruvec_state; #ifdef CONFIG_EMM_WORKINGSET_TRACKING /* Non-resident file age, driven by LRU movement */ atomic_long_t evicted_file; diff --git a/include/linux/zswap.h b/include/linux/zswap.h index e571e393669b..08c240e16a01 100644 --- a/include/linux/zswap.h +++ b/include/linux/zswap.h @@ -5,20 +5,40 @@ #include #include +struct lruvec; + extern u64 zswap_pool_total_size; extern atomic_t zswap_stored_pages; #ifdef CONFIG_ZSWAP +struct zswap_lruvec_state { + /* + * Number of pages in zswap that should be protected from the shrinker. + * This number is an estimate of the following counts: + * + * a) Recent page faults. + * b) Recent insertion to the zswap LRU. This includes new zswap stores, + * as well as recent zswap LRU rotations. + * + * These pages are likely to be warm, and might incur IO if the are written + * to swap. + */ + atomic_long_t nr_zswap_protected; +}; + bool zswap_store(struct folio *folio); bool zswap_load(struct folio *folio); void zswap_invalidate(int type, pgoff_t offset); void zswap_swapon(int type); void zswap_swapoff(int type); void zswap_memcg_offline_cleanup(struct mem_cgroup *memcg); - +void zswap_lruvec_state_init(struct lruvec *lruvec); +void zswap_page_swapin(struct page *page); #else +struct zswap_lruvec_state {}; + static inline bool zswap_store(struct folio *folio) { return false; @@ -33,7 +53,8 @@ static inline void zswap_invalidate(int type, pgoff_t offset) {} static inline void zswap_swapon(int type) {} static inline void zswap_swapoff(int type) {} static inline void zswap_memcg_offline_cleanup(struct mem_cgroup *memcg) {} - +static inline void zswap_lruvec_state_init(struct lruvec *lruvec) {} +static inline void zswap_page_swapin(struct page *page) {} #endif #endif /* _LINUX_ZSWAP_H */ diff --git a/mm/Kconfig b/mm/Kconfig index 9f118c681424..416cc1b98168 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -61,6 +61,20 @@ config ZSWAP_EXCLUSIVE_LOADS_DEFAULT_ON The cost is that if the page was never dirtied and needs to be swapped out again, it will be re-compressed. +config ZSWAP_SHRINKER_DEFAULT_ON + bool "Shrink the zswap pool on memory pressure" + depends on ZSWAP + default n + help + If selected, the zswap shrinker will be enabled, and the pages + stored in the zswap pool will become available for reclaim (i.e + written back to the backing swap device) on memory pressure. + + This means that zswap writeback could happen even if the pool is + not yet full, or the cgroup zswap limit has not been reached, + reducing the chance that cold pages will reside in the zswap pool + and consume memory indefinitely. + choice prompt "Default compressor" depends on ZSWAP diff --git a/mm/mmzone.c b/mm/mmzone.c index b594d3f268fe..c01896eca736 100644 --- a/mm/mmzone.c +++ b/mm/mmzone.c @@ -78,6 +78,7 @@ void lruvec_init(struct lruvec *lruvec) memset(lruvec, 0, sizeof(struct lruvec)); spin_lock_init(&lruvec->lru_lock); + zswap_lruvec_state_init(lruvec); for_each_lru(lru) INIT_LIST_HEAD(&lruvec->lists[lru]); diff --git a/mm/swap_state.c b/mm/swap_state.c index 1ab30de75310..0decdb9cbb69 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -636,6 +636,7 @@ struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask, &page_allocated, false); if (unlikely(page_allocated)) swap_readpage(page, false, NULL); + zswap_page_swapin(page); return page; } @@ -816,6 +817,7 @@ static struct page *swap_vma_readahead(swp_entry_t targ_entry, gfp_t gfp_mask, &page_allocated, false); if (unlikely(page_allocated)) swap_readpage(page, false, NULL); + zswap_page_swapin(page); return page; } diff --git a/mm/zswap.c b/mm/zswap.c index 420607bd3987..9a648aff11eb 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -146,6 +146,11 @@ module_param_named(exclusive_loads, zswap_exclusive_loads_enabled, bool, 0644); /* Number of zpools in zswap_pool (empirically determined for scalability) */ #define ZSWAP_NR_ZPOOLS 32 +/* Enable/disable memory pressure-based shrinker. */ +static bool zswap_shrinker_enabled = IS_ENABLED( + CONFIG_ZSWAP_SHRINKER_DEFAULT_ON); +module_param_named(shrinker_enabled, zswap_shrinker_enabled, bool, 0644); + /********************************* * data structures **********************************/ @@ -175,6 +180,8 @@ struct zswap_pool { char tfm_name[CRYPTO_MAX_ALG_NAME]; struct list_lru list_lru; struct mem_cgroup *next_shrink; + struct shrinker *shrinker; + atomic_t nr_stored; }; /* @@ -273,17 +280,26 @@ static bool zswap_can_accept(void) DIV_ROUND_UP(zswap_pool_total_size, PAGE_SIZE); } +static u64 get_zswap_pool_size(struct zswap_pool *pool) +{ + u64 pool_size = 0; + int i; + + for (i = 0; i < ZSWAP_NR_ZPOOLS; i++) + pool_size += zpool_get_total_size(pool->zpools[i]); + + return pool_size; +} + static void zswap_update_total_size(void) { struct zswap_pool *pool; u64 total = 0; - int i; rcu_read_lock(); list_for_each_entry_rcu(pool, &zswap_pools, list) - for (i = 0; i < ZSWAP_NR_ZPOOLS; i++) - total += zpool_get_total_size(pool->zpools[i]); + total += get_zswap_pool_size(pool); rcu_read_unlock(); @@ -342,13 +358,34 @@ static void zswap_entry_cache_free(struct zswap_entry *entry) kmem_cache_free(zswap_entry_cache, entry); } +/********************************* +* zswap lruvec functions +**********************************/ +void zswap_lruvec_state_init(struct lruvec *lruvec) +{ + atomic_long_set(&lruvec->zswap_lruvec_state.nr_zswap_protected, 0); +} + +void zswap_page_swapin(struct page *page) +{ + struct lruvec *lruvec; + + if (page) { + lruvec = folio_lruvec(page_folio(page)); + atomic_long_inc(&lruvec->zswap_lruvec_state.nr_zswap_protected); + } +} + /********************************* * lru functions **********************************/ static void zswap_lru_add(struct list_lru *list_lru, struct zswap_entry *entry) { + atomic_long_t *nr_zswap_protected; + unsigned long lru_size, old, new; int nid = entry_to_nid(entry); struct mem_cgroup *memcg; + struct lruvec *lruvec; /* * Note that it is safe to use rcu_read_lock() here, even in the face of @@ -366,6 +403,19 @@ static void zswap_lru_add(struct list_lru *list_lru, struct zswap_entry *entry) memcg = mem_cgroup_from_entry(entry); /* will always succeed */ list_lru_add(list_lru, &entry->lru, nid, memcg); + + /* Update the protection area */ + lru_size = list_lru_count_one(list_lru, nid, memcg); + lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid)); + nr_zswap_protected = &lruvec->zswap_lruvec_state.nr_zswap_protected; + old = atomic_long_inc_return(nr_zswap_protected); + /* + * Decay to avoid overflow and adapt to changing workloads. + * This is based on LRU reclaim cost decaying heuristics. + */ + do { + new = old > lru_size / 4 ? old / 2 : old; + } while (!atomic_long_try_cmpxchg(nr_zswap_protected, &old, new)); rcu_read_unlock(); } @@ -387,6 +437,7 @@ static void zswap_lru_putback(struct list_lru *list_lru, int nid = entry_to_nid(entry); spinlock_t *lock = &list_lru->node[nid].lock; struct mem_cgroup *memcg; + struct lruvec *lruvec; rcu_read_lock(); memcg = mem_cgroup_from_entry(entry); @@ -394,6 +445,10 @@ static void zswap_lru_putback(struct list_lru *list_lru, /* we cannot use list_lru_add here, because it increments node's lru count */ list_lru_putback(list_lru, &entry->lru, nid, memcg); spin_unlock(lock); + + lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(entry_to_nid(entry))); + /* increment the protection area to account for the LRU rotation. */ + atomic_long_inc(&lruvec->zswap_lruvec_state.nr_zswap_protected); rcu_read_unlock(); } @@ -483,6 +538,7 @@ static void zswap_free_entry(struct zswap_entry *entry) else { zswap_lru_del(&entry->pool->list_lru, entry); zpool_free(zswap_find_zpool(entry), entry->handle); + atomic_dec(&entry->pool->nr_stored); zswap_pool_put(entry->pool); } zswap_entry_cache_free(entry); @@ -524,6 +580,109 @@ static struct zswap_entry *zswap_entry_find_get(struct rb_root *root, return entry; } +/********************************* +* shrinker functions +**********************************/ +static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_one *l, + spinlock_t *lock, void *arg); + +static unsigned long zswap_shrinker_scan(struct shrinker *shrinker, + struct shrink_control *sc) +{ + struct lruvec *lruvec = mem_cgroup_lruvec(sc->memcg, NODE_DATA(sc->nid)); + unsigned long shrink_ret, nr_protected, lru_size; + struct zswap_pool *pool = shrinker->private_data; + bool encountered_page_in_swapcache = false; + + if (!zswap_shrinker_enabled) { + sc->nr_scanned = 0; + return SHRINK_STOP; + } + + nr_protected = + atomic_long_read(&lruvec->zswap_lruvec_state.nr_zswap_protected); + lru_size = list_lru_shrink_count(&pool->list_lru, sc); + + /* + * Abort if we are shrinking into the protected region. + * + * This short-circuiting is necessary because if we have too many multiple + * concurrent reclaimers getting the freeable zswap object counts at the + * same time (before any of them made reasonable progress), the total + * number of reclaimed objects might be more than the number of unprotected + * objects (i.e the reclaimers will reclaim into the protected area of the + * zswap LRU). + */ + if (nr_protected >= lru_size - sc->nr_to_scan) { + sc->nr_scanned = 0; + return SHRINK_STOP; + } + + shrink_ret = list_lru_shrink_walk(&pool->list_lru, sc, &shrink_memcg_cb, + &encountered_page_in_swapcache); + + if (encountered_page_in_swapcache) + return SHRINK_STOP; + + return shrink_ret ? shrink_ret : SHRINK_STOP; +} + +static unsigned long zswap_shrinker_count(struct shrinker *shrinker, + struct shrink_control *sc) +{ + struct zswap_pool *pool = shrinker->private_data; + struct mem_cgroup *memcg = sc->memcg; + struct lruvec *lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(sc->nid)); + unsigned long nr_backing, nr_stored, nr_freeable, nr_protected; + + if (!zswap_shrinker_enabled) + return 0; + +#ifdef CONFIG_MEMCG_KMEM + mem_cgroup_flush_stats(memcg); + nr_backing = memcg_page_state(memcg, MEMCG_ZSWAP_B) >> PAGE_SHIFT; + nr_stored = memcg_page_state(memcg, MEMCG_ZSWAPPED); +#else + /* use pool stats instead of memcg stats */ + nr_backing = get_zswap_pool_size(pool) >> PAGE_SHIFT; + nr_stored = atomic_read(&pool->nr_stored); +#endif + + if (!nr_stored) + return 0; + + nr_protected = + atomic_long_read(&lruvec->zswap_lruvec_state.nr_zswap_protected); + nr_freeable = list_lru_shrink_count(&pool->list_lru, sc); + /* + * Subtract the lru size by an estimate of the number of pages + * that should be protected. + */ + nr_freeable = nr_freeable > nr_protected ? nr_freeable - nr_protected : 0; + + /* + * Scale the number of freeable pages by the memory saving factor. + * This ensures that the better zswap compresses memory, the fewer + * pages we will evict to swap (as it will otherwise incur IO for + * relatively small memory saving). + */ + return mult_frac(nr_freeable, nr_backing, nr_stored); +} + +static void zswap_alloc_shrinker(struct zswap_pool *pool) +{ + pool->shrinker = + shrinker_alloc(SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE, "mm-zswap"); + if (!pool->shrinker) + return; + + pool->shrinker->private_data = pool; + pool->shrinker->scan_objects = zswap_shrinker_scan; + pool->shrinker->count_objects = zswap_shrinker_count; + pool->shrinker->batch = 0; + pool->shrinker->seeks = DEFAULT_SEEKS; +} + /********************************* * per-cpu code **********************************/ @@ -719,6 +878,7 @@ static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_o spinlock_t *lock, void *arg) { struct zswap_entry *entry = container_of(item, struct zswap_entry, lru); + bool *encountered_page_in_swapcache = (bool *)arg; struct zswap_tree *tree; pgoff_t swpoffset; enum lru_status ret = LRU_REMOVED_RETRY; @@ -754,6 +914,17 @@ static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_o zswap_reject_reclaim_fail++; zswap_lru_putback(&entry->pool->list_lru, entry); ret = LRU_RETRY; + + /* + * Encountering a page already in swap cache is a sign that we are shrinking + * into the warmer region. We should terminate shrinking (if we're in the dynamic + * shrinker context). + */ + if (writeback_result == -EEXIST && encountered_page_in_swapcache) { + ret = LRU_SKIP; + *encountered_page_in_swapcache = true; + } + goto put_unlock; } zswap_written_back_pages++; @@ -912,6 +1083,11 @@ static struct zswap_pool *zswap_pool_create(char *type, char *compressor) &pool->node); if (ret) goto error; + + zswap_alloc_shrinker(pool); + if (!pool->shrinker) + goto error; + pr_debug("using %s compressor\n", pool->tfm_name); /* being the current pool takes 1 ref; this func expects the @@ -919,13 +1095,19 @@ static struct zswap_pool *zswap_pool_create(char *type, char *compressor) */ kref_init(&pool->kref); INIT_LIST_HEAD(&pool->list); - list_lru_init_memcg(&pool->list_lru, NULL); + if (list_lru_init_memcg(&pool->list_lru, pool->shrinker)) + goto lru_fail; + shrinker_register(pool->shrinker); INIT_WORK(&pool->shrink_work, shrink_worker); + atomic_set(&pool->nr_stored, 0); zswap_pool_debug("created", pool); return pool; +lru_fail: + list_lru_destroy(&pool->list_lru); + shrinker_free(pool->shrinker); error: if (pool->acomp_ctx) free_percpu(pool->acomp_ctx); @@ -983,6 +1165,7 @@ static void zswap_pool_destroy(struct zswap_pool *pool) zswap_pool_debug("destroying", pool); + shrinker_free(pool->shrinker); cpuhp_state_remove_instance(CPUHP_MM_ZSWP_POOL_PREPARE, &pool->node); free_percpu(pool->acomp_ctx); list_lru_destroy(&pool->list_lru); @@ -1544,6 +1727,7 @@ bool zswap_store(struct folio *folio) if (entry->length) { INIT_LIST_HEAD(&entry->lru); zswap_lru_add(&entry->pool->list_lru, entry); + atomic_inc(&entry->pool->nr_stored); } spin_unlock(&tree->lock); -- Gitee From e85d0a0e374ffcc5e665672a80b626ab9c9ef55d Mon Sep 17 00:00:00 2001 From: "Fabio M. De Francesco" Date: Mon, 27 Nov 2023 16:55:21 +0100 Subject: [PATCH 093/484] mm/zswap: replace kmap_atomic() with kmap_local_page() commit 003ae2fb0b36803112f9b66cce8041afa5d46a83 upstream Conflicts: none Backport-reason: ZSWAP local map kmap_atomic() has been deprecated in favor of kmap_local_page(). Therefore, replace kmap_atomic() with kmap_local_page() in zswap.c. kmap_atomic() is implemented like a kmap_local_page() which also disables page-faults and preemption (the latter only in !PREEMPT_RT kernels). The kernel virtual addresses returned by these two API are only valid in the context of the callers (i.e., they cannot be handed to other threads). With kmap_local_page() the mappings are per thread and CPU local like in kmap_atomic(); however, they can handle page-faults and can be called from any context (including interrupts). The tasks that call kmap_local_page() can be preempted and, when they are scheduled to run again, the kernel virtual addresses are restored and are still valid. In mm/zswap.c, the blocks of code between the mappings and un-mappings do not depend on the above-mentioned side effects of kmap_atomic(), so that the mere replacements of the old API with the new one is all that is required (i.e., there is no need to explicitly call pagefault_disable() and/or preempt_disable()). Link: https://lkml.kernel.org/r/20231127160058.586446-1-fabio.maria.de.francesco@linux.intel.com Signed-off-by: Fabio M. De Francesco Reviewed-by: Nhat Pham Acked-by: Chris Li (Google) Cc: Ira Weiny Cc: Seth Jennings Cc: Dan Streetman Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/zswap.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/mm/zswap.c b/mm/zswap.c index 9a648aff11eb..29cf817ab87c 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -1620,16 +1620,16 @@ bool zswap_store(struct folio *folio) } if (zswap_same_filled_pages_enabled) { - src = kmap_atomic(page); + src = kmap_local_page(page); if (zswap_is_page_same_filled(src, &value)) { - kunmap_atomic(src); + kunmap_local(src); entry->swpentry = swp_entry(type, offset); entry->length = 0; entry->value = value; atomic_inc(&zswap_same_filled_pages); goto insert_entry; } - kunmap_atomic(src); + kunmap_local(src); } if (!zswap_non_same_filled_pages_enabled) @@ -1783,9 +1783,9 @@ bool zswap_load(struct folio *folio) spin_unlock(&tree->lock); if (!entry->length) { - dst = kmap_atomic(page); + dst = kmap_local_page(page); zswap_fill_page(dst, entry->value); - kunmap_atomic(dst); + kunmap_local(dst); ret = true; goto stats; } -- Gitee From 70791da2bc693ebbc3dcbc1b7dc97070b4fa598e Mon Sep 17 00:00:00 2001 From: Chengming Zhou Date: Thu, 28 Dec 2023 09:45:42 +0000 Subject: [PATCH 094/484] mm/zswap: reuse dstmem when decompress commit c75f5c1e0f1d231278f42123ee46ba6c0e2b6a96 upstream Conflicts: none Backport-reason: ZSWAP Clean up Patch series "mm/zswap: dstmem reuse optimizations and cleanups", v5. The problem this series tries to optimize is that zswap_load() and zswap_writeback_entry() have to malloc a temporary memory to support !zpool_can_sleep_mapped(). We can avoid it by reusing the percpu crypto_acomp_ctx->dstmem, which is also used by zswap_store() and protected by the same percpu crypto_acomp_ctx->mutex. This patch (of 5): In the !zpool_can_sleep_mapped() case such as zsmalloc, we need to first copy the entry->handle memory to a temporary memory, which is allocated using kmalloc. Obviously we can reuse the per-compressor dstmem to avoid allocating every time, since it's percpu-compressor and protected in percpu mutex. Link: https://lkml.kernel.org/r/20231213-zswap-dstmem-v5-0-9382162bbf05@bytedance.com Link: https://lkml.kernel.org/r/20231213-zswap-dstmem-v5-1-9382162bbf05@bytedance.com Signed-off-by: Chengming Zhou Reviewed-by: Nhat Pham Reviewed-by: Yosry Ahmed Acked-by: Chris Li (Google) Cc: Barry Song <21cnbao@gmail.com> Cc: Dan Streetman Cc: Johannes Weiner Cc: Seth Jennings Cc: Vitaly Wool Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/zswap.c | 44 ++++++++++++-------------------------------- 1 file changed, 12 insertions(+), 32 deletions(-) diff --git a/mm/zswap.c b/mm/zswap.c index 29cf817ab87c..3df46e25d6fc 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -1406,7 +1406,7 @@ static int zswap_writeback_entry(struct zswap_entry *entry, struct crypto_acomp_ctx *acomp_ctx; struct zpool *pool = zswap_find_zpool(entry); bool page_was_allocated; - u8 *src, *tmp = NULL; + u8 *src; unsigned int dlen; int ret; struct swap_info_struct *si; @@ -1414,12 +1414,6 @@ static int zswap_writeback_entry(struct zswap_entry *entry, .sync_mode = WB_SYNC_NONE, }; - if (!zpool_can_sleep_mapped(pool)) { - tmp = kmalloc(PAGE_SIZE, GFP_KERNEL); - if (!tmp) - return -ENOMEM; - } - /* try to allocate swap cache page */ si = get_swap_device(swpentry); if (!si) @@ -1462,15 +1456,15 @@ static int zswap_writeback_entry(struct zswap_entry *entry, /* decompress */ acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx); dlen = PAGE_SIZE; + mutex_lock(acomp_ctx->mutex); src = zpool_map_handle(pool, entry->handle, ZPOOL_MM_RO); if (!zpool_can_sleep_mapped(pool)) { - memcpy(tmp, src, entry->length); - src = tmp; + memcpy(acomp_ctx->dstmem, src, entry->length); + src = acomp_ctx->dstmem; zpool_unmap_handle(pool, entry->handle); } - mutex_lock(acomp_ctx->mutex); sg_init_one(&input, src, entry->length); sg_init_table(&output, 1); sg_set_page(&output, page, PAGE_SIZE, 0); @@ -1479,9 +1473,7 @@ static int zswap_writeback_entry(struct zswap_entry *entry, dlen = acomp_ctx->req->dlen; mutex_unlock(acomp_ctx->mutex); - if (!zpool_can_sleep_mapped(pool)) - kfree(tmp); - else + if (zpool_can_sleep_mapped(pool)) zpool_unmap_handle(pool, entry->handle); BUG_ON(ret); @@ -1500,9 +1492,6 @@ static int zswap_writeback_entry(struct zswap_entry *entry, return ret; fail: - if (!zpool_can_sleep_mapped(pool)) - kfree(tmp); - /* * If we get here because the page is already in swapcache, a * load may be happening concurrently. It is safe and okay to @@ -1766,7 +1755,7 @@ bool zswap_load(struct folio *folio) struct zswap_entry *entry; struct scatterlist input, output; struct crypto_acomp_ctx *acomp_ctx; - u8 *src, *dst, *tmp; + u8 *src, *dst; struct zpool *zpool; unsigned int dlen; bool ret; @@ -1791,26 +1780,19 @@ bool zswap_load(struct folio *folio) } zpool = zswap_find_zpool(entry); - if (!zpool_can_sleep_mapped(zpool)) { - tmp = kmalloc(entry->length, GFP_KERNEL); - if (!tmp) { - ret = false; - goto freeentry; - } - } /* decompress */ dlen = PAGE_SIZE; - src = zpool_map_handle(zpool, entry->handle, ZPOOL_MM_RO); + acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx); + mutex_lock(acomp_ctx->mutex); + src = zpool_map_handle(zpool, entry->handle, ZPOOL_MM_RO); if (!zpool_can_sleep_mapped(zpool)) { - memcpy(tmp, src, entry->length); - src = tmp; + memcpy(acomp_ctx->dstmem, src, entry->length); + src = acomp_ctx->dstmem; zpool_unmap_handle(zpool, entry->handle); } - acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx); - mutex_lock(acomp_ctx->mutex); sg_init_one(&input, src, entry->length); sg_init_table(&output, 1); sg_set_page(&output, page, PAGE_SIZE, 0); @@ -1821,15 +1803,13 @@ bool zswap_load(struct folio *folio) if (zpool_can_sleep_mapped(zpool)) zpool_unmap_handle(zpool, entry->handle); - else - kfree(tmp); ret = true; stats: count_vm_event(ZSWPIN); if (entry->objcg) count_objcg_event(entry->objcg, ZSWPIN); -freeentry: + spin_lock(&tree->lock); if (ret && zswap_exclusive_loads_enabled) { zswap_invalidate_entry(tree, entry); -- Gitee From b217dbbecead9d93e8d836326d61541787aeffc7 Mon Sep 17 00:00:00 2001 From: Chengming Zhou Date: Thu, 28 Dec 2023 09:45:43 +0000 Subject: [PATCH 095/484] mm/zswap: refactor out __zswap_load() commit 32acba4c04830487ca3002d716325e02069e053a upstream Conflicts: none Backport-reason: ZSWAP Clean up zswap_load() and zswap_writeback_entry() have the same part that decompress the data from zswap_entry to page, so refactor out the common part as __zswap_load(entry, page). Link: https://lkml.kernel.org/r/20231213-zswap-dstmem-v5-2-9382162bbf05@bytedance.com Signed-off-by: Chengming Zhou Reviewed-by: Nhat Pham Reviewed-by: Yosry Ahmed Acked-by: Chris Li (Google) Cc: Barry Song <21cnbao@gmail.com> Cc: Dan Streetman Cc: Johannes Weiner Cc: Seth Jennings Cc: Vitaly Wool Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/zswap.c | 92 +++++++++++++++++++----------------------------------- 1 file changed, 32 insertions(+), 60 deletions(-) diff --git a/mm/zswap.c b/mm/zswap.c index 3df46e25d6fc..f7289ad7e6ee 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -1381,6 +1381,35 @@ static int zswap_enabled_param_set(const char *val, return ret; } +static void __zswap_load(struct zswap_entry *entry, struct page *page) +{ + struct zpool *zpool = zswap_find_zpool(entry); + struct scatterlist input, output; + struct crypto_acomp_ctx *acomp_ctx; + u8 *src; + + acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx); + mutex_lock(acomp_ctx->mutex); + + src = zpool_map_handle(zpool, entry->handle, ZPOOL_MM_RO); + if (!zpool_can_sleep_mapped(zpool)) { + memcpy(acomp_ctx->dstmem, src, entry->length); + src = acomp_ctx->dstmem; + zpool_unmap_handle(zpool, entry->handle); + } + + sg_init_one(&input, src, entry->length); + sg_init_table(&output, 1); + sg_set_page(&output, page, PAGE_SIZE, 0); + acomp_request_set_params(acomp_ctx->req, &input, &output, entry->length, PAGE_SIZE); + BUG_ON(crypto_wait_req(crypto_acomp_decompress(acomp_ctx->req), &acomp_ctx->wait)); + BUG_ON(acomp_ctx->req->dlen != PAGE_SIZE); + mutex_unlock(acomp_ctx->mutex); + + if (zpool_can_sleep_mapped(zpool)) + zpool_unmap_handle(zpool, entry->handle); +} + /********************************* * writeback code **********************************/ @@ -1402,12 +1431,7 @@ static int zswap_writeback_entry(struct zswap_entry *entry, swp_entry_t swpentry = entry->swpentry; struct page *page; struct mempolicy *mpol; - struct scatterlist input, output; - struct crypto_acomp_ctx *acomp_ctx; - struct zpool *pool = zswap_find_zpool(entry); bool page_was_allocated; - u8 *src; - unsigned int dlen; int ret; struct swap_info_struct *si; struct writeback_control wbc = { @@ -1453,31 +1477,7 @@ static int zswap_writeback_entry(struct zswap_entry *entry, } spin_unlock(&tree->lock); - /* decompress */ - acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx); - dlen = PAGE_SIZE; - mutex_lock(acomp_ctx->mutex); - - src = zpool_map_handle(pool, entry->handle, ZPOOL_MM_RO); - if (!zpool_can_sleep_mapped(pool)) { - memcpy(acomp_ctx->dstmem, src, entry->length); - src = acomp_ctx->dstmem; - zpool_unmap_handle(pool, entry->handle); - } - - sg_init_one(&input, src, entry->length); - sg_init_table(&output, 1); - sg_set_page(&output, page, PAGE_SIZE, 0); - acomp_request_set_params(acomp_ctx->req, &input, &output, entry->length, dlen); - ret = crypto_wait_req(crypto_acomp_decompress(acomp_ctx->req), &acomp_ctx->wait); - dlen = acomp_ctx->req->dlen; - mutex_unlock(acomp_ctx->mutex); - - if (zpool_can_sleep_mapped(pool)) - zpool_unmap_handle(pool, entry->handle); - - BUG_ON(ret); - BUG_ON(dlen != PAGE_SIZE); + __zswap_load(entry, page); /* page is up to date */ SetPageUptodate(page); @@ -1753,11 +1753,7 @@ bool zswap_load(struct folio *folio) struct page *page = &folio->page; struct zswap_tree *tree = zswap_trees[type]; struct zswap_entry *entry; - struct scatterlist input, output; - struct crypto_acomp_ctx *acomp_ctx; - u8 *src, *dst; - struct zpool *zpool; - unsigned int dlen; + u8 *dst; bool ret; VM_WARN_ON_ONCE(!folio_test_locked(folio)); @@ -1779,31 +1775,7 @@ bool zswap_load(struct folio *folio) goto stats; } - zpool = zswap_find_zpool(entry); - - /* decompress */ - dlen = PAGE_SIZE; - acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx); - mutex_lock(acomp_ctx->mutex); - - src = zpool_map_handle(zpool, entry->handle, ZPOOL_MM_RO); - if (!zpool_can_sleep_mapped(zpool)) { - memcpy(acomp_ctx->dstmem, src, entry->length); - src = acomp_ctx->dstmem; - zpool_unmap_handle(zpool, entry->handle); - } - - sg_init_one(&input, src, entry->length); - sg_init_table(&output, 1); - sg_set_page(&output, page, PAGE_SIZE, 0); - acomp_request_set_params(acomp_ctx->req, &input, &output, entry->length, dlen); - if (crypto_wait_req(crypto_acomp_decompress(acomp_ctx->req), &acomp_ctx->wait)) - WARN_ON(1); - mutex_unlock(acomp_ctx->mutex); - - if (zpool_can_sleep_mapped(zpool)) - zpool_unmap_handle(zpool, entry->handle); - + __zswap_load(entry, page); ret = true; stats: count_vm_event(ZSWPIN); -- Gitee From a6636c30f6a9ac747160526c6984882f9f3c0c1d Mon Sep 17 00:00:00 2001 From: Chengming Zhou Date: Thu, 28 Dec 2023 09:45:44 +0000 Subject: [PATCH 096/484] mm/zswap: cleanup zswap_load() commit 66447fd036a5a540bef67a96b770d4ed84ad0467 upstream Conflicts: none Backport-reason: ZSWAP Clean up After the common decompress part goes to __zswap_load(), we can cleanup the zswap_load() a little. Link: https://lkml.kernel.org/r/20231213-zswap-dstmem-v5-3-9382162bbf05@bytedance.com Signed-off-by: Chengming Zhou Reviewed-by: Yosry Ahmed Acked-by: Chis Li (Google) Cc: Barry Song <21cnbao@gmail.com> Cc: Dan Streetman Cc: Johannes Weiner Cc: Nhat Pham Cc: Seth Jennings Cc: Vitaly Wool Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/zswap.c | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/mm/zswap.c b/mm/zswap.c index f7289ad7e6ee..37320016022e 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -1754,7 +1754,6 @@ bool zswap_load(struct folio *folio) struct zswap_tree *tree = zswap_trees[type]; struct zswap_entry *entry; u8 *dst; - bool ret; VM_WARN_ON_ONCE(!folio_test_locked(folio)); @@ -1767,23 +1766,20 @@ bool zswap_load(struct folio *folio) } spin_unlock(&tree->lock); - if (!entry->length) { + if (entry->length) + __zswap_load(entry, page); + else { dst = kmap_local_page(page); zswap_fill_page(dst, entry->value); kunmap_local(dst); - ret = true; - goto stats; } - __zswap_load(entry, page); - ret = true; -stats: count_vm_event(ZSWPIN); if (entry->objcg) count_objcg_event(entry->objcg, ZSWPIN); spin_lock(&tree->lock); - if (ret && zswap_exclusive_loads_enabled) { + if (zswap_exclusive_loads_enabled) { zswap_invalidate_entry(tree, entry); folio_mark_dirty(folio); } else if (entry->length) { @@ -1793,7 +1789,7 @@ bool zswap_load(struct folio *folio) zswap_entry_put(tree, entry); spin_unlock(&tree->lock); - return ret; + return true; } void zswap_invalidate(int type, pgoff_t offset) -- Gitee From 70a56c77f49ef96b16caed9f267dc9cc5d2bdfeb Mon Sep 17 00:00:00 2001 From: Chengming Zhou Date: Thu, 28 Dec 2023 09:45:45 +0000 Subject: [PATCH 097/484] mm/zswap: cleanup zswap_writeback_entry() commit e947ba0bbf470fb3d813383426bdcd3c88fe9a7b upstream Conflicts: minor, due to already backported e3b63e966cac and 78524b05f1a3 Backport-reason: ZSWAP Clean up Also after the common decompress part goes to __zswap_load(), we can cleanup the zswap_writeback_entry() a little. Link: https://lkml.kernel.org/r/20231213-zswap-dstmem-v5-4-9382162bbf05@bytedance.com Signed-off-by: Chengming Zhou Reviewed-by: Yosry Ahmed Reviewed-by: Nhat Pham Acked-by: Chris Li (Google) Cc: Barry Song <21cnbao@gmail.com> Cc: Dan Streetman Cc: Johannes Weiner Cc: Seth Jennings Cc: Vitaly Wool Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/zswap.c | 29 ++++++++++------------------- 1 file changed, 10 insertions(+), 19 deletions(-) diff --git a/mm/zswap.c b/mm/zswap.c index 37320016022e..f4ccaedce1ec 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -1432,7 +1432,6 @@ static int zswap_writeback_entry(struct zswap_entry *entry, struct page *page; struct mempolicy *mpol; bool page_was_allocated; - int ret; struct swap_info_struct *si; struct writeback_control wbc = { .sync_mode = WB_SYNC_NONE, @@ -1447,16 +1446,17 @@ static int zswap_writeback_entry(struct zswap_entry *entry, page = __read_swap_cache_async(swpentry, GFP_KERNEL, mpol, NO_INTERLEAVE_INDEX, &page_was_allocated, true); put_swap_device(si); - if (!page) { - ret = -ENOMEM; - goto fail; - } + if (!page) + return -ENOMEM; - /* Found an existing page, we raced with load/swapin */ + /* + * Found an existing page, we raced with load/swapin. We generally + * writeback cold pages from zswap, and swapin means the page just + * became hot. Skip this page and let the caller find another one. + */ if (!page_was_allocated) { put_page(page); - ret = -EEXIST; - goto fail; + return -EEXIST; } /* @@ -1472,8 +1472,7 @@ static int zswap_writeback_entry(struct zswap_entry *entry, delete_from_swap_cache(page_folio(page)); unlock_page(page); put_page(page); - ret = -ENOMEM; - goto fail; + return -ENOMEM; } spin_unlock(&tree->lock); @@ -1489,15 +1488,7 @@ static int zswap_writeback_entry(struct zswap_entry *entry, ret = __swap_writepage(page, &wbc); put_page(page); - return ret; - -fail: - /* - * If we get here because the page is already in swapcache, a - * load may be happening concurrently. It is safe and okay to - * not free the entry. It is also okay to return !0. - */ - return ret; + return 0; } static int zswap_is_page_same_filled(void *ptr, unsigned long *value) -- Gitee From 080dc3c651116f72887f61c4a545f5d1918335d8 Mon Sep 17 00:00:00 2001 From: Chengming Zhou Date: Thu, 28 Dec 2023 09:45:46 +0000 Subject: [PATCH 098/484] mm/zswap: change per-cpu mutex and buffer to per-acomp_ctx commit 8ba2f844f050a82624ba3ad5146aa3c116f506f7 upstream Conflicts: none Backport-reason: ZSWAP Clean up First of all, we need to rename acomp_ctx->dstmem field to buffer, since we are now using for purposes other than compression. Then we change per-cpu mutex and buffer to per-acomp_ctx, since them belong to the acomp_ctx and are necessary parts when used in the compress/decompress contexts. So we can remove the old per-cpu mutex and dstmem. Link: https://lkml.kernel.org/r/20231213-zswap-dstmem-v5-5-9382162bbf05@bytedance.com Signed-off-by: Chengming Zhou Acked-by: Chris Li (Google) Reviewed-by: Nhat Pham Cc: Barry Song <21cnbao@gmail.com> Cc: Dan Streetman Cc: Johannes Weiner Cc: Seth Jennings Cc: Vitaly Wool Cc: Yosry Ahmed Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- include/linux/cpuhotplug.h | 1 - mm/zswap.c | 104 ++++++++++++------------------------- 2 files changed, 33 insertions(+), 72 deletions(-) diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index 2a0de78c0dec..e0a12d8140d4 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -124,7 +124,6 @@ enum cpuhp_state { CPUHP_ARM_BL_PREPARE, CPUHP_TRACE_RB_PREPARE, CPUHP_MM_ZS_PREPARE, - CPUHP_MM_ZSWP_MEM_PREPARE, CPUHP_MM_ZSWP_POOL_PREPARE, CPUHP_KVM_PPC_BOOK3S_PREPARE, CPUHP_ZCOMP_PREPARE, diff --git a/mm/zswap.c b/mm/zswap.c index f4ccaedce1ec..1c1ef68664b2 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -159,8 +159,8 @@ struct crypto_acomp_ctx { struct crypto_acomp *acomp; struct acomp_req *req; struct crypto_wait wait; - u8 *dstmem; - struct mutex *mutex; + u8 *buffer; + struct mutex mutex; }; /* @@ -686,63 +686,26 @@ static void zswap_alloc_shrinker(struct zswap_pool *pool) /********************************* * per-cpu code **********************************/ -static DEFINE_PER_CPU(u8 *, zswap_dstmem); -/* - * If users dynamically change the zpool type and compressor at runtime, i.e. - * zswap is running, zswap can have more than one zpool on one cpu, but they - * are sharing dtsmem. So we need this mutex to be per-cpu. - */ -static DEFINE_PER_CPU(struct mutex *, zswap_mutex); - -static int zswap_dstmem_prepare(unsigned int cpu) -{ - struct mutex *mutex; - u8 *dst; - - dst = kmalloc_node(PAGE_SIZE * 2, GFP_KERNEL, cpu_to_node(cpu)); - if (!dst) - return -ENOMEM; - - mutex = kmalloc_node(sizeof(*mutex), GFP_KERNEL, cpu_to_node(cpu)); - if (!mutex) { - kfree(dst); - return -ENOMEM; - } - - mutex_init(mutex); - per_cpu(zswap_dstmem, cpu) = dst; - per_cpu(zswap_mutex, cpu) = mutex; - return 0; -} - -static int zswap_dstmem_dead(unsigned int cpu) -{ - struct mutex *mutex; - u8 *dst; - - mutex = per_cpu(zswap_mutex, cpu); - kfree(mutex); - per_cpu(zswap_mutex, cpu) = NULL; - - dst = per_cpu(zswap_dstmem, cpu); - kfree(dst); - per_cpu(zswap_dstmem, cpu) = NULL; - - return 0; -} - static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node) { struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node); struct crypto_acomp_ctx *acomp_ctx = per_cpu_ptr(pool->acomp_ctx, cpu); struct crypto_acomp *acomp; struct acomp_req *req; + int ret; + + mutex_init(&acomp_ctx->mutex); + + acomp_ctx->buffer = kmalloc_node(PAGE_SIZE * 2, GFP_KERNEL, cpu_to_node(cpu)); + if (!acomp_ctx->buffer) + return -ENOMEM; acomp = crypto_alloc_acomp_node(pool->tfm_name, 0, 0, cpu_to_node(cpu)); if (IS_ERR(acomp)) { pr_err("could not alloc crypto acomp %s : %ld\n", pool->tfm_name, PTR_ERR(acomp)); - return PTR_ERR(acomp); + ret = PTR_ERR(acomp); + goto acomp_fail; } acomp_ctx->acomp = acomp; @@ -750,8 +713,8 @@ static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node) if (!req) { pr_err("could not alloc crypto acomp_request %s\n", pool->tfm_name); - crypto_free_acomp(acomp_ctx->acomp); - return -ENOMEM; + ret = -ENOMEM; + goto req_fail; } acomp_ctx->req = req; @@ -764,10 +727,13 @@ static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node) acomp_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG, crypto_req_done, &acomp_ctx->wait); - acomp_ctx->mutex = per_cpu(zswap_mutex, cpu); - acomp_ctx->dstmem = per_cpu(zswap_dstmem, cpu); - return 0; + +req_fail: + crypto_free_acomp(acomp_ctx->acomp); +acomp_fail: + kfree(acomp_ctx->buffer); + return ret; } static int zswap_cpu_comp_dead(unsigned int cpu, struct hlist_node *node) @@ -780,6 +746,7 @@ static int zswap_cpu_comp_dead(unsigned int cpu, struct hlist_node *node) acomp_request_free(acomp_ctx->req); if (!IS_ERR_OR_NULL(acomp_ctx->acomp)) crypto_free_acomp(acomp_ctx->acomp); + kfree(acomp_ctx->buffer); } return 0; @@ -1389,12 +1356,12 @@ static void __zswap_load(struct zswap_entry *entry, struct page *page) u8 *src; acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx); - mutex_lock(acomp_ctx->mutex); + mutex_lock(&acomp_ctx->mutex); src = zpool_map_handle(zpool, entry->handle, ZPOOL_MM_RO); if (!zpool_can_sleep_mapped(zpool)) { - memcpy(acomp_ctx->dstmem, src, entry->length); - src = acomp_ctx->dstmem; + memcpy(acomp_ctx->buffer, src, entry->length); + src = acomp_ctx->buffer; zpool_unmap_handle(zpool, entry->handle); } @@ -1404,7 +1371,7 @@ static void __zswap_load(struct zswap_entry *entry, struct page *page) acomp_request_set_params(acomp_ctx->req, &input, &output, entry->length, PAGE_SIZE); BUG_ON(crypto_wait_req(crypto_acomp_decompress(acomp_ctx->req), &acomp_ctx->wait)); BUG_ON(acomp_ctx->req->dlen != PAGE_SIZE); - mutex_unlock(acomp_ctx->mutex); + mutex_unlock(&acomp_ctx->mutex); if (zpool_can_sleep_mapped(zpool)) zpool_unmap_handle(zpool, entry->handle); @@ -1632,13 +1599,17 @@ bool zswap_store(struct folio *folio) /* compress */ acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx); - mutex_lock(acomp_ctx->mutex); + mutex_lock(&acomp_ctx->mutex); - dst = acomp_ctx->dstmem; + dst = acomp_ctx->buffer; sg_init_table(&input, 1); sg_set_page(&input, page, PAGE_SIZE, 0); - /* zswap_dstmem is of size (PAGE_SIZE * 2). Reflect same in sg_list */ + /* + * We need PAGE_SIZE * 2 here since there maybe over-compression case, + * and hardware-accelerators may won't check the dst buffer size, so + * giving the dst buffer with enough length to avoid buffer overflow. + */ sg_init_one(&output, dst, PAGE_SIZE * 2); acomp_request_set_params(acomp_ctx->req, &input, &output, PAGE_SIZE, dlen); /* @@ -1676,7 +1647,7 @@ bool zswap_store(struct folio *folio) buf = zpool_map_handle(zpool, handle, ZPOOL_MM_WO); memcpy(buf, dst, dlen); zpool_unmap_handle(zpool, handle); - mutex_unlock(acomp_ctx->mutex); + mutex_unlock(&acomp_ctx->mutex); /* populate entry */ entry->swpentry = swp_entry(type, offset); @@ -1719,7 +1690,7 @@ bool zswap_store(struct folio *folio) return true; put_dstmem: - mutex_unlock(acomp_ctx->mutex); + mutex_unlock(&acomp_ctx->mutex); put_pool: zswap_pool_put(entry->pool); freepage: @@ -1892,13 +1863,6 @@ static int zswap_setup(void) goto cache_fail; } - ret = cpuhp_setup_state(CPUHP_MM_ZSWP_MEM_PREPARE, "mm/zswap:prepare", - zswap_dstmem_prepare, zswap_dstmem_dead); - if (ret) { - pr_err("dstmem alloc failed\n"); - goto dstmem_fail; - } - ret = cpuhp_setup_state_multi(CPUHP_MM_ZSWP_POOL_PREPARE, "mm/zswap_pool:prepare", zswap_cpu_comp_prepare, @@ -1930,8 +1894,6 @@ static int zswap_setup(void) if (pool) zswap_pool_destroy(pool); hp_fail: - cpuhp_remove_state(CPUHP_MM_ZSWP_MEM_PREPARE); -dstmem_fail: kmem_cache_destroy(zswap_entry_cache); cache_fail: /* if built-in, we aren't unloaded on failure; don't allow use */ -- Gitee From 3e6764f364043d68bac07b677cee1623030d4156 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 13 Dec 2023 21:58:30 +0000 Subject: [PATCH 099/484] mm: return the folio from __read_swap_cache_async() commit 96c7b0b42239e7b8987b2664b458dc74e825f760 upstream Conflicts: minor, __swap_writepage here could fail with SYNC_IO, and we have extra and correct folio_put due to backported fixes Backport-reason: SWAP update: more folio Patch series "More swap folio conversions". These all seem like fairly straightforward conversions to me. A lot of compound_head() calls get removed. And page_swap_info(), which is nice. This patch (of 13): Move the folio->page conversion into the callers that actually want that. Most of the callers are happier with the folio anyway. If the page_allocated boolean is set, the folio allocated is of order-0, so it is safe to pass the page directly to swap_readpage(). Link: https://lkml.kernel.org/r/20231213215842.671461-1-willy@infradead.org Link: https://lkml.kernel.org/r/20231213215842.671461-2-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- include/linux/zswap.h | 4 +-- mm/swap.h | 7 ++-- mm/swap_state.c | 75 ++++++++++++++++++++----------------------- mm/zswap.c | 65 +++++++++++++++++++------------------ 4 files changed, 73 insertions(+), 78 deletions(-) diff --git a/include/linux/zswap.h b/include/linux/zswap.h index 08c240e16a01..e88572d4c720 100644 --- a/include/linux/zswap.h +++ b/include/linux/zswap.h @@ -34,7 +34,7 @@ void zswap_swapon(int type); void zswap_swapoff(int type); void zswap_memcg_offline_cleanup(struct mem_cgroup *memcg); void zswap_lruvec_state_init(struct lruvec *lruvec); -void zswap_page_swapin(struct page *page); +void zswap_folio_swapin(struct folio *folio); #else struct zswap_lruvec_state {}; @@ -54,7 +54,7 @@ static inline void zswap_swapon(int type) {} static inline void zswap_swapoff(int type) {} static inline void zswap_memcg_offline_cleanup(struct mem_cgroup *memcg) {} static inline void zswap_lruvec_state_init(struct lruvec *lruvec) {} -static inline void zswap_page_swapin(struct page *page) {} +static inline void zswap_folio_swapin(struct folio *folio) {} #endif #endif /* _LINUX_ZSWAP_H */ diff --git a/mm/swap.h b/mm/swap.h index 8d9d8f472ad3..1b4a7b688e04 100644 --- a/mm/swap.h +++ b/mm/swap.h @@ -49,10 +49,9 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, struct vm_area_struct *vma, unsigned long addr, struct swap_iocb **plug); -struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, - struct mempolicy *mpol, pgoff_t ilx, - bool *new_page_allocated, - bool skip_if_exists); +struct folio *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_flags, + struct mempolicy *mpol, pgoff_t ilx, bool *new_page_allocated, + bool skip_if_exists); struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t flag, struct mempolicy *mpol, pgoff_t ilx); struct page *swapin_readahead(swp_entry_t entry, gfp_t flag, diff --git a/mm/swap_state.c b/mm/swap_state.c index 0decdb9cbb69..4f74dae0b125 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -361,14 +361,12 @@ struct folio *filemap_get_incore_folio(struct address_space *mapping, return folio; } -struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, - struct mempolicy *mpol, pgoff_t ilx, - bool *new_page_allocated, - bool skip_if_exists) +struct folio *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, + struct mempolicy *mpol, pgoff_t ilx, bool *new_page_allocated, + bool skip_if_exists) { struct swap_info_struct *si = swp_swap_info(entry); struct folio *folio; - struct page *page; void *shadow = NULL; *new_page_allocated = false; @@ -382,10 +380,8 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, */ folio = filemap_get_folio(swap_address_space(entry), swp_offset(entry)); - if (!IS_ERR(folio)) { - page = folio_file_page(folio, swp_offset(entry)); - goto got_page; - } + if (!IS_ERR(folio)) + goto got_folio; /* * Just skip read ahead for unused swap slot. @@ -394,7 +390,7 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, goto fail_put_swap; /* - * Get a new page to read into from swap. Allocate it now, + * Get a new folio to read into from swap. Allocate it now, * before marking swap_map SWAP_HAS_CACHE, when -EEXIST will * cause any racers to loop around until we add it to cache. */ @@ -430,13 +426,13 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, * stumble across a swap_map entry whose SWAP_HAS_CACHE * has not yet been cleared. Or race against another * __read_swap_cache_async(), which has set SWAP_HAS_CACHE - * in swap_map, but not yet added its page to swap cache. + * in swap_map, but not yet added its folio to swap cache. */ schedule_timeout_uninterruptible(1); } /* - * The swap entry is ours to swap in. Prepare the new page. + * The swap entry is ours to swap in. Prepare the new folio. */ __folio_set_locked(folio); @@ -457,9 +453,8 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, /* Caller will initiate read into locked folio */ folio_add_lru(folio); *new_page_allocated = true; - page = &folio->page; -got_page: - return page; +got_folio: + return folio; fail_unlock: put_swap_folio(folio, entry); @@ -487,22 +482,22 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, bool page_allocated; struct mempolicy *mpol; pgoff_t ilx; - struct page *page; + struct folio *folio; si = get_swap_device(entry); if (!si) return NULL; mpol = get_vma_policy(vma, addr, 0, &ilx); - page = __read_swap_cache_async(entry, gfp_mask, mpol, ilx, + folio = __read_swap_cache_async(entry, gfp_mask, mpol, ilx, &page_allocated, false); mpol_cond_put(mpol); if (page_allocated) - swap_readpage(page, false, plug); + swap_readpage(&folio->page, false, plug); put_swap_device(si); - return page; + return folio_file_page(folio, swp_offset(entry)); } EXPORT_SYMBOL(read_swap_cache_async); @@ -588,7 +583,7 @@ static unsigned long swapin_nr_pages(unsigned long offset) struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask, struct mempolicy *mpol, pgoff_t ilx) { - struct page *page; + struct folio *folio; unsigned long entry_offset = swp_offset(entry); unsigned long offset = entry_offset; unsigned long start_offset, end_offset; @@ -613,31 +608,31 @@ struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask, blk_start_plug(&plug); for (offset = start_offset; offset <= end_offset ; offset++) { /* Ok, do the async read-ahead now */ - page = __read_swap_cache_async( + folio = __read_swap_cache_async( swp_entry(swp_type(entry), offset), gfp_mask, mpol, ilx, &page_allocated, false); - if (!page) + if (!folio) continue; if (page_allocated) { - swap_readpage(page, false, &splug); + swap_readpage(&folio->page, false, &splug); if (offset != entry_offset) { - SetPageReadahead(page); + folio_set_readahead(folio); count_vm_event(SWAP_RA); } } - put_page(page); + folio_put(folio); } blk_finish_plug(&plug); swap_read_unplug(splug); lru_add_drain(); /* Push any new pages onto the LRU now */ skip: /* The page was likely read above, so no need for plugging here */ - page = __read_swap_cache_async(entry, gfp_mask, mpol, ilx, + folio = __read_swap_cache_async(entry, gfp_mask, mpol, ilx, &page_allocated, false); if (unlikely(page_allocated)) - swap_readpage(page, false, NULL); - zswap_page_swapin(page); - return page; + swap_readpage(&folio->page, false, NULL); + zswap_folio_swapin(folio); + return folio_file_page(folio, swp_offset(entry)); } int init_swap_address_space(struct swap_info_struct *si, unsigned long nr_pages) @@ -760,7 +755,7 @@ static struct page *swap_vma_readahead(swp_entry_t targ_entry, gfp_t gfp_mask, { struct blk_plug plug; struct swap_iocb *splug = NULL; - struct page *page; + struct folio *folio; pte_t *pte = NULL, pentry; unsigned long addr; swp_entry_t entry; @@ -793,18 +788,18 @@ static struct page *swap_vma_readahead(swp_entry_t targ_entry, gfp_t gfp_mask, continue; pte_unmap(pte); pte = NULL; - page = __read_swap_cache_async(entry, gfp_mask, mpol, ilx, + folio = __read_swap_cache_async(entry, gfp_mask, mpol, ilx, &page_allocated, false); - if (!page) + if (!folio) continue; if (page_allocated) { - swap_readpage(page, false, &splug); + swap_readpage(&folio->page, false, &splug); if (i != ra_info.offset) { - SetPageReadahead(page); + folio_set_readahead(folio); count_vm_event(SWAP_RA); } } - put_page(page); + folio_put(folio); } if (pte) pte_unmap(pte); @@ -812,13 +807,13 @@ static struct page *swap_vma_readahead(swp_entry_t targ_entry, gfp_t gfp_mask, swap_read_unplug(splug); lru_add_drain(); skip: - /* The page was likely read above, so no need for plugging here */ - page = __read_swap_cache_async(targ_entry, gfp_mask, mpol, targ_ilx, + /* The folio was likely read above, so no need for plugging here */ + folio = __read_swap_cache_async(targ_entry, gfp_mask, mpol, targ_ilx, &page_allocated, false); if (unlikely(page_allocated)) - swap_readpage(page, false, NULL); - zswap_page_swapin(page); - return page; + swap_readpage(&folio->page, false, NULL); + zswap_folio_swapin(folio); + return folio_file_page(folio, swp_offset(entry)); } /** diff --git a/mm/zswap.c b/mm/zswap.c index 1c1ef68664b2..e64f4d9b1391 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -366,12 +366,12 @@ void zswap_lruvec_state_init(struct lruvec *lruvec) atomic_long_set(&lruvec->zswap_lruvec_state.nr_zswap_protected, 0); } -void zswap_page_swapin(struct page *page) +void zswap_folio_swapin(struct folio *folio) { struct lruvec *lruvec; - if (page) { - lruvec = folio_lruvec(page_folio(page)); + if (folio) { + lruvec = folio_lruvec(folio); atomic_long_inc(&lruvec->zswap_lruvec_state.nr_zswap_protected); } } @@ -1381,81 +1381,82 @@ static void __zswap_load(struct zswap_entry *entry, struct page *page) * writeback code **********************************/ /* - * Attempts to free an entry by adding a page to the swap cache, - * decompressing the entry data into the page, and issuing a - * bio write to write the page back to the swap device. + * Attempts to free an entry by adding a folio to the swap cache, + * decompressing the entry data into the folio, and issuing a + * bio write to write the folio back to the swap device. * - * This can be thought of as a "resumed writeback" of the page + * This can be thought of as a "resumed writeback" of the folio * to the swap device. We are basically resuming the same swap * writeback path that was intercepted with the zswap_store() - * in the first place. After the page has been decompressed into + * in the first place. After the folio has been decompressed into * the swap cache, the compressed version stored by zswap can be * freed. */ static int zswap_writeback_entry(struct zswap_entry *entry, struct zswap_tree *tree) { + int ret; swp_entry_t swpentry = entry->swpentry; - struct page *page; + struct folio *folio; struct mempolicy *mpol; - bool page_was_allocated; + bool folio_was_allocated; struct swap_info_struct *si; struct writeback_control wbc = { .sync_mode = WB_SYNC_NONE, }; - /* try to allocate swap cache page */ + /* try to allocate swap cache folio */ si = get_swap_device(swpentry); if (!si) return -EEXIST; mpol = get_task_policy(current); - page = __read_swap_cache_async(swpentry, GFP_KERNEL, mpol, - NO_INTERLEAVE_INDEX, &page_was_allocated, true); + folio = __read_swap_cache_async(swpentry, GFP_KERNEL, mpol, + NO_INTERLEAVE_INDEX, &folio_was_allocated, true); put_swap_device(si); - if (!page) + if (!folio) return -ENOMEM; /* - * Found an existing page, we raced with load/swapin. We generally - * writeback cold pages from zswap, and swapin means the page just - * became hot. Skip this page and let the caller find another one. + * Found an existing folio, we raced with load/swapin. We generally + * writeback cold folios from zswap, and swapin means the folio just + * became hot. Skip this folio and let the caller find another one. */ - if (!page_was_allocated) { - put_page(page); + if (!folio_was_allocated) { + folio_put(folio); return -EEXIST; } /* - * Page is locked, and the swapcache is now secured against + * folio is locked, and the swapcache is now secured against * concurrent swapping to and from the slot. Verify that the * swap entry hasn't been invalidated and recycled behind our * backs (our zswap_entry reference doesn't prevent that), to - * avoid overwriting a new swap page with old compressed data. + * avoid overwriting a new swap folio with old compressed data. */ spin_lock(&tree->lock); if (zswap_rb_search(&tree->rbroot, swp_offset(entry->swpentry)) != entry) { spin_unlock(&tree->lock); - delete_from_swap_cache(page_folio(page)); - unlock_page(page); - put_page(page); + delete_from_swap_cache(folio); + folio_unlock(folio); + folio_put(folio); return -ENOMEM; } spin_unlock(&tree->lock); - __zswap_load(entry, page); + __zswap_load(entry, &folio->page); - /* page is up to date */ - SetPageUptodate(page); + /* folio is up to date */ + folio_mark_uptodate(folio); /* move it to the tail of the inactive list after end_writeback */ - SetPageReclaim(page); + folio_set_reclaim(folio); /* start writeback */ - ret = __swap_writepage(page, &wbc); - put_page(page); + ret = __swap_writepage(&folio->page, &wbc); + folio_put(folio); - return 0; + return ret; } static int zswap_is_page_same_filled(void *ptr, unsigned long *value) @@ -1603,7 +1604,7 @@ bool zswap_store(struct folio *folio) dst = acomp_ctx->buffer; sg_init_table(&input, 1); - sg_set_page(&input, page, PAGE_SIZE, 0); + sg_set_page(&input, &folio->page, PAGE_SIZE, 0); /* * We need PAGE_SIZE * 2 here since there maybe over-compression case, -- Gitee From c437614693343d9a1cc4a72e328f07632e8bdca8 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 13 Dec 2023 21:58:31 +0000 Subject: [PATCH 100/484] mm: pass a folio to __swap_writepage() commit b99b4e0d9d7f29b428bacd7a61188b2abf340c1e upstream Conflicts: major, our __swap_writepage has return value. Backport-reason: SWAP update: more folio Both callers now have a folio, so pass that in instead of the page. Removes a few hidden calls to compound_head(). Link: https://lkml.kernel.org/r/20231213215842.671461-3-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/page_io.c | 18 +++++++++--------- mm/swap.h | 2 +- mm/zswap.c | 2 +- 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/mm/page_io.c b/mm/page_io.c index 95312fdd9f9b..ee9bb49bb559 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -200,7 +200,7 @@ int swap_writepage(struct page *page, struct writeback_control *wbc) folio_end_writeback(folio); return 0; } - ret = __swap_writepage(&folio->page, wbc); + ret = __swap_writepage(folio, wbc); return ret; } EXPORT_SYMBOL(swap_writepage); @@ -368,17 +368,17 @@ static void swap_writepage_bdev_async(struct page *page, submit_bio(bio); } -int __swap_writepage(struct page *page, struct writeback_control *wbc) +int __swap_writepage(struct folio *folio, struct writeback_control *wbc) { - struct swap_info_struct *sis = page_swap_info(page); + struct swap_info_struct *sis = swp_swap_info(folio->swap); - VM_BUG_ON_PAGE(!PageSwapCache(page), page); + VM_BUG_ON_FOLIO(!folio_test_swapcache(folio), folio); if (data_race(sis->flags & SWP_SYNCHRONOUS_IO)) { - int ret = bdev_swapout_folio(sis->bdev, swap_page_sector(page), page_folio(page), wbc); + int ret = bdev_swapout_folio(sis->bdev, swap_page_sector(&folio->page), folio, wbc); if (ret != -EOPNOTSUPP) { if (!ret) - count_swpout_vm_event(page_folio(page)); + count_swpout_vm_event(folio); return ret; } } @@ -389,11 +389,11 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc) * is safe. */ if (data_race(sis->flags & SWP_FS_OPS)) - swap_writepage_fs(page, wbc); + swap_writepage_fs(&folio->page, wbc); else if (sis->flags & SWP_SYNCHRONOUS_IO) - swap_writepage_bdev_sync(page, wbc, sis); + swap_writepage_bdev_sync(&folio->page, wbc, sis); else - swap_writepage_bdev_async(page, wbc, sis); + swap_writepage_bdev_async(&folio->page, wbc, sis); return 0; } diff --git a/mm/swap.h b/mm/swap.h index 1b4a7b688e04..fa4bb307b0e6 100644 --- a/mm/swap.h +++ b/mm/swap.h @@ -19,7 +19,7 @@ static inline void swap_read_unplug(struct swap_iocb *plug) } void swap_write_unplug(struct swap_iocb *sio); int swap_writepage(struct page *page, struct writeback_control *wbc); -int __swap_writepage(struct page *page, struct writeback_control *wbc); +int __swap_writepage(struct folio *folio, struct writeback_control *wbc); /* linux/mm/swap_state.c */ /* One swap address space for each 64M swap space */ diff --git a/mm/zswap.c b/mm/zswap.c index e64f4d9b1391..8c6c776826f7 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -1453,7 +1453,7 @@ static int zswap_writeback_entry(struct zswap_entry *entry, folio_set_reclaim(folio); /* start writeback */ - ret = __swap_writepage(&folio->page, &wbc); + ret = __swap_writepage(folio, &wbc); folio_put(folio); return ret; -- Gitee From 3f658725eb9083f6c68db61c6d7692ad7616c968 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 13 Dec 2023 21:58:32 +0000 Subject: [PATCH 101/484] mm: pass a folio to swap_writepage_fs() commit bfcd44d5f816b442feb27f59e9312ce38ac4b3cf upstream Conflicts: none Backport-reason: SWAP update: more folio Saves several calls to compound_head(). Link: https://lkml.kernel.org/r/20231213215842.671461-4-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/page_io.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/mm/page_io.c b/mm/page_io.c index ee9bb49bb559..4995df9e10ad 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -288,16 +288,16 @@ static void sio_write_complete(struct kiocb *iocb, long ret) mempool_free(sio, sio_pool); } -static void swap_writepage_fs(struct page *page, struct writeback_control *wbc) +static void swap_writepage_fs(struct folio *folio, struct writeback_control *wbc) { struct swap_iocb *sio = NULL; - struct swap_info_struct *sis = page_swap_info(page); + struct swap_info_struct *sis = swp_swap_info(folio->swap); struct file *swap_file = sis->swap_file; - loff_t pos = page_file_offset(page); + loff_t pos = folio_file_pos(folio); - count_swpout_vm_event(page_folio(page)); - set_page_writeback(page); - unlock_page(page); + count_swpout_vm_event(folio); + folio_start_writeback(folio); + folio_unlock(folio); if (wbc->swap_plug) sio = *wbc->swap_plug; if (sio) { @@ -315,8 +315,8 @@ static void swap_writepage_fs(struct page *page, struct writeback_control *wbc) sio->pages = 0; sio->len = 0; } - bvec_set_page(&sio->bvec[sio->pages], page, thp_size(page), 0); - sio->len += thp_size(page); + bvec_set_folio(&sio->bvec[sio->pages], folio, folio_size(folio), 0); + sio->len += folio_size(folio); sio->pages += 1; if (sio->pages == ARRAY_SIZE(sio->bvec) || !wbc->swap_plug) { swap_write_unplug(sio); @@ -389,7 +389,7 @@ int __swap_writepage(struct folio *folio, struct writeback_control *wbc) * is safe. */ if (data_race(sis->flags & SWP_FS_OPS)) - swap_writepage_fs(&folio->page, wbc); + swap_writepage_fs(folio, wbc); else if (sis->flags & SWP_SYNCHRONOUS_IO) swap_writepage_bdev_sync(&folio->page, wbc, sis); else -- Gitee From 172d58c07961ee8e836476878f37e2e8599e5fd5 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 13 Dec 2023 21:58:33 +0000 Subject: [PATCH 102/484] mm: pass a folio to swap_writepage_bdev_sync() commit 6de62c7bc4bc3444ce63490640efae965b637fe6 upstream Conflicts: none Backport-reason: SWAP update: more folio Saves a call to compound_head(). Link: https://lkml.kernel.org/r/20231213215842.671461-5-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/page_io.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/mm/page_io.c b/mm/page_io.c index 4995df9e10ad..b609df90e2a3 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -326,17 +326,16 @@ static void swap_writepage_fs(struct folio *folio, struct writeback_control *wbc *wbc->swap_plug = sio; } -static void swap_writepage_bdev_sync(struct page *page, +static void swap_writepage_bdev_sync(struct folio *folio, struct writeback_control *wbc, struct swap_info_struct *sis) { struct bio_vec bv; struct bio bio; - struct folio *folio = page_folio(page); bio_init(&bio, sis->bdev, &bv, 1, REQ_OP_WRITE | REQ_SWAP | wbc_to_write_flags(wbc)); - bio.bi_iter.bi_sector = swap_page_sector(page); - __bio_add_page(&bio, page, thp_size(page), 0); + bio.bi_iter.bi_sector = swap_page_sector(&folio->page); + bio_add_folio_nofail(&bio, folio, folio_size(folio), 0); bio_associate_blkg_from_page(&bio, folio); count_swpout_vm_event(folio); @@ -391,7 +390,7 @@ int __swap_writepage(struct folio *folio, struct writeback_control *wbc) if (data_race(sis->flags & SWP_FS_OPS)) swap_writepage_fs(folio, wbc); else if (sis->flags & SWP_SYNCHRONOUS_IO) - swap_writepage_bdev_sync(&folio->page, wbc, sis); + swap_writepage_bdev_sync(folio, wbc, sis); else swap_writepage_bdev_async(&folio->page, wbc, sis); -- Gitee From 3082727bf2e39636f9bf8c47cd5236df9fa179ac Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 13 Dec 2023 21:58:34 +0000 Subject: [PATCH 103/484] mm: pass a folio to swap_writepage_bdev_async() commit ee1b1d9b46f206ffdef5ebe4086d925a5c43805b upstream Conflicts: trivial, __swap_writepage has return value. Backport-reason: SWAP update: more folio Saves a call to compound_head(). Link: https://lkml.kernel.org/r/20231213215842.671461-6-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/page_io.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/mm/page_io.c b/mm/page_io.c index b609df90e2a3..58c60bd10bfb 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -347,18 +347,17 @@ static void swap_writepage_bdev_sync(struct folio *folio, __end_swap_bio_write(&bio); } -static void swap_writepage_bdev_async(struct page *page, +static void swap_writepage_bdev_async(struct folio *folio, struct writeback_control *wbc, struct swap_info_struct *sis) { struct bio *bio; - struct folio *folio = page_folio(page); bio = bio_alloc(sis->bdev, 1, REQ_OP_WRITE | REQ_SWAP | wbc_to_write_flags(wbc), GFP_NOIO); - bio->bi_iter.bi_sector = swap_page_sector(page); + bio->bi_iter.bi_sector = swap_page_sector(&folio->page); bio->bi_end_io = end_swap_bio_write; - __bio_add_page(bio, page, thp_size(page), 0); + bio_add_folio_nofail(bio, folio, folio_size(folio), 0); bio_associate_blkg_from_page(bio, folio); count_swpout_vm_event(folio); @@ -392,7 +391,7 @@ int __swap_writepage(struct folio *folio, struct writeback_control *wbc) else if (sis->flags & SWP_SYNCHRONOUS_IO) swap_writepage_bdev_sync(folio, wbc, sis); else - swap_writepage_bdev_async(&folio->page, wbc, sis); + swap_writepage_bdev_async(folio, wbc, sis); return 0; } -- Gitee From 014ddb1ca566296a3733b9a30c7c673644efcd16 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 13 Dec 2023 21:58:35 +0000 Subject: [PATCH 104/484] mm: pass a folio to swap_readpage_fs() commit 64a24e55e3f462836ee618be480bd1b0b018e557 upstream Conflicts: none Backport-reason: SWAP update: more folio Saves a call to compound_head(). Link: https://lkml.kernel.org/r/20231213215842.671461-7-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/page_io.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/mm/page_io.c b/mm/page_io.c index 58c60bd10bfb..dbcc1b1870c1 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -432,12 +432,11 @@ static void sio_read_complete(struct kiocb *iocb, long ret) mempool_free(sio, sio_pool); } -static void swap_readpage_fs(struct page *page, - struct swap_iocb **plug) +static void swap_readpage_fs(struct folio *folio, struct swap_iocb **plug) { - struct swap_info_struct *sis = page_swap_info(page); + struct swap_info_struct *sis = swp_swap_info(folio->swap); struct swap_iocb *sio = NULL; - loff_t pos = page_file_offset(page); + loff_t pos = folio_file_pos(folio); if (plug) sio = *plug; @@ -456,8 +455,8 @@ static void swap_readpage_fs(struct page *page, sio->pages = 0; sio->len = 0; } - bvec_set_page(&sio->bvec[sio->pages], page, thp_size(page), 0); - sio->len += thp_size(page); + bvec_set_folio(&sio->bvec[sio->pages], folio, folio_size(folio), 0); + sio->len += folio_size(folio); sio->pages += 1; if (sio->pages == ARRAY_SIZE(sio->bvec) || !plug) { swap_read_unplug(sio); @@ -527,7 +526,7 @@ void swap_readpage(struct page *page, bool synchronous, struct swap_iocb **plug) folio_mark_uptodate(folio); folio_unlock(folio); } else if (data_race(sis->flags & SWP_FS_OPS)) { - swap_readpage_fs(page, plug); + swap_readpage_fs(folio, plug); } else if (synchronous || (sis->flags & SWP_SYNCHRONOUS_IO)) { int ret = bdev_swapin_folio(sis->bdev, swap_page_sector(page), folio); if (ret != -EOPNOTSUPP) { -- Gitee From cb2889cf626d1673cba84585ef3746ab37d21c4a Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 13 Dec 2023 21:58:36 +0000 Subject: [PATCH 105/484] mm: pass a folio to swap_readpage_bdev_sync() commit 2c184d821eec55f9ea3c98c67dc2b0c5ec827c87 upstream Conflicts: minor, __swap_writepage has return value. Backport-reason: SWAP update: more folio Make it plain that this takes the head page (which before this point was just an assumption, but is now enforced by the compiler). Link: https://lkml.kernel.org/r/20231213215842.671461-8-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/page_io.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/mm/page_io.c b/mm/page_io.c index dbcc1b1870c1..39347e731c99 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -466,15 +466,15 @@ static void swap_readpage_fs(struct folio *folio, struct swap_iocb **plug) *plug = sio; } -static void swap_readpage_bdev_sync(struct page *page, +static void swap_readpage_bdev_sync(struct folio *folio, struct swap_info_struct *sis) { struct bio_vec bv; struct bio bio; bio_init(&bio, sis->bdev, &bv, 1, REQ_OP_READ); - bio.bi_iter.bi_sector = swap_page_sector(page); - __bio_add_page(&bio, page, thp_size(page), 0); + bio.bi_iter.bi_sector = swap_page_sector(&folio->page); + bio_add_folio_nofail(&bio, folio, folio_size(folio), 0); /* * Keep this task valid during swap readpage because the oom killer may * attempt to access it in the page fault retry time check. @@ -534,7 +534,7 @@ void swap_readpage(struct page *page, bool synchronous, struct swap_iocb **plug) count_vm_event(PSWPIN); goto out; } - swap_readpage_bdev_sync(page, sis); + swap_readpage_bdev_sync(folio, sis); } else { swap_readpage_bdev_async(page, sis); } -- Gitee From 3b1a78043abd147def09f8e894da5ceff72388a5 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 13 Dec 2023 21:58:37 +0000 Subject: [PATCH 106/484] mm: pass a folio to swap_readpage_bdev_async() commit 3c3ebd82e0d1e77df7a3906e79b42d8f0793bdd7 upstream Conflicts: none Backport-reason: SWAP update: more folio Make it plain that this takes the head page (which before this point was just an assumption, but is now enforced by the compiler). Link: https://lkml.kernel.org/r/20231213215842.671461-9-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/page_io.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/mm/page_io.c b/mm/page_io.c index 39347e731c99..3117e9bd1014 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -486,15 +486,15 @@ static void swap_readpage_bdev_sync(struct folio *folio, put_task_struct(current); } -static void swap_readpage_bdev_async(struct page *page, +static void swap_readpage_bdev_async(struct folio *folio, struct swap_info_struct *sis) { struct bio *bio; bio = bio_alloc(sis->bdev, 1, REQ_OP_READ, GFP_KERNEL); - bio->bi_iter.bi_sector = swap_page_sector(page); + bio->bi_iter.bi_sector = swap_page_sector(&folio->page); bio->bi_end_io = end_swap_bio_read; - __bio_add_page(bio, page, thp_size(page), 0); + bio_add_folio_nofail(bio, folio, folio_size(folio), 0); count_vm_event(PSWPIN); submit_bio(bio); } @@ -536,7 +536,7 @@ void swap_readpage(struct page *page, bool synchronous, struct swap_iocb **plug) } swap_readpage_bdev_sync(folio, sis); } else { - swap_readpage_bdev_async(page, sis); + swap_readpage_bdev_async(folio, sis); } out: -- Gitee From fa33336b87560deafd5a6facaa0860bf3d7050b0 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 13 Dec 2023 21:58:38 +0000 Subject: [PATCH 107/484] mm: convert swap_page_sector() to swap_folio_sector() commit 3a61e6f668120ee2c7840b91891c858d575d07e2 upstream Conflicts: minor, cover two other swap_page_sector usage. Backport-reason: SWAP update: more folio All callers have a folio, so pass it in. Saves a couple of calls to compound_head(). Link: https://lkml.kernel.org/r/20231213215842.671461-10-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- include/linux/swap.h | 2 +- mm/page_io.c | 12 ++++++------ mm/swapfile.c | 6 +++--- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/include/linux/swap.h b/include/linux/swap.h index 123752caac13..51019a3f7827 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -532,7 +532,7 @@ struct backing_dev_info; extern int init_swap_address_space(struct swap_info_struct *si, unsigned long nr_pages); extern void exit_swap_address_space(unsigned int type); extern struct swap_info_struct *get_swap_device(swp_entry_t entry); -sector_t swap_page_sector(struct page *page); +sector_t swap_folio_sector(struct folio *folio); static inline void put_swap_device(struct swap_info_struct *si) { diff --git a/mm/page_io.c b/mm/page_io.c index 3117e9bd1014..701e422c0854 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -334,7 +334,7 @@ static void swap_writepage_bdev_sync(struct folio *folio, bio_init(&bio, sis->bdev, &bv, 1, REQ_OP_WRITE | REQ_SWAP | wbc_to_write_flags(wbc)); - bio.bi_iter.bi_sector = swap_page_sector(&folio->page); + bio.bi_iter.bi_sector = swap_folio_sector(folio); bio_add_folio_nofail(&bio, folio, folio_size(folio), 0); bio_associate_blkg_from_page(&bio, folio); @@ -355,7 +355,7 @@ static void swap_writepage_bdev_async(struct folio *folio, bio = bio_alloc(sis->bdev, 1, REQ_OP_WRITE | REQ_SWAP | wbc_to_write_flags(wbc), GFP_NOIO); - bio->bi_iter.bi_sector = swap_page_sector(&folio->page); + bio->bi_iter.bi_sector = swap_folio_sector(folio); bio->bi_end_io = end_swap_bio_write; bio_add_folio_nofail(bio, folio, folio_size(folio), 0); @@ -373,7 +373,7 @@ int __swap_writepage(struct folio *folio, struct writeback_control *wbc) VM_BUG_ON_FOLIO(!folio_test_swapcache(folio), folio); if (data_race(sis->flags & SWP_SYNCHRONOUS_IO)) { - int ret = bdev_swapout_folio(sis->bdev, swap_page_sector(&folio->page), folio, wbc); + int ret = bdev_swapout_folio(sis->bdev, swap_folio_sector(folio), folio, wbc); if (ret != -EOPNOTSUPP) { if (!ret) count_swpout_vm_event(folio); @@ -473,7 +473,7 @@ static void swap_readpage_bdev_sync(struct folio *folio, struct bio bio; bio_init(&bio, sis->bdev, &bv, 1, REQ_OP_READ); - bio.bi_iter.bi_sector = swap_page_sector(&folio->page); + bio.bi_iter.bi_sector = swap_folio_sector(folio); bio_add_folio_nofail(&bio, folio, folio_size(folio), 0); /* * Keep this task valid during swap readpage because the oom killer may @@ -492,7 +492,7 @@ static void swap_readpage_bdev_async(struct folio *folio, struct bio *bio; bio = bio_alloc(sis->bdev, 1, REQ_OP_READ, GFP_KERNEL); - bio->bi_iter.bi_sector = swap_page_sector(&folio->page); + bio->bi_iter.bi_sector = swap_folio_sector(folio); bio->bi_end_io = end_swap_bio_read; bio_add_folio_nofail(bio, folio, folio_size(folio), 0); count_vm_event(PSWPIN); @@ -528,7 +528,7 @@ void swap_readpage(struct page *page, bool synchronous, struct swap_iocb **plug) } else if (data_race(sis->flags & SWP_FS_OPS)) { swap_readpage_fs(folio, plug); } else if (synchronous || (sis->flags & SWP_SYNCHRONOUS_IO)) { - int ret = bdev_swapin_folio(sis->bdev, swap_page_sector(page), folio); + int ret = bdev_swapin_folio(sis->bdev, swap_folio_sector(folio), folio); if (ret != -EOPNOTSUPP) { if (!ret) count_vm_event(PSWPIN); diff --git a/mm/swapfile.c b/mm/swapfile.c index 3ceeb54dbed7..ee613e3d25bf 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -327,14 +327,14 @@ offset_to_swap_extent(struct swap_info_struct *sis, unsigned long offset) BUG(); } -sector_t swap_page_sector(struct page *page) +sector_t swap_folio_sector(struct folio *folio) { - struct swap_info_struct *sis = page_swap_info(page); + struct swap_info_struct *sis = swp_swap_info(folio->swap); struct swap_extent *se; sector_t sector; pgoff_t offset; - offset = __page_file_index(page); + offset = swp_offset(folio->swap); se = offset_to_swap_extent(sis, offset); sector = se->start_block + (offset - se->start_page); return sector << (PAGE_SHIFT - 9); -- Gitee From 4fa4d48cfe66daf6da0a9d8e0d5073e82031acc4 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 13 Dec 2023 21:58:39 +0000 Subject: [PATCH 108/484] mm: convert swap_readpage() to swap_read_folio() commit c9bdf768dd9319d2d80a334646e2c8116af9e430 upstream Conflicts: minor, __swap_writepage has return value, and other backported commit Backport-reason: SWAP update: more folio All callers have a folio, so pass it in, saving two calls to compound_head(). Link: https://lkml.kernel.org/r/20231213215842.671461-11-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/memory.c | 4 ++-- mm/page_io.c | 18 +++++++++--------- mm/swap.h | 5 +++-- mm/swap_state.c | 12 ++++++------ mm/swapfile.c | 2 +- 5 files changed, 21 insertions(+), 20 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index fb8ef80dd7f7..5b22656da571 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4027,9 +4027,9 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) folio_add_lru(folio); - /* To provide entry to swap_readpage() */ + /* To provide entry to swap_read_folio() */ folio->swap = entry; - swap_readpage(page, true, NULL); + swap_read_folio(folio, true, NULL); folio->private = NULL; } } else { diff --git a/mm/page_io.c b/mm/page_io.c index 701e422c0854..92e329007eba 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -432,7 +432,7 @@ static void sio_read_complete(struct kiocb *iocb, long ret) mempool_free(sio, sio_pool); } -static void swap_readpage_fs(struct folio *folio, struct swap_iocb **plug) +static void swap_read_folio_fs(struct folio *folio, struct swap_iocb **plug) { struct swap_info_struct *sis = swp_swap_info(folio->swap); struct swap_iocb *sio = NULL; @@ -466,7 +466,7 @@ static void swap_readpage_fs(struct folio *folio, struct swap_iocb **plug) *plug = sio; } -static void swap_readpage_bdev_sync(struct folio *folio, +static void swap_read_folio_bdev_sync(struct folio *folio, struct swap_info_struct *sis) { struct bio_vec bv; @@ -486,7 +486,7 @@ static void swap_readpage_bdev_sync(struct folio *folio, put_task_struct(current); } -static void swap_readpage_bdev_async(struct folio *folio, +static void swap_read_folio_bdev_async(struct folio *folio, struct swap_info_struct *sis) { struct bio *bio; @@ -499,10 +499,10 @@ static void swap_readpage_bdev_async(struct folio *folio, submit_bio(bio); } -void swap_readpage(struct page *page, bool synchronous, struct swap_iocb **plug) +void swap_read_folio(struct folio *folio, bool synchronous, + struct swap_iocb **plug) { - struct folio *folio = page_folio(page); - struct swap_info_struct *sis = page_swap_info(page); + struct swap_info_struct *sis = swp_swap_info(folio->swap); bool workingset = folio_test_workingset(folio); unsigned long pflags; bool in_thrashing; @@ -526,7 +526,7 @@ void swap_readpage(struct page *page, bool synchronous, struct swap_iocb **plug) folio_mark_uptodate(folio); folio_unlock(folio); } else if (data_race(sis->flags & SWP_FS_OPS)) { - swap_readpage_fs(folio, plug); + swap_read_folio_fs(folio, plug); } else if (synchronous || (sis->flags & SWP_SYNCHRONOUS_IO)) { int ret = bdev_swapin_folio(sis->bdev, swap_folio_sector(folio), folio); if (ret != -EOPNOTSUPP) { @@ -534,9 +534,9 @@ void swap_readpage(struct page *page, bool synchronous, struct swap_iocb **plug) count_vm_event(PSWPIN); goto out; } - swap_readpage_bdev_sync(folio, sis); + swap_read_folio_bdev_sync(folio, sis); } else { - swap_readpage_bdev_async(folio, sis); + swap_read_folio_bdev_async(folio, sis); } out: diff --git a/mm/swap.h b/mm/swap.h index fa4bb307b0e6..870a8f437c85 100644 --- a/mm/swap.h +++ b/mm/swap.h @@ -10,7 +10,8 @@ struct mempolicy; /* linux/mm/page_io.c */ int sio_pool_init(void); struct swap_iocb; -void swap_readpage(struct page *page, bool do_poll, struct swap_iocb **plug); +void swap_read_folio(struct folio *folio, bool do_poll, + struct swap_iocb **plug); void __swap_read_unplug(struct swap_iocb *plug); static inline void swap_read_unplug(struct swap_iocb *plug) { @@ -63,7 +64,7 @@ static inline unsigned int folio_swap_flags(struct folio *folio) } #else /* CONFIG_SWAP */ struct swap_iocb; -static inline void swap_readpage(struct page *page, bool do_poll, +static inline void swap_read_folio(struct folio *folio, bool do_poll, struct swap_iocb **plug) { } diff --git a/mm/swap_state.c b/mm/swap_state.c index 4f74dae0b125..10c6808b1dc4 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -471,7 +471,7 @@ struct folio *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, * the swap entry is no longer in use. * * get/put_swap_device() aren't needed to call this function, because - * __read_swap_cache_async() call them and swap_readpage() holds the + * __read_swap_cache_async() call them and swap_read_folio() holds the * swap cache folio lock. */ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, @@ -494,7 +494,7 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, mpol_cond_put(mpol); if (page_allocated) - swap_readpage(&folio->page, false, plug); + swap_read_folio(folio, false, plug); put_swap_device(si); return folio_file_page(folio, swp_offset(entry)); @@ -614,7 +614,7 @@ struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask, if (!folio) continue; if (page_allocated) { - swap_readpage(&folio->page, false, &splug); + swap_read_folio(folio, false, &splug); if (offset != entry_offset) { folio_set_readahead(folio); count_vm_event(SWAP_RA); @@ -630,7 +630,7 @@ struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask, folio = __read_swap_cache_async(entry, gfp_mask, mpol, ilx, &page_allocated, false); if (unlikely(page_allocated)) - swap_readpage(&folio->page, false, NULL); + swap_read_folio(folio, false, NULL); zswap_folio_swapin(folio); return folio_file_page(folio, swp_offset(entry)); } @@ -793,7 +793,7 @@ static struct page *swap_vma_readahead(swp_entry_t targ_entry, gfp_t gfp_mask, if (!folio) continue; if (page_allocated) { - swap_readpage(&folio->page, false, &splug); + swap_read_folio(folio, false, &splug); if (i != ra_info.offset) { folio_set_readahead(folio); count_vm_event(SWAP_RA); @@ -811,7 +811,7 @@ static struct page *swap_vma_readahead(swp_entry_t targ_entry, gfp_t gfp_mask, folio = __read_swap_cache_async(targ_entry, gfp_mask, mpol, targ_ilx, &page_allocated, false); if (unlikely(page_allocated)) - swap_readpage(&folio->page, false, NULL); + swap_read_folio(folio, false, NULL); zswap_folio_swapin(folio); return folio_file_page(folio, swp_offset(entry)); } diff --git a/mm/swapfile.c b/mm/swapfile.c index ee613e3d25bf..41a4efe05352 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -2393,7 +2393,7 @@ EXPORT_SYMBOL_GPL(add_swap_extent); /* * A `swap extent' is a simple thing which maps a contiguous range of pages * onto a contiguous range of disk blocks. A rbtree of swap extents is - * built at swapon time and is then used at swap_writepage/swap_readpage + * built at swapon time and is then used at swap_writepage/swap_read_folio * time for locating where on disk a page belongs. * * If the swapfile is an S_ISBLK block device, a single extent is installed. -- Gitee From c840b8621f08fb2240c1a3d0bbe41fe9e43101b0 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 13 Dec 2023 21:58:40 +0000 Subject: [PATCH 109/484] mm: remove page_swap_info() commit 69fe7d67cb0c6eeab3d4c9a3bf950f9d12af4719 upstream Conflicts: none Backport-reason: SWAP update: more folio It's more efficient to get the swap_info_struct by calling swp_swap_info() directly. Link: https://lkml.kernel.org/r/20231213215842.671461-12-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- include/linux/swap.h | 3 +-- mm/swap.h | 2 +- mm/swapfile.c | 8 +------- 3 files changed, 3 insertions(+), 10 deletions(-) diff --git a/include/linux/swap.h b/include/linux/swap.h index 51019a3f7827..60cfa10fc2ef 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -526,8 +526,7 @@ extern sector_t swapdev_block(int, pgoff_t); extern int __swap_count(swp_entry_t entry); extern int swap_swapcount(struct swap_info_struct *si, swp_entry_t entry); extern int swp_swapcount(swp_entry_t entry); -extern struct swap_info_struct *page_swap_info(struct page *); -extern struct swap_info_struct *swp_swap_info(swp_entry_t entry); +struct swap_info_struct *swp_swap_info(swp_entry_t entry); struct backing_dev_info; extern int init_swap_address_space(struct swap_info_struct *si, unsigned long nr_pages); extern void exit_swap_address_space(unsigned int type); diff --git a/mm/swap.h b/mm/swap.h index 870a8f437c85..9abe4b7921ca 100644 --- a/mm/swap.h +++ b/mm/swap.h @@ -60,7 +60,7 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t flag, static inline unsigned int folio_swap_flags(struct folio *folio) { - return page_swap_info(&folio->page)->flags; + return swp_swap_info(folio->swap)->flags; } #else /* CONFIG_SWAP */ struct swap_iocb; diff --git a/mm/swapfile.c b/mm/swapfile.c index 41a4efe05352..0bad74825a83 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -3545,18 +3545,12 @@ struct swap_info_struct *swp_swap_info(swp_entry_t entry) } EXPORT_SYMBOL(swp_swap_info); -struct swap_info_struct *page_swap_info(struct page *page) -{ - swp_entry_t entry = page_swap_entry(page); - return swp_swap_info(entry); -} - /* * out-of-line methods to avoid include hell. */ struct address_space *swapcache_mapping(struct folio *folio) { - return page_swap_info(&folio->page)->swap_file->f_mapping; + return swp_swap_info(folio->swap)->swap_file->f_mapping; } EXPORT_SYMBOL_GPL(swapcache_mapping); -- Gitee From 1b0b42bb39f2b7a1548f014c00e2d7a8918be0a8 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 13 Dec 2023 21:58:41 +0000 Subject: [PATCH 110/484] mm: return a folio from read_swap_cache_async() commit 6e03492e9d288d9ce886064289e2768da5d7d967 upstream Conflicts: minor, due to backported si ref shrink Backport-reason: SWAP update: more folio The only two callers simply call put_page() on the page returned, so they're happier calling folio_put(). Saves two calls to compound_head(). Link: https://lkml.kernel.org/r/20231213215842.671461-13-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/madvise.c | 22 +++++++++++----------- mm/swap.h | 7 +++---- mm/swap_state.c | 8 ++++---- 3 files changed, 18 insertions(+), 19 deletions(-) diff --git a/mm/madvise.c b/mm/madvise.c index 47fb2768f2c7..6572b618a9f8 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -180,7 +180,7 @@ static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start, for (addr = start; addr < end; addr += PAGE_SIZE) { pte_t pte; swp_entry_t entry; - struct page *page; + struct folio *folio; if (!ptep++) { ptep = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); @@ -198,10 +198,10 @@ static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start, pte_unmap_unlock(ptep, ptl); ptep = NULL; - page = read_swap_cache_async(entry, GFP_HIGHUSER_MOVABLE, + folio = read_swap_cache_async(entry, GFP_HIGHUSER_MOVABLE, vma, addr, &splug); - if (page) - put_page(page); + if (folio) + folio_put(folio); } if (ptep) @@ -223,17 +223,17 @@ static void shmem_swapin_range(struct vm_area_struct *vma, { XA_STATE(xas, &mapping->i_pages, linear_page_index(vma, start)); pgoff_t end_index = linear_page_index(vma, end) - 1; - struct page *page; + struct folio *folio; struct swap_iocb *splug = NULL; rcu_read_lock(); - xas_for_each(&xas, page, end_index) { + xas_for_each(&xas, folio, end_index) { unsigned long addr; swp_entry_t entry; - if (!xa_is_value(page)) + if (!xa_is_value(folio)) continue; - entry = radix_to_swp_entry(page); + entry = radix_to_swp_entry(folio); /* There might be swapin error entries in shmem mapping. */ if (non_swap_entry(entry)) continue; @@ -243,10 +243,10 @@ static void shmem_swapin_range(struct vm_area_struct *vma, xas_pause(&xas); rcu_read_unlock(); - page = read_swap_cache_async(entry, mapping_gfp_mask(mapping), + folio = read_swap_cache_async(entry, mapping_gfp_mask(mapping), vma, addr, &splug); - if (page) - put_page(page); + if (folio) + folio_put(folio); rcu_read_lock(); } diff --git a/mm/swap.h b/mm/swap.h index 9abe4b7921ca..47bd5259c634 100644 --- a/mm/swap.h +++ b/mm/swap.h @@ -46,10 +46,9 @@ struct folio *swap_cache_get_folio(swp_entry_t entry, struct folio *filemap_get_incore_folio(struct address_space *mapping, pgoff_t index); -struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, - struct vm_area_struct *vma, - unsigned long addr, - struct swap_iocb **plug); +struct folio *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, + struct vm_area_struct *vma, unsigned long addr, + struct swap_iocb **plug); struct folio *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_flags, struct mempolicy *mpol, pgoff_t ilx, bool *new_page_allocated, bool skip_if_exists); diff --git a/mm/swap_state.c b/mm/swap_state.c index 10c6808b1dc4..6d541ad22462 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -474,9 +474,9 @@ struct folio *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, * __read_swap_cache_async() call them and swap_read_folio() holds the * swap cache folio lock. */ -struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, - struct vm_area_struct *vma, - unsigned long addr, struct swap_iocb **plug) +struct folio *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, + struct vm_area_struct *vma, unsigned long addr, + struct swap_iocb **plug) { struct swap_info_struct *si; bool page_allocated; @@ -497,7 +497,7 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, swap_read_folio(folio, false, plug); put_swap_device(si); - return folio_file_page(folio, swp_offset(entry)); + return folio; } EXPORT_SYMBOL(read_swap_cache_async); -- Gitee From 1487bd0d0f529dc272f317ad615292ab82946724 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 13 Dec 2023 21:58:42 +0000 Subject: [PATCH 111/484] mm: convert swap_cluster_readahead and swap_vma_readahead to return a folio commit a4575c4138db887bd27dc7f87cf7cfb0224c6f5e upstream Conflicts: none Backport-reason: SWAP update: more folio shmem_swapin_cluster() immediately converts the page back to a folio, and swapin_readahead() may as well call folio_file_page() once instead of having each function call it. [willy@infradead.org: avoid NULL pointer deref] Link: https://lkml.kernel.org/r/ZYI7OcVlM1voKfBl@casper.infradead.org Link: https://lkml.kernel.org/r/20231213215842.671461-14-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/shmem.c | 8 +++----- mm/swap.h | 6 +++--- mm/swap_state.c | 24 +++++++++++++----------- 3 files changed, 19 insertions(+), 19 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index 9d80ad42a5a6..1c30cc3ae1af 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1585,15 +1585,13 @@ static struct folio *shmem_swapin_cluster(swp_entry_t swap, gfp_t gfp, { struct mempolicy *mpol; pgoff_t ilx; - struct page *page; + struct folio *folio; mpol = shmem_get_pgoff_policy(info, index, 0, &ilx); - page = swap_cluster_readahead(swap, gfp, mpol, ilx); + folio = swap_cluster_readahead(swap, gfp, mpol, ilx); mpol_cond_put(mpol); - if (!page) - return NULL; - return page_folio(page); + return folio; } /* diff --git a/mm/swap.h b/mm/swap.h index 47bd5259c634..81b1d887c98d 100644 --- a/mm/swap.h +++ b/mm/swap.h @@ -52,8 +52,8 @@ struct folio *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, struct folio *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_flags, struct mempolicy *mpol, pgoff_t ilx, bool *new_page_allocated, bool skip_if_exists); -struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t flag, - struct mempolicy *mpol, pgoff_t ilx); +struct folio *swap_cluster_readahead(swp_entry_t entry, gfp_t flag, + struct mempolicy *mpol, pgoff_t ilx); struct page *swapin_readahead(swp_entry_t entry, gfp_t flag, struct vm_fault *vmf); @@ -80,7 +80,7 @@ static inline void show_swap_cache_info(void) { } -static inline struct page *swap_cluster_readahead(swp_entry_t entry, +static inline struct folio *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask, struct mempolicy *mpol, pgoff_t ilx) { return NULL; diff --git a/mm/swap_state.c b/mm/swap_state.c index 6d541ad22462..f50001224108 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -569,7 +569,7 @@ static unsigned long swapin_nr_pages(unsigned long offset) * @mpol: NUMA memory allocation policy to be applied * @ilx: NUMA interleave index, for use only when MPOL_INTERLEAVE * - * Returns the struct page for entry and addr, after queueing swapin. + * Returns the struct folio for entry and addr, after queueing swapin. * * Primitive swap readahead code. We simply read an aligned block of * (1 << page_cluster) entries in the swap area. This method is chosen @@ -580,7 +580,7 @@ static unsigned long swapin_nr_pages(unsigned long offset) * are used for every page of the readahead: neighbouring pages on swap * are fairly likely to have been swapped out from the same node. */ -struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask, +struct folio *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask, struct mempolicy *mpol, pgoff_t ilx) { struct folio *folio; @@ -632,7 +632,7 @@ struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask, if (unlikely(page_allocated)) swap_read_folio(folio, false, NULL); zswap_folio_swapin(folio); - return folio_file_page(folio, swp_offset(entry)); + return folio; } int init_swap_address_space(struct swap_info_struct *si, unsigned long nr_pages) @@ -741,7 +741,7 @@ static void swap_ra_info(struct vm_fault *vmf, * @targ_ilx: NUMA interleave index, for use only when MPOL_INTERLEAVE * @vmf: fault information * - * Returns the struct page for entry and addr, after queueing swapin. + * Returns the struct folio for entry and addr, after queueing swapin. * * Primitive swap readahead code. We simply read in a few pages whose * virtual addresses are around the fault address in the same vma. @@ -749,9 +749,8 @@ static void swap_ra_info(struct vm_fault *vmf, * Caller must hold read mmap_lock if vmf->vma is not NULL. * */ -static struct page *swap_vma_readahead(swp_entry_t targ_entry, gfp_t gfp_mask, - struct mempolicy *mpol, pgoff_t targ_ilx, - struct vm_fault *vmf) +static struct folio *swap_vma_readahead(swp_entry_t targ_entry, gfp_t gfp_mask, + struct mempolicy *mpol, pgoff_t targ_ilx, struct vm_fault *vmf) { struct blk_plug plug; struct swap_iocb *splug = NULL; @@ -813,7 +812,7 @@ static struct page *swap_vma_readahead(swp_entry_t targ_entry, gfp_t gfp_mask, if (unlikely(page_allocated)) swap_read_folio(folio, false, NULL); zswap_folio_swapin(folio); - return folio_file_page(folio, swp_offset(entry)); + return folio; } /** @@ -833,14 +832,17 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask, { struct mempolicy *mpol; pgoff_t ilx; - struct page *page; + struct folio *folio; mpol = get_vma_policy(vmf->vma, vmf->address, 0, &ilx); - page = swap_use_vma_readahead() ? + folio = swap_use_vma_readahead() ? swap_vma_readahead(entry, gfp_mask, mpol, ilx, vmf) : swap_cluster_readahead(entry, gfp_mask, mpol, ilx); mpol_cond_put(mpol); - return page; + + if (!folio) + return NULL; + return folio_file_page(folio, swp_offset(entry)); } #ifdef CONFIG_SYSFS -- Gitee From 209b4231fdf2c9db0298833fba4bb0ebd50b892f Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 2 Apr 2024 21:16:57 +0100 Subject: [PATCH 112/484] mm: remove struct page from get_shadow_from_swap_cache commit 4c773a44257f1f8a88ebc2c1bf67799bb4d6f409 upstream Conflicts: none Backport-reason: SWAP cleanup We don't actually use any parts of struct page; all we do is check the value of the pointer. So give the pointer the appropriate name & type. Link: https://lkml.kernel.org/r/20240402201659.918308-1-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: David Hildenbrand Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/swap_state.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/mm/swap_state.c b/mm/swap_state.c index f50001224108..ada7d1396666 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -71,11 +71,11 @@ void *get_shadow_from_swap_cache(swp_entry_t entry) { struct address_space *address_space = swap_address_space(entry); pgoff_t idx = swp_offset(entry); - struct page *page; + void *shadow; - page = xa_load(&address_space->i_pages, idx); - if (xa_is_value(page)) - return page; + shadow = xa_load(&address_space->i_pages, idx); + if (xa_is_value(shadow)) + return shadow; return NULL; } -- Gitee From ce1bf8c38db8339070f7d4976d63df70a34fb8df Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Wed, 17 Jan 2024 18:39:54 +0800 Subject: [PATCH 113/484] mm: memory: move mem_cgroup_charge() into alloc_anon_folio() commit 085ff35e76368455c629b194bf3cb62dd82eadf6 upstream Conflicts: none Backport-reason: Memory Cgroup minor cleanup The GFP flags from vma_thp_gfp_mask() according to user configuration only used for large folio allocation but not for memory cgroup charge, and GFP_KERNEL is used for both order-0 and large order folio when memory cgroup charge at present. However, mem_cgroup_charge() uses the GFP flags in a fairly sophisticated way. In addition to checking gfpflags_allow_blocking(), it pays attention to __GFP_NORETRY and __GFP_RETRY_MAYFAIL to ensure that processes within this memcg do not exceed their quotas. So we'd better to move mem_cgroup_charge() into alloc_anon_folio(), 1) it will make us to allocate as much as possible large order folio, because we could try the next order if mem_cgroup_charge() fails, although the memcg's memory usage is close to its limits. 2) using same GFP flags for allocation and charge is to be consistent with PMD THP firstly, in addition, according to GFP flag returned from vma_thp_gfp_mask(), GFP_TRANSHUGE_LIGHT could make us skip direct reclaim, _GFP_NORETRY will make us skip mem_cgroup_oom() and won't trigger memory cgroup oom from large order(order <= COSTLY_ORDER) folio charging. Link: https://lkml.kernel.org/r/20240122011612.501029-1-wangkefeng.wang@huawei.com Link: https://lkml.kernel.org/r/20240117103954.2756050-1-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Reviewed-by: Ryan Roberts Cc: David Hildenbrand Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Roman Gushchin Cc: Johannes Weiner Cc: Shakeel Butt Cc: Muchun Song Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/memory.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index 5b22656da571..eec3ad8f520f 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4293,8 +4293,8 @@ static bool pte_range_none(pte_t *pte, int nr_pages) static struct folio *alloc_anon_folio(struct vm_fault *vmf) { -#ifdef CONFIG_TRANSPARENT_HUGEPAGE struct vm_area_struct *vma = vmf->vma; +#ifdef CONFIG_TRANSPARENT_HUGEPAGE unsigned long orders; struct folio *folio; unsigned long addr; @@ -4346,15 +4346,21 @@ static struct folio *alloc_anon_folio(struct vm_fault *vmf) addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order); folio = vma_alloc_folio(gfp, order, vma, addr, true); if (folio) { + if (mem_cgroup_charge(folio, vma->vm_mm, gfp)) { + folio_put(folio); + goto next; + } + folio_throttle_swaprate(folio, gfp); clear_huge_page(&folio->page, vmf->address, 1 << order); return folio; } +next: order = next_order(&orders, order); } fallback: #endif - return vma_alloc_zeroed_movable_folio(vmf->vma, vmf->address); + return folio_prealloc(vma->vm_mm, vma, vmf->address, true); } /* @@ -4421,10 +4427,6 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) nr_pages = folio_nr_pages(folio); addr = ALIGN_DOWN(vmf->address, nr_pages * PAGE_SIZE); - if (mem_cgroup_charge(folio, vma->vm_mm, GFP_KERNEL)) - goto oom_free_page; - folio_throttle_swaprate(folio, GFP_KERNEL); - /* * The memory barrier inside __folio_mark_uptodate makes sure that * preceding stores to the page contents become visible before @@ -4478,8 +4480,6 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) release: folio_put(folio); goto unlock; -oom_free_page: - folio_put(folio); oom: return VM_FAULT_OOM; } -- Gitee From 17aa9533599f35b06413b959e1eb73b89e5528ff Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Fri, 29 Mar 2024 14:59:33 +0800 Subject: [PATCH 114/484] mm: huge_memory: add the missing folio_test_pmd_mappable() for THP split statistics commit 835c3a25aa373d486514e4e0f5a7450ea82ae489 upstream Conflicts: none Backport-reason: mTHP splitting Now the mTHP can also be split or added into the deferred list, so add folio_test_pmd_mappable() validation for PMD mapped THP, to avoid confusion with PMD mapped THP related statistics. [baolin.wang@linux.alibaba.com: check THP earlier in case folio is split, per Lance] Link: https://lkml.kernel.org/r/b99f8cb14bc85fdb6ab43721d1331cb5ebed2581.1713771041.git.baolin.wang@linux.alibaba.com Link: https://lkml.kernel.org/r/a5341defeef27c9ac7b85c97f030f93e4368bbc1.1711694852.git.baolin.wang@linux.alibaba.com Signed-off-by: Baolin Wang Acked-by: David Hildenbrand Reviewed-by: Lance Yang Cc: Muchun Song Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/huge_memory.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 0b026daa1c93..01b681097eab 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2981,6 +2981,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) XA_STATE(xas, &folio->mapping->i_pages, folio->index); struct anon_vma *anon_vma = NULL; struct address_space *mapping = NULL; + bool is_thp = folio_test_pmd_mappable(folio); int extra_pins, ret; pgoff_t end; bool is_hzp; @@ -3124,7 +3125,8 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) i_mmap_unlock_read(mapping); out: xas_destroy(&xas); - count_vm_event(!ret ? THP_SPLIT_PAGE : THP_SPLIT_PAGE_FAILED); + if (is_thp) + count_vm_event(!ret ? THP_SPLIT_PAGE : THP_SPLIT_PAGE_FAILED); return ret; } @@ -3186,7 +3188,8 @@ void deferred_split_folio(struct folio *folio) spin_lock_irqsave(&ds_queue->split_queue_lock, flags); if (list_empty(&folio->_deferred_list)) { - count_vm_event(THP_DEFERRED_SPLIT_PAGE); + if (folio_test_pmd_mappable(folio)) + count_vm_event(THP_DEFERRED_SPLIT_PAGE); list_add_tail(&folio->_deferred_list, &ds_queue->split_queue); ds_queue->split_queue_len++; #ifdef CONFIG_MEMCG -- Gitee From 98b5d3de39aead403bf5f96fa0c8fa4febfb815c Mon Sep 17 00:00:00 2001 From: Nhat Pham Date: Mon, 5 Feb 2024 15:24:42 -0800 Subject: [PATCH 115/484] mm/swap_state: update zswap LRU's protection range with the folio locked commit 16e96ba5e92ce06b54f0862d44bb27bfa00d6c23 upstream Conflicts: none Backport-reason: ZSWAP mass udpate When a folio is swapped in, the protection size of the corresponding zswap LRU is incremented, so that the zswap shrinker is more conservative with its reclaiming action. This field is embedded within the struct lruvec, so updating it requires looking up the folio's memcg and lruvec. However, currently this lookup can happen after the folio is unlocked, for instance if a new folio is allocated, and swap_read_folio() unlocks the folio before returning. In this scenario, there is no stability guarantee for the binding between a folio and its memcg and lruvec: * A folio's memcg and lruvec can be freed between the lookup and the update, leading to a UAF. * Folio migration can clear the now-unlocked folio's memcg_data, which directs the zswap LRU protection size update towards the root memcg instead of the original memcg. This was recently picked up by the syzbot thanks to a warning in the inlined folio_lruvec() call. Move the zswap LRU protection range update above the swap_read_folio() call, and only when a new page is allocated, to prevent this. [nphamcs@gmail.com: add VM_WARN_ON_ONCE() to zswap_folio_swapin()] Link: https://lkml.kernel.org/r/20240206180855.3987204-1-nphamcs@gmail.com [nphamcs@gmail.com: remove unneeded if (folio) checks] Link: https://lkml.kernel.org/r/20240206191355.83755-1-nphamcs@gmail.com Link: https://lkml.kernel.org/r/20240205232442.3240571-1-nphamcs@gmail.com Fixes: b5ba474f3f51 ("zswap: shrink zswap pool based on memory pressure") Reported-by: syzbot+17a611d10af7d18a7092@syzkaller.appspotmail.com Closes: https://lore.kernel.org/all/000000000000ae47f90610803260@google.com/ Signed-off-by: Nhat Pham Reviewed-by: Chengming Zhou Acked-by: Johannes Weiner Cc: Yosry Ahmed Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/swap_state.c | 10 ++++++---- mm/zswap.c | 7 +++---- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/mm/swap_state.c b/mm/swap_state.c index ada7d1396666..f1d0d34be865 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -629,9 +629,10 @@ struct folio *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask, /* The page was likely read above, so no need for plugging here */ folio = __read_swap_cache_async(entry, gfp_mask, mpol, ilx, &page_allocated, false); - if (unlikely(page_allocated)) + if (unlikely(page_allocated)) { + zswap_folio_swapin(folio); swap_read_folio(folio, false, NULL); - zswap_folio_swapin(folio); + } return folio; } @@ -809,9 +810,10 @@ static struct folio *swap_vma_readahead(swp_entry_t targ_entry, gfp_t gfp_mask, /* The folio was likely read above, so no need for plugging here */ folio = __read_swap_cache_async(targ_entry, gfp_mask, mpol, targ_ilx, &page_allocated, false); - if (unlikely(page_allocated)) + if (unlikely(page_allocated)) { + zswap_folio_swapin(folio); swap_read_folio(folio, false, NULL); - zswap_folio_swapin(folio); + } return folio; } diff --git a/mm/zswap.c b/mm/zswap.c index 8c6c776826f7..a8ee59269fb4 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -370,10 +370,9 @@ void zswap_folio_swapin(struct folio *folio) { struct lruvec *lruvec; - if (folio) { - lruvec = folio_lruvec(folio); - atomic_long_inc(&lruvec->zswap_lruvec_state.nr_zswap_protected); - } + VM_WARN_ON_ONCE(!folio_test_locked(folio)); + lruvec = folio_lruvec(folio); + atomic_long_inc(&lruvec->zswap_lruvec_state.nr_zswap_protected); } /********************************* -- Gitee From 4119b0ba31f6f0d7caaacd29e620e1f0c6efa11f Mon Sep 17 00:00:00 2001 From: Zi Yan Date: Thu, 2 May 2024 09:28:51 -0400 Subject: [PATCH 116/484] mm/rmap: do not add fully unmapped large folio to deferred split list commit 7491f3f34891ef8baf5418f6856af91b58f7d200 upstream Conflicts: none Backport-reason: mTHP splitting In __folio_remove_rmap(), a large folio is added to deferred split list if any page in a folio loses its final mapping. But it is possible that the folio is fully unmapped and adding it to deferred split list is unnecessary. For PMD-mapped THPs, that was not really an issue, because removing the last PMD mapping in the absence of PTE mappings would not have added the folio to the deferred split queue. However, for PTE-mapped THPs, which are now more prominent due to mTHP, they are always added to the deferred split queue. One side effect is that the THP_DEFERRED_SPLIT_PAGE stat for a PTE-mapped folio can be unintentionally increased, making it look like there are many partially mapped folios -- although the whole folio is fully unmapped stepwise. Core-mm now tries batch-unmapping consecutive PTEs of PTE-mapped THPs where possible starting from commit b06dc281aa99 ("mm/rmap: introduce folio_remove_rmap_[pte|ptes|pmd]()"). When it happens, a whole PTE-mapped folio is unmapped in one go and can avoid being added to deferred split list, reducing the THP_DEFERRED_SPLIT_PAGE noise. But there will still be noise when we cannot batch-unmap a complete PTE-mapped folio in one go -- or where this type of batching is not implemented yet, e.g., migration. To avoid the unnecessary addition, folio->_nr_pages_mapped is checked to tell if the whole folio is unmapped. If the folio is already on deferred split list, it will be skipped, too. Note: commit 98046944a159 ("mm: huge_memory: add the missing folio_test_pmd_mappable() for THP split statistics") tried to exclude mTHP deferred split stats from THP_DEFERRED_SPLIT_PAGE, but it does not fix the above issue. A fully unmapped PTE-mapped order-9 THP was still added to deferred split list and counted as THP_DEFERRED_SPLIT_PAGE, since nr is 512 (non zero), level is RMAP_LEVEL_PTE, and inside deferred_split_folio() the order-9 folio is folio_test_pmd_mappable(). Link: https://lkml.kernel.org/r/20240502132852.862138-1-zi.yan@sent.com Signed-off-by: Zi Yan Suggested-by: David Hildenbrand Reviewed-by: Yang Shi Reviewed-by: David Hildenbrand Reviewed-by: Barry Song Reviewed-by: Lance Yang Cc: Alexander Gordeev Cc: Matthew Wilcox (Oracle) Cc: Ryan Roberts Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/rmap.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/mm/rmap.c b/mm/rmap.c index cba599b00597..7c1cb10d0115 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1474,6 +1474,7 @@ static __always_inline void __folio_remove_rmap(struct folio *folio, { atomic_t *mapped = &folio->_nr_pages_mapped; int last, nr = 0, nr_pmdmapped = 0; + bool partially_mapped = false; enum node_stat_item idx; __folio_rmap_sanity_checks(folio, page, nr_pages, level); @@ -1490,6 +1491,8 @@ static __always_inline void __folio_remove_rmap(struct folio *folio, if (last) nr++; } while (page++, --nr_pages > 0); + + partially_mapped = nr && atomic_read(mapped); break; case RMAP_LEVEL_PMD: last = atomic_add_negative(-1, &folio->_entire_mapcount); @@ -1506,6 +1509,8 @@ static __always_inline void __folio_remove_rmap(struct folio *folio, nr = 0; } } + + partially_mapped = nr < nr_pmdmapped; break; } @@ -1526,10 +1531,12 @@ static __always_inline void __folio_remove_rmap(struct folio *folio, * Queue anon large folio for deferred split if at least one * page of the folio is unmapped and at least one page * is still mapped. + * + * Check partially_mapped first to ensure it is a large folio. */ - if (folio_test_large(folio) && folio_test_anon(folio)) - if (level == RMAP_LEVEL_PTE || nr < nr_pmdmapped) - deferred_split_folio(folio); + if (folio_test_anon(folio) && partially_mapped && + list_empty(&folio->_deferred_list)) + deferred_split_folio(folio); } /* -- Gitee From ef32a988d7daecf6803e33801eda4d3c1afddc4c Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Mon, 6 May 2024 19:29:24 +0000 Subject: [PATCH 117/484] mm: do not update memcg stats for NR_{FILE/SHMEM}_PMDMAPPED commit 4f687281012e2da1a34cfe08ab10ea1361c600d2 upstream Conflicts: none Backport-reason: mTHP statistic: clean up and prepare. We don't have the 514462bbe927b but our NR_*_PMDMAPPED it not exposed via memcg eigher. Previously, all NR_VM_EVENT_ITEMS stats were maintained per-memcg, although some of those fields are not exposed anywhere. Commit 14e0f6c957e39 ("memcg: reduce memory for the lruvec and memcg stats") changed this such that we only maintain the stats we actually expose per-memcg via a translation table. Additionally, commit 514462bbe927b ("memcg: warn for unexpected events and stats") added a warning if a per-memcg stat update is attempted for a stat that is not in the translation table. The warning started firing for the NR_{FILE/SHMEM}_PMDMAPPED stat updates in the rmap code. These stats are not maintained per-memcg, and hence are not in the translation table. Do not use __lruvec_stat_mod_folio() when updating NR_FILE_PMDMAPPED and NR_SHMEM_PMDMAPPED. Use __mod_node_page_state() instead, which updates the global per-node stats only. Link: https://lkml.kernel.org/r/20240506192924.271999-1-yosryahmed@google.com Fixes: 514462bbe927 ("memcg: warn for unexpected events and stats") Signed-off-by: Yosry Ahmed Reported-by: syzbot+9319a4268a640e26b72b@syzkaller.appspotmail.com Closes: https://lore.kernel.org/lkml/0000000000001b9d500617c8b23c@google.com Acked-by: Shakeel Butt Acked-by: David Hildenbrand Reviewed-by: Roman Gushchin Cc: Johannes Weiner Cc: Michal Hocko Cc: Muchun Song Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/rmap.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/mm/rmap.c b/mm/rmap.c index 7c1cb10d0115..f8cdc2045d05 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1415,13 +1415,14 @@ static __always_inline void __folio_add_file_rmap(struct folio *folio, struct page *page, int nr_pages, struct vm_area_struct *vma, enum rmap_level level) { + pg_data_t *pgdat = folio_pgdat(folio); int nr, nr_pmdmapped = 0; VM_WARN_ON_FOLIO(folio_test_anon(folio), folio); nr = __folio_add_rmap(folio, page, nr_pages, level, &nr_pmdmapped); if (nr_pmdmapped) - __lruvec_stat_mod_folio(folio, folio_test_swapbacked(folio) ? + __mod_node_page_state(pgdat, folio_test_swapbacked(folio) ? NR_SHMEM_PMDMAPPED : NR_FILE_PMDMAPPED, nr_pmdmapped); if (nr) __lruvec_stat_mod_folio(folio, NR_FILE_MAPPED, nr); @@ -1473,6 +1474,7 @@ static __always_inline void __folio_remove_rmap(struct folio *folio, enum rmap_level level) { atomic_t *mapped = &folio->_nr_pages_mapped; + pg_data_t *pgdat = folio_pgdat(folio); int last, nr = 0, nr_pmdmapped = 0; bool partially_mapped = false; enum node_stat_item idx; @@ -1515,13 +1517,14 @@ static __always_inline void __folio_remove_rmap(struct folio *folio, } if (nr_pmdmapped) { + /* NR_{FILE/SHMEM}_PMDMAPPED are not maintained per-memcg */ if (folio_test_anon(folio)) - idx = NR_ANON_THPS; - else if (folio_test_swapbacked(folio)) - idx = NR_SHMEM_PMDMAPPED; + __lruvec_stat_mod_folio(folio, NR_ANON_THPS, -nr_pmdmapped); else - idx = NR_FILE_PMDMAPPED; - __lruvec_stat_mod_folio(folio, idx, -nr_pmdmapped); + __mod_node_page_state(pgdat, + folio_test_swapbacked(folio) ? + NR_SHMEM_PMDMAPPED : NR_FILE_PMDMAPPED, + -nr_pmdmapped); } if (nr) { idx = folio_test_anon(folio) ? NR_ANON_MAPPED : NR_FILE_MAPPED; -- Gitee From 0db3b7530d8a31872a944327cd255c07997978be Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Mon, 6 May 2024 21:13:33 +0000 Subject: [PATCH 118/484] mm: rmap: abstract updating per-node and per-memcg stats commit 15c0536fb57fd989e24335020a443486bac01dac upstream Conflicts: none Backport-reason: mTHP statistic: clean up and prepare A lot of intricacies go into updating the stats when adding or removing mappings: which stat index to use and which function. Abstract this away into a new static helper in rmap.c, __folio_mod_stat(). This adds an unnecessary call to folio_test_anon() in __folio_add_anon_rmap() and __folio_add_file_rmap(). However, the folio struct should already be in the cache at this point, so it shouldn't cause any noticeable overhead. No functional change intended. [hughd@google.com: fix /proc/meminfo] Link: https://lkml.kernel.org/r/49914517-dfc7-e784-fde0-0e08fafbecc2@google.com Link: https://lkml.kernel.org/r/20240506211333.346605-1-yosryahmed@google.com Signed-off-by: Yosry Ahmed Signed-off-by: Hugh Dickins Reviewed-by: David Hildenbrand Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/rmap.c | 56 ++++++++++++++++++++++++++++--------------------------- 1 file changed, 29 insertions(+), 27 deletions(-) diff --git a/mm/rmap.c b/mm/rmap.c index f8cdc2045d05..7b345ae409d3 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1253,6 +1253,28 @@ static void __page_check_anon_rmap(struct folio *folio, struct page *page, page); } +static void __folio_mod_stat(struct folio *folio, int nr, int nr_pmdmapped) +{ + int idx; + + if (nr) { + idx = folio_test_anon(folio) ? NR_ANON_MAPPED : NR_FILE_MAPPED; + __lruvec_stat_mod_folio(folio, idx, nr); + } + if (nr_pmdmapped) { + if (folio_test_anon(folio)) { + idx = NR_ANON_THPS; + __lruvec_stat_mod_folio(folio, idx, nr_pmdmapped); + } else { + /* NR_*_PMDMAPPED are not maintained per-memcg */ + idx = folio_test_swapbacked(folio) ? + NR_SHMEM_PMDMAPPED : NR_FILE_PMDMAPPED; + __mod_node_page_state(folio_pgdat(folio), idx, + nr_pmdmapped); + } + } +} + static __always_inline void __folio_add_anon_rmap(struct folio *folio, struct page *page, int nr_pages, struct vm_area_struct *vma, unsigned long address, rmap_t flags, enum rmap_level level) @@ -1260,10 +1282,6 @@ static __always_inline void __folio_add_anon_rmap(struct folio *folio, int i, nr, nr_pmdmapped = 0; nr = __folio_add_rmap(folio, page, nr_pages, level, &nr_pmdmapped); - if (nr_pmdmapped) - __lruvec_stat_mod_folio(folio, NR_ANON_THPS, nr_pmdmapped); - if (nr) - __lruvec_stat_mod_folio(folio, NR_ANON_MAPPED, nr); if (unlikely(!folio_test_anon(folio))) { VM_WARN_ON_FOLIO(!folio_test_locked(folio), folio); @@ -1281,6 +1299,8 @@ static __always_inline void __folio_add_anon_rmap(struct folio *folio, __page_check_anon_rmap(folio, page, vma, address); } + __folio_mod_stat(folio, nr, nr_pmdmapped); + if (flags & RMAP_EXCLUSIVE) { switch (level) { case RMAP_LEVEL_PTE: @@ -1377,6 +1397,7 @@ void folio_add_new_anon_rmap(struct folio *folio, struct vm_area_struct *vma, unsigned long address) { int nr = folio_nr_pages(folio); + int nr_pmdmapped = 0; VM_WARN_ON_FOLIO(folio_test_hugetlb(folio), folio); VM_BUG_ON_VMA(address < vma->vm_start || @@ -1405,27 +1426,22 @@ void folio_add_new_anon_rmap(struct folio *folio, struct vm_area_struct *vma, atomic_set(&folio->_entire_mapcount, 0); atomic_set(&folio->_nr_pages_mapped, ENTIRELY_MAPPED); SetPageAnonExclusive(&folio->page); - __lruvec_stat_mod_folio(folio, NR_ANON_THPS, nr); + nr_pmdmapped = nr; } - __lruvec_stat_mod_folio(folio, NR_ANON_MAPPED, nr); + __folio_mod_stat(folio, nr, nr_pmdmapped); } static __always_inline void __folio_add_file_rmap(struct folio *folio, struct page *page, int nr_pages, struct vm_area_struct *vma, enum rmap_level level) { - pg_data_t *pgdat = folio_pgdat(folio); int nr, nr_pmdmapped = 0; VM_WARN_ON_FOLIO(folio_test_anon(folio), folio); nr = __folio_add_rmap(folio, page, nr_pages, level, &nr_pmdmapped); - if (nr_pmdmapped) - __mod_node_page_state(pgdat, folio_test_swapbacked(folio) ? - NR_SHMEM_PMDMAPPED : NR_FILE_PMDMAPPED, nr_pmdmapped); - if (nr) - __lruvec_stat_mod_folio(folio, NR_FILE_MAPPED, nr); + __folio_mod_stat(folio, nr, nr_pmdmapped); /* See comments in folio_add_anon_rmap_*() */ if (!folio_test_large(folio)) @@ -1474,10 +1490,8 @@ static __always_inline void __folio_remove_rmap(struct folio *folio, enum rmap_level level) { atomic_t *mapped = &folio->_nr_pages_mapped; - pg_data_t *pgdat = folio_pgdat(folio); int last, nr = 0, nr_pmdmapped = 0; bool partially_mapped = false; - enum node_stat_item idx; __folio_rmap_sanity_checks(folio, page, nr_pages, level); @@ -1516,20 +1530,7 @@ static __always_inline void __folio_remove_rmap(struct folio *folio, break; } - if (nr_pmdmapped) { - /* NR_{FILE/SHMEM}_PMDMAPPED are not maintained per-memcg */ - if (folio_test_anon(folio)) - __lruvec_stat_mod_folio(folio, NR_ANON_THPS, -nr_pmdmapped); - else - __mod_node_page_state(pgdat, - folio_test_swapbacked(folio) ? - NR_SHMEM_PMDMAPPED : NR_FILE_PMDMAPPED, - -nr_pmdmapped); - } if (nr) { - idx = folio_test_anon(folio) ? NR_ANON_MAPPED : NR_FILE_MAPPED; - __lruvec_stat_mod_folio(folio, idx, -nr); - /* * Queue anon large folio for deferred split if at least one * page of the folio is unmapped and at least one page @@ -1541,6 +1542,7 @@ static __always_inline void __folio_remove_rmap(struct folio *folio, list_empty(&folio->_deferred_list)) deferred_split_folio(folio); } + __folio_mod_stat(folio, -nr, -nr_pmdmapped); /* * It would be tidy to reset folio_test_anon mapping when fully -- Gitee From b2197f21252fac35094cccd91296597471fd98f9 Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Wed, 22 May 2024 01:58:44 +0800 Subject: [PATCH 119/484] nilfs2: drop usage of page_index commit 1f49c1476d09168eeaed462e587ed9161784a561 upstream Conflicts: none Backport-reason: SWAP update: dropping page_index Patch series "mm/swap: clean up and optimize swap cache index", v6. Currently we use one swap_address_space for every 64M chunk to reduce lock contention, this is like having a set of smaller files inside a swap device. But when doing swap cache look up or insert, we are still using the offset of the whole large swap device. This is OK for correctness, as the offset (key) is unique. But Xarray is specially optimized for small indexes, it creates the redix tree levels lazily to be just enough to fit the largest key stored in one Xarray. So we are wasting tree nodes unnecessarily. For 64M chunk it should only take at most 3 level to contain everything. But if we are using the offset from the whole swap device, the offset (key) value will be way beyond 64M, and so will the tree level. Optimize this by reduce the swap cache search space into 64M scope. Test with `time memhog 128G` inside a 8G memcg using 128G swap (ramdisk with SWP_SYNCHRONOUS_IO dropped, tested 3 times, results are stable. The test result is similar but the improvement is smaller if SWP_SYNCHRONOUS_IO is enabled, as swap out path can never skip swap cache): Before: 6.07user 250.74system 4:17.26elapsed 99%CPU (0avgtext+0avgdata 8373376maxresident)k 0inputs+0outputs (55major+33555018minor)pagefaults 0swaps After (+1.8% faster): 6.08user 246.09system 4:12.58elapsed 99%CPU (0avgtext+0avgdata 8373248maxresident)k 0inputs+0outputs (54major+33555027minor)pagefaults 0swaps Similar result with MySQL and sysbench using swap: Before: 94055.61 qps After (+0.8% faster): 94834.91 qps There is alse a very slight drop of radix tree node slab usage: Before: 303952K After: 302224K For this series: There are multiple places that expect mixed type of pages (page cache or swap cache), eg. migration, huge memory split; There are four helpers for that: - page_index - page_file_offset - folio_index - folio_file_pos To keep the code clean and compatible, this series first cleaned up usage of them. page_file_offset and folio_file_pos are historical helpes that can be simply dropped after clean up. And page_index can be all converted to folio_index or folio->index. Then introduce two new helpers swap_cache_index and swap_dev_pos for swap. Replace swp_offset with swap_cache_index when used to retrieve folio from swap cache, and use swap_dev_pos when needed to retrieve the device position of a swap entry. This way, swap_cache_index can return the optimized value with no compatibility issue. The result is better performance and reduced LOC. Idealy, in the future, we may want to reduce SWAP_ADDRESS_SPACE_SHIFT from 14 to 12: Default Xarray chunk offset is 6, so we have 3 level trees instead of 2 level trees just for 2 extra bits. But swap cache is based on address_space struct, with 4 times more metadata sparsely distributed in memory it waste more cacheline, the performance gain from this series is almost canceled according to my test. So first, just have a cleaner seperation of offsets and smaller search space. This patch (of 10): page_index is only for mixed usage of page cache and swap cache, for pure page cache usage, the caller can just use page->index instead. It can't be a swap cache page here (being part of buffer head), so just drop it. And while we are at it, optimize the code by retrieving the offset of the buffer head within the folio directly using bh_offset, and get rid of the loop and usage of page helpers. Link: https://lkml.kernel.org/r/20240521175854.96038-1-ryncsn@gmail.com Link: https://lkml.kernel.org/r/20240521175854.96038-3-ryncsn@gmail.com Suggested-by: Matthew Wilcox Signed-off-by: Kairui Song Acked-by: Ryusuke Konishi Cc: Ryusuke Konishi Cc: Anna Schumaker Cc: Barry Song Cc: Chao Yu Cc: Chris Li Cc: David Hildenbrand Cc: David Howells Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Ilya Dryomov Cc: Jaegeuk Kim Cc: Jeff Layton Cc: Marc Dionne Cc: Minchan Kim Cc: NeilBrown Cc: Ryan Roberts Cc: Trond Myklebust Cc: Xiubo Li Cc: Yosry Ahmed Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- fs/nilfs2/bmap.c | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/fs/nilfs2/bmap.c b/fs/nilfs2/bmap.c index 7a8f166f2c8d..b6a0a6ef9f9f 100644 --- a/fs/nilfs2/bmap.c +++ b/fs/nilfs2/bmap.c @@ -450,15 +450,9 @@ int nilfs_bmap_test_and_clear_dirty(struct nilfs_bmap *bmap) __u64 nilfs_bmap_data_get_key(const struct nilfs_bmap *bmap, const struct buffer_head *bh) { - struct buffer_head *pbh; - __u64 key; + loff_t pos = folio_pos(bh->b_folio) + bh_offset(bh); - key = page_index(bh->b_page) << (PAGE_SHIFT - - bmap->b_inode->i_blkbits); - for (pbh = page_buffers(bh->b_page); pbh != bh; pbh = pbh->b_this_page) - key++; - - return key; + return pos >> bmap->b_inode->i_blkbits; } __u64 nilfs_bmap_find_target_seq(const struct nilfs_bmap *bmap, __u64 key) -- Gitee From 707a35117ff3027cb3ac845838dc16a86f494f43 Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Fri, 29 Aug 2025 10:54:34 +0800 Subject: [PATCH 120/484] nilfs2: drop more usage of page_index Upstream: no Downstream have a few more page_index callers, remove them. Signed-off-by: Kairui Song --- fs/nilfs2/btnode.c | 2 +- fs/nilfs2/page.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c index 3dda7c39089b..07baac9d06f4 100644 --- a/fs/nilfs2/btnode.c +++ b/fs/nilfs2/btnode.c @@ -160,7 +160,7 @@ void nilfs_btnode_delete(struct buffer_head *bh) { struct address_space *mapping; struct page *page = bh->b_page; - pgoff_t index = page_index(page); + pgoff_t index = page->index; int still_dirty; get_page(page); diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c index 144e200c4909..077b85ebbecd 100644 --- a/fs/nilfs2/page.c +++ b/fs/nilfs2/page.c @@ -465,7 +465,7 @@ int __nilfs_clear_page_dirty(struct page *page) if (mapping) { xa_lock_irq(&mapping->i_pages); if (test_bit(PG_dirty, &page->flags)) { - __xa_clear_mark(&mapping->i_pages, page_index(page), + __xa_clear_mark(&mapping->i_pages, page->index, PAGECACHE_TAG_DIRTY); xa_unlock_irq(&mapping->i_pages); return clear_page_dirty_for_io(page); -- Gitee From bddb9b6a3b15799448ab298c081a2361f6c4626a Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Wed, 22 May 2024 01:58:45 +0800 Subject: [PATCH 121/484] ceph: drop usage of page_index commit 5e425300af15f8fcf6d0fec188c8bf5207c1456d upstream Conflicts: none Backport-reason: SWAP update: dropping page_index page_index is needed for mixed usage of page cache and swap cache, for pure page cache usage, the caller can just use page->index instead. It can't be a swap cache page here, so just drop it. Link: https://lkml.kernel.org/r/20240521175854.96038-4-ryncsn@gmail.com Signed-off-by: Kairui Song Cc: Xiubo Li Cc: Ilya Dryomov Cc: Jeff Layton Cc: Anna Schumaker Cc: Barry Song Cc: Chao Yu Cc: Chris Li Cc: David Hildenbrand Cc: David Howells Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Jaegeuk Kim Cc: Marc Dionne Cc: Matthew Wilcox (Oracle) Cc: Minchan Kim Cc: NeilBrown Cc: Ryan Roberts Cc: Ryusuke Konishi Cc: Trond Myklebust Cc: Yosry Ahmed Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- fs/ceph/dir.c | 2 +- fs/ceph/inode.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 1395b71df5cc..b78fc6b62701 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -138,7 +138,7 @@ __dcache_find_get_entry(struct dentry *parent, u64 idx, if (ptr_pos >= i_size_read(dir)) return NULL; - if (!cache_ctl->page || ptr_pgoff != page_index(cache_ctl->page)) { + if (!cache_ctl->page || ptr_pgoff != cache_ctl->page->index) { ceph_readdir_cache_release(cache_ctl); cache_ctl->page = find_lock_page(&dir->i_data, ptr_pgoff); if (!cache_ctl->page) { diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index f0befbeb6cb8..f9f1b0e4d587 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -1830,7 +1830,7 @@ static int fill_readdir_cache(struct inode *dir, struct dentry *dn, unsigned idx = ctl->index % nsize; pgoff_t pgoff = ctl->index / nsize; - if (!ctl->page || pgoff != page_index(ctl->page)) { + if (!ctl->page || pgoff != ctl->page->index) { ceph_readdir_cache_release(ctl); if (idx == 0) ctl->page = grab_cache_page(&dir->i_data, pgoff); -- Gitee From cb1d85db3811f0c2d93748353b609e69f058a989 Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Wed, 22 May 2024 01:58:46 +0800 Subject: [PATCH 122/484] NFS: remove nfs_page_lengthg and usage of page_index commit 8586be3ddf9a6133ec9f5915b35cd4e704d4131b upstream Conflicts: none Backport-reason: SWAP update: dropping page_index This function is no longer used after commit 4fa7a717b432 ("NFS: Fix up nfs_vm_page_mkwrite() for folios"), all users have been converted to use folio instead, just delete it to remove usage of page_index. Link: https://lkml.kernel.org/r/20240521175854.96038-5-ryncsn@gmail.com Signed-off-by: Kairui Song Cc: Trond Myklebust Cc: Anna Schumaker Cc: Barry Song Cc: Chao Yu Cc: Chris Li Cc: David Hildenbrand Cc: David Howells Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Ilya Dryomov Cc: Jaegeuk Kim Cc: Jeff Layton Cc: Marc Dionne Cc: Matthew Wilcox (Oracle) Cc: Minchan Kim Cc: NeilBrown Cc: Ryan Roberts Cc: Ryusuke Konishi Cc: Xiubo Li Cc: Yosry Ahmed Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- fs/nfs/internal.h | 19 ------------------- 1 file changed, 19 deletions(-) diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index c29ad2e1d416..ce194656ae52 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -790,25 +790,6 @@ static inline void nfs_folio_mark_unstable(struct folio *folio, } } -/* - * Determine the number of bytes of data the page contains - */ -static inline -unsigned int nfs_page_length(struct page *page) -{ - loff_t i_size = i_size_read(page_file_mapping(page)->host); - - if (i_size > 0) { - pgoff_t index = page_index(page); - pgoff_t end_index = (i_size - 1) >> PAGE_SHIFT; - if (index < end_index) - return PAGE_SIZE; - if (index == end_index) - return ((i_size - 1) & ~PAGE_MASK) + 1; - } - return 0; -} - /* * Determine the number of bytes of data the page contains */ -- Gitee From cbd306bd8024e29429729ee7ad7e0109fad12896 Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Fri, 29 Aug 2025 10:56:40 +0800 Subject: [PATCH 123/484] f2fs: drop usage of page_index Upstream: no Removing page_index, it's for mixed usage of swap cache and page cache but we will never want to mix them together. Signed-off-by: Kairui Song --- fs/f2fs/data.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index d863515ca1d6..b39b13ef02c5 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -2045,7 +2045,7 @@ static int f2fs_read_single_page(struct inode *inode, struct page *page, sector_t block_nr; int ret = 0; - block_in_file = (sector_t)page_index(page); + block_in_file = (sector_t)page->index; last_block = block_in_file + nr_pages; last_block_in_file = bytes_to_blks(inode, f2fs_readpage_limit(inode) + blocksize - 1); @@ -4081,7 +4081,7 @@ void f2fs_clear_page_cache_dirty_tag(struct page *page) unsigned long flags; xa_lock_irqsave(&mapping->i_pages, flags); - __xa_clear_mark(&mapping->i_pages, page_index(page), + __xa_clear_mark(&mapping->i_pages, page->index, PAGECACHE_TAG_DIRTY); xa_unlock_irqrestore(&mapping->i_pages, flags); } -- Gitee From 3aa76cd12308a759ca76f9670d3f7911d7aae01a Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Fri, 29 Aug 2025 10:58:36 +0800 Subject: [PATCH 124/484] btrfs: drop usage of page_index Upstream: page_index is also dropped but in a different approach page_index is for mixed usage of swap cache and page cache but we will never want to mix them up. Signed-off-by: Kairui Song --- fs/btrfs/defrag.c | 2 +- fs/btrfs/extent_io.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/defrag.c b/fs/btrfs/defrag.c index e1475dfdf7a8..dc82fd6c4741 100644 --- a/fs/btrfs/defrag.c +++ b/fs/btrfs/defrag.c @@ -1028,7 +1028,7 @@ static int defrag_one_locked_target(struct btrfs_inode *inode, const u64 len = target->len; unsigned long last_index = (start + len - 1) >> PAGE_SHIFT; unsigned long start_index = start >> PAGE_SHIFT; - unsigned long first_index = page_index(pages[0]); + unsigned long first_index = pages[0]->index; int ret = 0; int i; diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 48b06459bc48..57f96131c4bd 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3918,7 +3918,7 @@ static void btree_clear_page_dirty(struct page *page) xa_lock_irq(&page->mapping->i_pages); if (!PageDirty(page)) __xa_clear_mark(&page->mapping->i_pages, - page_index(page), PAGECACHE_TAG_DIRTY); + page->index, PAGECACHE_TAG_DIRTY); xa_unlock_irq(&page->mapping->i_pages); } -- Gitee From 8323a0ba7e59992241688b0b098c5f0d7c359d64 Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Wed, 22 May 2024 01:58:47 +0800 Subject: [PATCH 125/484] afs: drop usage of folio_file_pos commit d4f439865f97492c1931962c8a90fc9d7204d655 upstream Conflicts: none Backport-reason: SWAP update: dropping page_index folio_file_pos is only needed for mixed usage of page cache and swap cache, for pure page cache usage, the caller can just use folio_pos instead. It can't be a swap cache page here. Swap mapping may only call into fs through swap_rw and that is not supported for afs. So just drop it and use folio_pos instead. Link: https://lkml.kernel.org/r/20240521175854.96038-6-ryncsn@gmail.com Signed-off-by: Kairui Song Cc: David Howells Cc: Marc Dionne Cc: Anna Schumaker Cc: Barry Song Cc: Chao Yu Cc: Chris Li Cc: David Hildenbrand Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Ilya Dryomov Cc: Jaegeuk Kim Cc: Jeff Layton Cc: Matthew Wilcox (Oracle) Cc: Minchan Kim Cc: NeilBrown Cc: Ryan Roberts Cc: Ryusuke Konishi Cc: Trond Myklebust Cc: Xiubo Li Cc: Yosry Ahmed Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- fs/afs/dir.c | 6 +++--- fs/afs/dir_edit.c | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/afs/dir.c b/fs/afs/dir.c index cdd2abdc8975..e57e71e2b18f 100644 --- a/fs/afs/dir.c +++ b/fs/afs/dir.c @@ -534,14 +534,14 @@ static int afs_dir_iterate(struct inode *dir, struct dir_context *ctx, break; } - offset = round_down(ctx->pos, sizeof(*dblock)) - folio_file_pos(folio); + offset = round_down(ctx->pos, sizeof(*dblock)) - folio_pos(folio); size = min_t(loff_t, folio_size(folio), - req->actual_len - folio_file_pos(folio)); + req->actual_len - folio_pos(folio)); do { dblock = kmap_local_folio(folio, offset); ret = afs_dir_iterate_block(dvnode, ctx, dblock, - folio_file_pos(folio) + offset); + folio_pos(folio) + offset); kunmap_local(dblock); if (ret != 1) goto out; diff --git a/fs/afs/dir_edit.c b/fs/afs/dir_edit.c index 1dcc75fd0cee..fe223fb78111 100644 --- a/fs/afs/dir_edit.c +++ b/fs/afs/dir_edit.c @@ -256,7 +256,7 @@ void afs_edit_dir_add(struct afs_vnode *vnode, folio = folio0; } - block = kmap_local_folio(folio, b * AFS_DIR_BLOCK_SIZE - folio_file_pos(folio)); + block = kmap_local_folio(folio, b * AFS_DIR_BLOCK_SIZE - folio_pos(folio)); /* Abandon the edit if we got a callback break. */ if (!test_bit(AFS_VNODE_DIR_VALID, &vnode->flags)) @@ -417,7 +417,7 @@ void afs_edit_dir_remove(struct afs_vnode *vnode, folio = folio0; } - block = kmap_local_folio(folio, b * AFS_DIR_BLOCK_SIZE - folio_file_pos(folio)); + block = kmap_local_folio(folio, b * AFS_DIR_BLOCK_SIZE - folio_pos(folio)); /* Abandon the edit if we got a callback break. */ if (!test_bit(AFS_VNODE_DIR_VALID, &vnode->flags)) -- Gitee From 25773752331a079e8c9de1a3f6767b4ccaf5f6c0 Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Wed, 22 May 2024 01:58:49 +0800 Subject: [PATCH 126/484] nfs: drop usage of folio_file_pos commit 237d29075ca71feeadf38e801ef657858d9e9598 upstream Conflicts: resolved Backport-reason: SWAP update: dropping page_index folio_file_pos is only needed for mixed usage of page cache and swap cache, for pure page cache usage, the caller can just use folio_pos instead. After commit e1209d3a7a67 ("mm: introduce ->swap_rw and use it for reads from SWP_FS_OPS swap-space"), swap cache should never be exposed to nfs. So remove the usage of folio_file_pos in following NFS functions / helpers: - nfs_vm_page_mkwrite It's only used by nfs_file_vm_ops.page_mkwrite - trace event helper: nfs_folio_event - trace event helper: nfs_folio_event_done These two are used through DEFINE_NFS_FOLIO_EVENT and DEFINE_NFS_FOLIO_EVENT_DONE, which defined following events: - trace_nfs_aop_readpage{_done}: only called by nfs_read_folio - trace_nfs_writeback_folio: only called by nfs_wb_folio - trace_nfs_invalidate_folio: only called by nfs_invalidate_folio - trace_nfs_launder_folio_done: only called by nfs_launder_folio None of them could possibly be used on swap cache folio, nfs_read_folio only called by: .write_begin -> nfs_read_folio .read_folio nfs_wb_folio only called by nfs mapping: .release_folio -> nfs_wb_folio .launder_folio -> nfs_wb_folio .write_begin -> nfs_read_folio -> nfs_wb_folio .read_folio -> nfs_wb_folio .write_end -> nfs_update_folio -> nfs_writepage_setup -> nfs_setup_write_request -> nfs_try_to_update_request -> nfs_wb_folio .page_mkwrite -> nfs_update_folio -> nfs_writepage_setup -> nfs_setup_write_request -> nfs_try_to_update_request -> nfs_wb_folio .write_begin -> nfs_flush_incompatible -> nfs_wb_folio .page_mkwrite -> nfs_vm_page_mkwrite -> nfs_flush_incompatible -> nfs_wb_folio nfs_invalidate_folio is only called by .invalidate_folio. nfs_launder_folio is only called by .launder_folio - nfs_grow_file - nfs_update_folio nfs_grow_file is only called by nfs_update_folio, and all possible callers of them are: .write_end -> nfs_update_folio .page_mkwrite -> nfs_update_folio - nfs_wb_folio_cancel .invalidate_folio -> nfs_wb_folio_cancel Also, seeing from the swap side, swap_rw is now the only interface calling into fs, the offset info is always in iocb.ki_pos now. So we can remove all these folio_file_pos call safely. Link: https://lkml.kernel.org/r/20240521175854.96038-8-ryncsn@gmail.com Signed-off-by: Kairui Song Cc: Trond Myklebust Cc: Anna Schumaker Cc: Barry Song Cc: Chao Yu Cc: Chris Li Cc: David Hildenbrand Cc: David Howells Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Ilya Dryomov Cc: Jaegeuk Kim Cc: Jeff Layton Cc: Marc Dionne Cc: Matthew Wilcox (Oracle) Cc: Minchan Kim Cc: NeilBrown Cc: Ryan Roberts Cc: Ryusuke Konishi Cc: Xiubo Li Cc: Yosry Ahmed Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- fs/nfs/file.c | 2 +- fs/nfs/write.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 4994b2873970..5ebd077d7551 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -591,7 +591,7 @@ static vm_fault_t nfs_vm_page_mkwrite(struct vm_fault *vmf) dfprintk(PAGECACHE, "NFS: vm_page_mkwrite(%pD2(%lu), offset %lld)\n", filp, filp->f_mapping->host->i_ino, - (long long)folio_file_pos(folio)); + (long long)folio_pos(folio)); sb_start_pagefault(inode->i_sb); diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 6bc36cef14cd..a7110d3f0221 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -281,7 +281,7 @@ static void nfs_grow_file(struct folio *folio, unsigned int offset, end_index = ((i_size - 1) >> folio_shift(folio)) << folio_order(folio); if (i_size > 0 && folio_index(folio) < end_index) goto out; - end = folio_file_pos(folio) + (loff_t)offset + (loff_t)count; + end = folio_pos(folio) + (loff_t)offset + (loff_t)count; if (i_size >= end) goto out; trace_nfs_size_grow(inode, end); @@ -1377,7 +1377,7 @@ int nfs_update_folio(struct file *file, struct folio *folio, nfs_inc_stats(inode, NFSIOS_VFSUPDATEPAGE); dprintk("NFS: nfs_update_folio(%pD2 %d@%lld)\n", file, count, - (long long)(folio_file_pos(folio) + offset)); + (long long)(folio_pos(folio) + offset)); if (!count) goto out; -- Gitee From ab270e1960f017b2c14e3b1b05555b74bb630a53 Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Wed, 22 May 2024 01:58:50 +0800 Subject: [PATCH 127/484] mm/swap: get the swap device offset directly commit 545ebe71d38c13c76c20dfd84d3397dc14a129e3 upstream Conflicts: none Backport-reason: SWAP update: dropping page_index folio_file_pos and page_file_offset are for mixed usage of swap cache and page cache, it can't be page cache here, so introduce a new helper to get the swap offset in swap device directly. Need to include swapops.h in mm/swap.h to ensure swp_offset is always defined before use. Link: https://lkml.kernel.org/r/20240521175854.96038-9-ryncsn@gmail.com Signed-off-by: Kairui Song Reviewed-by: "Huang, Ying" Cc: Anna Schumaker Cc: Barry Song Cc: Chao Yu Cc: Chris Li Cc: David Hildenbrand Cc: David Howells Cc: Hugh Dickins Cc: Ilya Dryomov Cc: Jaegeuk Kim Cc: Jeff Layton Cc: Marc Dionne Cc: Matthew Wilcox (Oracle) Cc: Minchan Kim Cc: NeilBrown Cc: Ryan Roberts Cc: Ryusuke Konishi Cc: Trond Myklebust Cc: Xiubo Li Cc: Yosry Ahmed Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/page_io.c | 6 +++--- mm/swap.h | 9 +++++++++ 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/mm/page_io.c b/mm/page_io.c index 92e329007eba..c3cbdfe4af75 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -274,7 +274,7 @@ static void sio_write_complete(struct kiocb *iocb, long ret) * be temporary. */ pr_err_ratelimited("Write error %ld on dio swapfile (%llu)\n", - ret, page_file_offset(page)); + ret, swap_dev_pos(page_swap_entry(page))); for (p = 0; p < sio->pages; p++) { page = sio->bvec[p].bv_page; set_page_dirty(page); @@ -293,7 +293,7 @@ static void swap_writepage_fs(struct folio *folio, struct writeback_control *wbc struct swap_iocb *sio = NULL; struct swap_info_struct *sis = swp_swap_info(folio->swap); struct file *swap_file = sis->swap_file; - loff_t pos = folio_file_pos(folio); + loff_t pos = swap_dev_pos(folio->swap); count_swpout_vm_event(folio); folio_start_writeback(folio); @@ -436,7 +436,7 @@ static void swap_read_folio_fs(struct folio *folio, struct swap_iocb **plug) { struct swap_info_struct *sis = swp_swap_info(folio->swap); struct swap_iocb *sio = NULL; - loff_t pos = folio_file_pos(folio); + loff_t pos = swap_dev_pos(folio->swap); if (plug) sio = *plug; diff --git a/mm/swap.h b/mm/swap.h index 81b1d887c98d..dae5a7594c17 100644 --- a/mm/swap.h +++ b/mm/swap.h @@ -5,6 +5,7 @@ struct mempolicy; #ifdef CONFIG_SWAP +#include /* for swp_offset */ #include /* for bio_end_io_t */ /* linux/mm/page_io.c */ @@ -31,6 +32,14 @@ extern struct address_space *swapper_spaces[]; (&swapper_spaces[swp_type(entry)][swp_offset(entry) \ >> SWAP_ADDRESS_SPACE_SHIFT]) +/* + * Return the swap device position of the swap entry. + */ +static inline loff_t swap_dev_pos(swp_entry_t entry) +{ + return ((loff_t)swp_offset(entry)) << PAGE_SHIFT; +} + void show_swap_cache_info(void); void *get_shadow_from_swap_cache(swp_entry_t entry); int add_to_swap_cache(struct folio *folio, swp_entry_t entry, -- Gitee From 0c169f954b1686ba8cce37da4846a0d2da6cf658 Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Fri, 29 Aug 2025 01:24:54 +0800 Subject: [PATCH 128/484] fs: drop the last two users of folio_file_pos / page_file_offset Upstream: no These two helpers are for mixed usage of swap cache and page cache. We no longer have any fs that accesses the swap cache space directly, so remove them. Signed-off-by: Kairui Song --- fs/netfs/buffered_read.c | 4 ++-- fs/smb/client/file.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/netfs/buffered_read.c b/fs/netfs/buffered_read.c index 2cd3ccf4c439..a4bde65bc1a5 100644 --- a/fs/netfs/buffered_read.c +++ b/fs/netfs/buffered_read.c @@ -231,7 +231,7 @@ int netfs_read_folio(struct file *file, struct folio *folio) _enter("%lx", folio_index(folio)); rreq = netfs_alloc_request(mapping, file, - folio_file_pos(folio), folio_size(folio), + folio_pos(folio), folio_size(folio), NETFS_READPAGE); if (IS_ERR(rreq)) { ret = PTR_ERR(rreq); @@ -381,7 +381,7 @@ int netfs_write_begin(struct netfs_inode *ctx, } rreq = netfs_alloc_request(mapping, file, - folio_file_pos(folio), folio_size(folio), + folio_pos(folio), folio_size(folio), NETFS_READ_FOR_WRITE); if (IS_ERR(rreq)) { ret = PTR_ERR(rreq); diff --git a/fs/smb/client/file.c b/fs/smb/client/file.c index d883ed75022c..4d9c6d4b37b0 100644 --- a/fs/smb/client/file.c +++ b/fs/smb/client/file.c @@ -4852,7 +4852,7 @@ static int cifs_readpage_worker(struct file *file, struct page *page, static int cifs_read_folio(struct file *file, struct folio *folio) { struct page *page = &folio->page; - loff_t offset = page_file_offset(page); + loff_t offset = page_offset(page); int rc = -EACCES; unsigned int xid; -- Gitee From 283b44be69e80d54e84193981272136a2a8bda85 Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Wed, 22 May 2024 01:58:51 +0800 Subject: [PATCH 129/484] mm: remove page_file_offset and folio_file_pos commit 564a2ee9f9f66ceef135b7208cff705d48ad76c4 upstream Conflicts: none Backport-reason: SWAP update: dropping page_index These two helpers were useful for mixed usage of swap cache and page cache, which help retrieve the corresponding file or swap device offset of a page or folio. They were introduced in commit f981c5950fa8 ("mm: methods for teaching filesystems about PG_swapcache pages") and used in commit d56b4ddf7781 ("nfs: teach the NFS client how to treat PG_swapcache pages"), suppose to be used with direct_IO for swap over fs. But after commit e1209d3a7a67 ("mm: introduce ->swap_rw and use it for reads from SWP_FS_OPS swap-space"), swap with direct_IO is no more, and swap cache mapping is never exposed to fs. Now we have dropped all users of page_file_offset and folio_file_pos, so they can be deleted. Link: https://lkml.kernel.org/r/20240521175854.96038-10-ryncsn@gmail.com Signed-off-by: Kairui Song Reviewed-by: "Huang, Ying" Cc: Anna Schumaker Cc: Barry Song Cc: Chao Yu Cc: Chris Li Cc: David Hildenbrand Cc: David Howells Cc: Hugh Dickins Cc: Ilya Dryomov Cc: Jaegeuk Kim Cc: Jeff Layton Cc: Marc Dionne Cc: Matthew Wilcox (Oracle) Cc: Minchan Kim Cc: NeilBrown Cc: Ryan Roberts Cc: Ryusuke Konishi Cc: Trond Myklebust Cc: Xiubo Li Cc: Yosry Ahmed Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- include/linux/pagemap.h | 17 ----------------- 1 file changed, 17 deletions(-) diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 863d0dd1eec3..5f8877d0e9ea 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -930,11 +930,6 @@ static inline loff_t page_offset(struct page *page) return ((loff_t)page->index) << PAGE_SHIFT; } -static inline loff_t page_file_offset(struct page *page) -{ - return ((loff_t)page_index(page)) << PAGE_SHIFT; -} - /** * folio_pos - Returns the byte position of this folio in its file. * @folio: The folio. @@ -944,18 +939,6 @@ static inline loff_t folio_pos(struct folio *folio) return page_offset(&folio->page); } -/** - * folio_file_pos - Returns the byte position of this folio in its file. - * @folio: The folio. - * - * This differs from folio_pos() for folios which belong to a swap file. - * NFS is the only filesystem today which needs to use folio_file_pos(). - */ -static inline loff_t folio_file_pos(struct folio *folio) -{ - return page_file_offset(&folio->page); -} - /* * Get the offset in PAGE_SIZE (even for hugetlb folios). */ -- Gitee From 7aec7eb75b4691e752f406c9261a188a25d7f37a Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Wed, 22 May 2024 01:58:52 +0800 Subject: [PATCH 130/484] mm: drop page_index and simplify folio_index commit 05b0c7edad9b8a5ccf1b46b01e1b96fcd10b50d8 upstream Conflicts: none Backport-reason: SWAP update: dropping page_index There are two helpers for retrieving the index within address space for mixed usage of swap cache and page cache: - page_index - folio_index This commit drops page_index, as we have eliminated all users, and converts folio_index's helper __page_file_index to use folio to avoid the page conversion. Link: https://lkml.kernel.org/r/20240521175854.96038-11-ryncsn@gmail.com Signed-off-by: Kairui Song Reviewed-by: "Huang, Ying" Cc: Anna Schumaker Cc: Barry Song Cc: Chao Yu Cc: Chris Li Cc: David Hildenbrand Cc: David Howells Cc: Hugh Dickins Cc: Ilya Dryomov Cc: Jaegeuk Kim Cc: Jeff Layton Cc: Marc Dionne Cc: Matthew Wilcox (Oracle) Cc: Minchan Kim Cc: NeilBrown Cc: Ryan Roberts Cc: Ryusuke Konishi Cc: Trond Myklebust Cc: Xiubo Li Cc: Yosry Ahmed Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- include/linux/mm.h | 13 ------------- include/linux/pagemap.h | 8 ++++---- mm/swapfile.c | 7 +++---- 3 files changed, 7 insertions(+), 21 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 4d20c51baa3f..e97c091acf5a 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2250,19 +2250,6 @@ static inline void *folio_address(const struct folio *folio) return page_address(&folio->page); } -extern pgoff_t __page_file_index(struct page *page); - -/* - * Return the pagecache index of the passed page. Regular pagecache pages - * use ->index whereas swapcache pages use swp_offset(->private) - */ -static inline pgoff_t page_index(struct page *page) -{ - if (unlikely(PageSwapCache(page))) - return __page_file_index(page); - return page->index; -} - /* * Return true only if the page has been allocated with * ALLOC_NO_WATERMARKS and the low watermark was not diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 5f8877d0e9ea..9c169c9d09cb 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -790,7 +790,7 @@ static inline struct page *grab_cache_page_nowait(struct address_space *mapping, mapping_gfp_mask(mapping)); } -#define swapcache_index(folio) __page_file_index(&(folio)->page) +extern pgoff_t __folio_swap_cache_index(struct folio *folio); /** * folio_index - File index of a folio. @@ -805,9 +805,9 @@ static inline struct page *grab_cache_page_nowait(struct address_space *mapping, */ static inline pgoff_t folio_index(struct folio *folio) { - if (unlikely(folio_test_swapcache(folio))) - return swapcache_index(folio); - return folio->index; + if (unlikely(folio_test_swapcache(folio))) + return __folio_swap_cache_index(folio); + return folio->index; } /** diff --git a/mm/swapfile.c b/mm/swapfile.c index 0bad74825a83..7e8e859e3067 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -3554,12 +3554,11 @@ struct address_space *swapcache_mapping(struct folio *folio) } EXPORT_SYMBOL_GPL(swapcache_mapping); -pgoff_t __page_file_index(struct page *page) +pgoff_t __folio_swap_cache_index(struct folio *folio) { - swp_entry_t swap = page_swap_entry(page); - return swp_offset(swap); + return swp_offset(folio->swap); } -EXPORT_SYMBOL_GPL(__page_file_index); +EXPORT_SYMBOL_GPL(__folio_swap_cache_index); /* * add_swap_count_continuation - called when a swap count is duplicated -- Gitee From 26f8a44a9c28d3f370dcf27ebcf6f0366d32af04 Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Mon, 1 Sep 2025 15:40:33 +0800 Subject: [PATCH 131/484] mm/swap: reduce swap cache search space commit 7aad25b4b47ea5b67e1eb8be0db211b899dce60d upstream Conflicts: minor Backport-reason: SWAP update: dropping page_index Currently we use one swap_address_space for every 64M chunk to reduce lock contention, this is like having a set of smaller swap files inside one swap device. But when doing swap cache look up or insert, we are still using the offset of the whole large swap device. This is OK for correctness, as the offset (key) is unique. But Xarray is specially optimized for small indexes, it creates the radix tree levels lazily to be just enough to fit the largest key stored in one Xarray. So we are wasting tree nodes unnecessarily. For 64M chunk it should only take at most 3 levels to contain everything. But if we are using the offset from the whole swap device, the offset (key) value will be way beyond 64M, and so will the tree level. Optimize this by using a new helper swap_cache_index to get a swap entry's unique offset in its own 64M swap_address_space. I see a ~1% performance gain in benchmark and actual workload with high memory pressure. Test with `time memhog 128G` inside a 8G memcg using 128G swap (ramdisk with SWP_SYNCHRONOUS_IO dropped, tested 3 times, results are stable. The test result is similar but the improvement is smaller if SWP_SYNCHRONOUS_IO is enabled, as swap out path can never skip swap cache): Before: 6.07user 250.74system 4:17.26elapsed 99%CPU (0avgtext+0avgdata 8373376maxresident)k 0inputs+0outputs (55major+33555018minor)pagefaults 0swaps After (1.8% faster): 6.08user 246.09system 4:12.58elapsed 99%CPU (0avgtext+0avgdata 8373248maxresident)k 0inputs+0outputs (54major+33555027minor)pagefaults 0swaps Similar result with MySQL and sysbench using swap: Before: 94055.61 qps After (0.8% faster): 94834.91 qps Radix tree slab usage is also very slightly lower. Link: https://lkml.kernel.org/r/20240521175854.96038-12-ryncsn@gmail.com Signed-off-by: Kairui Song Reviewed-by: "Huang, Ying" Cc: Anna Schumaker Cc: Barry Song Cc: Chao Yu Cc: Chris Li Cc: David Hildenbrand Cc: David Howells Cc: Hugh Dickins Cc: Ilya Dryomov Cc: Jaegeuk Kim Cc: Jeff Layton Cc: Marc Dionne Cc: Matthew Wilcox (Oracle) Cc: Minchan Kim Cc: NeilBrown Cc: Ryan Roberts Cc: Ryusuke Konishi Cc: Trond Myklebust Cc: Xiubo Li Cc: Yosry Ahmed Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/huge_memory.c | 2 +- mm/memcontrol.c | 2 +- mm/mincore.c | 2 +- mm/shmem.c | 2 +- mm/swap.h | 15 +++++++++++++++ mm/swap_state.c | 17 +++++++++-------- mm/swapfile.c | 6 +++--- 7 files changed, 31 insertions(+), 15 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 01b681097eab..67900a500c9c 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2864,7 +2864,7 @@ static void __split_huge_page(struct page *page, struct list_head *list, split_page_memcg(head, nr); if (folio_test_anon(folio) && folio_test_swapcache(folio)) { - offset = swp_offset(folio->swap); + offset = swap_cache_index(folio->swap); swap_cache = swap_address_space(folio->swap); xa_lock(&swap_cache->i_pages); } diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 3f3031a92be7..7077841754bb 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -7741,7 +7741,7 @@ static struct page *mc_handle_swap_pte(struct vm_area_struct *vma, * Because swap_cache_get_folio() updates some statistics counter, * we call find_get_page() with swapper_space directly. */ - page = find_get_page(swap_address_space(ent), swp_offset(ent)); + page = find_get_page(swap_address_space(ent), swap_cache_index(ent)); entry->val = ent.val; return page; diff --git a/mm/mincore.c b/mm/mincore.c index dad3622cc963..e31cf1bde614 100644 --- a/mm/mincore.c +++ b/mm/mincore.c @@ -139,7 +139,7 @@ static int mincore_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, } else { #ifdef CONFIG_SWAP *vec = mincore_page(swap_address_space(entry), - swp_offset(entry)); + swap_cache_index(entry)); #else WARN_ON(1); *vec = 1; diff --git a/mm/shmem.c b/mm/shmem.c index 1c30cc3ae1af..ec2c1a86d7e6 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1771,7 +1771,7 @@ static int shmem_replace_folio(struct folio **foliop, gfp_t gfp, old = *foliop; entry = old->swap; - swap_index = swp_offset(entry); + swap_index = swap_cache_index(entry); swap_mapping = swap_address_space(entry); /* diff --git a/mm/swap.h b/mm/swap.h index dae5a7594c17..27468f7cc101 100644 --- a/mm/swap.h +++ b/mm/swap.h @@ -27,6 +27,7 @@ int __swap_writepage(struct folio *folio, struct writeback_control *wbc); /* One swap address space for each 64M swap space */ #define SWAP_ADDRESS_SPACE_SHIFT 14 #define SWAP_ADDRESS_SPACE_PAGES (1 << SWAP_ADDRESS_SPACE_SHIFT) +#define SWAP_ADDRESS_SPACE_MASK (SWAP_ADDRESS_SPACE_PAGES - 1) extern struct address_space *swapper_spaces[]; #define swap_address_space(entry) \ (&swapper_spaces[swp_type(entry)][swp_offset(entry) \ @@ -40,6 +41,15 @@ static inline loff_t swap_dev_pos(swp_entry_t entry) return ((loff_t)swp_offset(entry)) << PAGE_SHIFT; } +/* + * Return the swap cache index of the swap entry. + */ +static inline pgoff_t swap_cache_index(swp_entry_t entry) +{ + BUILD_BUG_ON((SWP_OFFSET_MASK | SWAP_ADDRESS_SPACE_MASK) != SWP_OFFSET_MASK); + return swp_offset(entry) & SWAP_ADDRESS_SPACE_MASK; +} + void show_swap_cache_info(void); void *get_shadow_from_swap_cache(swp_entry_t entry); int add_to_swap_cache(struct folio *folio, swp_entry_t entry, @@ -85,6 +95,11 @@ static inline struct address_space *swap_address_space(swp_entry_t entry) return NULL; } +static inline pgoff_t swap_cache_index(swp_entry_t entry) +{ + return 0; +} + static inline void show_swap_cache_info(void) { } diff --git a/mm/swap_state.c b/mm/swap_state.c index f1d0d34be865..f67b59cc6185 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -70,7 +70,7 @@ void show_swap_cache_info(void) void *get_shadow_from_swap_cache(swp_entry_t entry) { struct address_space *address_space = swap_address_space(entry); - pgoff_t idx = swp_offset(entry); + pgoff_t idx = swap_cache_index(entry); void *shadow; shadow = xa_load(&address_space->i_pages, idx); @@ -87,7 +87,7 @@ int add_to_swap_cache(struct folio *folio, swp_entry_t entry, gfp_t gfp, void **shadowp) { struct address_space *address_space = swap_address_space(entry); - pgoff_t idx = swp_offset(entry); + pgoff_t idx = swap_cache_index(entry); XA_STATE_ORDER(xas, &address_space->i_pages, idx, folio_order(folio)); unsigned long i, nr = folio_nr_pages(folio); void *old; @@ -143,7 +143,7 @@ void __delete_from_swap_cache(struct folio *folio, struct address_space *address_space = swap_address_space(entry); int i; long nr = folio_nr_pages(folio); - pgoff_t idx = swp_offset(entry); + pgoff_t idx = swap_cache_index(entry); XA_STATE(xas, &address_space->i_pages, idx); xas_set_update(&xas, workingset_update_node); @@ -192,13 +192,14 @@ void clear_shadow_from_swap_cache(int type, unsigned long begin, for (;;) { swp_entry_t entry = swp_entry(type, curr); + unsigned long index = curr & SWAP_ADDRESS_SPACE_MASK; struct address_space *address_space = swap_address_space(entry); - XA_STATE(xas, &address_space->i_pages, curr); + XA_STATE(xas, &address_space->i_pages, index); xas_set_update(&xas, workingset_update_node); xa_lock_irq(&address_space->i_pages); - xas_for_each(&xas, old, end) { + xas_for_each(&xas, old, min(index + (end - curr), SWAP_ADDRESS_SPACE_PAGES)) { if (!xa_is_value(old)) continue; xas_store(&xas, NULL); @@ -285,7 +286,7 @@ struct folio *swap_cache_get_folio(swp_entry_t entry, { struct folio *folio; - folio = filemap_get_folio(swap_address_space(entry), swp_offset(entry)); + folio = filemap_get_folio(swap_address_space(entry), swap_cache_index(entry)); if (!IS_ERR(folio)) { bool vma_ra = swap_use_vma_readahead(); bool readahead; @@ -355,7 +356,7 @@ struct folio *filemap_get_incore_folio(struct address_space *mapping, si = get_swap_device(swp); if (!si) return ERR_PTR(-ENOENT); - index = swp_offset(swp); + index = swap_cache_index(swp); folio = filemap_get_folio(swap_address_space(swp), index); put_swap_device(si); return folio; @@ -379,7 +380,7 @@ struct folio *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, * that would confuse statistics. */ folio = filemap_get_folio(swap_address_space(entry), - swp_offset(entry)); + swap_cache_index(entry)); if (!IS_ERR(folio)) goto got_folio; diff --git a/mm/swapfile.c b/mm/swapfile.c index 7e8e859e3067..190660b7a010 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -201,8 +201,8 @@ static int __try_to_reclaim_swap(struct swap_info_struct *si, int ret, nr_pages; bool need_reclaim; - folio = filemap_get_folio(address_space, offset); again: + folio = filemap_get_folio(address_space, swap_cache_index(entry)); if (IS_ERR(folio)) return 0; @@ -2255,7 +2255,7 @@ static int try_to_unuse(unsigned int type) (i = find_next_to_unuse(si, i)) != 0) { entry = swp_entry(type, i); - folio = filemap_get_folio(swap_address_space(entry), i); + folio = filemap_get_folio(swap_address_space(entry), swap_cache_index(entry)); if (IS_ERR(folio)) continue; @@ -3556,7 +3556,7 @@ EXPORT_SYMBOL_GPL(swapcache_mapping); pgoff_t __folio_swap_cache_index(struct folio *folio) { - return swp_offset(folio->swap); + return swap_cache_index(folio->swap); } EXPORT_SYMBOL_GPL(__folio_swap_cache_index); -- Gitee From f2b6279c8df172e2b3fde5dc17672e65504d55ef Mon Sep 17 00:00:00 2001 From: Chuanhua Han Date: Wed, 29 May 2024 20:28:19 +1200 Subject: [PATCH 132/484] mm: swap: introduce swap_free_nr() for batched swap_free() commit ebfba0045176cb013f49cb3e5bd9f0b16eba203c upstream Conflicts: major, rewrite cluster_swap_free_nr completely Backport-reason: SWAP update: mTHP swapin support Patch series "large folios swap-in: handle refault cases first", v5. This patchset is extracted from the large folio swapin series[1], primarily addressing the handling of scenarios involving large folios in the swap cache. Currently, it is particularly focused on addressing the refaulting of mTHP, which is still undergoing reclamation. This approach aims to streamline code review and expedite the integration of this segment into the MM tree. It relies on Ryan's swap-out series[2], leveraging the helper function swap_pte_batch() introduced by that series. Presently, do_swap_page only encounters a large folio in the swap cache before the large folio is released by vmscan. However, the code should remain equally useful once we support large folio swap-in via swapin_readahead(). This approach can effectively reduce page faults and eliminate most redundant checks and early exits for MTE restoration in recent MTE patchset[3]. The large folio swap-in for SWP_SYNCHRONOUS_IO and swapin_readahead() will be split into separate patch sets and sent at a later time. [1] https://lore.kernel.org/linux-mm/20240304081348.197341-1-21cnbao@gmail.com/ [2] https://lore.kernel.org/linux-mm/20240408183946.2991168-1-ryan.roberts@arm.com/ [3] https://lore.kernel.org/linux-mm/20240322114136.61386-1-21cnbao@gmail.com/ This patch (of 6): While swapping in a large folio, we need to free swaps related to the whole folio. To avoid frequently acquiring and releasing swap locks, it is better to introduce an API for batched free. Furthermore, this new function, swap_free_nr(), is designed to efficiently handle various scenarios for releasing a specified number, nr, of swap entries. Link: https://lkml.kernel.org/r/20240529082824.150954-1-21cnbao@gmail.com Link: https://lkml.kernel.org/r/20240529082824.150954-2-21cnbao@gmail.com Signed-off-by: Chuanhua Han Co-developed-by: Barry Song Signed-off-by: Barry Song Reviewed-by: Ryan Roberts Acked-by: Chris Li Reviewed-by: "Huang, Ying" Cc: Baolin Wang Cc: David Hildenbrand Cc: Gao Xiang Cc: Hugh Dickins Cc: Johannes Weiner Cc: Kairui Song Cc: Matthew Wilcox (Oracle) Cc: Suren Baghdasaryan Cc: Yosry Ahmed Cc: Yu Zhao Cc: Zi Yan Cc: Andreas Larsson Cc: Christoph Hellwig Cc: "David S. Miller" Cc: Khalid Aziz Cc: Len Brown Cc: Pavel Machek Cc: "Rafael J. Wysocki" Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- include/linux/swap.h | 5 +++++ mm/swapfile.c | 37 +++++++++++++++++++++++++++++++++++++ 2 files changed, 42 insertions(+) diff --git a/include/linux/swap.h b/include/linux/swap.h index 60cfa10fc2ef..d85808b46dc4 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -518,6 +518,7 @@ extern void swap_shmem_alloc(swp_entry_t); extern int swap_duplicate(swp_entry_t); extern int swapcache_prepare(swp_entry_t); extern void swap_free(swp_entry_t); +extern void swap_free_nr(swp_entry_t entry, int nr_pages); extern void free_swap_and_cache_nr(swp_entry_t entry, int nr); int swap_type_of(dev_t device, sector_t offset); int find_first_swap(dev_t *device); @@ -598,6 +599,10 @@ static inline void swap_free(swp_entry_t swp) { } +static inline void swap_free_nr(swp_entry_t entry, int nr_pages) +{ +} + static inline void put_swap_folio(struct folio *folio, swp_entry_t swp) { } diff --git a/mm/swapfile.c b/mm/swapfile.c index 190660b7a010..1820c828af23 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1494,6 +1494,43 @@ void swap_free(swp_entry_t entry) } EXPORT_SYMBOL(swap_free); +static void cluster_swap_free_nr(struct swap_info_struct *si, + unsigned long offset, int nr_pages, + unsigned char usage) +{ + struct swap_cluster_info *ci; + unsigned long end = offset + nr_pages; + + ci = lock_cluster(si, offset); + do { + if (!__swap_entry_free_locked(si, offset, usage)) + swap_entry_range_free(si, ci, swp_entry(si->type, offset), 1); + } while (++offset < end); + unlock_cluster(ci); +} + +/* + * Caller has made sure that the swap device corresponding to entry + * is still around or has not been recycled. + */ +void swap_free_nr(swp_entry_t entry, int nr_pages) +{ + int nr; + struct swap_info_struct *sis; + unsigned long offset = swp_offset(entry); + + sis = _swap_info_get(entry); + if (!sis) + return; + + while (nr_pages) { + nr = min_t(int, nr_pages, SWAPFILE_CLUSTER - offset % SWAPFILE_CLUSTER); + cluster_swap_free_nr(sis, offset, nr, 1); + offset += nr; + nr_pages -= nr; + } +} + /* * Called after dropping swapcache to decrease refcnt to swap entries. */ -- Gitee From 56114d7e8b72b2d3fa4d0289483b01d22c999cb8 Mon Sep 17 00:00:00 2001 From: Barry Song Date: Wed, 29 May 2024 20:28:20 +1200 Subject: [PATCH 133/484] mm: remove the implementation of swap_free() and always use swap_free_nr() commit 54f7a49c20ebb5189980c53e6e66709d22bee572 upstream Conflicts: minor, swap_free is EXPORT_SYMBOL in downstream, we'll update the module that is using it then. Backport-reason: SWAP update: mTHP swapin support To streamline maintenance efforts, we propose removing the implementation of swap_free(). Instead, we can simply invoke swap_free_nr() with nr set to 1. swap_free_nr() is designed with a bitmap consisting of only one long, resulting in overhead that can be ignored for cases where nr equals 1. A prime candidate for leveraging swap_free_nr() lies within kernel/power/swap.c. Implementing this change facilitates the adoption of batch processing for hibernation. Link: https://lkml.kernel.org/r/20240529082824.150954-3-21cnbao@gmail.com Signed-off-by: Barry Song Suggested-by: "Huang, Ying" Reviewed-by: "Huang, Ying" Acked-by: Chris Li Reviewed-by: Ryan Roberts Cc: "Rafael J. Wysocki" Cc: Pavel Machek Cc: Len Brown Cc: Hugh Dickins Cc: Christoph Hellwig Cc: Andreas Larsson Cc: Baolin Wang Cc: Chuanhua Han Cc: David Hildenbrand Cc: "David S. Miller" Cc: Gao Xiang Cc: Johannes Weiner Cc: Kairui Song Cc: Khalid Aziz Cc: Matthew Wilcox (Oracle) Cc: Suren Baghdasaryan Cc: Yosry Ahmed Cc: Yu Zhao Cc: Zi Yan Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- include/linux/swap.h | 10 +++++----- kernel/power/swap.c | 5 ++--- mm/swapfile.c | 14 -------------- 3 files changed, 7 insertions(+), 22 deletions(-) diff --git a/include/linux/swap.h b/include/linux/swap.h index d85808b46dc4..43201dcee18b 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -517,7 +517,6 @@ extern int add_swap_count_continuation(swp_entry_t, gfp_t); extern void swap_shmem_alloc(swp_entry_t); extern int swap_duplicate(swp_entry_t); extern int swapcache_prepare(swp_entry_t); -extern void swap_free(swp_entry_t); extern void swap_free_nr(swp_entry_t entry, int nr_pages); extern void free_swap_and_cache_nr(swp_entry_t entry, int nr); int swap_type_of(dev_t device, sector_t offset); @@ -595,10 +594,6 @@ static inline int swapcache_prepare(swp_entry_t swp) return 0; } -static inline void swap_free(swp_entry_t swp) -{ -} - static inline void swap_free_nr(swp_entry_t entry, int nr_pages) { } @@ -645,6 +640,11 @@ static inline void free_swap_and_cache(swp_entry_t entry) free_swap_and_cache_nr(entry, 1); } +static inline void swap_free(swp_entry_t entry) +{ + swap_free_nr(entry, 1); +} + #ifdef CONFIG_MEMCG static inline int mem_cgroup_swappiness(struct mem_cgroup *memcg) { diff --git a/kernel/power/swap.c b/kernel/power/swap.c index d71c590550d2..b1896346fde1 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -201,12 +201,11 @@ void free_all_swap_pages(int swap) while ((node = swsusp_extents.rb_node)) { struct swsusp_extent *ext; - unsigned long offset; ext = rb_entry(node, struct swsusp_extent, node); rb_erase(node, &swsusp_extents); - for (offset = ext->start; offset <= ext->end; offset++) - swap_free(swp_entry(swap, offset)); + swap_free_nr(swp_entry(swap, ext->start), + ext->end - ext->start + 1); kfree(ext); } diff --git a/mm/swapfile.c b/mm/swapfile.c index 1820c828af23..af617aa03180 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1480,20 +1480,6 @@ static void swap_entry_range_free(struct swap_info_struct *si, partial_free_cluster(si, ci); } -/* - * Caller has made sure that the swap device corresponding to entry - * is still around or has not been recycled. - */ -void swap_free(swp_entry_t entry) -{ - struct swap_info_struct *p; - - p = _swap_info_get(entry); - if (p) - __swap_entry_free(p, entry); -} -EXPORT_SYMBOL(swap_free); - static void cluster_swap_free_nr(struct swap_info_struct *si, unsigned long offset, int nr_pages, unsigned char usage) -- Gitee From c2ee23bdf71a57b814c61b2236305fff66e8055a Mon Sep 17 00:00:00 2001 From: Barry Song Date: Wed, 29 May 2024 20:28:21 +1200 Subject: [PATCH 134/484] mm: introduce pte_move_swp_offset() helper which can move offset bidirectionally commit 3f9abcaa3e9c3910893ccbe6085aa0452e72896d upstream Conflicts: none Backport-reason: SWAP update: mTHP swapin support There could arise a necessity to obtain the first pte_t from a swap pte_t located in the middle. For instance, this may occur within the context of do_swap_page(), where a page fault can potentially occur in any PTE of a large folio. To address this, the following patch introduces pte_move_swp_offset(), a function capable of bidirectional movement by a specified delta argument. Consequently, pte_next_swp_offset() will directly invoke it with delta = 1. Link: https://lkml.kernel.org/r/20240529082824.150954-4-21cnbao@gmail.com Signed-off-by: Barry Song Suggested-by: "Huang, Ying" Reviewed-by: Ryan Roberts Reviewed-by: "Huang, Ying" Cc: Andreas Larsson Cc: Baolin Wang Cc: Chris Li Cc: Christoph Hellwig Cc: Chuanhua Han Cc: David Hildenbrand Cc: "David S. Miller" Cc: Gao Xiang Cc: Hugh Dickins Cc: Johannes Weiner Cc: Kairui Song Cc: Khalid Aziz Cc: Len Brown Cc: Matthew Wilcox (Oracle) Cc: Pavel Machek Cc: "Rafael J. Wysocki" Cc: Suren Baghdasaryan Cc: Yosry Ahmed Cc: Yu Zhao Cc: Zi Yan Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/internal.h | 25 +++++++++++++++++++++---- 1 file changed, 21 insertions(+), 4 deletions(-) diff --git a/mm/internal.h b/mm/internal.h index d6d174353eaa..b1be18013b40 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -231,18 +231,21 @@ static inline int folio_pte_batch(struct folio *folio, unsigned long addr, } /** - * pte_next_swp_offset - Increment the swap entry offset field of a swap pte. + * pte_move_swp_offset - Move the swap entry offset field of a swap pte + * forward or backward by delta * @pte: The initial pte state; is_swap_pte(pte) must be true and * non_swap_entry() must be false. + * @delta: The direction and the offset we are moving; forward if delta + * is positive; backward if delta is negative * - * Increments the swap offset, while maintaining all other fields, including + * Moves the swap offset, while maintaining all other fields, including * swap type, and any swp pte bits. The resulting pte is returned. */ -static inline pte_t pte_next_swp_offset(pte_t pte) +static inline pte_t pte_move_swp_offset(pte_t pte, long delta) { swp_entry_t entry = pte_to_swp_entry(pte); pte_t new = __swp_entry_to_pte(__swp_entry(swp_type(entry), - (swp_offset(entry) + 1))); + (swp_offset(entry) + delta))); if (pte_swp_soft_dirty(pte)) new = pte_swp_mksoft_dirty(new); @@ -254,6 +257,20 @@ static inline pte_t pte_next_swp_offset(pte_t pte) return new; } + +/** + * pte_next_swp_offset - Increment the swap entry offset field of a swap pte. + * @pte: The initial pte state; is_swap_pte(pte) must be true and + * non_swap_entry() must be false. + * + * Increments the swap offset, while maintaining all other fields, including + * swap type, and any swp pte bits. The resulting pte is returned. + */ +static inline pte_t pte_next_swp_offset(pte_t pte) +{ + return pte_move_swp_offset(pte, 1); +} + /** * swap_pte_batch - detect a PTE batch for a set of contiguous swap entries * @start_ptep: Page table pointer for the first entry. -- Gitee From 0be6bb1d1789286352776d01c8c7a68a6fc71b05 Mon Sep 17 00:00:00 2001 From: Barry Song Date: Wed, 29 May 2024 20:28:22 +1200 Subject: [PATCH 135/484] mm: introduce arch_do_swap_page_nr() which allows restore metadata for nr pages commit 29f252cdc293f4a50b5d3dcbed53701d8444614d upstream Conflicts: none Backport-reason: SWAP update: mTHP swapin support Should do_swap_page() have the capability to directly map a large folio, metadata restoration becomes necessary for a specified number of pages denoted as nr. It's important to highlight that metadata restoration is solely required by the SPARC platform, which, however, does not enable THP_SWAP. Consequently, in the present kernel configuration, there exists no practical scenario where users necessitate the restoration of nr metadata. Platforms implementing THP_SWAP might invoke this function with nr values exceeding 1, subsequent to do_swap_page() successfully mapping an entire large folio. Nonetheless, their arch_do_swap_page_nr() functions remain empty. Link: https://lkml.kernel.org/r/20240529082824.150954-5-21cnbao@gmail.com Signed-off-by: Barry Song Reviewed-by: Ryan Roberts Reviewed-by: Khalid Aziz Cc: "David S. Miller" Cc: Andreas Larsson Cc: Baolin Wang Cc: Chris Li Cc: Christoph Hellwig Cc: Chuanhua Han Cc: David Hildenbrand Cc: Gao Xiang Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Johannes Weiner Cc: Kairui Song Cc: Len Brown Cc: Matthew Wilcox (Oracle) Cc: Pavel Machek Cc: "Rafael J. Wysocki" Cc: Suren Baghdasaryan Cc: Yosry Ahmed Cc: Yu Zhao Cc: Zi Yan Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- include/linux/pgtable.h | 26 ++++++++++++++++++++------ mm/memory.c | 3 ++- 2 files changed, 22 insertions(+), 7 deletions(-) diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 7741274366f4..ff43f1b1c406 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -1065,6 +1065,15 @@ static inline int pgd_same(pgd_t pgd_a, pgd_t pgd_b) }) #ifndef __HAVE_ARCH_DO_SWAP_PAGE +static inline void arch_do_swap_page_nr(struct mm_struct *mm, + struct vm_area_struct *vma, + unsigned long addr, + pte_t pte, pte_t oldpte, + int nr) +{ + +} +#else /* * Some architectures support metadata associated with a page. When a * page is being swapped out, this metadata must be saved so it can be @@ -1073,12 +1082,17 @@ static inline int pgd_same(pgd_t pgd_a, pgd_t pgd_b) * page as metadata for the page. arch_do_swap_page() can restore this * metadata when a page is swapped back in. */ -static inline void arch_do_swap_page(struct mm_struct *mm, - struct vm_area_struct *vma, - unsigned long addr, - pte_t pte, pte_t oldpte) -{ - +static inline void arch_do_swap_page_nr(struct mm_struct *mm, + struct vm_area_struct *vma, + unsigned long addr, + pte_t pte, pte_t oldpte, + int nr) +{ + for (int i = 0; i < nr; i++) { + arch_do_swap_page(vma->vm_mm, vma, addr + i * PAGE_SIZE, + pte_advance_pfn(pte, i), + pte_advance_pfn(oldpte, i)); + } } #endif diff --git a/mm/memory.c b/mm/memory.c index eec3ad8f520f..e0ea88eae1a2 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4226,7 +4226,8 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) VM_BUG_ON(!folio_test_anon(folio) || (pte_write(pte) && !PageAnonExclusive(page))); set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte); - arch_do_swap_page(vma->vm_mm, vma, vmf->address, pte, vmf->orig_pte); + arch_do_swap_page_nr(vma->vm_mm, vma, vmf->address, + pte, vmf->orig_pte, 1); folio_unlock(folio); if (folio != swapcache && swapcache) { -- Gitee From 7868157bd35bbf959797be1f37dc45324f0e232a Mon Sep 17 00:00:00 2001 From: Chuanhua Han Date: Wed, 29 May 2024 20:28:23 +1200 Subject: [PATCH 136/484] mm: swap: make should_try_to_free_swap() support large-folio commit 4c3f966436873435600b00e5c2c6c8933607e236 upstream Conflicts: none Backport-reason: SWAP update: mTHP swapin support The function should_try_to_free_swap() operates under the assumption that swap-in always occurs at the normal page granularity, i.e., folio_nr_pages() = 1. However, in reality, for large folios, add_to_swap_cache() will invoke folio_ref_add(folio, nr). To accommodate large folio swap-in, this patch eliminates this assumption. Link: https://lkml.kernel.org/r/20240529082824.150954-6-21cnbao@gmail.com Signed-off-by: Chuanhua Han Co-developed-by: Barry Song Signed-off-by: Barry Song Acked-by: Chris Li Reviewed-by: Ryan Roberts Reviewed-by: "Huang, Ying" Reviewed-by: David Hildenbrand Cc: Andreas Larsson Cc: Baolin Wang Cc: Christoph Hellwig Cc: "David S. Miller" Cc: Gao Xiang Cc: Hugh Dickins Cc: Johannes Weiner Cc: Kairui Song Cc: Khalid Aziz Cc: Len Brown Cc: Matthew Wilcox (Oracle) Cc: Pavel Machek Cc: "Rafael J. Wysocki" Cc: Suren Baghdasaryan Cc: Yosry Ahmed Cc: Yu Zhao Cc: Zi Yan Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/memory.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/memory.c b/mm/memory.c index e0ea88eae1a2..9a1f8ea880ef 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3837,7 +3837,7 @@ static inline bool should_try_to_free_swap(struct folio *folio, * reference only in case it's likely that we'll be the exlusive user. */ return (fault_flags & FAULT_FLAG_WRITE) && !folio_test_ksm(folio) && - folio_ref_count(folio) == 2; + folio_ref_count(folio) == (1 + folio_nr_pages(folio)); } static vm_fault_t pte_marker_clear(struct vm_fault *vmf) -- Gitee From 09e5a00f65e3a884e9f4112c7582b87ec1724c02 Mon Sep 17 00:00:00 2001 From: Chuanhua Han Date: Wed, 29 May 2024 20:28:24 +1200 Subject: [PATCH 137/484] mm: swap: entirely map large folios found in swapcache commit 508758960b8d89fa464abce2f9897973c8e8d4f0 upstream Conflicts: none Backport-reason: SWAP update: mTHP swapin support When a large folio is found in the swapcache, the current implementation requires calling do_swap_page() nr_pages times, resulting in nr_pages page faults. This patch opts to map the entire large folio at once to minimize page faults. Additionally, redundant checks and early exits for ARM64 MTE restoring are removed. Link: https://lkml.kernel.org/r/20240529082824.150954-7-21cnbao@gmail.com Signed-off-by: Chuanhua Han Co-developed-by: Barry Song Signed-off-by: Barry Song Reviewed-by: Ryan Roberts Reviewed-by: "Huang, Ying" Cc: Andreas Larsson Cc: Baolin Wang Cc: Chris Li Cc: Christoph Hellwig Cc: David Hildenbrand Cc: "David S. Miller" Cc: Gao Xiang Cc: Hugh Dickins Cc: Johannes Weiner Cc: Kairui Song Cc: Khalid Aziz Cc: Len Brown Cc: Matthew Wilcox (Oracle) Cc: Pavel Machek Cc: "Rafael J. Wysocki" Cc: Suren Baghdasaryan Cc: Yosry Ahmed Cc: Yu Zhao Cc: Zi Yan Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/memory.c | 59 +++++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 48 insertions(+), 11 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index 9a1f8ea880ef..7434f21192ee 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3928,6 +3928,10 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) pte_t pte; vm_fault_t ret = 0; void *shadow = NULL; + int nr_pages; + unsigned long page_idx; + unsigned long address; + pte_t *ptep; if (!pte_unmap_same(vmf)) goto out; @@ -4126,6 +4130,38 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) goto out_nomap; } + nr_pages = 1; + page_idx = 0; + address = vmf->address; + ptep = vmf->pte; + if (folio_test_large(folio) && folio_test_swapcache(folio)) { + int nr = folio_nr_pages(folio); + unsigned long idx = folio_page_idx(folio, page); + unsigned long folio_start = address - idx * PAGE_SIZE; + unsigned long folio_end = folio_start + nr * PAGE_SIZE; + pte_t *folio_ptep; + pte_t folio_pte; + + if (unlikely(folio_start < max(address & PMD_MASK, vma->vm_start))) + goto check_folio; + if (unlikely(folio_end > pmd_addr_end(address, vma->vm_end))) + goto check_folio; + + folio_ptep = vmf->pte - idx; + folio_pte = ptep_get(folio_ptep); + if (!pte_same(folio_pte, pte_move_swp_offset(vmf->orig_pte, -idx)) || + swap_pte_batch(folio_ptep, nr, folio_pte) != nr) + goto check_folio; + + page_idx = idx; + address = folio_start; + ptep = folio_ptep; + nr_pages = nr; + entry = folio->swap; + page = &folio->page; + } + +check_folio: /* * PG_anon_exclusive reuses PG_mappedtodisk for anon pages. A swap pte * must never point at an anonymous page in the swapcache that is @@ -4185,12 +4221,12 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) * We're already holding a reference on the page but haven't mapped it * yet. */ - swap_free(entry); + swap_free_nr(entry, nr_pages); if (should_try_to_free_swap(folio, vma, vmf->flags)) folio_free_swap(folio); - inc_mm_counter(vma->vm_mm, MM_ANONPAGES); - dec_mm_counter(vma->vm_mm, MM_SWAPENTS); + add_mm_counter(vma->vm_mm, MM_ANONPAGES, nr_pages); + add_mm_counter(vma->vm_mm, MM_SWAPENTS, -nr_pages); pte = mk_pte(page, vma->vm_page_prot); /* @@ -4207,27 +4243,28 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) } rmap_flags |= RMAP_EXCLUSIVE; } - flush_icache_page(vma, page); + folio_ref_add(folio, nr_pages - 1); + flush_icache_pages(vma, page, nr_pages); if (pte_swp_soft_dirty(vmf->orig_pte)) pte = pte_mksoft_dirty(pte); if (pte_swp_uffd_wp(vmf->orig_pte)) pte = pte_mkuffd_wp(pte); - vmf->orig_pte = pte; + vmf->orig_pte = pte_advance_pfn(pte, page_idx); /* ksm created a completely new copy */ if (unlikely(folio != swapcache && swapcache)) { - folio_add_new_anon_rmap(folio, vma, vmf->address); + folio_add_new_anon_rmap(folio, vma, address); folio_add_lru_vma(folio, vma); } else { - folio_add_anon_rmap_pte(folio, page, vma, vmf->address, + folio_add_anon_rmap_ptes(folio, page, nr_pages, vma, address, rmap_flags); } VM_BUG_ON(!folio_test_anon(folio) || (pte_write(pte) && !PageAnonExclusive(page))); - set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte); - arch_do_swap_page_nr(vma->vm_mm, vma, vmf->address, - pte, vmf->orig_pte, 1); + set_ptes(vma->vm_mm, address, ptep, pte, nr_pages); + arch_do_swap_page_nr(vma->vm_mm, vma, address, + pte, pte, nr_pages); folio_unlock(folio); if (folio != swapcache && swapcache) { @@ -4251,7 +4288,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) } /* No need to invalidate - it was non-present before */ - update_mmu_cache_range(vmf, vma, vmf->address, vmf->pte, 1); + update_mmu_cache_range(vmf, vma, address, ptep, nr_pages); unlock: if (vmf->pte) pte_unmap_unlock(vmf->pte, vmf->ptl); -- Gitee From dba98cdb93cf093f8e26bcba6f0b56e6be6919f8 Mon Sep 17 00:00:00 2001 From: Barry Song Date: Fri, 12 Apr 2024 23:48:55 +1200 Subject: [PATCH 138/484] mm: add per-order mTHP anon_fault_alloc and anon_fault_fallback counters commit ec33687c674934dfefd782a8ffd58370b080b503 upstream Conflicts: none Backport-reason: mTHP statistic Patch series "mm: add per-order mTHP alloc and swpout counters", v6. The patchset introduces a framework to facilitate mTHP counters, starting with the allocation and swap-out counters. Currently, only four new nodes are appended to the stats directory for each mTHP size. /sys/kernel/mm/transparent_hugepage/hugepages-/stats anon_fault_alloc anon_fault_fallback anon_fault_fallback_charge anon_swpout anon_swpout_fallback These nodes are crucial for us to monitor the fragmentation levels of both the buddy system and the swap partitions. In the future, we may consider adding additional nodes for further insights. This patch (of 4): Profiling a system blindly with mTHP has become challenging due to the lack of visibility into its operations. Presenting the success rate of mTHP allocations appears to be pressing need. Recently, I've been experiencing significant difficulty debugging performance improvements and regressions without these figures. It's crucial for us to understand the true effectiveness of mTHP in real-world scenarios, especially in systems with fragmented memory. This patch establishes the framework for per-order mTHP counters. It begins by introducing the anon_fault_alloc and anon_fault_fallback counters. Additionally, to maintain consistency with thp_fault_fallback_charge in /proc/vmstat, this patch also tracks anon_fault_fallback_charge when mem_cgroup_charge fails for mTHP. Incorporating additional counters should now be straightforward as well. Link: https://lkml.kernel.org/r/20240412114858.407208-1-21cnbao@gmail.com Link: https://lkml.kernel.org/r/20240412114858.407208-2-21cnbao@gmail.com Signed-off-by: Barry Song Acked-by: David Hildenbrand Cc: Chris Li Cc: Domenico Cerasuolo Cc: Kairui Song Cc: Matthew Wilcox (Oracle) Cc: Peter Xu Cc: Ryan Roberts Cc: Suren Baghdasaryan Cc: Yosry Ahmed Cc: Yu Zhao Cc: Jonathan Corbet Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- include/linux/huge_mm.h | 21 +++++++++++++++++ mm/huge_memory.c | 52 +++++++++++++++++++++++++++++++++++++++++ mm/memory.c | 5 ++++ 3 files changed, 78 insertions(+) diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 842bb8f69197..0cd63d80cee9 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -272,6 +272,27 @@ unsigned long thp_vma_allowable_orders(struct vm_area_struct *vma, enforce_sysfs, orders); } +enum mthp_stat_item { + MTHP_STAT_ANON_FAULT_ALLOC, + MTHP_STAT_ANON_FAULT_FALLBACK, + MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE, + __MTHP_STAT_COUNT +}; + +struct mthp_stat { + unsigned long stats[ilog2(MAX_PTRS_PER_PTE) + 1][__MTHP_STAT_COUNT]; +}; + +DECLARE_PER_CPU(struct mthp_stat, mthp_stats); + +static inline void count_mthp_stat(int order, enum mthp_stat_item item) +{ + if (order <= 0 || order > PMD_ORDER) + return; + + this_cpu_inc(mthp_stats.stats[order][item]); +} + #define transparent_hugepage_use_zero_page() \ (transparent_hugepage_flags & \ (1<stats[order][item]; + } + + return sum; +} + +#define DEFINE_MTHP_STAT_ATTR(_name, _index) \ +static ssize_t _name##_show(struct kobject *kobj, \ + struct kobj_attribute *attr, char *buf) \ +{ \ + int order = to_thpsize(kobj)->order; \ + \ + return sysfs_emit(buf, "%lu\n", sum_mthp_stat(order, _index)); \ +} \ +static struct kobj_attribute _name##_attr = __ATTR_RO(_name) + +DEFINE_MTHP_STAT_ATTR(anon_fault_alloc, MTHP_STAT_ANON_FAULT_ALLOC); +DEFINE_MTHP_STAT_ATTR(anon_fault_fallback, MTHP_STAT_ANON_FAULT_FALLBACK); +DEFINE_MTHP_STAT_ATTR(anon_fault_fallback_charge, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE); + +static struct attribute *stats_attrs[] = { + &anon_fault_alloc_attr.attr, + &anon_fault_fallback_attr.attr, + &anon_fault_fallback_charge_attr.attr, + NULL, +}; + +static struct attribute_group stats_attr_group = { + .name = "stats", + .attrs = stats_attrs, +}; + static struct thpsize *thpsize_create(int order, struct kobject *parent) { unsigned long size = (PAGE_SIZE << order) / SZ_1K; @@ -633,6 +675,12 @@ static struct thpsize *thpsize_create(int order, struct kobject *parent) return ERR_PTR(ret); } + ret = sysfs_create_group(&thpsize->kobj, &stats_attr_group); + if (ret) { + kobject_put(&thpsize->kobj); + return ERR_PTR(ret); + } + thpsize->order = order; return thpsize; } @@ -992,6 +1040,8 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf, folio_put(folio); count_vm_event(THP_FAULT_FALLBACK); count_vm_event(THP_FAULT_FALLBACK_CHARGE); + count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_FALLBACK); + count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE); return VM_FAULT_FALLBACK; } folio_throttle_swaprate(folio, gfp); @@ -1041,6 +1091,7 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf, mm_inc_nr_ptes(vma->vm_mm); spin_unlock(vmf->ptl); count_vm_event(THP_FAULT_ALLOC); + count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_ALLOC); count_memcg_event_mm(vma->vm_mm, THP_FAULT_ALLOC); } @@ -1161,6 +1212,7 @@ vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf) folio = vma_alloc_folio(gfp, HPAGE_PMD_ORDER, vma, haddr, true); if (unlikely(!folio)) { count_vm_event(THP_FAULT_FALLBACK); + count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_FALLBACK); return VM_FAULT_FALLBACK; } return __do_huge_pmd_anonymous_page(vmf, &folio->page, gfp); diff --git a/mm/memory.c b/mm/memory.c index 7434f21192ee..fe58bfc2b1df 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4385,6 +4385,7 @@ static struct folio *alloc_anon_folio(struct vm_fault *vmf) folio = vma_alloc_folio(gfp, order, vma, addr, true); if (folio) { if (mem_cgroup_charge(folio, vma->vm_mm, gfp)) { + count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE); folio_put(folio); goto next; } @@ -4393,6 +4394,7 @@ static struct folio *alloc_anon_folio(struct vm_fault *vmf) return folio; } next: + count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK); order = next_order(&orders, order); } @@ -4502,6 +4504,9 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) folio_ref_add(folio, nr_pages - 1); add_mm_counter(vma->vm_mm, MM_ANONPAGES, nr_pages); +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + count_mthp_stat(folio_order(folio), MTHP_STAT_ANON_FAULT_ALLOC); +#endif folio_add_new_anon_rmap(folio, vma, addr); folio_add_lru_vma(folio, vma); setpte: -- Gitee From 2ff4bb1bfd46286f2e7c127ee7edbde360f97cb3 Mon Sep 17 00:00:00 2001 From: Barry Song Date: Fri, 12 Apr 2024 23:48:56 +1200 Subject: [PATCH 139/484] mm: add per-order mTHP anon_swpout and anon_swpout_fallback counters commit d0f048ac39f6a71566d3f49a5922dfd7fa0d585b upstream Conflicts: trivial Backport-reason: mTHP statistic This helps to display the fragmentation situation of the swapfile, knowing the proportion of how much we haven't split large folios. So far, we only support non-split swapout for anon memory, with the possibility of expanding to shmem in the future. So, we add the "anon" prefix to the counter names. Link: https://lkml.kernel.org/r/20240412114858.407208-3-21cnbao@gmail.com Signed-off-by: Barry Song Reviewed-by: Ryan Roberts Acked-by: David Hildenbrand Cc: Chris Li Cc: Domenico Cerasuolo Cc: Kairui Song Cc: Matthew Wilcox (Oracle) Cc: Peter Xu Cc: Ryan Roberts Cc: Suren Baghdasaryan Cc: Yosry Ahmed Cc: Yu Zhao Cc: Jonathan Corbet Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- include/linux/huge_mm.h | 2 ++ mm/huge_memory.c | 4 ++++ mm/page_io.c | 1 + mm/vmscan.c | 2 ++ 4 files changed, 9 insertions(+) diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 0cd63d80cee9..1edc627de7f1 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -276,6 +276,8 @@ enum mthp_stat_item { MTHP_STAT_ANON_FAULT_ALLOC, MTHP_STAT_ANON_FAULT_FALLBACK, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE, + MTHP_STAT_ANON_SWPOUT, + MTHP_STAT_ANON_SWPOUT_FALLBACK, __MTHP_STAT_COUNT }; diff --git a/mm/huge_memory.c b/mm/huge_memory.c index c931ade47843..391f6c648fa2 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -639,11 +639,15 @@ static struct kobj_attribute _name##_attr = __ATTR_RO(_name) DEFINE_MTHP_STAT_ATTR(anon_fault_alloc, MTHP_STAT_ANON_FAULT_ALLOC); DEFINE_MTHP_STAT_ATTR(anon_fault_fallback, MTHP_STAT_ANON_FAULT_FALLBACK); DEFINE_MTHP_STAT_ATTR(anon_fault_fallback_charge, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE); +DEFINE_MTHP_STAT_ATTR(anon_swpout, MTHP_STAT_ANON_SWPOUT); +DEFINE_MTHP_STAT_ATTR(anon_swpout_fallback, MTHP_STAT_ANON_SWPOUT_FALLBACK); static struct attribute *stats_attrs[] = { &anon_fault_alloc_attr.attr, &anon_fault_fallback_attr.attr, &anon_fault_fallback_charge_attr.attr, + &anon_swpout_attr.attr, + &anon_swpout_fallback_attr.attr, NULL, }; diff --git a/mm/page_io.c b/mm/page_io.c index c3cbdfe4af75..fb68d28d8219 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -212,6 +212,7 @@ static inline void count_swpout_vm_event(struct folio *folio) count_memcg_folio_events(folio, THP_SWPOUT, 1); count_vm_event(THP_SWPOUT); } + count_mthp_stat(folio_order(folio), MTHP_STAT_ANON_SWPOUT); #endif count_vm_events(PSWPOUT, folio_nr_pages(folio)); } diff --git a/mm/vmscan.c b/mm/vmscan.c index 4659cb111b78..a290c6bde5d3 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1330,6 +1330,7 @@ static unsigned int shrink_folio_list(struct list_head *folio_list, goto activate_locked; } if (folio_alloc_swap(folio, __GFP_HIGH | __GFP_NOWARN)) { + int __maybe_unused order = folio_order(folio); if (!folio_test_large(folio)) goto activate_locked_split; @@ -1342,6 +1343,7 @@ static unsigned int shrink_folio_list(struct list_head *folio_list, THP_SWPOUT_FALLBACK, 1); count_vm_event(THP_SWPOUT_FALLBACK); } + count_mthp_stat(order, MTHP_STAT_ANON_SWPOUT_FALLBACK); #endif if (folio_alloc_swap(folio, __GFP_HIGH | __GFP_NOWARN)) goto activate_locked_split; -- Gitee From f32fc312fe8d231255a643263899e951b9fab971 Mon Sep 17 00:00:00 2001 From: Barry Song Date: Fri, 12 Apr 2024 23:48:57 +1200 Subject: [PATCH 140/484] mm: add docs for per-order mTHP counters and transhuge_page ABI commit 42248b9d34ea1be1b959d343fff465906cb787fc upstream Conflicts: none Backport-reason: mTHP statistic This patch includes documentation for mTHP counters and an ABI file for sys-kernel-mm-transparent-hugepage, which appears to have been missing for some time. [v-songbaohua@oppo.com: fix the name and unexpected indentation] Link: https://lkml.kernel.org/r/20240415054538.17071-1-21cnbao@gmail.com Link: https://lkml.kernel.org/r/20240412114858.407208-4-21cnbao@gmail.com Signed-off-by: Barry Song Reviewed-by: Ryan Roberts Reviewed-by: David Hildenbrand Cc: Chris Li Cc: Domenico Cerasuolo Cc: Kairui Song Cc: Matthew Wilcox (Oracle) Cc: Peter Xu Cc: Ryan Roberts Cc: Suren Baghdasaryan Cc: Yosry Ahmed Cc: Yu Zhao Cc: Jonathan Corbet Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- .../sysfs-kernel-mm-transparent-hugepage | 18 ++++++++++++ Documentation/admin-guide/mm/transhuge.rst | 28 +++++++++++++++++++ 2 files changed, 46 insertions(+) create mode 100644 Documentation/ABI/testing/sysfs-kernel-mm-transparent-hugepage diff --git a/Documentation/ABI/testing/sysfs-kernel-mm-transparent-hugepage b/Documentation/ABI/testing/sysfs-kernel-mm-transparent-hugepage new file mode 100644 index 000000000000..7bfbb9cc2c11 --- /dev/null +++ b/Documentation/ABI/testing/sysfs-kernel-mm-transparent-hugepage @@ -0,0 +1,18 @@ +What: /sys/kernel/mm/transparent_hugepage/ +Date: April 2024 +Contact: Linux memory management mailing list +Description: + /sys/kernel/mm/transparent_hugepage/ contains a number of files and + subdirectories, + + - defrag + - enabled + - hpage_pmd_size + - khugepaged + - shmem_enabled + - use_zero_page + - subdirectories of the form hugepages-kB, where + is the page size of the hugepages supported by the kernel/CPU + combination. + + See Documentation/admin-guide/mm/transhuge.rst for details. diff --git a/Documentation/admin-guide/mm/transhuge.rst b/Documentation/admin-guide/mm/transhuge.rst index 04eb45a2f940..e0fe17affeb3 100644 --- a/Documentation/admin-guide/mm/transhuge.rst +++ b/Documentation/admin-guide/mm/transhuge.rst @@ -447,6 +447,34 @@ thp_swpout_fallback Usually because failed to allocate some continuous swap space for the huge page. +In /sys/kernel/mm/transparent_hugepage/hugepages-kB/stats, There are +also individual counters for each huge page size, which can be utilized to +monitor the system's effectiveness in providing huge pages for usage. Each +counter has its own corresponding file. + +anon_fault_alloc + is incremented every time a huge page is successfully + allocated and charged to handle a page fault. + +anon_fault_fallback + is incremented if a page fault fails to allocate or charge + a huge page and instead falls back to using huge pages with + lower orders or small pages. + +anon_fault_fallback_charge + is incremented if a page fault fails to charge a huge page and + instead falls back to using huge pages with lower orders or + small pages even though the allocation was successful. + +anon_swpout + is incremented every time a huge page is swapped out in one + piece without splitting. + +anon_swpout_fallback + is incremented if a huge page has to be split before swapout. + Usually because failed to allocate some continuous swap space + for the huge page. + As the system ages, allocating huge pages may be expensive as the system uses memory compaction to copy data around memory to free a huge page for use. There are some counters in ``/proc/vmstat`` to help -- Gitee From dd34a86d34440bca6dc4a55e74e9c79fcd825dcb Mon Sep 17 00:00:00 2001 From: Barry Song Date: Fri, 12 Apr 2024 23:48:58 +1200 Subject: [PATCH 141/484] mm: correct the docs for thp_fault_alloc and thp_fault_fallback commit a14421ae2a99378c4103bb03606465ab13e75509 upstream Conflicts: none Backport-reason: mTHP statistic The documentation does not align with the code. In __do_huge_pmd_anonymous_page(), THP_FAULT_FALLBACK is incremented when mem_cgroup_charge() fails, despite the allocation succeeding, whereas THP_FAULT_ALLOC is only incremented after a successful charge. Link: https://lkml.kernel.org/r/20240412114858.407208-5-21cnbao@gmail.com Signed-off-by: Barry Song Reviewed-by: Ryan Roberts Reviewed-by: David Hildenbrand Cc: Chris Li Cc: Domenico Cerasuolo Cc: Kairui Song Cc: Matthew Wilcox (Oracle) Cc: Peter Xu Cc: Ryan Roberts Cc: Suren Baghdasaryan Cc: Yosry Ahmed Cc: Yu Zhao Cc: Jonathan Corbet Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- Documentation/admin-guide/mm/transhuge.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Documentation/admin-guide/mm/transhuge.rst b/Documentation/admin-guide/mm/transhuge.rst index e0fe17affeb3..f82300b9193f 100644 --- a/Documentation/admin-guide/mm/transhuge.rst +++ b/Documentation/admin-guide/mm/transhuge.rst @@ -369,7 +369,7 @@ monitor how successfully the system is providing huge pages for use. thp_fault_alloc is incremented every time a huge page is successfully - allocated to handle a page fault. + allocated and charged to handle a page fault. thp_collapse_alloc is incremented by khugepaged when it has found @@ -377,7 +377,7 @@ thp_collapse_alloc successfully allocated a new huge page to store the data. thp_fault_fallback - is incremented if a page fault fails to allocate + is incremented if a page fault fails to allocate or charge a huge page and instead falls back to using small pages. thp_fault_fallback_charge -- Gitee From a117eb10eb066e43e479d98ab01386f65f54f2e5 Mon Sep 17 00:00:00 2001 From: Barry Song Date: Fri, 24 May 2024 08:50:48 +1200 Subject: [PATCH 142/484] mm: huge_mm: fix undefined reference to `mthp_stats' for CONFIG_SYSFS=n commit 94d46bf17916965e918bd2f3d2eec057f7c5578d upstream Conflicts: none Backport-reason: mTHP statistic if CONFIG_SYSFS is not enabled in config, we get the below error, All errors (new ones prefixed by >>): s390-linux-ld: mm/memory.o: in function `count_mthp_stat': >> include/linux/huge_mm.h:285:(.text+0x191c): undefined reference to `mthp_stats' s390-linux-ld: mm/huge_memory.o:(.rodata+0x10): undefined reference to `mthp_stats' vim +285 include/linux/huge_mm.h 279 280 static inline void count_mthp_stat(int order, enum mthp_stat_item item) 281 { 282 if (order <= 0 || order > PMD_ORDER) 283 return; 284 > 285 this_cpu_inc(mthp_stats.stats[order][item]); 286 } 287 Link: https://lkml.kernel.org/r/20240523210045.40444-1-21cnbao@gmail.com Fixes: ec33687c6749 ("mm: add per-order mTHP anon_fault_alloc and anon_fault_fallback counters") Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202405231728.tCAogiSI-lkp@intel.com/ Signed-off-by: Barry Song Tested-by: Yujie Liu Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- include/linux/huge_mm.h | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 1edc627de7f1..0ba1d8959ab3 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -285,6 +285,7 @@ struct mthp_stat { unsigned long stats[ilog2(MAX_PTRS_PER_PTE) + 1][__MTHP_STAT_COUNT]; }; +#ifdef CONFIG_SYSFS DECLARE_PER_CPU(struct mthp_stat, mthp_stats); static inline void count_mthp_stat(int order, enum mthp_stat_item item) @@ -294,6 +295,11 @@ static inline void count_mthp_stat(int order, enum mthp_stat_item item) this_cpu_inc(mthp_stats.stats[order][item]); } +#else +static inline void count_mthp_stat(int order, enum mthp_stat_item item) +{ +} +#endif #define transparent_hugepage_use_zero_page() \ (transparent_hugepage_flags & \ -- Gitee From 342ff46d1c67eabd177c1bc0bc5ed7c8449a12b3 Mon Sep 17 00:00:00 2001 From: Barry Song Date: Tue, 18 Jun 2024 11:11:35 +1200 Subject: [PATCH 143/484] mm: extend rmap flags arguments for folio_add_new_anon_rmap MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 15bde4abab734c687c1f81704886aba3a70c268e upstream Conflicts: none Backport-reason: mTHP swapin support: rmap preparing Patch series "mm: clarify folio_add_new_anon_rmap() and __folio_add_anon_rmap()", v2. This patchset is preparatory work for mTHP swapin. folio_add_new_anon_rmap() assumes that new anon rmaps are always exclusive. However, this assumption doesn’t hold true for cases like do_swap_page(), where a new anon might be added to the swapcache and is not necessarily exclusive. The patchset extends the rmap flags to allow folio_add_new_anon_rmap() to handle both exclusive and non-exclusive new anon folios. The do_swap_page() function is updated to use this extended API with rmap flags. Consequently, all new anon folios now consistently use folio_add_new_anon_rmap(). The special case for !folio_test_anon() in __folio_add_anon_rmap() can be safely removed. In conclusion, new anon folios always use folio_add_new_anon_rmap(), regardless of exclusivity. Old anon folios continue to use __folio_add_anon_rmap() via folio_add_anon_rmap_pmd() and folio_add_anon_rmap_ptes(). This patch (of 3): In the case of a swap-in, a new anonymous folio is not necessarily exclusive. This patch updates the rmap flags to allow a new anonymous folio to be treated as either exclusive or non-exclusive. To maintain the existing behavior, we always use EXCLUSIVE as the default setting. [akpm@linux-foundation.org: cleanup and constifications per David and akpm] [v-songbaohua@oppo.com: fix missing doc for flags of folio_add_new_anon_rmap()] Link: https://lkml.kernel.org/r/20240619210641.62542-1-21cnbao@gmail.com [v-songbaohua@oppo.com: enhance doc for extend rmap flags arguments for folio_add_new_anon_rmap] Link: https://lkml.kernel.org/r/20240622030256.43775-1-21cnbao@gmail.com Link: https://lkml.kernel.org/r/20240617231137.80726-1-21cnbao@gmail.com Link: https://lkml.kernel.org/r/20240617231137.80726-2-21cnbao@gmail.com Signed-off-by: Barry Song Suggested-by: David Hildenbrand Tested-by: Shuai Yuan Acked-by: David Hildenbrand Cc: Baolin Wang Cc: Chris Li Cc: "Huang, Ying" Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Ryan Roberts Cc: Suren Baghdasaryan Cc: Yang Shi Cc: Yosry Ahmed Cc: Yu Zhao Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- include/linux/rmap.h | 2 +- kernel/events/uprobes.c | 2 +- mm/huge_memory.c | 2 +- mm/khugepaged.c | 2 +- mm/memory.c | 10 +++++----- mm/migrate_device.c | 2 +- mm/rmap.c | 25 ++++++++++++++++--------- mm/swapfile.c | 2 +- mm/userfaultfd.c | 2 +- 9 files changed, 28 insertions(+), 21 deletions(-) diff --git a/include/linux/rmap.h b/include/linux/rmap.h index 3e093c29021a..480f4440e7c2 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -236,7 +236,7 @@ void folio_add_anon_rmap_ptes(struct folio *, struct page *, int nr_pages, void folio_add_anon_rmap_pmd(struct folio *, struct page *, struct vm_area_struct *, unsigned long address, rmap_t flags); void folio_add_new_anon_rmap(struct folio *, struct vm_area_struct *, - unsigned long address); + unsigned long address, rmap_t flags); void folio_add_file_rmap_ptes(struct folio *, struct page *, int nr_pages, struct vm_area_struct *); #define folio_add_file_rmap_pte(folio, page, vma) \ diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index bc7e912735c6..861cb46fba7a 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -192,7 +192,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, if (new_page) { folio_get(new_folio); - folio_add_new_anon_rmap(new_folio, vma, addr); + folio_add_new_anon_rmap(new_folio, vma, addr, RMAP_EXCLUSIVE); folio_add_lru_vma(new_folio, vma); } else /* no new page, just dec_mm_counter for old_page */ diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 391f6c648fa2..af946fe73095 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1086,7 +1086,7 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf, entry = mk_huge_pmd(page, vma->vm_page_prot); entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); - folio_add_new_anon_rmap(folio, vma, haddr); + folio_add_new_anon_rmap(folio, vma, haddr, RMAP_EXCLUSIVE); folio_add_lru_vma(folio, vma); pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable); set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry); diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 4881c4e22921..962eb2d3cdb9 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1256,7 +1256,7 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address, spin_lock(pmd_ptl); BUG_ON(!pmd_none(*pmd)); - folio_add_new_anon_rmap(folio, vma, address); + folio_add_new_anon_rmap(folio, vma, address, RMAP_EXCLUSIVE); folio_add_lru_vma(folio, vma); pgtable_trans_huge_deposit(mm, pmd, pgtable); set_pmd_at(mm, address, pmd, _pmd); diff --git a/mm/memory.c b/mm/memory.c index fe58bfc2b1df..3cd2c634c10d 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -918,7 +918,7 @@ copy_present_page(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma *prealloc = NULL; copy_user_highpage(&new_folio->page, page, addr, src_vma); __folio_mark_uptodate(new_folio); - folio_add_new_anon_rmap(new_folio, dst_vma, addr); + folio_add_new_anon_rmap(new_folio, dst_vma, addr, RMAP_EXCLUSIVE); folio_add_lru_vma(new_folio, dst_vma); rss[MM_ANONPAGES]++; @@ -3342,7 +3342,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) * some TLBs while the old PTE remains in others. */ ptep_clear_flush(vma, vmf->address, vmf->pte); - folio_add_new_anon_rmap(new_folio, vma, vmf->address); + folio_add_new_anon_rmap(new_folio, vma, vmf->address, RMAP_EXCLUSIVE); folio_add_lru_vma(new_folio, vma); /* * We call the notify macro here because, when using secondary @@ -4253,7 +4253,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) /* ksm created a completely new copy */ if (unlikely(folio != swapcache && swapcache)) { - folio_add_new_anon_rmap(folio, vma, address); + folio_add_new_anon_rmap(folio, vma, address, RMAP_EXCLUSIVE); folio_add_lru_vma(folio, vma); } else { folio_add_anon_rmap_ptes(folio, page, nr_pages, vma, address, @@ -4507,7 +4507,7 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) #ifdef CONFIG_TRANSPARENT_HUGEPAGE count_mthp_stat(folio_order(folio), MTHP_STAT_ANON_FAULT_ALLOC); #endif - folio_add_new_anon_rmap(folio, vma, addr); + folio_add_new_anon_rmap(folio, vma, addr, RMAP_EXCLUSIVE); folio_add_lru_vma(folio, vma); setpte: if (uffd_wp) @@ -4715,7 +4715,7 @@ void set_pte_range(struct vm_fault *vmf, struct folio *folio, /* copy-on-write page */ if (write && !(vma->vm_flags & VM_SHARED)) { VM_BUG_ON_FOLIO(nr != 1, folio); - folio_add_new_anon_rmap(folio, vma, addr); + folio_add_new_anon_rmap(folio, vma, addr, RMAP_EXCLUSIVE); folio_add_lru_vma(folio, vma); } else { folio_add_file_rmap_ptes(folio, page, nr, vma); diff --git a/mm/migrate_device.c b/mm/migrate_device.c index b6c27c76e1a0..1bebdfae2286 100644 --- a/mm/migrate_device.c +++ b/mm/migrate_device.c @@ -656,7 +656,7 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate, goto unlock_abort; inc_mm_counter(mm, MM_ANONPAGES); - folio_add_new_anon_rmap(folio, vma, addr); + folio_add_new_anon_rmap(folio, vma, addr, RMAP_EXCLUSIVE); if (!folio_is_zone_device(folio)) folio_add_lru_vma(folio, vma); folio_get(folio); diff --git a/mm/rmap.c b/mm/rmap.c index 7b345ae409d3..e38047254fda 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1385,30 +1385,35 @@ void folio_add_anon_rmap_pmd(struct folio *folio, struct page *page, * @folio: The folio to add the mapping to. * @vma: the vm area in which the mapping is added * @address: the user virtual address mapped + * @flags: The rmap flags * * Like folio_add_anon_rmap_*() but must only be called on *new* folios. * This means the inc-and-test can be bypassed. - * The folio does not have to be locked. + * The folio doesn't necessarily need to be locked while it's exclusive + * unless two threads map it concurrently. However, the folio must be + * locked if it's shared. * - * If the folio is pmd-mappable, it is accounted as a THP. As the folio - * is new, it's assumed to be mapped exclusively by a single process. + * If the folio is pmd-mappable, it is accounted as a THP. */ void folio_add_new_anon_rmap(struct folio *folio, struct vm_area_struct *vma, - unsigned long address) + unsigned long address, rmap_t flags) { - int nr = folio_nr_pages(folio); + const int nr = folio_nr_pages(folio); + const bool exclusive = flags & RMAP_EXCLUSIVE; int nr_pmdmapped = 0; VM_WARN_ON_FOLIO(folio_test_hugetlb(folio), folio); + VM_WARN_ON_FOLIO(!exclusive && !folio_test_locked(folio), folio); VM_BUG_ON_VMA(address < vma->vm_start || address + (nr << PAGE_SHIFT) > vma->vm_end, vma); __folio_set_swapbacked(folio); - __folio_set_anon(folio, vma, address, true); + __folio_set_anon(folio, vma, address, exclusive); if (likely(!folio_test_large(folio))) { /* increment count (starts at -1) */ atomic_set(&folio->_mapcount, 0); - SetPageAnonExclusive(&folio->page); + if (exclusive) + SetPageAnonExclusive(&folio->page); } else if (!folio_test_pmd_mappable(folio)) { int i; @@ -1417,7 +1422,8 @@ void folio_add_new_anon_rmap(struct folio *folio, struct vm_area_struct *vma, /* increment count (starts at -1) */ atomic_set(&page->_mapcount, 0); - SetPageAnonExclusive(page); + if (exclusive) + SetPageAnonExclusive(page); } atomic_set(&folio->_nr_pages_mapped, nr); @@ -1425,7 +1431,8 @@ void folio_add_new_anon_rmap(struct folio *folio, struct vm_area_struct *vma, /* increment count (starts at -1) */ atomic_set(&folio->_entire_mapcount, 0); atomic_set(&folio->_nr_pages_mapped, ENTIRELY_MAPPED); - SetPageAnonExclusive(&folio->page); + if (exclusive) + SetPageAnonExclusive(&folio->page); nr_pmdmapped = nr; } diff --git a/mm/swapfile.c b/mm/swapfile.c index af617aa03180..e4f897f5afe3 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1993,7 +1993,7 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, folio_add_anon_rmap_pte(folio, page, vma, addr, rmap_flags); } else { /* ksm created a completely new copy */ - folio_add_new_anon_rmap(folio, vma, addr); + folio_add_new_anon_rmap(folio, vma, addr, RMAP_EXCLUSIVE); folio_add_lru_vma(folio, vma); } new_pte = pte_mkold(mk_pte(page, vma->vm_page_prot)); diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 952366a17898..dcf24184f985 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -116,7 +116,7 @@ int mfill_atomic_install_pte(pmd_t *dst_pmd, folio_add_lru(folio); folio_add_file_rmap_pte(folio, page, dst_vma); } else { - folio_add_new_anon_rmap(folio, dst_vma, dst_addr); + folio_add_new_anon_rmap(folio, dst_vma, dst_addr, RMAP_EXCLUSIVE); folio_add_lru_vma(folio, dst_vma); } -- Gitee From 6bed571e5cbc45ea9c23bfb289b019ad7ff8b107 Mon Sep 17 00:00:00 2001 From: Barry Song Date: Tue, 18 Jun 2024 11:11:36 +1200 Subject: [PATCH 144/484] mm: use folio_add_new_anon_rmap() if folio_test_anon(folio)==false commit 9ae2feacedde16067014f11414675f385c68eedc upstream Conflicts: none Backport-reason: SWAP update: mTHP swapin support rmap preparing For the !folio_test_anon(folio) case, we can now invoke folio_add_new_anon_rmap() with the rmap flags set to either EXCLUSIVE or non-EXCLUSIVE. This action will suppress the VM_WARN_ON_FOLIO check within __folio_add_anon_rmap() while initiating the process of bringing up mTHP swapin. static __always_inline void __folio_add_anon_rmap(struct folio *folio, struct page *page, int nr_pages, struct vm_area_struct *vma, unsigned long address, rmap_t flags, enum rmap_level level) { ... if (unlikely(!folio_test_anon(folio))) { VM_WARN_ON_FOLIO(folio_test_large(folio) && level != RMAP_LEVEL_PMD, folio); } ... } It also improves the code's readability. Currently, all new anonymous folios calling folio_add_anon_rmap_ptes() are order-0. This ensures that new folios cannot be partially exclusive; they are either entirely exclusive or entirely shared. A useful comment from Hugh's fix: : Commit "mm: use folio_add_new_anon_rmap() if folio_test_anon(folio)== : false" has extended folio_add_new_anon_rmap() to use on non-exclusive : folios, already visible to others in swap cache and on LRU. : : That renders its non-atomic __folio_set_swapbacked() unsafe: it risks : overwriting concurrent atomic operations on folio->flags, losing bits : added or restoring bits cleared. Since it's only used in this risky way : when folio_test_locked and !folio_test_anon, many such races are excluded; : but, for example, isolations by folio_test_clear_lru() are vulnerable, and : setting or clearing active. : : It could just use the atomic folio_set_swapbacked(); but this function : does try to avoid atomics where it can, so use a branch instead: just : avoid setting swapbacked when it is already set, that is good enough. : (Swapbacked is normally stable once set: lazyfree can undo it, but only : later, when found anon in a page table.) : : This fixes a lot of instability under compaction and swapping loads: : assorted "Bad page"s, VM_BUG_ON_FOLIO()s, apparently even page double : frees - though I've not worked out what races could lead to the latter. [akpm@linux-foundation.org: comment fixes, per David and akpm] [v-songbaohua@oppo.com: lock the folio to avoid race] Link: https://lkml.kernel.org/r/20240622032002.53033-1-21cnbao@gmail.com [hughd@google.com: folio_add_new_anon_rmap() careful __folio_set_swapbacked()] Link: https://lkml.kernel.org/r/f3599b1d-8323-0dc5-e9e0-fdb3cfc3dd5a@google.com Link: https://lkml.kernel.org/r/20240617231137.80726-3-21cnbao@gmail.com Signed-off-by: Barry Song Signed-off-by: Hugh Dickins Suggested-by: David Hildenbrand Tested-by: Shuai Yuan Acked-by: David Hildenbrand Cc: Baolin Wang Cc: Chris Li Cc: "Huang, Ying" Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Ryan Roberts Cc: Suren Baghdasaryan Cc: Yang Shi Cc: Yosry Ahmed Cc: Yu Zhao Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/memory.c | 9 +++++++++ mm/rmap.c | 4 +++- mm/swapfile.c | 14 ++++++++++++-- 3 files changed, 24 insertions(+), 3 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index 3cd2c634c10d..069d54fe7cc8 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4255,6 +4255,15 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) if (unlikely(folio != swapcache && swapcache)) { folio_add_new_anon_rmap(folio, vma, address, RMAP_EXCLUSIVE); folio_add_lru_vma(folio, vma); + } else if (!folio_test_anon(folio)) { + /* + * We currently only expect small !anon folios, which are either + * fully exclusive or fully shared. If we ever get large folios + * here, we have to be careful. + */ + VM_WARN_ON_ONCE(folio_test_large(folio)); + VM_WARN_ON_FOLIO(!folio_test_locked(folio), folio); + folio_add_new_anon_rmap(folio, vma, address, rmap_flags); } else { folio_add_anon_rmap_ptes(folio, page, nr_pages, vma, address, rmap_flags); diff --git a/mm/rmap.c b/mm/rmap.c index e38047254fda..4936f082928e 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1406,7 +1406,9 @@ void folio_add_new_anon_rmap(struct folio *folio, struct vm_area_struct *vma, VM_WARN_ON_FOLIO(!exclusive && !folio_test_locked(folio), folio); VM_BUG_ON_VMA(address < vma->vm_start || address + (nr << PAGE_SHIFT) > vma->vm_end, vma); - __folio_set_swapbacked(folio); + + if (!folio_test_swapbacked(folio)) + __folio_set_swapbacked(folio); __folio_set_anon(folio, vma, address, exclusive); if (likely(!folio_test_large(folio))) { diff --git a/mm/swapfile.c b/mm/swapfile.c index e4f897f5afe3..9dbd88fa497d 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1990,8 +1990,18 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, VM_BUG_ON_FOLIO(folio_test_writeback(folio), folio); if (pte_swp_exclusive(old_pte)) rmap_flags |= RMAP_EXCLUSIVE; - - folio_add_anon_rmap_pte(folio, page, vma, addr, rmap_flags); + /* + * We currently only expect small !anon folios, which are either + * fully exclusive or fully shared. If we ever get large folios + * here, we have to be careful. + */ + if (!folio_test_anon(folio)) { + VM_WARN_ON_ONCE(folio_test_large(folio)); + VM_WARN_ON_FOLIO(!folio_test_locked(folio), folio); + folio_add_new_anon_rmap(folio, vma, addr, rmap_flags); + } else { + folio_add_anon_rmap_pte(folio, page, vma, addr, rmap_flags); + } } else { /* ksm created a completely new copy */ folio_add_new_anon_rmap(folio, vma, addr, RMAP_EXCLUSIVE); folio_add_lru_vma(folio, vma); -- Gitee From 5b68c21d702db572544393d9db7eee1d5116f690 Mon Sep 17 00:00:00 2001 From: Barry Song Date: Tue, 18 Jun 2024 11:11:37 +1200 Subject: [PATCH 145/484] mm: remove folio_test_anon(folio)==false path in __folio_add_anon_rmap() commit 4c1171f1d22484f2419b07ab688548350db521cb upstream Conflicts: none Backport-reason: SWAP update: mTHP swapin support rmap preparing The folio_test_anon(folio)==false cases has been relocated to folio_add_new_anon_rmap(). Additionally, four other callers consistently pass anonymous folios. stack 1: remove_migration_pmd -> folio_add_anon_rmap_pmd -> __folio_add_anon_rmap stack 2: __split_huge_pmd_locked -> folio_add_anon_rmap_ptes -> __folio_add_anon_rmap stack 3: remove_migration_pmd -> folio_add_anon_rmap_pmd -> __folio_add_anon_rmap (RMAP_LEVEL_PMD) stack 4: try_to_merge_one_page -> replace_page -> folio_add_anon_rmap_pte -> __folio_add_anon_rmap __folio_add_anon_rmap() only needs to handle the cases folio_test_anon(folio)==true now. We can remove the !folio_test_anon(folio)) path within __folio_add_anon_rmap() now. Link: https://lkml.kernel.org/r/20240617231137.80726-4-21cnbao@gmail.com Signed-off-by: Barry Song Suggested-by: David Hildenbrand Tested-by: Shuai Yuan Acked-by: David Hildenbrand Cc: Baolin Wang Cc: Chris Li Cc: "Huang, Ying" Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Ryan Roberts Cc: Suren Baghdasaryan Cc: Yang Shi Cc: Yosry Ahmed Cc: Yu Zhao Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/rmap.c | 17 +++-------------- 1 file changed, 3 insertions(+), 14 deletions(-) diff --git a/mm/rmap.c b/mm/rmap.c index 4936f082928e..be410371e3d3 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1281,23 +1281,12 @@ static __always_inline void __folio_add_anon_rmap(struct folio *folio, { int i, nr, nr_pmdmapped = 0; + VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio); + nr = __folio_add_rmap(folio, page, nr_pages, level, &nr_pmdmapped); - if (unlikely(!folio_test_anon(folio))) { - VM_WARN_ON_FOLIO(!folio_test_locked(folio), folio); - /* - * For a PTE-mapped large folio, we only know that the single - * PTE is exclusive. Further, __folio_set_anon() might not get - * folio->index right when not given the address of the head - * page. - */ - VM_WARN_ON_FOLIO(folio_test_large(folio) && - level != RMAP_LEVEL_PMD, folio); - __folio_set_anon(folio, vma, address, - !!(flags & RMAP_EXCLUSIVE)); - } else if (likely(!folio_test_ksm(folio))) { + if (likely(!folio_test_ksm(folio))) __page_check_anon_rmap(folio, page, vma, address); - } __folio_mod_stat(folio, nr, nr_pmdmapped); -- Gitee From e968de4486745d35afb1dada43839974ec2caf46 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Wed, 15 May 2024 15:07:06 +0800 Subject: [PATCH 146/484] mm: add folio_alloc_mpol() commit a19621ed4e0ac9e1d296d07dc8ff8c27d5c03b43 upstream Conflicts: major, it's quite different from upstream due to missing page allocation tagging and b951aaff5035 Backport-reason: Memory folio allocation helper Patch series "mm: convert to folio_alloc_mpol()". This patch (of 4): This adds a new folio_alloc_mpol() like folio_alloc() but allocate folio according to NUMA mempolicy. Link: https://lkml.kernel.org/r/20240515070709.78529-1-wangkefeng.wang@huawei.com Link: https://lkml.kernel.org/r/20240515070709.78529-2-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- include/linux/gfp.h | 7 +++++++ mm/mempolicy.c | 7 +++++++ 2 files changed, 14 insertions(+) diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 3e4c0c536a3d..b20f51dbc3cc 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -266,6 +266,8 @@ struct page *alloc_pages(gfp_t gfp, unsigned int order); struct page *alloc_pages_mpol(gfp_t gfp, unsigned int order, struct mempolicy *mpol, pgoff_t ilx, int nid); struct folio *folio_alloc(gfp_t gfp, unsigned int order); +struct folio *folio_alloc_mpol(gfp_t gfp, unsigned int order, + struct mempolicy *mpol, pgoff_t ilx, int nid); struct folio *vma_alloc_folio(gfp_t gfp, int order, struct vm_area_struct *vma, unsigned long addr, bool hugepage); #else @@ -282,6 +284,11 @@ static inline struct folio *folio_alloc(gfp_t gfp, unsigned int order) { return __folio_alloc_node(gfp, order, numa_node_id()); } +static inline struct folio *folio_alloc_mpol(gfp_t gfp, unsigned int order, + struct mempolicy *mpol, pgoff_t ilx, int nid) +{ + return folio_alloc(gfp, order); +} #define vma_alloc_folio(gfp, order, vma, addr, hugepage) \ folio_alloc(gfp, order) #endif diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 1fe8f837f9cd..67039850a532 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -2147,6 +2147,13 @@ struct page *alloc_pages_mpol(gfp_t gfp, unsigned int order, return page; } +struct folio *folio_alloc_mpol(gfp_t gfp, unsigned int order, + struct mempolicy *pol, pgoff_t ilx, int nid) +{ + return page_rmappable_folio(alloc_pages_mpol(gfp | __GFP_COMP, + order, pol, ilx, nid)); +} + /** * vma_alloc_folio - Allocate a folio for a VMA. * @gfp: GFP flags. -- Gitee From 5d736cdb92d974a3e7dd8568dea485931705f036 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Wed, 15 May 2024 15:07:07 +0800 Subject: [PATCH 147/484] mm: mempolicy: use folio_alloc_mpol_noprof() in vma_alloc_folio_noprof() commit 3174d70cf694c7c1c506fecffdefa0d26a78cf60 upstream Conflicts: major, just do it differently. Backport-reason: Memory folio allocation helper Convert to use folio_alloc_mpol_noprof() to make vma_alloc_folio_noprof() to use folio throughout. Link: https://lkml.kernel.org/r/20240515070709.78529-3-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/mempolicy.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 67039850a532..82953333c6c1 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -2171,17 +2171,17 @@ struct folio *folio_alloc_mpol(gfp_t gfp, unsigned int order, * Return: The folio on success or NULL if allocation fails. */ struct folio *vma_alloc_folio(gfp_t gfp, int order, struct vm_area_struct *vma, - unsigned long addr, bool hugepage) + unsigned long addr, bool hugepage) { struct mempolicy *pol; pgoff_t ilx; - struct page *page; + struct folio *folio; pol = get_vma_policy(vma, addr, order, &ilx); - page = alloc_pages_mpol(gfp | __GFP_COMP, order, - pol, ilx, numa_node_id()); + folio = folio_alloc_mpol(gfp | __GFP_COMP, order, + pol, ilx, numa_node_id()); mpol_cond_put(pol); - return page_rmappable_folio(page); + return folio; } EXPORT_SYMBOL(vma_alloc_folio); -- Gitee From 303518d184ec1151dd73d46cb025638328529fef Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Wed, 15 May 2024 15:07:08 +0800 Subject: [PATCH 148/484] mm: mempolicy: use folio_alloc_mpol() in alloc_migration_target_by_mpol() commit 1d9cb7852bae9884eef413c213f82158ea84bebf upstream Conflicts: none Backport-reason: Memory folio allocation helper Convert to use folio_alloc_mpol() to make vma_alloc_folio_noprof() to use folio throughout. Link: https://lkml.kernel.org/r/20240515070709.78529-4-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/mempolicy.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 82953333c6c1..87e6e8d9fc59 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1170,7 +1170,6 @@ static struct folio *alloc_migration_target_by_mpol(struct folio *src, struct migration_mpol *mmpol = (struct migration_mpol *)private; struct mempolicy *pol = mmpol->pol; pgoff_t ilx = mmpol->ilx; - struct page *page; unsigned int order; int nid = numa_node_id(); gfp_t gfp; @@ -1193,8 +1192,7 @@ static struct folio *alloc_migration_target_by_mpol(struct folio *src, else gfp = GFP_HIGHUSER_MOVABLE | __GFP_RETRY_MAYFAIL | __GFP_COMP; - page = alloc_pages_mpol(gfp, order, pol, ilx, nid); - return page_rmappable_folio(page); + return folio_alloc_mpol(gfp, order, pol, ilx, nid); } #else -- Gitee From 78646b5fe8854ce354b7e20d91e5e99399869419 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Wed, 15 May 2024 15:07:09 +0800 Subject: [PATCH 149/484] mm: shmem: use folio_alloc_mpol() in shmem_alloc_folio() commit 6f775463d0027733caebfb75d62ec7c4f807f834 upstream Conflicts: none Backport-reason: Memory folio allocation helper Let's change shmem_alloc_folio() to take a order and use folio_alloc_mpol() helper, then directly use it for normal or large folio to cleanup code. Link: https://lkml.kernel.org/r/20240515070709.78529-5-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/shmem.c | 32 +++++++++----------------------- 1 file changed, 9 insertions(+), 23 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index ec2c1a86d7e6..111233bd2ee3 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1618,32 +1618,18 @@ static gfp_t limit_gfp_mask(gfp_t huge_gfp, gfp_t limit_gfp) return result; } -static struct folio *shmem_alloc_hugefolio(gfp_t gfp, +static struct folio *shmem_alloc_folio(gfp_t gfp, int order, struct shmem_inode_info *info, pgoff_t index) { struct mempolicy *mpol; pgoff_t ilx; - struct page *page; - - mpol = shmem_get_pgoff_policy(info, index, HPAGE_PMD_ORDER, &ilx); - page = alloc_pages_mpol(gfp, HPAGE_PMD_ORDER, mpol, ilx, numa_node_id()); - mpol_cond_put(mpol); - - return page_rmappable_folio(page); -} - -static struct folio *shmem_alloc_folio(gfp_t gfp, - struct shmem_inode_info *info, pgoff_t index) -{ - struct mempolicy *mpol; - pgoff_t ilx; - struct page *page; + struct folio *folio; - mpol = shmem_get_pgoff_policy(info, index, 0, &ilx); - page = alloc_pages_mpol(gfp, 0, mpol, ilx, numa_node_id()); + mpol = shmem_get_pgoff_policy(info, index, order, &ilx); + folio = folio_alloc_mpol(gfp, order, mpol, ilx, numa_node_id()); mpol_cond_put(mpol); - return (struct folio *)page; + return folio; } static struct folio *shmem_alloc_and_add_folio(gfp_t gfp, @@ -1675,12 +1661,12 @@ static struct folio *shmem_alloc_and_add_folio(gfp_t gfp, index + HPAGE_PMD_NR - 1, XA_PRESENT)) return ERR_PTR(-E2BIG); - folio = shmem_alloc_hugefolio(gfp, info, index); + folio = shmem_alloc_folio(gfp, HPAGE_PMD_ORDER, info, index); if (!folio) count_vm_event(THP_FILE_FALLBACK); } else { pages = 1; - folio = shmem_alloc_folio(gfp, info, index); + folio = shmem_alloc_folio(gfp, 0, info, index); } if (!folio) return ERR_PTR(-ENOMEM); @@ -1780,7 +1766,7 @@ static int shmem_replace_folio(struct folio **foliop, gfp_t gfp, */ gfp &= ~GFP_CONSTRAINT_MASK; VM_BUG_ON_FOLIO(folio_test_large(old), old); - new = shmem_alloc_folio(gfp, info, index); + new = shmem_alloc_folio(gfp, 0, info, index); if (!new) return -ENOMEM; @@ -2628,7 +2614,7 @@ int shmem_mfill_atomic_pte(pmd_t *dst_pmd, if (!*foliop) { ret = -ENOMEM; - folio = shmem_alloc_folio(gfp, info, pgoff); + folio = shmem_alloc_folio(gfp, 0, info, pgoff); if (!folio) goto out_unacct_blocks; -- Gitee From b468cad3bbf3ab21fe6449d73f25093b8623e2db Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Tue, 9 Jul 2024 18:55:08 +0800 Subject: [PATCH 150/484] mm: swap_state: use folio_alloc_mpol() in __read_swap_cache_async() commit 2ef52d5bb78dc2d27d8141578e8095989ad5ca35 upstream Conflicts: none Backport-reason: SWAP update: minor cleanup Convert to use folio_alloc_mpol() helper() in __read_swap_cache_async(). Link: https://lkml.kernel.org/r/20240709105508.3933823-1-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/swap_state.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/mm/swap_state.c b/mm/swap_state.c index f67b59cc6185..c6db00db789f 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -395,8 +395,7 @@ struct folio *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, * before marking swap_map SWAP_HAS_CACHE, when -EEXIST will * cause any racers to loop around until we add it to cache. */ - folio = (struct folio *)alloc_pages_mpol(gfp_mask, 0, - mpol, ilx, numa_node_id()); + folio = folio_alloc_mpol(gfp_mask, 0, mpol, ilx, numa_node_id()); if (!folio) goto fail_put_swap; -- Gitee From dfb195e8c7f43acf081f038aebe80a2382769b28 Mon Sep 17 00:00:00 2001 From: Barry Song Date: Wed, 24 Jul 2024 14:00:56 +1200 Subject: [PATCH 151/484] mm: extend 'usage' parameter so that cluster_swap_free_nr() can be reused commit d2539ed7ee3b042e4503c304603d0eaa50c9c476 upstream Conflicts: major, don't touch cluster_swap_free_nr Backport-reason: SWAP update: minor cleanup Extend a usage parameter so that cluster_swap_free_nr() can be reused by both swapcache_clear() and swap_free(). __swap_entry_free() is quite similar but more tricky as it requires the return value of __swap_entry_free_locked() which cluster_swap_free_nr() doesn't support. Link: https://lkml.kernel.org/r/20240724020056.65838-1-21cnbao@gmail.com Signed-off-by: Barry Song Reviewed-by: Ryan Roberts Acked-by: Chris Li Cc: "Huang, Ying" Cc: Kairui Song Cc: David Hildenbrand Cc: Chuanhua Han Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/swapfile.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/mm/swapfile.c b/mm/swapfile.c index 9dbd88fa497d..1b3ebf9f5ea3 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -3561,15 +3561,9 @@ int swapcache_prepare(swp_entry_t entry) void swapcache_clear(struct swap_info_struct *si, swp_entry_t entry) { - struct swap_cluster_info *ci; unsigned long offset = swp_offset(entry); - unsigned char usage; - ci = lock_cluster(si, offset); - usage = __swap_entry_free_locked(si, offset, SWAP_HAS_CACHE); - if (!usage) - swap_entry_range_free(si, ci, entry, 1); - unlock_cluster(ci); + cluster_swap_free_nr(si, offset, 1, SWAP_HAS_CACHE); } struct swap_info_struct *swp_swap_info(swp_entry_t entry) -- Gitee From 98930d39c901aebd9636231125f6dcb07274b2bb Mon Sep 17 00:00:00 2001 From: Barry Song Date: Tue, 30 Jul 2024 19:13:39 +1200 Subject: [PATCH 152/484] mm: swap: add nr argument in swapcache_prepare and swapcache_clear to support large folios commit 9f101bef408a3f70c44b6e4de44d3d4e2655ed10 upstream Conflicts: resolved Backport-reason: SWAP update: mTHP swapin support Right now, swapcache_prepare() and swapcache_clear() supports one entry only, to support large folios, we need to handle multiple swap entries. To optimize stack usage, we iterate twice in __swap_duplicate(): the first time to verify that all entries are valid, and the second time to apply the modifications to the entries. Currently, we're using nr=1 for the existing users. [v-songbaohua@oppo.com: clarify swap_count_continued and improve readability for __swap_duplicate] Link: https://lkml.kernel.org/r/20240802071817.47081-1-21cnbao@gmail.com Link: https://lkml.kernel.org/r/20240730071339.107447-2-21cnbao@gmail.com Signed-off-by: Barry Song Reviewed-by: Baolin Wang Acked-by: David Hildenbrand Tested-by: Baolin Wang Cc: Chris Li Cc: Gao Xiang Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Johannes Weiner Cc: Kairui Song Cc: Kalesh Singh Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Minchan Kim Cc: Nhat Pham Cc: Ryan Roberts Cc: Sergey Senozhatsky Cc: Shakeel Butt Cc: Suren Baghdasaryan Cc: Yang Shi Cc: Yosry Ahmed Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- include/linux/swap.h | 4 +- mm/memory.c | 6 +-- mm/swap.h | 5 ++- mm/swap_state.c | 2 +- mm/swapfile.c | 95 +++++++++++++++++++++++++------------------- 5 files changed, 63 insertions(+), 49 deletions(-) diff --git a/include/linux/swap.h b/include/linux/swap.h index 43201dcee18b..40fb4e600064 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -516,7 +516,7 @@ extern swp_entry_t get_swap_page_of_type(int); extern int add_swap_count_continuation(swp_entry_t, gfp_t); extern void swap_shmem_alloc(swp_entry_t); extern int swap_duplicate(swp_entry_t); -extern int swapcache_prepare(swp_entry_t); +extern int swapcache_prepare(swp_entry_t entry, int nr); extern void swap_free_nr(swp_entry_t entry, int nr_pages); extern void free_swap_and_cache_nr(swp_entry_t entry, int nr); int swap_type_of(dev_t device, sector_t offset); @@ -589,7 +589,7 @@ static inline int swap_duplicate(swp_entry_t swp) return 0; } -static inline int swapcache_prepare(swp_entry_t swp) +static inline int swapcache_prepare(swp_entry_t swp, int nr) { return 0; } diff --git a/mm/memory.c b/mm/memory.c index 069d54fe7cc8..c2e3bddde8f7 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4002,7 +4002,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) * reusing the same entry. It's undetectable as * pte_same() returns true due to entry reuse. */ - if (swapcache_prepare(entry)) { + if (swapcache_prepare(entry, 1)) { /* Relax a bit to prevent rapid repeated page faults */ schedule_timeout_uninterruptible(1); goto out; @@ -4304,7 +4304,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) out: /* Clear the swap cache pin for direct swapin after PTL unlock */ if (need_clear_cache) - swapcache_clear(si, entry); + swapcache_clear(si, entry, 1); if (si) put_swap_device(si); return ret; @@ -4320,7 +4320,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) folio_put(swapcache); } if (need_clear_cache) - swapcache_clear(si, entry); + swapcache_clear(si, entry, 1); if (si) put_swap_device(si); return ret; diff --git a/mm/swap.h b/mm/swap.h index 27468f7cc101..c668591d1e10 100644 --- a/mm/swap.h +++ b/mm/swap.h @@ -59,7 +59,7 @@ void __delete_from_swap_cache(struct folio *folio, void delete_from_swap_cache(struct folio *folio); void clear_shadow_from_swap_cache(int type, unsigned long begin, unsigned long end); -void swapcache_clear(struct swap_info_struct *si, swp_entry_t entry); +void swapcache_clear(struct swap_info_struct *si, swp_entry_t entry, int nr); struct folio *swap_cache_get_folio(swp_entry_t entry, struct vm_area_struct *vma, unsigned long addr); struct folio *filemap_get_incore_folio(struct address_space *mapping, @@ -121,7 +121,7 @@ static inline int swap_writepage(struct page *p, struct writeback_control *wbc) return 0; } -static inline void swapcache_clear(struct swap_info_struct *si, swp_entry_t entry) +static inline void swapcache_clear(struct swap_info_struct *si, swp_entry_t entry, int nr) { } @@ -168,4 +168,5 @@ static inline unsigned int folio_swap_flags(struct folio *folio) return 0; } #endif /* CONFIG_SWAP */ + #endif /* _MM_SWAP_H */ diff --git a/mm/swap_state.c b/mm/swap_state.c index c6db00db789f..0db2a57ff534 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -402,7 +402,7 @@ struct folio *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, /* * Swap entry may have been freed since our caller observed it. */ - err = swapcache_prepare(entry); + err = swapcache_prepare(entry, 1); if (!err) break; diff --git a/mm/swapfile.c b/mm/swapfile.c index 1b3ebf9f5ea3..67cb96133272 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -3451,7 +3451,7 @@ void si_swapinfo(struct sysinfo *val) } /* - * Verify that a swap entry is valid and increment its swap map count. + * Verify that nr swap entries are valid and increment their swap map counts. * * Returns error code in following case. * - success -> 0 @@ -3460,60 +3460,73 @@ void si_swapinfo(struct sysinfo *val) * - swap-cache reference is requested but the entry is not used. -> ENOENT * - swap-mapped reference requested but needs continued swap count. -> ENOMEM */ -static int __swap_duplicate(swp_entry_t entry, unsigned char usage) +static int __swap_duplicate(swp_entry_t entry, unsigned char usage, int nr) { struct swap_info_struct *si; struct swap_cluster_info *ci; unsigned long offset; unsigned char count; unsigned char has_cache; - int err; + int err, i; si = swp_swap_info(entry); offset = swp_offset(entry); + VM_WARN_ON(nr > SWAPFILE_CLUSTER - offset % SWAPFILE_CLUSTER); + VM_WARN_ON(usage == 1 && nr > 1); ci = lock_cluster(si, offset); - count = si->swap_map[offset]; - - /* - * swapin_readahead() doesn't check if a swap entry is valid, so the - * swap entry could be SWAP_MAP_BAD. Check here with lock held. - */ - if (unlikely(swap_count(count) == SWAP_MAP_BAD)) { - err = -ENOENT; - goto unlock_out; - } - - has_cache = count & SWAP_HAS_CACHE; - count &= ~SWAP_HAS_CACHE; err = 0; + for (i = 0; i < nr; i++) { + count = si->swap_map[offset + i]; - if (usage == SWAP_HAS_CACHE) { + /* + * swapin_readahead() doesn't check if a swap entry is valid, so the + * swap entry could be SWAP_MAP_BAD. Check here with lock held. + */ + if (unlikely(swap_count(count) == SWAP_MAP_BAD)) { + err = -ENOENT; + goto unlock_out; + } - /* set SWAP_HAS_CACHE if there is no cache and entry is used */ - if (!has_cache && count) - has_cache = SWAP_HAS_CACHE; - else if (has_cache) /* someone else added cache */ - err = -EEXIST; - else /* no users remaining */ + has_cache = count & SWAP_HAS_CACHE; + count &= ~SWAP_HAS_CACHE; + + if (!count && !has_cache) { err = -ENOENT; + } else if (usage == SWAP_HAS_CACHE) { + if (has_cache) + err = -EEXIST; + } else if ((count & ~COUNT_CONTINUED) > SWAP_MAP_MAX) { + err = -EINVAL; + } - } else if (count || has_cache) { + if (err) + goto unlock_out; + } + + for (i = 0; i < nr; i++) { + count = si->swap_map[offset + i]; + has_cache = count & SWAP_HAS_CACHE; + count &= ~SWAP_HAS_CACHE; - if ((count & ~COUNT_CONTINUED) < SWAP_MAP_MAX) + if (usage == SWAP_HAS_CACHE) + has_cache = SWAP_HAS_CACHE; + else if ((count & ~COUNT_CONTINUED) < SWAP_MAP_MAX) count += usage; - else if ((count & ~COUNT_CONTINUED) > SWAP_MAP_MAX) - err = -EINVAL; - else if (swap_count_continued(si, offset, count)) + else if (swap_count_continued(si, offset + i, count)) count = COUNT_CONTINUED; - else + else { + /* + * Don't need to rollback changes, because if + * usage == 1, there must be nr == 1. + */ err = -ENOMEM; - } else - err = -ENOENT; /* unused swap entry */ + goto unlock_out; + } - if (!err) - WRITE_ONCE(si->swap_map[offset], count | has_cache); + WRITE_ONCE(si->swap_map[offset + i], count | has_cache); + } unlock_out: unlock_cluster(ci); @@ -3526,7 +3539,7 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage) */ void swap_shmem_alloc(swp_entry_t entry) { - __swap_duplicate(entry, SWAP_MAP_SHMEM); + __swap_duplicate(entry, SWAP_MAP_SHMEM, 1); } /* @@ -3540,30 +3553,30 @@ int swap_duplicate(swp_entry_t entry) { int err = 0; - while (!err && __swap_duplicate(entry, 1) == -ENOMEM) + while (!err && __swap_duplicate(entry, 1, 1) == -ENOMEM) err = add_swap_count_continuation(entry, GFP_ATOMIC); return err; } EXPORT_SYMBOL(swap_duplicate); /* - * @entry: swap entry for which we allocate swap cache. + * @entry: first swap entry from which we allocate nr swap cache. * - * Called when allocating swap cache for existing swap entry, + * Called when allocating swap cache for existing swap entries, * This can return error codes. Returns 0 at success. * -EEXIST means there is a swap cache. * Note: return code is different from swap_duplicate(). */ -int swapcache_prepare(swp_entry_t entry) +int swapcache_prepare(swp_entry_t entry, int nr) { - return __swap_duplicate(entry, SWAP_HAS_CACHE); + return __swap_duplicate(entry, SWAP_HAS_CACHE, nr); } -void swapcache_clear(struct swap_info_struct *si, swp_entry_t entry) +void swapcache_clear(struct swap_info_struct *si, swp_entry_t entry, int nr) { unsigned long offset = swp_offset(entry); - cluster_swap_free_nr(si, offset, 1, SWAP_HAS_CACHE); + cluster_swap_free_nr(si, offset, nr, SWAP_HAS_CACHE); } struct swap_info_struct *swp_swap_info(swp_entry_t entry) -- Gitee From 2cf7d61c55ba6a3f6dcc5f4db85d2d643e55070d Mon Sep 17 00:00:00 2001 From: Zhaoyu Liu Date: Wed, 31 Jul 2024 21:31:01 +0800 Subject: [PATCH 153/484] mm: swap: allocate folio only first time in __read_swap_cache_async() commit 1d3440305e07c166c3752edb5e212bf324cf3252 upstream Conflicts: minor Backport-reason: SWAP update: minor optimize It should be checked by filemap_get_folio() if SWAP_HAS_CACHE was marked while reading a share swap page. It would re-allocate a folio if the swap cache was not ready now. We save the new folio to avoid page allocating again. Link: https://lkml.kernel.org/r/20240731133101.GA2096752@bytedance Signed-off-by: Zhaoyu Liu Cc: Domenico Cerasuolo Cc: Kairui Song Cc: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/swap_state.c | 57 +++++++++++++++++++++++++++---------------------- 1 file changed, 31 insertions(+), 26 deletions(-) diff --git a/mm/swap_state.c b/mm/swap_state.c index 0db2a57ff534..09e8de4da384 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -368,6 +368,8 @@ struct folio *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, { struct swap_info_struct *si = swp_swap_info(entry); struct folio *folio; + struct folio *new_folio = NULL; + struct folio *result = NULL; void *shadow = NULL; *new_page_allocated = false; @@ -388,16 +390,19 @@ struct folio *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, * Just skip read ahead for unused swap slot. */ if (!swap_swapcount(si, entry)) - goto fail_put_swap; + goto put_and_return; /* - * Get a new folio to read into from swap. Allocate it now, - * before marking swap_map SWAP_HAS_CACHE, when -EEXIST will - * cause any racers to loop around until we add it to cache. + * Get a new folio to read into from swap. Allocate it now if + * new_folio not exist, before marking swap_map SWAP_HAS_CACHE, + * when -EEXIST will cause any racers to loop around until we + * add it to cache. */ - folio = folio_alloc_mpol(gfp_mask, 0, mpol, ilx, numa_node_id()); - if (!folio) - goto fail_put_swap; + if (!new_folio) { + new_folio = folio_alloc_mpol(gfp_mask, 0, mpol, ilx, numa_node_id()); + if (!new_folio) + goto put_and_return; + } /* * Swap entry may have been freed since our caller observed it. @@ -405,10 +410,8 @@ struct folio *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, err = swapcache_prepare(entry, 1); if (!err) break; - - folio_put(folio); - if (err != -EEXIST) - goto fail_put_swap; + else if (err != -EEXIST) + goto put_and_return; /* * Protect against a recursive call to __read_swap_cache_async() @@ -419,7 +422,7 @@ struct folio *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, * __read_swap_cache_async() in the writeback path. */ if (skip_if_exists) - goto fail_put_swap; + goto put_and_return; /* * We might race against __delete_from_swap_cache(), and @@ -434,34 +437,36 @@ struct folio *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, /* * The swap entry is ours to swap in. Prepare the new folio. */ + __folio_set_locked(new_folio); + __folio_set_swapbacked(new_folio); - __folio_set_locked(folio); - __folio_set_swapbacked(folio); - - if (mem_cgroup_swapin_charge_folio(folio, NULL, gfp_mask, entry)) + if (mem_cgroup_swapin_charge_folio(new_folio, NULL, gfp_mask, entry)) goto fail_unlock; /* May fail (-ENOMEM) if XArray node allocation failed. */ - if (add_to_swap_cache(folio, entry, gfp_mask & GFP_RECLAIM_MASK, &shadow)) + if (add_to_swap_cache(new_folio, entry, gfp_mask & GFP_RECLAIM_MASK, &shadow)) goto fail_unlock; mem_cgroup_swapin_uncharge_swap(entry); if (shadow) - workingset_refault(folio, shadow); + workingset_refault(new_folio, shadow); - /* Caller will initiate read into locked folio */ - folio_add_lru(folio); + /* Caller will initiate read into locked new_folio */ + folio_add_lru(new_folio); *new_page_allocated = true; + folio = new_folio; got_folio: - return folio; + result = folio; + goto put_and_return; fail_unlock: - put_swap_folio(folio, entry); - folio_unlock(folio); - folio_put(folio); -fail_put_swap: - return NULL; + put_swap_folio(new_folio, entry); + folio_unlock(new_folio); +put_and_return: + if (!(*new_page_allocated) && new_folio) + folio_put(new_folio); + return result; } /* -- Gitee From a20fcf975b21105a077e68a3ccbf1a1ee6f5ccec Mon Sep 17 00:00:00 2001 From: Nhat Pham Date: Thu, 7 Dec 2023 11:24:06 -0800 Subject: [PATCH 154/484] zswap: memcontrol: implement zswap writeback disabling commit 501a06fe8e4c185bbda371b8cedbdf1b23a633d8 upstream Conflicts: minor, __swap_writepage may fail in downstream. Backport-reason: ZSWAP mass udpate During our experiment with zswap, we sometimes observe swap IOs due to occasional zswap store failures and writebacks-to-swap. These swapping IOs prevent many users who cannot tolerate swapping from adopting zswap to save memory and improve performance where possible. This patch adds the option to disable this behavior entirely: do not writeback to backing swapping device when a zswap store attempt fail, and do not write pages in the zswap pool back to the backing swap device (both when the pool is full, and when the new zswap shrinker is called). This new behavior can be opted-in/out on a per-cgroup basis via a new cgroup file. By default, writebacks to swap device is enabled, which is the previous behavior. Initially, writeback is enabled for the root cgroup, and a newly created cgroup will inherit the current setting of its parent. Note that this is subtly different from setting memory.swap.max to 0, as it still allows for pages to be stored in the zswap pool (which itself consumes swap space in its current form). This patch should be applied on top of the zswap shrinker series: https://lore.kernel.org/linux-mm/20231130194023.4102148-1-nphamcs@gmail.com/ as it also disables the zswap shrinker, a major source of zswap writebacks. For the most part, this feature is motivated by internal parties who have already established their opinions regarding swapping - the workloads that are highly sensitive to IO, and especially those who are using servers with really slow disk performance (for instance, massive but slow HDDs). For these folks, it's impossible to convince them to even entertain zswap if swapping also comes as a packaged deal. Writeback disabling is quite a useful feature in these situations - on a mixed workloads deployment, they can disable writeback for the more IO-sensitive workloads, and enable writeback for other background workloads. For instance, on a server with HDD, I allocate memories and populate them with random values (so that zswap store will always fail), and specify memory.high low enough to trigger reclaim. The time it takes to allocate the memories and just read through it a couple of times (doing silly things like computing the values' average etc.): zswap.writeback disabled: real 0m30.537s user 0m23.687s sys 0m6.637s 0 pages swapped in 0 pages swapped out zswap.writeback enabled: real 0m45.061s user 0m24.310s sys 0m8.892s 712686 pages swapped in 461093 pages swapped out (the last two lines are from vmstat -s). [nphamcs@gmail.com: add a comment about recurring zswap store failures leading to reclaim inefficiency] Link: https://lkml.kernel.org/r/20231221005725.3446672-1-nphamcs@gmail.com Link: https://lkml.kernel.org/r/20231207192406.3809579-1-nphamcs@gmail.com Signed-off-by: Nhat Pham Suggested-by: Johannes Weiner Reviewed-by: Yosry Ahmed Acked-by: Chris Li Cc: Dan Streetman Cc: David Heidelberg Cc: Domenico Cerasuolo Cc: Hugh Dickins Cc: Jonathan Corbet Cc: Konrad Rzeszutek Wilk Cc: Michal Hocko Cc: Mike Rapoport (IBM) Cc: Muchun Song Cc: Roman Gushchin Cc: Sergey Senozhatsky Cc: Seth Jennings Cc: Shakeel Butt Cc: Tejun Heo Cc: Vitaly Wool Cc: Zefan Li Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- Documentation/admin-guide/cgroup-v2.rst | 15 ++++++++++ Documentation/admin-guide/mm/zswap.rst | 10 +++++++ include/linux/memcontrol.h | 12 ++++++++ include/linux/zswap.h | 7 +++++ mm/memcontrol.c | 38 +++++++++++++++++++++++++ mm/page_io.c | 6 ++++ mm/shmem.c | 3 +- mm/zswap.c | 13 +++++++-- 8 files changed, 100 insertions(+), 4 deletions(-) diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index a6f051fab212..f6ae7fa4d60f 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -1714,6 +1714,21 @@ PAGE_SIZE multiple when read back. limit, it will refuse to take any more stores before existing entries fault back in or are written out to disk. + memory.zswap.writeback + A read-write single value file. The default value is "1". The + initial value of the root cgroup is 1, and when a new cgroup is + created, it inherits the current value of its parent. + + When this is set to 0, all swapping attempts to swapping devices + are disabled. This included both zswap writebacks, and swapping due + to zswap store failures. If the zswap store failures are recurring + (for e.g if the pages are incompressible), users can observe + reclaim inefficiency after disabling writeback (because the same + pages might be rejected again and again). + + Note that this is subtly different from setting memory.swap.max to + 0, as it still allows for pages to be written to the zswap pool. + memory.pressure A read-only nested-keyed file. diff --git a/Documentation/admin-guide/mm/zswap.rst b/Documentation/admin-guide/mm/zswap.rst index 62fc244ec702..b42132969e31 100644 --- a/Documentation/admin-guide/mm/zswap.rst +++ b/Documentation/admin-guide/mm/zswap.rst @@ -153,6 +153,16 @@ attribute, e. g.:: Setting this parameter to 100 will disable the hysteresis. +Some users cannot tolerate the swapping that comes with zswap store failures +and zswap writebacks. Swapping can be disabled entirely (without disabling +zswap itself) on a cgroup-basis as follows: + + echo 0 > /sys/fs/cgroup//memory.zswap.writeback + +Note that if the store failures are recurring (for e.g if the pages are +incompressible), users can observe reclaim inefficiency after disabling +writeback (because the same pages might be rejected again and again). + When there is a sizable amount of cold memory residing in the zswap pool, it can be advantageous to proactively write these cold pages to swap and reclaim the memory for other use cases. By default, the zswap shrinker is disabled. diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index b9cd40d162ac..de9e3f6a0034 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -267,6 +267,12 @@ struct mem_cgroup { #if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP) unsigned long zswap_max; + + /* + * Prevent pages from this memcg from being written back from zswap to + * swap, and from being swapped out on zswap store failures. + */ + bool zswap_writeback; #endif #ifdef CONFIG_MEMCG_ZRAM @@ -2106,6 +2112,7 @@ static inline void count_objcg_event(struct obj_cgroup *objcg, bool obj_cgroup_may_zswap(struct obj_cgroup *objcg); void obj_cgroup_charge_zswap(struct obj_cgroup *objcg, size_t size); void obj_cgroup_uncharge_zswap(struct obj_cgroup *objcg, size_t size); +bool mem_cgroup_zswap_writeback_enabled(struct mem_cgroup *memcg); #else static inline bool obj_cgroup_may_zswap(struct obj_cgroup *objcg) { @@ -2119,6 +2126,11 @@ static inline void obj_cgroup_uncharge_zswap(struct obj_cgroup *objcg, size_t size) { } +static inline bool mem_cgroup_zswap_writeback_enabled(struct mem_cgroup *memcg) +{ + /* if zswap is disabled, do not block pages going to the swapping device */ + return true; +} #endif #endif /* _LINUX_MEMCONTROL_H */ diff --git a/include/linux/zswap.h b/include/linux/zswap.h index e88572d4c720..0b709f5bc65f 100644 --- a/include/linux/zswap.h +++ b/include/linux/zswap.h @@ -35,6 +35,7 @@ void zswap_swapoff(int type); void zswap_memcg_offline_cleanup(struct mem_cgroup *memcg); void zswap_lruvec_state_init(struct lruvec *lruvec); void zswap_folio_swapin(struct folio *folio); +bool is_zswap_enabled(void); #else struct zswap_lruvec_state {}; @@ -55,6 +56,12 @@ static inline void zswap_swapoff(int type) {} static inline void zswap_memcg_offline_cleanup(struct mem_cgroup *memcg) {} static inline void zswap_lruvec_state_init(struct lruvec *lruvec) {} static inline void zswap_folio_swapin(struct folio *folio) {} + +static inline bool is_zswap_enabled(void) +{ + return false; +} + #endif #endif /* _LINUX_ZSWAP_H */ diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 7077841754bb..6f5bbadef31e 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -7294,6 +7294,8 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) memcg->pagecache_max_ratio = PAGECACHE_MAX_RATIO_MAX; #if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP) memcg->zswap_max = PAGE_COUNTER_MAX; + WRITE_ONCE(memcg->zswap_writeback, + !parent || READ_ONCE(parent->zswap_writeback)); #endif #ifdef CONFIG_MEMCG_ZRAM memcg->zram_max = PAGE_COUNTER_MAX; @@ -10150,6 +10152,12 @@ void obj_cgroup_uncharge_zswap(struct obj_cgroup *objcg, size_t size) rcu_read_unlock(); } +bool mem_cgroup_zswap_writeback_enabled(struct mem_cgroup *memcg) +{ + /* if zswap is disabled, do not block pages going to the swapping device */ + return !is_zswap_enabled() || !memcg || READ_ONCE(memcg->zswap_writeback); +} + static u64 zswap_current_read(struct cgroup_subsys_state *css, struct cftype *cft) { @@ -10182,6 +10190,31 @@ static ssize_t zswap_max_write(struct kernfs_open_file *of, return nbytes; } +static int zswap_writeback_show(struct seq_file *m, void *v) +{ + struct mem_cgroup *memcg = mem_cgroup_from_seq(m); + + seq_printf(m, "%d\n", READ_ONCE(memcg->zswap_writeback)); + return 0; +} + +static ssize_t zswap_writeback_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); + int zswap_writeback; + ssize_t parse_ret = kstrtoint(strstrip(buf), 0, &zswap_writeback); + + if (parse_ret) + return parse_ret; + + if (zswap_writeback != 0 && zswap_writeback != 1) + return -EINVAL; + + WRITE_ONCE(memcg->zswap_writeback, zswap_writeback); + return nbytes; +} + static struct cftype zswap_files[] = { { .name = "zswap.current", @@ -10194,6 +10227,11 @@ static struct cftype zswap_files[] = { .seq_show = zswap_max_show, .write = zswap_max_write, }, + { + .name = "zswap.writeback", + .seq_show = zswap_writeback_show, + .write = zswap_writeback_write, + }, { } /* terminate */ }; #endif /* CONFIG_MEMCG_KMEM && CONFIG_ZSWAP */ diff --git a/mm/page_io.c b/mm/page_io.c index fb68d28d8219..8a3e9e6d59c3 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -200,6 +200,12 @@ int swap_writepage(struct page *page, struct writeback_control *wbc) folio_end_writeback(folio); return 0; } + + if (!mem_cgroup_zswap_writeback_enabled(folio_memcg(folio))) { + folio_mark_dirty(folio); + return AOP_WRITEPAGE_ACTIVATE; + } + ret = __swap_writepage(folio, wbc); return ret; } diff --git a/mm/shmem.c b/mm/shmem.c index 111233bd2ee3..53e6a417f69f 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1529,8 +1529,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) mutex_unlock(&shmem_swaplist_mutex); BUG_ON(folio_mapped(folio)); - swap_writepage(&folio->page, wbc); - return 0; + return swap_writepage(&folio->page, wbc); } if (!info->swapped) list_del_init(&info->swaplist); diff --git a/mm/zswap.c b/mm/zswap.c index a8ee59269fb4..7cfe7b47643f 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -151,6 +151,11 @@ static bool zswap_shrinker_enabled = IS_ENABLED( CONFIG_ZSWAP_SHRINKER_DEFAULT_ON); module_param_named(shrinker_enabled, zswap_shrinker_enabled, bool, 0644); +bool is_zswap_enabled(void) +{ + return zswap_enabled; +} + /********************************* * data structures **********************************/ @@ -593,7 +598,8 @@ static unsigned long zswap_shrinker_scan(struct shrinker *shrinker, struct zswap_pool *pool = shrinker->private_data; bool encountered_page_in_swapcache = false; - if (!zswap_shrinker_enabled) { + if (!zswap_shrinker_enabled || + !mem_cgroup_zswap_writeback_enabled(sc->memcg)) { sc->nr_scanned = 0; return SHRINK_STOP; } @@ -634,7 +640,7 @@ static unsigned long zswap_shrinker_count(struct shrinker *shrinker, struct lruvec *lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(sc->nid)); unsigned long nr_backing, nr_stored, nr_freeable, nr_protected; - if (!zswap_shrinker_enabled) + if (!zswap_shrinker_enabled || !mem_cgroup_zswap_writeback_enabled(memcg)) return 0; #ifdef CONFIG_MEMCG_KMEM @@ -920,6 +926,9 @@ static int shrink_memcg(struct mem_cgroup *memcg) struct zswap_pool *pool; int nid, shrunk = 0; + if (!mem_cgroup_zswap_writeback_enabled(memcg)) + return -EINVAL; + /* * Skip zombies because their LRUs are reparented and we would be * reclaiming from the parent instead of the dead memcg. -- Gitee From 00f8631419f78147116be076b1409eb37cfe68f0 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 29 Jan 2024 20:34:38 -0500 Subject: [PATCH 155/484] mm: zswap: fix objcg use-after-free in entry destruction commit 2e601e1e8e4b330020a346c55ba111d49e0b188e upstream Conflicts: none Backport-reason: ZSWAP mass udpate In the per-memcg LRU universe, LRU removal uses entry->objcg to determine which list count needs to be decreased. Drop the objcg reference after updating the LRU, to fix a possible use-after-free. Link: https://lkml.kernel.org/r/20240130013438.565167-1-hannes@cmpxchg.org Fixes: a65b0e7607cc ("zswap: make shrinking memcg-aware") Signed-off-by: Johannes Weiner Acked-by: Yosry Ahmed Reviewed-by: Nhat Pham Reviewed-by: Chengming Zhou Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/zswap.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/mm/zswap.c b/mm/zswap.c index 7cfe7b47643f..25cb7d329b79 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -533,10 +533,6 @@ static struct zpool *zswap_find_zpool(struct zswap_entry *entry) */ static void zswap_free_entry(struct zswap_entry *entry) { - if (entry->objcg) { - obj_cgroup_uncharge_zswap(entry->objcg, entry->length); - obj_cgroup_put(entry->objcg); - } if (!entry->length) atomic_dec(&zswap_same_filled_pages); else { @@ -545,6 +541,10 @@ static void zswap_free_entry(struct zswap_entry *entry) atomic_dec(&entry->pool->nr_stored); zswap_pool_put(entry->pool); } + if (entry->objcg) { + obj_cgroup_uncharge_zswap(entry->objcg, entry->length); + obj_cgroup_put(entry->objcg); + } zswap_entry_cache_free(entry); atomic_dec(&zswap_stored_pages); zswap_update_total_size(); -- Gitee From 707c0c78a024c852daf540448651a47896816d18 Mon Sep 17 00:00:00 2001 From: Chengming Zhou Date: Sun, 28 Jan 2024 13:28:49 +0000 Subject: [PATCH 156/484] mm/zswap: don't return LRU_SKIP if we have dropped lru lock commit 27d3969b47cc38810b3fd65d72231940e8671e6c upstream Conflicts: none Backport-reason: ZSWAP mass udpate LRU_SKIP can only be returned if we don't ever dropped lru lock, or we need to return LRU_RETRY to restart from the head of lru list. Otherwise, the iteration might continue from a cursor position that was freed while the locks were dropped. Actually we may need to introduce another LRU_STOP to really terminate the ongoing shrinking scan process, when we encounter a warm page already in the swap cache. The current list_lru implementation doesn't have this function to early break from __list_lru_walk_one. Link: https://lkml.kernel.org/r/20240126-zswap-writeback-race-v2-1-b10479847099@bytedance.com Fixes: b5ba474f3f51 ("zswap: shrink zswap pool based on memory pressure") Signed-off-by: Chengming Zhou Acked-by: Johannes Weiner Reviewed-by: Nhat Pham Cc: Chris Li Cc: Yosry Ahmed Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/zswap.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/mm/zswap.c b/mm/zswap.c index 25cb7d329b79..317b821694f7 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -892,10 +892,8 @@ static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_o * into the warmer region. We should terminate shrinking (if we're in the dynamic * shrinker context). */ - if (writeback_result == -EEXIST && encountered_page_in_swapcache) { - ret = LRU_SKIP; + if (writeback_result == -EEXIST && encountered_page_in_swapcache) *encountered_page_in_swapcache = true; - } goto put_unlock; } -- Gitee From f2b968fb086eff5fc28655bf73e0bdc8f93244bb Mon Sep 17 00:00:00 2001 From: Ronald Monthero Date: Tue, 16 Jan 2024 23:31:45 +1000 Subject: [PATCH 157/484] mm/zswap: improve with alloc_workqueue() call commit 8409a385a6b41b7042aae1a4e4053963da997ffe upstream Conflicts: none Backport-reason: ZSWAP mass udpate The core-api create_workqueue is deprecated, this patch replaces the create_workqueue with alloc_workqueue. The previous implementation workqueue of zswap was a bounded workqueue, this patch uses alloc_workqueue() to create an unbounded workqueue. The WQ_UNBOUND attribute is desirable making the workqueue not localized to a specific cpu so that the scheduler is free to exercise improvisations in any demanding scenarios for offloading cpu time slices for workqueues. For example if any other workqueues of the same primary cpu had to be served which are WQ_HIGHPRI and WQ_CPU_INTENSIVE. Also Unbound workqueue happens to be more efficient in a system during memory pressure scenarios in comparison to a bounded workqueue. shrink_wq = alloc_workqueue("zswap-shrink", WQ_UNBOUND|WQ_MEM_RECLAIM, 1); Overall the change suggested in this patch should be seamless and does not alter the existing behavior, other than the improvisation to be an unbounded workqueue. Link: https://lkml.kernel.org/r/20240116133145.12454-1-debug.penguin32@gmail.com Signed-off-by: Ronald Monthero Acked-by: Nhat Pham Acked-by: Johannes Weiner Cc: Chris Li Cc: Dan Streetman Cc: Seth Jennings Cc: Vitaly Wool Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/zswap.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mm/zswap.c b/mm/zswap.c index 317b821694f7..2f5285b20099 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -1888,7 +1888,8 @@ static int zswap_setup(void) zswap_enabled = false; } - shrink_wq = create_workqueue("zswap-shrink"); + shrink_wq = alloc_workqueue("zswap-shrink", + WQ_UNBOUND|WQ_MEM_RECLAIM, 1); if (!shrink_wq) goto fallback_fail; -- Gitee From d2467d7ced21c1eadfa7cc1f07c5d4520f6403e9 Mon Sep 17 00:00:00 2001 From: Chengming Zhou Date: Fri, 19 Jan 2024 11:22:22 +0000 Subject: [PATCH 158/484] mm/zswap: make sure each swapfile always have zswap rb-tree commit bb29fd7760ae39905127afd31fc83294625ff704 upstream Conflicts: resolved Backport-reason: ZSWAP mass update Patch series "mm/zswap: optimize the scalability of zswap rb-tree", v2. When testing the zswap performance by using kernel build -j32 in a tmpfs directory, I found the scalability of zswap rb-tree is not good, which is protected by the only spinlock. That would cause heavy lock contention if multiple tasks zswap_store/load concurrently. So a simple solution is to split the only one zswap rb-tree into multiple rb-trees, each corresponds to SWAP_ADDRESS_SPACE_PAGES (64M). This idea is from the commit 4b3ef9daa4fc ("mm/swap: split swap cache into 64MB trunks"). Although this method can't solve the spinlock contention completely, it can mitigate much of that contention. Below is the results of kernel build in tmpfs with zswap shrinker enabled: linux-next zswap-lock-optimize real 1m9.181s 1m3.820s user 17m44.036s 17m40.100s sys 7m37.297s 4m54.622s So there are clearly improvements. And it's complementary with the ongoing zswap xarray conversion by Chris. Anyway, I think we can also merge this first, it's complementary IMHO. So I just refresh and resend this for further discussion. This patch (of 2): Not all zswap interfaces can handle the absence of the zswap rb-tree, actually only zswap_store() has handled it for now. To make things simple, we make sure each swapfile always have the zswap rb-tree prepared before being enabled and used. The preparation is unlikely to fail in practice, this patch just make it explicit. Link: https://lkml.kernel.org/r/20240117-b4-zswap-lock-optimize-v2-0-b5cc55479090@bytedance.com Link: https://lkml.kernel.org/r/20240117-b4-zswap-lock-optimize-v2-1-b5cc55479090@bytedance.com Signed-off-by: Chengming Zhou Acked-by: Nhat Pham Acked-by: Johannes Weiner Acked-by: Yosry Ahmed Cc: Chris Li Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- include/linux/zswap.h | 7 +++++-- mm/swapfile.c | 10 +++++++--- mm/zswap.c | 8 +++----- 3 files changed, 15 insertions(+), 10 deletions(-) diff --git a/include/linux/zswap.h b/include/linux/zswap.h index 0b709f5bc65f..eca388229d9a 100644 --- a/include/linux/zswap.h +++ b/include/linux/zswap.h @@ -30,7 +30,7 @@ struct zswap_lruvec_state { bool zswap_store(struct folio *folio); bool zswap_load(struct folio *folio); void zswap_invalidate(int type, pgoff_t offset); -void zswap_swapon(int type); +int zswap_swapon(int type); void zswap_swapoff(int type); void zswap_memcg_offline_cleanup(struct mem_cgroup *memcg); void zswap_lruvec_state_init(struct lruvec *lruvec); @@ -51,7 +51,10 @@ static inline bool zswap_load(struct folio *folio) } static inline void zswap_invalidate(int type, pgoff_t offset) {} -static inline void zswap_swapon(int type) {} +static inline int zswap_swapon(int type) +{ + return 0; +} static inline void zswap_swapoff(int type) {} static inline void zswap_memcg_offline_cleanup(struct mem_cgroup *memcg) {} static inline void zswap_lruvec_state_init(struct lruvec *lruvec) {} diff --git a/mm/swapfile.c b/mm/swapfile.c index 67cb96133272..b446b8d3fc4a 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -2546,8 +2546,6 @@ static void enable_swap_info(struct swap_info_struct *si, int prio, unsigned char *swap_map, struct swap_cluster_info *cluster_info) { - zswap_swapon(si->type); - spin_lock(&swap_lock); spin_lock(&si->lock); setup_swap_info(si, prio, swap_map, cluster_info); @@ -3369,6 +3367,10 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) if (error) goto bad_swap_unlock_inode; + error = zswap_swapon(p->type); + if (error) + goto free_swap_address_space; + /* * Flush any pending IO and dirty mappings before we start using this * swap device. @@ -3377,7 +3379,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) error = inode_drain_writes(inode); if (error) { inode->i_flags &= ~S_SWAPFILE; - goto free_swap_address_space; + goto free_swap_zswap; } mutex_lock(&swapon_mutex); @@ -3401,6 +3403,8 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) error = 0; goto out; +free_swap_zswap: + zswap_swapoff(p->type); free_swap_address_space: exit_swap_address_space(si->type); bad_swap_unlock_inode: diff --git a/mm/zswap.c b/mm/zswap.c index 2f5285b20099..77dcc0f435e1 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -1523,9 +1523,6 @@ bool zswap_store(struct folio *folio) if (folio_test_large(folio)) return false; - if (!tree) - return false; - /* * If this is a duplicate, it must be removed before attempting to store * it, otherwise, if the store fails the old page won't be removed from @@ -1778,19 +1775,20 @@ void zswap_invalidate(int type, pgoff_t offset) spin_unlock(&tree->lock); } -void zswap_swapon(int type) +int zswap_swapon(int type) { struct zswap_tree *tree; tree = kzalloc(sizeof(*tree), GFP_KERNEL); if (!tree) { pr_err("alloc failed, zswap disabled for swap type %d\n", type); - return; + return -ENOMEM; } tree->rbroot = RB_ROOT; spin_lock_init(&tree->lock); zswap_trees[type] = tree; + return 0; } void zswap_swapoff(int type) -- Gitee From af99f008d94b6b9f964a2799241158aad72fcfbf Mon Sep 17 00:00:00 2001 From: Chengming Zhou Date: Fri, 19 Jan 2024 11:22:23 +0000 Subject: [PATCH 159/484] mm/zswap: split zswap rb-tree commit 44c7c734a5132fc02f5584c7207f1d0c483f3ccd upstream Conflicts: trivial, rename p to si. Backport-reason: ZSWAP mass udpate Each swapfile has one rb-tree to search the mapping of swp_entry_t to zswap_entry, that use a spinlock to protect, which can cause heavy lock contention if multiple tasks zswap_store/load concurrently. Optimize the scalability problem by splitting the zswap rb-tree into multiple rb-trees, each corresponds to SWAP_ADDRESS_SPACE_PAGES (64M), just like we did in the swap cache address_space splitting. Although this method can't solve the spinlock contention completely, it can mitigate much of that contention. Below is the results of kernel build in tmpfs with zswap shrinker enabled: linux-next zswap-lock-optimize real 1m9.181s 1m3.820s user 17m44.036s 17m40.100s sys 7m37.297s 4m54.622s So there are clearly improvements. Link: https://lkml.kernel.org/r/20240117-b4-zswap-lock-optimize-v2-2-b5cc55479090@bytedance.com Signed-off-by: Chengming Zhou Acked-by: Johannes Weiner Acked-by: Nhat Pham Acked-by: Yosry Ahmed Cc: Chris Li Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- include/linux/zswap.h | 4 +-- mm/swapfile.c | 4 +-- mm/zswap.c | 71 ++++++++++++++++++++++++++++--------------- 3 files changed, 50 insertions(+), 29 deletions(-) diff --git a/include/linux/zswap.h b/include/linux/zswap.h index eca388229d9a..91895ce1fdbc 100644 --- a/include/linux/zswap.h +++ b/include/linux/zswap.h @@ -30,7 +30,7 @@ struct zswap_lruvec_state { bool zswap_store(struct folio *folio); bool zswap_load(struct folio *folio); void zswap_invalidate(int type, pgoff_t offset); -int zswap_swapon(int type); +int zswap_swapon(int type, unsigned long nr_pages); void zswap_swapoff(int type); void zswap_memcg_offline_cleanup(struct mem_cgroup *memcg); void zswap_lruvec_state_init(struct lruvec *lruvec); @@ -51,7 +51,7 @@ static inline bool zswap_load(struct folio *folio) } static inline void zswap_invalidate(int type, pgoff_t offset) {} -static inline int zswap_swapon(int type) +static inline int zswap_swapon(int type, unsigned long nr_pages) { return 0; } diff --git a/mm/swapfile.c b/mm/swapfile.c index b446b8d3fc4a..fcbaa1bbc747 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -3367,7 +3367,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) if (error) goto bad_swap_unlock_inode; - error = zswap_swapon(p->type); + error = zswap_swapon(si->type, maxpages); if (error) goto free_swap_address_space; @@ -3404,7 +3404,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) error = 0; goto out; free_swap_zswap: - zswap_swapoff(p->type); + zswap_swapoff(si->type); free_swap_address_space: exit_swap_address_space(si->type); bad_swap_unlock_inode: diff --git a/mm/zswap.c b/mm/zswap.c index 77dcc0f435e1..d5686c5b510b 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -237,6 +237,7 @@ struct zswap_tree { }; static struct zswap_tree *zswap_trees[MAX_SWAPFILES]; +static unsigned int nr_zswap_trees[MAX_SWAPFILES]; /* RCU-protected iteration */ static LIST_HEAD(zswap_pools); @@ -263,6 +264,12 @@ static bool zswap_has_pool; * helpers and fwd declarations **********************************/ +static inline struct zswap_tree *swap_zswap_tree(swp_entry_t swp) +{ + return &zswap_trees[swp_type(swp)][swp_offset(swp) + >> SWAP_ADDRESS_SPACE_SHIFT]; +} + #define zswap_pool_debug(msg, p) \ pr_debug("%s pool %s/%s\n", msg, (p)->tfm_name, \ zpool_get_type((p)->zpools[0])) @@ -862,7 +869,7 @@ static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_o * until the entry is verified to still be alive in the tree. */ swpoffset = swp_offset(entry->swpentry); - tree = zswap_trees[swp_type(entry->swpentry)]; + tree = swap_zswap_tree(entry->swpentry); list_lru_isolate(l, item); /* * It's safe to drop the lock here because we return either @@ -1498,10 +1505,9 @@ static void zswap_fill_page(void *ptr, unsigned long value) bool zswap_store(struct folio *folio) { swp_entry_t swp = folio->swap; - int type = swp_type(swp); pgoff_t offset = swp_offset(swp); struct page *page = &folio->page; - struct zswap_tree *tree = zswap_trees[type]; + struct zswap_tree *tree = swap_zswap_tree(swp); struct zswap_entry *entry, *dupentry; struct scatterlist input, output; struct crypto_acomp_ctx *acomp_ctx; @@ -1574,7 +1580,7 @@ bool zswap_store(struct folio *folio) src = kmap_local_page(page); if (zswap_is_page_same_filled(src, &value)) { kunmap_local(src); - entry->swpentry = swp_entry(type, offset); + entry->swpentry = swp; entry->length = 0; entry->value = value; atomic_inc(&zswap_same_filled_pages); @@ -1654,7 +1660,7 @@ bool zswap_store(struct folio *folio) mutex_unlock(&acomp_ctx->mutex); /* populate entry */ - entry->swpentry = swp_entry(type, offset); + entry->swpentry = swp; entry->handle = handle; entry->length = dlen; @@ -1714,10 +1720,9 @@ bool zswap_store(struct folio *folio) bool zswap_load(struct folio *folio) { swp_entry_t swp = folio->swap; - int type = swp_type(swp); pgoff_t offset = swp_offset(swp); struct page *page = &folio->page; - struct zswap_tree *tree = zswap_trees[type]; + struct zswap_tree *tree = swap_zswap_tree(swp); struct zswap_entry *entry; u8 *dst; @@ -1760,7 +1765,7 @@ bool zswap_load(struct folio *folio) void zswap_invalidate(int type, pgoff_t offset) { - struct zswap_tree *tree = zswap_trees[type]; + struct zswap_tree *tree = swap_zswap_tree(swp_entry(type, offset)); struct zswap_entry *entry; /* find */ @@ -1775,37 +1780,53 @@ void zswap_invalidate(int type, pgoff_t offset) spin_unlock(&tree->lock); } -int zswap_swapon(int type) +int zswap_swapon(int type, unsigned long nr_pages) { - struct zswap_tree *tree; + struct zswap_tree *trees, *tree; + unsigned int nr, i; - tree = kzalloc(sizeof(*tree), GFP_KERNEL); - if (!tree) { + nr = DIV_ROUND_UP(nr_pages, SWAP_ADDRESS_SPACE_PAGES); + trees = kvcalloc(nr, sizeof(*tree), GFP_KERNEL); + if (!trees) { pr_err("alloc failed, zswap disabled for swap type %d\n", type); return -ENOMEM; } - tree->rbroot = RB_ROOT; - spin_lock_init(&tree->lock); - zswap_trees[type] = tree; + for (i = 0; i < nr; i++) { + tree = trees + i; + tree->rbroot = RB_ROOT; + spin_lock_init(&tree->lock); + } + + nr_zswap_trees[type] = nr; + zswap_trees[type] = trees; return 0; } void zswap_swapoff(int type) { - struct zswap_tree *tree = zswap_trees[type]; - struct zswap_entry *entry, *n; + struct zswap_tree *trees = zswap_trees[type]; + unsigned int i; - if (!tree) + if (!trees) return; - /* walk the tree and free everything */ - spin_lock(&tree->lock); - rbtree_postorder_for_each_entry_safe(entry, n, &tree->rbroot, rbnode) - zswap_free_entry(entry); - tree->rbroot = RB_ROOT; - spin_unlock(&tree->lock); - kfree(tree); + for (i = 0; i < nr_zswap_trees[type]; i++) { + struct zswap_tree *tree = trees + i; + struct zswap_entry *entry, *n; + + /* walk the tree and free everything */ + spin_lock(&tree->lock); + rbtree_postorder_for_each_entry_safe(entry, n, + &tree->rbroot, + rbnode) + zswap_free_entry(entry); + tree->rbroot = RB_ROOT; + spin_unlock(&tree->lock); + } + + kvfree(trees); + nr_zswap_trees[type] = 0; zswap_trees[type] = NULL; } -- Gitee From 9761a32da2241ca9a1cfbe02963f24d0accc5a93 Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Wed, 24 Jan 2024 04:51:12 +0000 Subject: [PATCH 160/484] mm: zswap: remove unnecessary trees cleanups in zswap_swapoff() commit 83e68f25decdc8d133dbfedd18b93f5267430049 upstream Conflicts: none Backport-reason: ZSWAP mass udpate During swapoff, try_to_unuse() makes sure that zswap_invalidate() is called for all swap entries before zswap_swapoff() is called. This means that all zswap entries should already be removed from the tree. Simplify zswap_swapoff() by removing the trees cleanup code, and leave an assertion in its place. Link: https://lkml.kernel.org/r/20240124045113.415378-3-yosryahmed@google.com Signed-off-by: Yosry Ahmed Reviewed-by: Chengming Zhou Cc: Chris Li Cc: "Huang, Ying" Cc: Johannes Weiner Cc: Nhat Pham Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/zswap.c | 16 +++------------- 1 file changed, 3 insertions(+), 13 deletions(-) diff --git a/mm/zswap.c b/mm/zswap.c index d5686c5b510b..70ba3a0af140 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -1811,19 +1811,9 @@ void zswap_swapoff(int type) if (!trees) return; - for (i = 0; i < nr_zswap_trees[type]; i++) { - struct zswap_tree *tree = trees + i; - struct zswap_entry *entry, *n; - - /* walk the tree and free everything */ - spin_lock(&tree->lock); - rbtree_postorder_for_each_entry_safe(entry, n, - &tree->rbroot, - rbnode) - zswap_free_entry(entry); - tree->rbroot = RB_ROOT; - spin_unlock(&tree->lock); - } + /* try_to_unuse() invalidated all the entries already */ + for (i = 0; i < nr_zswap_trees[type]; i++) + WARN_ON_ONCE(!RB_EMPTY_ROOT(&trees[i].rbroot)); kvfree(trees); nr_zswap_trees[type] = 0; -- Gitee From 64610977aa80f33cd44c0b5864dc09897cf279c9 Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Thu, 25 Jan 2024 08:14:23 +0000 Subject: [PATCH 161/484] mm: zswap: remove unused tree argument in zswap_entry_put() commit db128f5fdee9d9298c19d7137add7f7af840d9f7 upstream Conflicts: none Backport-reason: ZSWAP mass udpate Commit 7310895779624 ("mm: zswap: tighten up entry invalidation") removed the usage of tree argument, delete it. Link: https://lkml.kernel.org/r/20240125081423.1200336-1-yosryahmed@google.com Signed-off-by: Yosry Ahmed Reviewed-by: Chengming Zhou Acked-by: Johannes Weiner Reviewed-by: Nhat Pham Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/zswap.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/mm/zswap.c b/mm/zswap.c index 70ba3a0af140..7fa4961d0db1 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -566,8 +566,7 @@ static void zswap_entry_get(struct zswap_entry *entry) /* caller must hold the tree lock * remove from the tree and free it, if nobody reference the entry */ -static void zswap_entry_put(struct zswap_tree *tree, - struct zswap_entry *entry) +static void zswap_entry_put(struct zswap_entry *entry) { int refcount = --entry->refcount; @@ -850,7 +849,7 @@ static void zswap_invalidate_entry(struct zswap_tree *tree, struct zswap_entry *entry) { if (zswap_rb_erase(&tree->rbroot, entry)) - zswap_entry_put(tree, entry); + zswap_entry_put(entry); } static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_one *l, @@ -919,7 +918,7 @@ static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_o put_unlock: /* Drop local reference */ - zswap_entry_put(tree, entry); + zswap_entry_put(entry); unlock: spin_unlock(&tree->lock); spin_lock(lock); @@ -1757,7 +1756,7 @@ bool zswap_load(struct folio *folio) zswap_lru_del(&entry->pool->list_lru, entry); zswap_lru_add(&entry->pool->list_lru, entry); } - zswap_entry_put(tree, entry); + zswap_entry_put(entry); spin_unlock(&tree->lock); return true; -- Gitee From 19a934cc746b9915b4e18fa9effc30a43ef959a5 Mon Sep 17 00:00:00 2001 From: Chengming Zhou Date: Sun, 28 Jan 2024 13:28:50 +0000 Subject: [PATCH 162/484] mm/zswap: fix race between lru writeback and swapoff commit 5878303c5353bdd9b7949db9edbaded53c150173 upstream Conflicts: trivial, resolved Backport-reason: ZSWAP mass udpate LRU writeback has race problem with swapoff, as spotted by Yosry [1]: CPU1 CPU2 shrink_memcg_cb swap_off list_lru_isolate zswap_invalidate zswap_swapoff kfree(tree) // UAF spin_lock(&tree->lock) The problem is that the entry in lru list can't protect the tree from being swapoff and freed, and the entry also can be invalidated and freed concurrently after we unlock the lru lock. We can fix it by moving the swap cache allocation ahead before referencing the tree, then check invalidate race with tree lock, only after that we can safely deref the entry. Note we couldn't deref entry or tree anymore after we unlock the folio, since we depend on this to hold on swapoff. So this patch moves all tree and entry usage to zswap_writeback_entry(), we only use the copied swpentry on the stack to allocate swap cache and if returned with folio locked we can reference the tree safely. Then we can check invalidate race with tree lock, the following things is much the same like zswap_load(). Since we can't deref the entry after zswap_writeback_entry(), we can't use zswap_lru_putback() anymore, instead we rotate the entry in the beginning. And it will be unlinked and freed when invalidated if writeback success. Another change is we don't update the memcg nr_zswap_protected in the -ENOMEM and -EEXIST cases anymore. -EEXIST case means we raced with swapin or concurrent shrinker action, since swapin already have memcg nr_zswap_protected updated, don't need double counts here. For concurrent shrinker, the folio will be writeback and freed anyway. -ENOMEM case is extremely rare and doesn't happen spuriously either, so don't bother distinguishing this case. [1] https://lore.kernel.org/all/CAJD7tkasHsRnT_75-TXsEe58V9_OW6m3g6CF7Kmsvz8CKRG_EA@mail.gmail.com/ Link: https://lkml.kernel.org/r/20240126-zswap-writeback-race-v2-2-b10479847099@bytedance.com Signed-off-by: Chengming Zhou Acked-by: Johannes Weiner Acked-by: Nhat Pham Cc: Chris Li Cc: Yosry Ahmed Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/zswap.c | 114 +++++++++++++++++++++++------------------------------ 1 file changed, 49 insertions(+), 65 deletions(-) diff --git a/mm/zswap.c b/mm/zswap.c index 7fa4961d0db1..a5317a1aca4d 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -275,7 +275,7 @@ static inline struct zswap_tree *swap_zswap_tree(swp_entry_t swp) zpool_get_type((p)->zpools[0])) static int zswap_writeback_entry(struct zswap_entry *entry, - struct zswap_tree *tree); + swp_entry_t swpentry); static int zswap_pool_get(struct zswap_pool *pool); static void zswap_pool_put(struct zswap_pool *pool); @@ -442,27 +442,6 @@ static void zswap_lru_del(struct list_lru *list_lru, struct zswap_entry *entry) rcu_read_unlock(); } -static void zswap_lru_putback(struct list_lru *list_lru, - struct zswap_entry *entry) -{ - int nid = entry_to_nid(entry); - spinlock_t *lock = &list_lru->node[nid].lock; - struct mem_cgroup *memcg; - struct lruvec *lruvec; - - rcu_read_lock(); - memcg = mem_cgroup_from_entry(entry); - spin_lock(lock); - /* we cannot use list_lru_add here, because it increments node's lru count */ - list_lru_putback(list_lru, &entry->lru, nid, memcg); - spin_unlock(lock); - - lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(entry_to_nid(entry))); - /* increment the protection area to account for the LRU rotation. */ - atomic_long_inc(&lruvec->zswap_lruvec_state.nr_zswap_protected); - rcu_read_unlock(); -} - /********************************* * rbtree functions **********************************/ @@ -857,40 +836,47 @@ static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_o { struct zswap_entry *entry = container_of(item, struct zswap_entry, lru); bool *encountered_page_in_swapcache = (bool *)arg; - struct zswap_tree *tree; - pgoff_t swpoffset; + swp_entry_t swpentry; enum lru_status ret = LRU_REMOVED_RETRY; int writeback_result; + /* + * Rotate the entry to the tail before unlocking the LRU, + * so that in case of an invalidation race concurrent + * reclaimers don't waste their time on it. + * + * If writeback succeeds, or failure is due to the entry + * being invalidated by the swap subsystem, the invalidation + * will unlink and free it. + * + * Temporary failures, where the same entry should be tried + * again immediately, almost never happen for this shrinker. + * We don't do any trylocking; -ENOMEM comes closest, + * but that's extremely rare and doesn't happen spuriously + * either. Don't bother distinguishing this case. + * + * But since they do exist in theory, the entry cannot just + * be unlinked, or we could leak it. Hence, rotate. + */ + list_move_tail(item, &l->list); + /* * Once the lru lock is dropped, the entry might get freed. The - * swpoffset is copied to the stack, and entry isn't deref'd again + * swpentry is copied to the stack, and entry isn't deref'd again * until the entry is verified to still be alive in the tree. */ - swpoffset = swp_offset(entry->swpentry); - tree = swap_zswap_tree(entry->swpentry); - list_lru_isolate(l, item); + swpentry = entry->swpentry; + /* * It's safe to drop the lock here because we return either * LRU_REMOVED_RETRY or LRU_RETRY. */ spin_unlock(lock); - /* Check for invalidate() race */ - spin_lock(&tree->lock); - if (entry != zswap_rb_search(&tree->rbroot, swpoffset)) - goto unlock; - - /* Hold a reference to prevent a free during writeback */ - zswap_entry_get(entry); - spin_unlock(&tree->lock); + writeback_result = zswap_writeback_entry(entry, swpentry); - writeback_result = zswap_writeback_entry(entry, tree); - - spin_lock(&tree->lock); if (writeback_result) { zswap_reject_reclaim_fail++; - zswap_lru_putback(&entry->pool->list_lru, entry); ret = LRU_RETRY; /* @@ -900,27 +886,10 @@ static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_o */ if (writeback_result == -EEXIST && encountered_page_in_swapcache) *encountered_page_in_swapcache = true; - - goto put_unlock; + } else { + zswap_written_back_pages++; } - zswap_written_back_pages++; - - if (entry->objcg) - count_objcg_event(entry->objcg, ZSWPWB); - count_vm_event(ZSWPWB); - /* - * Writeback started successfully, the page now belongs to the - * swapcache. Drop the entry from zswap - unless invalidate already - * took it out while we had the tree->lock released for IO. - */ - zswap_invalidate_entry(tree, entry); - -put_unlock: - /* Drop local reference */ - zswap_entry_put(entry); -unlock: - spin_unlock(&tree->lock); spin_lock(lock); return ret; } @@ -1405,10 +1374,10 @@ static void __zswap_load(struct zswap_entry *entry, struct page *page) * freed. */ static int zswap_writeback_entry(struct zswap_entry *entry, - struct zswap_tree *tree) + swp_entry_t swpentry) { int ret; - swp_entry_t swpentry = entry->swpentry; + struct zswap_tree *tree; struct folio *folio; struct mempolicy *mpol; bool folio_was_allocated; @@ -1430,9 +1399,11 @@ static int zswap_writeback_entry(struct zswap_entry *entry, return -ENOMEM; /* - * Found an existing folio, we raced with load/swapin. We generally - * writeback cold folios from zswap, and swapin means the folio just - * became hot. Skip this folio and let the caller find another one. + * Found an existing folio, we raced with swapin or concurrent + * shrinker. We generally writeback cold folios from zswap, and + * swapin means the folio just became hot, so skip this folio. + * For unlikely concurrent shrinker case, it will be unlinked + * and freed when invalidated by the concurrent shrinker anyway. */ if (!folio_was_allocated) { folio_put(folio); @@ -1446,18 +1417,31 @@ static int zswap_writeback_entry(struct zswap_entry *entry, * backs (our zswap_entry reference doesn't prevent that), to * avoid overwriting a new swap folio with old compressed data. */ + tree = swap_zswap_tree(swpentry); spin_lock(&tree->lock); - if (zswap_rb_search(&tree->rbroot, swp_offset(entry->swpentry)) != entry) { + if (zswap_rb_search(&tree->rbroot, swp_offset(swpentry)) != entry) { spin_unlock(&tree->lock); delete_from_swap_cache(folio); folio_unlock(folio); folio_put(folio); return -ENOMEM; } + + /* Safe to deref entry after the entry is verified above. */ + zswap_entry_get(entry); spin_unlock(&tree->lock); __zswap_load(entry, &folio->page); + count_vm_event(ZSWPWB); + if (entry->objcg) + count_objcg_event(entry->objcg, ZSWPWB); + + spin_lock(&tree->lock); + zswap_invalidate_entry(tree, entry); + zswap_entry_put(entry); + spin_unlock(&tree->lock); + /* folio is up to date */ folio_mark_uptodate(folio); -- Gitee From ca1c7c7855864ed8aea6ea74c240a020f8a315fd Mon Sep 17 00:00:00 2001 From: Chengming Zhou Date: Sun, 28 Jan 2024 13:28:51 +0000 Subject: [PATCH 163/484] mm/list_lru: remove list_lru_putback() commit 3f798aa6121ab3eb572f96ab2d8558894d979a4c upstream Conflicts: none Backport-reason: ZSWAP mass udpate Since the only user zswap_lru_putback() has gone, remove list_lru_putback() too. Link: https://lkml.kernel.org/r/20240126-zswap-writeback-race-v2-3-b10479847099@bytedance.com Signed-off-by: Chengming Zhou Acked-by: Yosry Ahmed Cc: Chris Li Cc: Johannes Weiner Cc: Nhat Pham Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- include/linux/list_lru.h | 16 ---------------- mm/list_lru.c | 14 -------------- mm/zswap.c | 2 +- 3 files changed, 1 insertion(+), 31 deletions(-) diff --git a/include/linux/list_lru.h b/include/linux/list_lru.h index 7675a48a0701..9e184364a656 100644 --- a/include/linux/list_lru.h +++ b/include/linux/list_lru.h @@ -170,22 +170,6 @@ static inline unsigned long list_lru_count(struct list_lru *lru) void list_lru_isolate(struct list_lru_one *list, struct list_head *item); void list_lru_isolate_move(struct list_lru_one *list, struct list_head *item, struct list_head *head); -/** - * list_lru_putback: undo list_lru_isolate - * @lru: the lru pointer. - * @item: the item to put back. - * @nid: the node id of the sublist to put the item back to. - * @memcg: the cgroup of the sublist to put the item back to. - * - * Put back an isolated item into its original LRU. Note that unlike - * list_lru_add, this does not increment the node LRU count (as - * list_lru_isolate does not originally decrement this count). - * - * Since we might have dropped the LRU lock in between, recompute list_lru_one - * from the node's id and memcg. - */ -void list_lru_putback(struct list_lru *lru, struct list_head *item, int nid, - struct mem_cgroup *memcg); typedef enum lru_status (*list_lru_walk_cb)(struct list_head *item, struct list_lru_one *list, spinlock_t *lock, void *cb_arg); diff --git a/mm/list_lru.c b/mm/list_lru.c index fcca67ac26ec..b8910e99d986 100644 --- a/mm/list_lru.c +++ b/mm/list_lru.c @@ -193,20 +193,6 @@ void list_lru_isolate_move(struct list_lru_one *list, struct list_head *item, } EXPORT_SYMBOL_GPL(list_lru_isolate_move); -void list_lru_putback(struct list_lru *lru, struct list_head *item, int nid, - struct mem_cgroup *memcg) -{ - struct list_lru_one *list = - list_lru_from_memcg_idx(lru, nid, memcg_kmem_id(memcg)); - - if (list_empty(item)) { - list_add_tail(item, &list->list); - if (!list->nr_items++) - set_shrinker_bit(memcg, nid, lru_shrinker_id(lru)); - } -} -EXPORT_SYMBOL_GPL(list_lru_putback); - unsigned long list_lru_count_one(struct list_lru *lru, int nid, struct mem_cgroup *memcg) { diff --git a/mm/zswap.c b/mm/zswap.c index a5317a1aca4d..f0e80b817265 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -408,7 +408,7 @@ static void zswap_lru_add(struct list_lru *list_lru, struct zswap_entry *entry) * 2. list_lru_add() is called after memcg->kmemcg_id is updated. The * new entry will be added directly to memcg's parent's list_lru. * - * Similar reasoning holds for list_lru_del() and list_lru_putback(). + * Similar reasoning holds for list_lru_del(). */ rcu_read_lock(); memcg = mem_cgroup_from_entry(entry); -- Gitee From 736a996f40631e754b0d978ebfb0ce85ad8fcb9c Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 29 Jan 2024 20:36:37 -0500 Subject: [PATCH 164/484] mm: zswap: rename zswap_free_entry to zswap_entry_free commit 42398be2adb12096500e61f9585f05d0a479bf57 upstream Conflicts: none Backport-reason: ZSWAP mass udpate There is a zswap_entry_ namespace with multiple functions already. Link: https://lkml.kernel.org/r/20240130014208.565554-2-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Acked-by: Nhat Pham Acked-by: Yosry Ahmed Reviewed-by: Chengming Zhou Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/zswap.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/zswap.c b/mm/zswap.c index f0e80b817265..f1594ce76365 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -517,7 +517,7 @@ static struct zpool *zswap_find_zpool(struct zswap_entry *entry) * Carries out the common pattern of freeing and entry's zpool allocation, * freeing the entry itself, and decrementing the number of stored pages. */ -static void zswap_free_entry(struct zswap_entry *entry) +static void zswap_entry_free(struct zswap_entry *entry) { if (!entry->length) atomic_dec(&zswap_same_filled_pages); @@ -552,7 +552,7 @@ static void zswap_entry_put(struct zswap_entry *entry) WARN_ON_ONCE(refcount < 0); if (refcount == 0) { WARN_ON_ONCE(!RB_EMPTY_NODE(&entry->rbnode)); - zswap_free_entry(entry); + zswap_entry_free(entry); } } -- Gitee From df0617d23792d7d5c764d2ff06ad60d748809279 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 29 Jan 2024 20:36:38 -0500 Subject: [PATCH 165/484] mm: zswap: inline and remove zswap_entry_find_get() commit 5b297f70bb26b9041d5badd0d51d9227e25fc1eb upstream Conflicts: none Backport-reason: ZSWAP mass udpate There is only one caller and the function is trivial. Inline it. Link: https://lkml.kernel.org/r/20240130014208.565554-3-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Reviewed-by: Nhat Pham Acked-by: Yosry Ahmed Reviewed-by: Chengming Zhou Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/zswap.c | 17 ++--------------- 1 file changed, 2 insertions(+), 15 deletions(-) diff --git a/mm/zswap.c b/mm/zswap.c index f1594ce76365..b79a7a3ede7b 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -556,19 +556,6 @@ static void zswap_entry_put(struct zswap_entry *entry) } } -/* caller must hold the tree lock */ -static struct zswap_entry *zswap_entry_find_get(struct rb_root *root, - pgoff_t offset) -{ - struct zswap_entry *entry; - - entry = zswap_rb_search(root, offset); - if (entry) - zswap_entry_get(entry); - - return entry; -} - /********************************* * shrinker functions **********************************/ @@ -1711,13 +1698,13 @@ bool zswap_load(struct folio *folio) VM_WARN_ON_ONCE(!folio_test_locked(folio)); - /* find */ spin_lock(&tree->lock); - entry = zswap_entry_find_get(&tree->rbroot, offset); + entry = zswap_rb_search(&tree->rbroot, offset); if (!entry) { spin_unlock(&tree->lock); return false; } + zswap_entry_get(entry); spin_unlock(&tree->lock); if (entry->length) -- Gitee From 5818c8c5951bfb44030e4af82a9a3991a9609f39 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 29 Jan 2024 20:36:39 -0500 Subject: [PATCH 166/484] mm: zswap: move zswap_invalidate_entry() to related functions commit 7dd1f7f0fc1c98525cbadf954eb6b7672ad4acea upstream Conflicts: none Backport-reason: ZSWAP mass udpate Move it up to the other tree and refcounting functions. Link: https://lkml.kernel.org/r/20240130014208.565554-4-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Reviewed-by: Nhat Pham Acked-by: Yosry Ahmed Reviewed-by: Chengming Zhou Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/zswap.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/mm/zswap.c b/mm/zswap.c index b79a7a3ede7b..ccca3a3e04c8 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -556,6 +556,18 @@ static void zswap_entry_put(struct zswap_entry *entry) } } +/* + * If the entry is still valid in the tree, drop the initial ref and remove it + * from the tree. This function must be called with an additional ref held, + * otherwise it may race with another invalidation freeing the entry. + */ +static void zswap_invalidate_entry(struct zswap_tree *tree, + struct zswap_entry *entry) +{ + if (zswap_rb_erase(&tree->rbroot, entry)) + zswap_entry_put(entry); +} + /********************************* * shrinker functions **********************************/ @@ -806,18 +818,6 @@ static struct zswap_pool *zswap_pool_find_get(char *type, char *compressor) return NULL; } -/* - * If the entry is still valid in the tree, drop the initial ref and remove it - * from the tree. This function must be called with an additional ref held, - * otherwise it may race with another invalidation freeing the entry. - */ -static void zswap_invalidate_entry(struct zswap_tree *tree, - struct zswap_entry *entry) -{ - if (zswap_rb_erase(&tree->rbroot, entry)) - zswap_entry_put(entry); -} - static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_one *l, spinlock_t *lock, void *arg) { -- Gitee From 29809b460db8aac5d7a526b7b59c08a20a2b2ce2 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 29 Jan 2024 20:36:40 -0500 Subject: [PATCH 167/484] mm: zswap: warn when referencing a dead entry commit e477559ca602a033e3970ebdbbfb39773345d694 upstream Conflicts: none Backport-reason: ZSWAP mass udpate Put a standard sanity check on zswap_entry_get() for UAF scenario. Link: https://lkml.kernel.org/r/20240130014208.565554-5-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Reviewed-by: Nhat Pham Acked-by: Yosry Ahmed Reviewed-by: Chengming Zhou Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/zswap.c | 1 + 1 file changed, 1 insertion(+) diff --git a/mm/zswap.c b/mm/zswap.c index ccca3a3e04c8..93f30c84ce38 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -539,6 +539,7 @@ static void zswap_entry_free(struct zswap_entry *entry) /* caller must hold the tree lock */ static void zswap_entry_get(struct zswap_entry *entry) { + WARN_ON_ONCE(!entry->refcount); entry->refcount++; } -- Gitee From cb3676943fff32c0e4f56ec710232f0efeb09b75 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 29 Jan 2024 20:36:41 -0500 Subject: [PATCH 168/484] mm: zswap: clean up zswap_entry_put() commit dab7711fac6dd3c45471bfb3b6b5336e8fa900a2 upstream Conflicts: none Backport-reason: ZSWAP mass udpate Remove stale comment and unnecessary local variable. Link: https://lkml.kernel.org/r/20240130014208.565554-6-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Acked-by: Yosry Ahmed Reviewed-by: Nhat Pham Reviewed-by: Chengming Zhou Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/zswap.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/mm/zswap.c b/mm/zswap.c index 93f30c84ce38..cc2d3d1439f3 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -543,15 +543,11 @@ static void zswap_entry_get(struct zswap_entry *entry) entry->refcount++; } -/* caller must hold the tree lock -* remove from the tree and free it, if nobody reference the entry -*/ +/* caller must hold the tree lock */ static void zswap_entry_put(struct zswap_entry *entry) { - int refcount = --entry->refcount; - - WARN_ON_ONCE(refcount < 0); - if (refcount == 0) { + WARN_ON_ONCE(!entry->refcount); + if (--entry->refcount == 0) { WARN_ON_ONCE(!RB_EMPTY_NODE(&entry->rbnode)); zswap_entry_free(entry); } -- Gitee From 04255847c3a476d7561848a6c1095527d62f3f81 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 29 Jan 2024 20:36:42 -0500 Subject: [PATCH 169/484] mm: zswap: rename __zswap_load() to zswap_decompress() commit ff2972aa1b5d527d982af3925258be78d26c6625 upstream Conflicts: none Backport-reason: ZSWAP mass udpate Link: https://lkml.kernel.org/r/20240130014208.565554-7-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Reviewed-by: Nhat Pham Acked-by: Yosry Ahmed Reviewed-by: Chengming Zhou Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/zswap.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mm/zswap.c b/mm/zswap.c index cc2d3d1439f3..7a9f23f07fce 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -1313,7 +1313,7 @@ static int zswap_enabled_param_set(const char *val, return ret; } -static void __zswap_load(struct zswap_entry *entry, struct page *page) +static void zswap_decompress(struct zswap_entry *entry, struct page *page) { struct zpool *zpool = zswap_find_zpool(entry); struct scatterlist input, output; @@ -1415,7 +1415,7 @@ static int zswap_writeback_entry(struct zswap_entry *entry, zswap_entry_get(entry); spin_unlock(&tree->lock); - __zswap_load(entry, &folio->page); + zswap_decompress(entry, &folio->page); count_vm_event(ZSWPWB); if (entry->objcg) @@ -1705,7 +1705,7 @@ bool zswap_load(struct folio *folio) spin_unlock(&tree->lock); if (entry->length) - __zswap_load(entry, page); + zswap_decompress(entry, page); else { dst = kmap_local_page(page); zswap_fill_page(dst, entry->value); -- Gitee From 808045fcdec53749395d8d9641e94b5f32b3a821 Mon Sep 17 00:00:00 2001 From: Nhat Pham Date: Tue, 24 Oct 2023 16:45:09 -0700 Subject: [PATCH 170/484] zswap: export compression failure stats commit cb61dad80fdc6a8f01b101e823a9335cd6d61071 upstream Conflicts: none Backport-reason: ZSWAP mass udpate During a zswap store attempt, the compression algorithm could fail (for e.g due to the page containing incompressible random data). This is not tracked in any of existing zswap counters, making it hard to monitor for and investigate. We have run into this problem several times in our internal investigations on zswap store failures. This patch adds a dedicated debugfs counter for compression algorithm failures. Link: https://lkml.kernel.org/r/20231024234509.2680539-1-nphamcs@gmail.com Signed-off-by: Nhat Pham Reviewed-by: Sergey Senozhatsky Cc: Dan Streetman Cc: Domenico Cerasuolo Cc: Johannes Weiner Cc: Michal Hocko Cc: Muchun Song Cc: Roman Gushchin Cc: Seth Jennings Cc: Shakeel Butt Cc: Vitaly Wool Cc: Yosry Ahmed Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/zswap.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/mm/zswap.c b/mm/zswap.c index 7a9f23f07fce..ef7296d9f2f6 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -63,6 +63,8 @@ static u64 zswap_pool_limit_hit; static u64 zswap_written_back_pages; /* Store failed due to a reclaim failure after pool limit was reached */ static u64 zswap_reject_reclaim_fail; +/* Store failed due to compression algorithm failure */ +static u64 zswap_reject_compress_fail; /* Compressed page was too big for the allocator to (optimally) store */ static u64 zswap_reject_compress_poor; /* Store failed because underlying allocator could not get memory */ @@ -1604,8 +1606,10 @@ bool zswap_store(struct folio *folio) ret = crypto_wait_req(crypto_acomp_compress(acomp_ctx->req), &acomp_ctx->wait); dlen = acomp_ctx->req->dlen; - if (ret) + if (ret) { + zswap_reject_compress_fail++; goto put_dstmem; + } /* store */ zpool = zswap_find_zpool(entry); @@ -1810,6 +1814,8 @@ static int zswap_debugfs_init(void) zswap_debugfs_root, &zswap_reject_alloc_fail); debugfs_create_u64("reject_kmemcache_fail", 0444, zswap_debugfs_root, &zswap_reject_kmemcache_fail); + debugfs_create_u64("reject_compress_fail", 0444, + zswap_debugfs_root, &zswap_reject_compress_fail); debugfs_create_u64("reject_compress_poor", 0444, zswap_debugfs_root, &zswap_reject_compress_poor); debugfs_create_u64("written_back_pages", 0444, -- Gitee From 451ace6a52f10b80d9fb7ca853b78f4ba7524638 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 29 Jan 2024 20:36:43 -0500 Subject: [PATCH 171/484] mm: zswap: break out zwap_compress() commit fa9ad6e21003691905f6caa307f9e5bf27ac1c23 upstream Conflicts: none Backport-reason: ZSWAP mass udpate zswap_store() is long and mixes work at the zswap layer with work at the backend and compression layer. Move compression & backend work to zswap_compress(), mirroring zswap_decompress(). Link: https://lkml.kernel.org/r/20240130014208.565554-8-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Reviewed-by: Nhat Pham Acked-by: Yosry Ahmed Reviewed-by: Chengming Zhou Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/zswap.c | 145 ++++++++++++++++++++++++++++------------------------- 1 file changed, 77 insertions(+), 68 deletions(-) diff --git a/mm/zswap.c b/mm/zswap.c index ef7296d9f2f6..4316e21962b7 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -1315,6 +1315,79 @@ static int zswap_enabled_param_set(const char *val, return ret; } +static bool zswap_compress(struct folio *folio, struct zswap_entry *entry) +{ + struct crypto_acomp_ctx *acomp_ctx; + struct scatterlist input, output; + unsigned int dlen = PAGE_SIZE; + unsigned long handle; + struct zpool *zpool; + char *buf; + gfp_t gfp; + int ret; + u8 *dst; + + acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx); + + mutex_lock(&acomp_ctx->mutex); + + dst = acomp_ctx->buffer; + sg_init_table(&input, 1); + sg_set_page(&input, &folio->page, PAGE_SIZE, 0); + + /* + * We need PAGE_SIZE * 2 here since there maybe over-compression case, + * and hardware-accelerators may won't check the dst buffer size, so + * giving the dst buffer with enough length to avoid buffer overflow. + */ + sg_init_one(&output, dst, PAGE_SIZE * 2); + acomp_request_set_params(acomp_ctx->req, &input, &output, PAGE_SIZE, dlen); + + /* + * it maybe looks a little bit silly that we send an asynchronous request, + * then wait for its completion synchronously. This makes the process look + * synchronous in fact. + * Theoretically, acomp supports users send multiple acomp requests in one + * acomp instance, then get those requests done simultaneously. but in this + * case, zswap actually does store and load page by page, there is no + * existing method to send the second page before the first page is done + * in one thread doing zwap. + * but in different threads running on different cpu, we have different + * acomp instance, so multiple threads can do (de)compression in parallel. + */ + ret = crypto_wait_req(crypto_acomp_compress(acomp_ctx->req), &acomp_ctx->wait); + dlen = acomp_ctx->req->dlen; + if (ret) { + zswap_reject_compress_fail++; + goto unlock; + } + + zpool = zswap_find_zpool(entry); + gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM; + if (zpool_malloc_support_movable(zpool)) + gfp |= __GFP_HIGHMEM | __GFP_MOVABLE; + ret = zpool_malloc(zpool, dlen, gfp, &handle); + if (ret == -ENOSPC) { + zswap_reject_compress_poor++; + goto unlock; + } + if (ret) { + zswap_reject_alloc_fail++; + goto unlock; + } + + buf = zpool_map_handle(zpool, handle, ZPOOL_MM_WO); + memcpy(buf, dst, dlen); + zpool_unmap_handle(zpool, handle); + + entry->handle = handle; + entry->length = dlen; + +unlock: + mutex_unlock(&acomp_ctx->mutex); + return ret == 0; +} + static void zswap_decompress(struct zswap_entry *entry, struct page *page) { struct zpool *zpool = zswap_find_zpool(entry); @@ -1478,18 +1551,11 @@ bool zswap_store(struct folio *folio) struct page *page = &folio->page; struct zswap_tree *tree = swap_zswap_tree(swp); struct zswap_entry *entry, *dupentry; - struct scatterlist input, output; - struct crypto_acomp_ctx *acomp_ctx; struct obj_cgroup *objcg = NULL; struct mem_cgroup *memcg = NULL; struct zswap_pool *pool; - struct zpool *zpool; - unsigned int dlen = PAGE_SIZE; - unsigned long handle, value; - char *buf; - u8 *src, *dst; - gfp_t gfp; - int ret; + unsigned long value; + u8 *src; VM_WARN_ON_ONCE(!folio_test_locked(folio)); VM_WARN_ON_ONCE(!folio_test_swapcache(folio)); @@ -1575,65 +1641,10 @@ bool zswap_store(struct folio *folio) mem_cgroup_put(memcg); } - /* compress */ - acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx); - - mutex_lock(&acomp_ctx->mutex); - - dst = acomp_ctx->buffer; - sg_init_table(&input, 1); - sg_set_page(&input, &folio->page, PAGE_SIZE, 0); + if (!zswap_compress(folio, entry)) + goto put_pool; - /* - * We need PAGE_SIZE * 2 here since there maybe over-compression case, - * and hardware-accelerators may won't check the dst buffer size, so - * giving the dst buffer with enough length to avoid buffer overflow. - */ - sg_init_one(&output, dst, PAGE_SIZE * 2); - acomp_request_set_params(acomp_ctx->req, &input, &output, PAGE_SIZE, dlen); - /* - * it maybe looks a little bit silly that we send an asynchronous request, - * then wait for its completion synchronously. This makes the process look - * synchronous in fact. - * Theoretically, acomp supports users send multiple acomp requests in one - * acomp instance, then get those requests done simultaneously. but in this - * case, zswap actually does store and load page by page, there is no - * existing method to send the second page before the first page is done - * in one thread doing zwap. - * but in different threads running on different cpu, we have different - * acomp instance, so multiple threads can do (de)compression in parallel. - */ - ret = crypto_wait_req(crypto_acomp_compress(acomp_ctx->req), &acomp_ctx->wait); - dlen = acomp_ctx->req->dlen; - - if (ret) { - zswap_reject_compress_fail++; - goto put_dstmem; - } - - /* store */ - zpool = zswap_find_zpool(entry); - gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM; - if (zpool_malloc_support_movable(zpool)) - gfp |= __GFP_HIGHMEM | __GFP_MOVABLE; - ret = zpool_malloc(zpool, dlen, gfp, &handle); - if (ret == -ENOSPC) { - zswap_reject_compress_poor++; - goto put_dstmem; - } - if (ret) { - zswap_reject_alloc_fail++; - goto put_dstmem; - } - buf = zpool_map_handle(zpool, handle, ZPOOL_MM_WO); - memcpy(buf, dst, dlen); - zpool_unmap_handle(zpool, handle); - mutex_unlock(&acomp_ctx->mutex); - - /* populate entry */ entry->swpentry = swp; - entry->handle = handle; - entry->length = dlen; insert_entry: entry->objcg = objcg; @@ -1670,8 +1681,6 @@ bool zswap_store(struct folio *folio) return true; -put_dstmem: - mutex_unlock(&acomp_ctx->mutex); put_pool: zswap_pool_put(entry->pool); freepage: -- Gitee From 9c10b1807dfa208b67c3570a42cdca08bde8d03f Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 29 Jan 2024 20:36:44 -0500 Subject: [PATCH 172/484] mm: zswap: further cleanup zswap_store() commit be7fc97c52835b931682d79121530773b67e8307 upstream Conflicts: none Backport-reason: ZSWAP mass udpate - Remove dupentry, reusing entry works just fine. - Rename pool to shrink_pool, as this one actually is confusing. - Remove page, use folio_nid() and kmap_local_folio() directly. - Set entry->swpentry in a common path. - Move value and src to local scope of use. Link: https://lkml.kernel.org/r/20240130014208.565554-9-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Reviewed-by: Nhat Pham Acked-by: Yosry Ahmed Reviewed-by: Chengming Zhou Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/zswap.c | 28 +++++++++++++--------------- 1 file changed, 13 insertions(+), 15 deletions(-) diff --git a/mm/zswap.c b/mm/zswap.c index 4316e21962b7..6907c2db7bd3 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -1548,14 +1548,11 @@ bool zswap_store(struct folio *folio) { swp_entry_t swp = folio->swap; pgoff_t offset = swp_offset(swp); - struct page *page = &folio->page; struct zswap_tree *tree = swap_zswap_tree(swp); struct zswap_entry *entry, *dupentry; struct obj_cgroup *objcg = NULL; struct mem_cgroup *memcg = NULL; - struct zswap_pool *pool; - unsigned long value; - u8 *src; + struct zswap_pool *shrink_pool; VM_WARN_ON_ONCE(!folio_test_locked(folio)); VM_WARN_ON_ONCE(!folio_test_swapcache(folio)); @@ -1570,10 +1567,10 @@ bool zswap_store(struct folio *folio) * the tree, and it might be written back overriding the new data. */ spin_lock(&tree->lock); - dupentry = zswap_rb_search(&tree->rbroot, offset); - if (dupentry) { + entry = zswap_rb_search(&tree->rbroot, offset); + if (entry) { + zswap_invalidate_entry(tree, entry); zswap_duplicate_entry++; - zswap_invalidate_entry(tree, dupentry); } spin_unlock(&tree->lock); @@ -1605,17 +1602,19 @@ bool zswap_store(struct folio *folio) } /* allocate entry */ - entry = zswap_entry_cache_alloc(GFP_KERNEL, page_to_nid(page)); + entry = zswap_entry_cache_alloc(GFP_KERNEL, folio_nid(folio)); if (!entry) { zswap_reject_kmemcache_fail++; goto reject; } if (zswap_same_filled_pages_enabled) { - src = kmap_local_page(page); + unsigned long value; + u8 *src; + + src = kmap_local_folio(folio, 0); if (zswap_is_page_same_filled(src, &value)) { kunmap_local(src); - entry->swpentry = swp; entry->length = 0; entry->value = value; atomic_inc(&zswap_same_filled_pages); @@ -1644,9 +1643,8 @@ bool zswap_store(struct folio *folio) if (!zswap_compress(folio, entry)) goto put_pool; - entry->swpentry = swp; - insert_entry: + entry->swpentry = swp; entry->objcg = objcg; if (objcg) { obj_cgroup_charge_zswap(objcg, entry->length); @@ -1691,9 +1689,9 @@ bool zswap_store(struct folio *folio) return false; shrink: - pool = zswap_pool_last_get(); - if (pool && !queue_work(shrink_wq, &pool->shrink_work)) - zswap_pool_put(pool); + shrink_pool = zswap_pool_last_get(); + if (shrink_pool && !queue_work(shrink_wq, &shrink_pool->shrink_work)) + zswap_pool_put(shrink_pool); goto reject; } -- Gitee From 861c3ac83fbd57977b88a6f1209f8714fce4bc63 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 29 Jan 2024 20:36:45 -0500 Subject: [PATCH 173/484] mm: zswap: simplify zswap_invalidate() commit 06ed22890cf9749b0e45d59c5cdc6e46dbd7339d upstream Conflicts: none Backport-reason: ZSWAP mass udpate The branching is awkward and duplicates code. The comment about writeback is also misleading: yes, the entry might have been written back. Or it might have never been stored in zswap to begin with due to a rejection - zswap_invalidate() is called on all exiting swap entries. Link: https://lkml.kernel.org/r/20240130014208.565554-10-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Reviewed-by: Nhat Pham Acked-by: Yosry Ahmed Reviewed-by: Chengming Zhou Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/zswap.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/mm/zswap.c b/mm/zswap.c index 6907c2db7bd3..157865251a83 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -1746,15 +1746,10 @@ void zswap_invalidate(int type, pgoff_t offset) struct zswap_tree *tree = swap_zswap_tree(swp_entry(type, offset)); struct zswap_entry *entry; - /* find */ spin_lock(&tree->lock); entry = zswap_rb_search(&tree->rbroot, offset); - if (!entry) { - /* entry was written back */ - spin_unlock(&tree->lock); - return; - } - zswap_invalidate_entry(tree, entry); + if (entry) + zswap_invalidate_entry(tree, entry); spin_unlock(&tree->lock); } -- Gitee From f6dcb74e65bec482570b4c0aeb652ec20caa967d Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 29 Jan 2024 20:36:46 -0500 Subject: [PATCH 174/484] mm: zswap: function ordering: pool alloc & free commit a984649b5c1f34103638a960b255bd695c2a9445 upstream Conflicts: none Backport-reason: ZSWAP mass udpate The function ordering in zswap.c is a little chaotic, which requires jumping in unexpected directions when following related code. This is a series of patches that brings the file into the following order: - pool functions - lru functions - rbtree functions - zswap entry functions - compression/backend functions - writeback & shrinking functions - store, load, invalidate, swapon, swapoff - debugfs - init But it has to be split up such the moving still produces halfway readable diffs. In this patch, move pool allocation and freeing functions. Link: https://lkml.kernel.org/r/20240130014208.565554-11-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Reviewed-by: Nhat Pham Cc: Chengming Zhou Cc: Yosry Ahmed Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/zswap.c | 297 +++++++++++++++++++++++++++-------------------------- 1 file changed, 152 insertions(+), 145 deletions(-) diff --git a/mm/zswap.c b/mm/zswap.c index 157865251a83..a01a34fa2254 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -320,6 +320,158 @@ static void zswap_update_total_size(void) zswap_pool_total_size = total; } +/********************************* +* pool functions +**********************************/ + +static void zswap_alloc_shrinker(struct zswap_pool *pool); +static void shrink_worker(struct work_struct *w); + +static struct zswap_pool *zswap_pool_create(char *type, char *compressor) +{ + int i; + struct zswap_pool *pool; + char name[38]; /* 'zswap' + 32 char (max) num + \0 */ + gfp_t gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM; + int ret; + + if (!zswap_has_pool) { + /* if either are unset, pool initialization failed, and we + * need both params to be set correctly before trying to + * create a pool. + */ + if (!strcmp(type, ZSWAP_PARAM_UNSET)) + return NULL; + if (!strcmp(compressor, ZSWAP_PARAM_UNSET)) + return NULL; + } + + pool = kzalloc(sizeof(*pool), GFP_KERNEL); + if (!pool) + return NULL; + + for (i = 0; i < ZSWAP_NR_ZPOOLS; i++) { + /* unique name for each pool specifically required by zsmalloc */ + snprintf(name, 38, "zswap%x", + atomic_inc_return(&zswap_pools_count)); + + pool->zpools[i] = zpool_create_pool(type, name, gfp); + if (!pool->zpools[i]) { + pr_err("%s zpool not available\n", type); + goto error; + } + } + pr_debug("using %s zpool\n", zpool_get_type(pool->zpools[0])); + + strscpy(pool->tfm_name, compressor, sizeof(pool->tfm_name)); + + pool->acomp_ctx = alloc_percpu(*pool->acomp_ctx); + if (!pool->acomp_ctx) { + pr_err("percpu alloc failed\n"); + goto error; + } + + ret = cpuhp_state_add_instance(CPUHP_MM_ZSWP_POOL_PREPARE, + &pool->node); + if (ret) + goto error; + + zswap_alloc_shrinker(pool); + if (!pool->shrinker) + goto error; + + pr_debug("using %s compressor\n", pool->tfm_name); + + /* being the current pool takes 1 ref; this func expects the + * caller to always add the new pool as the current pool + */ + kref_init(&pool->kref); + INIT_LIST_HEAD(&pool->list); + if (list_lru_init_memcg(&pool->list_lru, pool->shrinker)) + goto lru_fail; + shrinker_register(pool->shrinker); + INIT_WORK(&pool->shrink_work, shrink_worker); + atomic_set(&pool->nr_stored, 0); + + zswap_pool_debug("created", pool); + + return pool; + +lru_fail: + list_lru_destroy(&pool->list_lru); + shrinker_free(pool->shrinker); +error: + if (pool->acomp_ctx) + free_percpu(pool->acomp_ctx); + while (i--) + zpool_destroy_pool(pool->zpools[i]); + kfree(pool); + return NULL; +} + +static struct zswap_pool *__zswap_pool_create_fallback(void) +{ + bool has_comp, has_zpool; + + has_comp = crypto_has_acomp(zswap_compressor, 0, 0); + if (!has_comp && strcmp(zswap_compressor, + CONFIG_ZSWAP_COMPRESSOR_DEFAULT)) { + pr_err("compressor %s not available, using default %s\n", + zswap_compressor, CONFIG_ZSWAP_COMPRESSOR_DEFAULT); + param_free_charp(&zswap_compressor); + zswap_compressor = CONFIG_ZSWAP_COMPRESSOR_DEFAULT; + has_comp = crypto_has_acomp(zswap_compressor, 0, 0); + } + if (!has_comp) { + pr_err("default compressor %s not available\n", + zswap_compressor); + param_free_charp(&zswap_compressor); + zswap_compressor = ZSWAP_PARAM_UNSET; + } + + has_zpool = zpool_has_pool(zswap_zpool_type); + if (!has_zpool && strcmp(zswap_zpool_type, + CONFIG_ZSWAP_ZPOOL_DEFAULT)) { + pr_err("zpool %s not available, using default %s\n", + zswap_zpool_type, CONFIG_ZSWAP_ZPOOL_DEFAULT); + param_free_charp(&zswap_zpool_type); + zswap_zpool_type = CONFIG_ZSWAP_ZPOOL_DEFAULT; + has_zpool = zpool_has_pool(zswap_zpool_type); + } + if (!has_zpool) { + pr_err("default zpool %s not available\n", + zswap_zpool_type); + param_free_charp(&zswap_zpool_type); + zswap_zpool_type = ZSWAP_PARAM_UNSET; + } + + if (!has_comp || !has_zpool) + return NULL; + + return zswap_pool_create(zswap_zpool_type, zswap_compressor); +} + +static void zswap_pool_destroy(struct zswap_pool *pool) +{ + int i; + + zswap_pool_debug("destroying", pool); + + shrinker_free(pool->shrinker); + cpuhp_state_remove_instance(CPUHP_MM_ZSWP_POOL_PREPARE, &pool->node); + free_percpu(pool->acomp_ctx); + list_lru_destroy(&pool->list_lru); + + spin_lock(&zswap_pools_lock); + mem_cgroup_iter_break(NULL, pool->next_shrink); + pool->next_shrink = NULL; + spin_unlock(&zswap_pools_lock); + + for (i = 0; i < ZSWAP_NR_ZPOOLS; i++) + zpool_destroy_pool(pool->zpools[i]); + kfree(pool); +} + /* should be called under RCU */ #ifdef CONFIG_MEMCG static inline struct mem_cgroup *mem_cgroup_from_entry(struct zswap_entry *entry) @@ -969,151 +1121,6 @@ static void shrink_worker(struct work_struct *w) zswap_pool_put(pool); } -static struct zswap_pool *zswap_pool_create(char *type, char *compressor) -{ - int i; - struct zswap_pool *pool; - char name[38]; /* 'zswap' + 32 char (max) num + \0 */ - gfp_t gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM; - int ret; - - if (!zswap_has_pool) { - /* if either are unset, pool initialization failed, and we - * need both params to be set correctly before trying to - * create a pool. - */ - if (!strcmp(type, ZSWAP_PARAM_UNSET)) - return NULL; - if (!strcmp(compressor, ZSWAP_PARAM_UNSET)) - return NULL; - } - - pool = kzalloc(sizeof(*pool), GFP_KERNEL); - if (!pool) - return NULL; - - for (i = 0; i < ZSWAP_NR_ZPOOLS; i++) { - /* unique name for each pool specifically required by zsmalloc */ - snprintf(name, 38, "zswap%x", - atomic_inc_return(&zswap_pools_count)); - - pool->zpools[i] = zpool_create_pool(type, name, gfp); - if (!pool->zpools[i]) { - pr_err("%s zpool not available\n", type); - goto error; - } - } - pr_debug("using %s zpool\n", zpool_get_type(pool->zpools[0])); - - strscpy(pool->tfm_name, compressor, sizeof(pool->tfm_name)); - - pool->acomp_ctx = alloc_percpu(*pool->acomp_ctx); - if (!pool->acomp_ctx) { - pr_err("percpu alloc failed\n"); - goto error; - } - - ret = cpuhp_state_add_instance(CPUHP_MM_ZSWP_POOL_PREPARE, - &pool->node); - if (ret) - goto error; - - zswap_alloc_shrinker(pool); - if (!pool->shrinker) - goto error; - - pr_debug("using %s compressor\n", pool->tfm_name); - - /* being the current pool takes 1 ref; this func expects the - * caller to always add the new pool as the current pool - */ - kref_init(&pool->kref); - INIT_LIST_HEAD(&pool->list); - if (list_lru_init_memcg(&pool->list_lru, pool->shrinker)) - goto lru_fail; - shrinker_register(pool->shrinker); - INIT_WORK(&pool->shrink_work, shrink_worker); - atomic_set(&pool->nr_stored, 0); - - zswap_pool_debug("created", pool); - - return pool; - -lru_fail: - list_lru_destroy(&pool->list_lru); - shrinker_free(pool->shrinker); -error: - if (pool->acomp_ctx) - free_percpu(pool->acomp_ctx); - while (i--) - zpool_destroy_pool(pool->zpools[i]); - kfree(pool); - return NULL; -} - -static struct zswap_pool *__zswap_pool_create_fallback(void) -{ - bool has_comp, has_zpool; - - has_comp = crypto_has_acomp(zswap_compressor, 0, 0); - if (!has_comp && strcmp(zswap_compressor, - CONFIG_ZSWAP_COMPRESSOR_DEFAULT)) { - pr_err("compressor %s not available, using default %s\n", - zswap_compressor, CONFIG_ZSWAP_COMPRESSOR_DEFAULT); - param_free_charp(&zswap_compressor); - zswap_compressor = CONFIG_ZSWAP_COMPRESSOR_DEFAULT; - has_comp = crypto_has_acomp(zswap_compressor, 0, 0); - } - if (!has_comp) { - pr_err("default compressor %s not available\n", - zswap_compressor); - param_free_charp(&zswap_compressor); - zswap_compressor = ZSWAP_PARAM_UNSET; - } - - has_zpool = zpool_has_pool(zswap_zpool_type); - if (!has_zpool && strcmp(zswap_zpool_type, - CONFIG_ZSWAP_ZPOOL_DEFAULT)) { - pr_err("zpool %s not available, using default %s\n", - zswap_zpool_type, CONFIG_ZSWAP_ZPOOL_DEFAULT); - param_free_charp(&zswap_zpool_type); - zswap_zpool_type = CONFIG_ZSWAP_ZPOOL_DEFAULT; - has_zpool = zpool_has_pool(zswap_zpool_type); - } - if (!has_zpool) { - pr_err("default zpool %s not available\n", - zswap_zpool_type); - param_free_charp(&zswap_zpool_type); - zswap_zpool_type = ZSWAP_PARAM_UNSET; - } - - if (!has_comp || !has_zpool) - return NULL; - - return zswap_pool_create(zswap_zpool_type, zswap_compressor); -} - -static void zswap_pool_destroy(struct zswap_pool *pool) -{ - int i; - - zswap_pool_debug("destroying", pool); - - shrinker_free(pool->shrinker); - cpuhp_state_remove_instance(CPUHP_MM_ZSWP_POOL_PREPARE, &pool->node); - free_percpu(pool->acomp_ctx); - list_lru_destroy(&pool->list_lru); - - spin_lock(&zswap_pools_lock); - mem_cgroup_iter_break(NULL, pool->next_shrink); - pool->next_shrink = NULL; - spin_unlock(&zswap_pools_lock); - - for (i = 0; i < ZSWAP_NR_ZPOOLS; i++) - zpool_destroy_pool(pool->zpools[i]); - kfree(pool); -} - static int __must_check zswap_pool_get(struct zswap_pool *pool) { if (!pool) -- Gitee From 22c2c5393062574186d3944ae12e71c55bd47ff1 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 29 Jan 2024 20:36:47 -0500 Subject: [PATCH 175/484] mm: zswap: function ordering: pool refcounting commit 39f3ec8eaa6065873e592857627f18ae17b6c878 upstream Conflicts: none Backport-reason: ZSWAP mass udpate Move pool refcounting functions into the pool section. First the destroy functions, then the get and put which uses them. __zswap_pool_empty() has an upward reference to the global zswap_pools, to sanity check it's not the currently active pool that's being freed. That gets the forward decl for zswap_pool_current(). This puts the get and put function above all callers, so kill the forward decls as well. Link: https://lkml.kernel.org/r/20240130014208.565554-12-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Reviewed-by: Nhat Pham Cc: Chengming Zhou Cc: Yosry Ahmed Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/zswap.c | 94 +++++++++++++++++++++++++++--------------------------- 1 file changed, 47 insertions(+), 47 deletions(-) diff --git a/mm/zswap.c b/mm/zswap.c index a01a34fa2254..1965b75367e6 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -278,8 +278,6 @@ static inline struct zswap_tree *swap_zswap_tree(swp_entry_t swp) static int zswap_writeback_entry(struct zswap_entry *entry, swp_entry_t swpentry); -static int zswap_pool_get(struct zswap_pool *pool); -static void zswap_pool_put(struct zswap_pool *pool); static bool zswap_is_full(void) { @@ -472,6 +470,53 @@ static void zswap_pool_destroy(struct zswap_pool *pool) kfree(pool); } +static void __zswap_pool_release(struct work_struct *work) +{ + struct zswap_pool *pool = container_of(work, typeof(*pool), + release_work); + + synchronize_rcu(); + + /* nobody should have been able to get a kref... */ + WARN_ON(kref_get_unless_zero(&pool->kref)); + + /* pool is now off zswap_pools list and has no references. */ + zswap_pool_destroy(pool); +} + +static struct zswap_pool *zswap_pool_current(void); + +static void __zswap_pool_empty(struct kref *kref) +{ + struct zswap_pool *pool; + + pool = container_of(kref, typeof(*pool), kref); + + spin_lock(&zswap_pools_lock); + + WARN_ON(pool == zswap_pool_current()); + + list_del_rcu(&pool->list); + + INIT_WORK(&pool->release_work, __zswap_pool_release); + schedule_work(&pool->release_work); + + spin_unlock(&zswap_pools_lock); +} + +static int __must_check zswap_pool_get(struct zswap_pool *pool) +{ + if (!pool) + return 0; + + return kref_get_unless_zero(&pool->kref); +} + +static void zswap_pool_put(struct zswap_pool *pool) +{ + kref_put(&pool->kref, __zswap_pool_empty); +} + /* should be called under RCU */ #ifdef CONFIG_MEMCG static inline struct mem_cgroup *mem_cgroup_from_entry(struct zswap_entry *entry) @@ -1121,51 +1166,6 @@ static void shrink_worker(struct work_struct *w) zswap_pool_put(pool); } -static int __must_check zswap_pool_get(struct zswap_pool *pool) -{ - if (!pool) - return 0; - - return kref_get_unless_zero(&pool->kref); -} - -static void __zswap_pool_release(struct work_struct *work) -{ - struct zswap_pool *pool = container_of(work, typeof(*pool), - release_work); - - synchronize_rcu(); - - /* nobody should have been able to get a kref... */ - WARN_ON(kref_get_unless_zero(&pool->kref)); - - /* pool is now off zswap_pools list and has no references. */ - zswap_pool_destroy(pool); -} - -static void __zswap_pool_empty(struct kref *kref) -{ - struct zswap_pool *pool; - - pool = container_of(kref, typeof(*pool), kref); - - spin_lock(&zswap_pools_lock); - - WARN_ON(pool == zswap_pool_current()); - - list_del_rcu(&pool->list); - - INIT_WORK(&pool->release_work, __zswap_pool_release); - schedule_work(&pool->release_work); - - spin_unlock(&zswap_pools_lock); -} - -static void zswap_pool_put(struct zswap_pool *pool) -{ - kref_put(&pool->kref, __zswap_pool_empty); -} - /********************************* * param callbacks **********************************/ -- Gitee From 01b1bbe573c01f5c313d11803394d155fed44402 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 29 Jan 2024 20:36:48 -0500 Subject: [PATCH 176/484] mm: zswap: function ordering: zswap_pools commit c1a0ecb82bdc6e9e972b0adbd74eb56ff33776f9 upstream Conflicts: none Backport-reason: ZSWAP mass udpate Move the operations against the global zswap_pools list (current pool, last, find) to the pool section. Link: https://lkml.kernel.org/r/20240130014208.565554-13-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Reviewed-by: Nhat Pham Cc: Chengming Zhou Cc: Yosry Ahmed Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/zswap.c | 150 ++++++++++++++++++++++++++--------------------------- 1 file changed, 73 insertions(+), 77 deletions(-) diff --git a/mm/zswap.c b/mm/zswap.c index 1965b75367e6..621357f2c61f 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -517,6 +517,79 @@ static void zswap_pool_put(struct zswap_pool *pool) kref_put(&pool->kref, __zswap_pool_empty); } +static struct zswap_pool *__zswap_pool_current(void) +{ + struct zswap_pool *pool; + + pool = list_first_or_null_rcu(&zswap_pools, typeof(*pool), list); + WARN_ONCE(!pool && zswap_has_pool, + "%s: no page storage pool!\n", __func__); + + return pool; +} + +static struct zswap_pool *zswap_pool_current(void) +{ + assert_spin_locked(&zswap_pools_lock); + + return __zswap_pool_current(); +} + +static struct zswap_pool *zswap_pool_current_get(void) +{ + struct zswap_pool *pool; + + rcu_read_lock(); + + pool = __zswap_pool_current(); + if (!zswap_pool_get(pool)) + pool = NULL; + + rcu_read_unlock(); + + return pool; +} + +static struct zswap_pool *zswap_pool_last_get(void) +{ + struct zswap_pool *pool, *last = NULL; + + rcu_read_lock(); + + list_for_each_entry_rcu(pool, &zswap_pools, list) + last = pool; + WARN_ONCE(!last && zswap_has_pool, + "%s: no page storage pool!\n", __func__); + if (!zswap_pool_get(last)) + last = NULL; + + rcu_read_unlock(); + + return last; +} + +/* type and compressor must be null-terminated */ +static struct zswap_pool *zswap_pool_find_get(char *type, char *compressor) +{ + struct zswap_pool *pool; + + assert_spin_locked(&zswap_pools_lock); + + list_for_each_entry_rcu(pool, &zswap_pools, list) { + if (strcmp(pool->tfm_name, compressor)) + continue; + /* all zpools share the same type */ + if (strcmp(zpool_get_type(pool->zpools[0]), type)) + continue; + /* if we can't get it, it's about to be destroyed */ + if (!zswap_pool_get(pool)) + continue; + return pool; + } + + return NULL; +} + /* should be called under RCU */ #ifdef CONFIG_MEMCG static inline struct mem_cgroup *mem_cgroup_from_entry(struct zswap_entry *entry) @@ -937,83 +1010,6 @@ static int zswap_cpu_comp_dead(unsigned int cpu, struct hlist_node *node) return 0; } -/********************************* -* pool functions -**********************************/ - -static struct zswap_pool *__zswap_pool_current(void) -{ - struct zswap_pool *pool; - - pool = list_first_or_null_rcu(&zswap_pools, typeof(*pool), list); - WARN_ONCE(!pool && zswap_has_pool, - "%s: no page storage pool!\n", __func__); - - return pool; -} - -static struct zswap_pool *zswap_pool_current(void) -{ - assert_spin_locked(&zswap_pools_lock); - - return __zswap_pool_current(); -} - -static struct zswap_pool *zswap_pool_current_get(void) -{ - struct zswap_pool *pool; - - rcu_read_lock(); - - pool = __zswap_pool_current(); - if (!zswap_pool_get(pool)) - pool = NULL; - - rcu_read_unlock(); - - return pool; -} - -static struct zswap_pool *zswap_pool_last_get(void) -{ - struct zswap_pool *pool, *last = NULL; - - rcu_read_lock(); - - list_for_each_entry_rcu(pool, &zswap_pools, list) - last = pool; - WARN_ONCE(!last && zswap_has_pool, - "%s: no page storage pool!\n", __func__); - if (!zswap_pool_get(last)) - last = NULL; - - rcu_read_unlock(); - - return last; -} - -/* type and compressor must be null-terminated */ -static struct zswap_pool *zswap_pool_find_get(char *type, char *compressor) -{ - struct zswap_pool *pool; - - assert_spin_locked(&zswap_pools_lock); - - list_for_each_entry_rcu(pool, &zswap_pools, list) { - if (strcmp(pool->tfm_name, compressor)) - continue; - /* all zpools share the same type */ - if (strcmp(zpool_get_type(pool->zpools[0]), type)) - continue; - /* if we can't get it, it's about to be destroyed */ - if (!zswap_pool_get(pool)) - continue; - return pool; - } - - return NULL; -} - static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_one *l, spinlock_t *lock, void *arg) { -- Gitee From 30346266b0d1f311fc3dbfd4ab47a60a4a22ecbd Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 29 Jan 2024 20:36:49 -0500 Subject: [PATCH 177/484] mm: zswap: function ordering: pool params commit abca07c04aa56f9eb41056f4d25aaf029afb00a3 upstream Conflicts: none Backport-reason: ZSWAP mass udpate Patch series "mm: zswap: cleanups". Cleanups and maintenance items that accumulated while reviewing zswap patches. This patch (of 20): The parameters primarily control pool attributes. Move those operations up to the pool section. Link: https://lkml.kernel.org/r/20240130014208.565554-1-hannes@cmpxchg.org Link: https://lkml.kernel.org/r/20240130014208.565554-14-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Reviewed-by: Nhat Pham Cc: Chengming Zhou Cc: Yosry Ahmed Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/zswap.c | 312 ++++++++++++++++++++++++++--------------------------- 1 file changed, 156 insertions(+), 156 deletions(-) diff --git a/mm/zswap.c b/mm/zswap.c index 621357f2c61f..9a4af947dad8 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -590,6 +590,162 @@ static struct zswap_pool *zswap_pool_find_get(char *type, char *compressor) return NULL; } +/********************************* +* param callbacks +**********************************/ + +static bool zswap_pool_changed(const char *s, const struct kernel_param *kp) +{ + /* no change required */ + if (!strcmp(s, *(char **)kp->arg) && zswap_has_pool) + return false; + return true; +} + +/* val must be a null-terminated string */ +static int __zswap_param_set(const char *val, const struct kernel_param *kp, + char *type, char *compressor) +{ + struct zswap_pool *pool, *put_pool = NULL; + char *s = strstrip((char *)val); + int ret = 0; + bool new_pool = false; + + mutex_lock(&zswap_init_lock); + switch (zswap_init_state) { + case ZSWAP_UNINIT: + /* if this is load-time (pre-init) param setting, + * don't create a pool; that's done during init. + */ + ret = param_set_charp(s, kp); + break; + case ZSWAP_INIT_SUCCEED: + new_pool = zswap_pool_changed(s, kp); + break; + case ZSWAP_INIT_FAILED: + pr_err("can't set param, initialization failed\n"); + ret = -ENODEV; + } + mutex_unlock(&zswap_init_lock); + + /* no need to create a new pool, return directly */ + if (!new_pool) + return ret; + + if (!type) { + if (!zpool_has_pool(s)) { + pr_err("zpool %s not available\n", s); + return -ENOENT; + } + type = s; + } else if (!compressor) { + if (!crypto_has_acomp(s, 0, 0)) { + pr_err("compressor %s not available\n", s); + return -ENOENT; + } + compressor = s; + } else { + WARN_ON(1); + return -EINVAL; + } + + spin_lock(&zswap_pools_lock); + + pool = zswap_pool_find_get(type, compressor); + if (pool) { + zswap_pool_debug("using existing", pool); + WARN_ON(pool == zswap_pool_current()); + list_del_rcu(&pool->list); + } + + spin_unlock(&zswap_pools_lock); + + if (!pool) + pool = zswap_pool_create(type, compressor); + + if (pool) + ret = param_set_charp(s, kp); + else + ret = -EINVAL; + + spin_lock(&zswap_pools_lock); + + if (!ret) { + put_pool = zswap_pool_current(); + list_add_rcu(&pool->list, &zswap_pools); + zswap_has_pool = true; + } else if (pool) { + /* add the possibly pre-existing pool to the end of the pools + * list; if it's new (and empty) then it'll be removed and + * destroyed by the put after we drop the lock + */ + list_add_tail_rcu(&pool->list, &zswap_pools); + put_pool = pool; + } + + spin_unlock(&zswap_pools_lock); + + if (!zswap_has_pool && !pool) { + /* if initial pool creation failed, and this pool creation also + * failed, maybe both compressor and zpool params were bad. + * Allow changing this param, so pool creation will succeed + * when the other param is changed. We already verified this + * param is ok in the zpool_has_pool() or crypto_has_acomp() + * checks above. + */ + ret = param_set_charp(s, kp); + } + + /* drop the ref from either the old current pool, + * or the new pool we failed to add + */ + if (put_pool) + zswap_pool_put(put_pool); + + return ret; +} + +static int zswap_compressor_param_set(const char *val, + const struct kernel_param *kp) +{ + return __zswap_param_set(val, kp, zswap_zpool_type, NULL); +} + +static int zswap_zpool_param_set(const char *val, + const struct kernel_param *kp) +{ + return __zswap_param_set(val, kp, NULL, zswap_compressor); +} + +static int zswap_enabled_param_set(const char *val, + const struct kernel_param *kp) +{ + int ret = -ENODEV; + + /* if this is load-time (pre-init) param setting, only set param. */ + if (system_state != SYSTEM_RUNNING) + return param_set_bool(val, kp); + + mutex_lock(&zswap_init_lock); + switch (zswap_init_state) { + case ZSWAP_UNINIT: + if (zswap_setup()) + break; + fallthrough; + case ZSWAP_INIT_SUCCEED: + if (!zswap_has_pool) + pr_err("can't enable, no pool configured\n"); + else + ret = param_set_bool(val, kp); + break; + case ZSWAP_INIT_FAILED: + pr_err("can't enable, initialization failed\n"); + } + mutex_unlock(&zswap_init_lock); + + return ret; +} + /* should be called under RCU */ #ifdef CONFIG_MEMCG static inline struct mem_cgroup *mem_cgroup_from_entry(struct zswap_entry *entry) @@ -1162,162 +1318,6 @@ static void shrink_worker(struct work_struct *w) zswap_pool_put(pool); } -/********************************* -* param callbacks -**********************************/ - -static bool zswap_pool_changed(const char *s, const struct kernel_param *kp) -{ - /* no change required */ - if (!strcmp(s, *(char **)kp->arg) && zswap_has_pool) - return false; - return true; -} - -/* val must be a null-terminated string */ -static int __zswap_param_set(const char *val, const struct kernel_param *kp, - char *type, char *compressor) -{ - struct zswap_pool *pool, *put_pool = NULL; - char *s = strstrip((char *)val); - int ret = 0; - bool new_pool = false; - - mutex_lock(&zswap_init_lock); - switch (zswap_init_state) { - case ZSWAP_UNINIT: - /* if this is load-time (pre-init) param setting, - * don't create a pool; that's done during init. - */ - ret = param_set_charp(s, kp); - break; - case ZSWAP_INIT_SUCCEED: - new_pool = zswap_pool_changed(s, kp); - break; - case ZSWAP_INIT_FAILED: - pr_err("can't set param, initialization failed\n"); - ret = -ENODEV; - } - mutex_unlock(&zswap_init_lock); - - /* no need to create a new pool, return directly */ - if (!new_pool) - return ret; - - if (!type) { - if (!zpool_has_pool(s)) { - pr_err("zpool %s not available\n", s); - return -ENOENT; - } - type = s; - } else if (!compressor) { - if (!crypto_has_acomp(s, 0, 0)) { - pr_err("compressor %s not available\n", s); - return -ENOENT; - } - compressor = s; - } else { - WARN_ON(1); - return -EINVAL; - } - - spin_lock(&zswap_pools_lock); - - pool = zswap_pool_find_get(type, compressor); - if (pool) { - zswap_pool_debug("using existing", pool); - WARN_ON(pool == zswap_pool_current()); - list_del_rcu(&pool->list); - } - - spin_unlock(&zswap_pools_lock); - - if (!pool) - pool = zswap_pool_create(type, compressor); - - if (pool) - ret = param_set_charp(s, kp); - else - ret = -EINVAL; - - spin_lock(&zswap_pools_lock); - - if (!ret) { - put_pool = zswap_pool_current(); - list_add_rcu(&pool->list, &zswap_pools); - zswap_has_pool = true; - } else if (pool) { - /* add the possibly pre-existing pool to the end of the pools - * list; if it's new (and empty) then it'll be removed and - * destroyed by the put after we drop the lock - */ - list_add_tail_rcu(&pool->list, &zswap_pools); - put_pool = pool; - } - - spin_unlock(&zswap_pools_lock); - - if (!zswap_has_pool && !pool) { - /* if initial pool creation failed, and this pool creation also - * failed, maybe both compressor and zpool params were bad. - * Allow changing this param, so pool creation will succeed - * when the other param is changed. We already verified this - * param is ok in the zpool_has_pool() or crypto_has_acomp() - * checks above. - */ - ret = param_set_charp(s, kp); - } - - /* drop the ref from either the old current pool, - * or the new pool we failed to add - */ - if (put_pool) - zswap_pool_put(put_pool); - - return ret; -} - -static int zswap_compressor_param_set(const char *val, - const struct kernel_param *kp) -{ - return __zswap_param_set(val, kp, zswap_zpool_type, NULL); -} - -static int zswap_zpool_param_set(const char *val, - const struct kernel_param *kp) -{ - return __zswap_param_set(val, kp, NULL, zswap_compressor); -} - -static int zswap_enabled_param_set(const char *val, - const struct kernel_param *kp) -{ - int ret = -ENODEV; - - /* if this is load-time (pre-init) param setting, only set param. */ - if (system_state != SYSTEM_RUNNING) - return param_set_bool(val, kp); - - mutex_lock(&zswap_init_lock); - switch (zswap_init_state) { - case ZSWAP_UNINIT: - if (zswap_setup()) - break; - fallthrough; - case ZSWAP_INIT_SUCCEED: - if (!zswap_has_pool) - pr_err("can't enable, no pool configured\n"); - else - ret = param_set_bool(val, kp); - break; - case ZSWAP_INIT_FAILED: - pr_err("can't enable, initialization failed\n"); - } - mutex_unlock(&zswap_init_lock); - - return ret; -} - static bool zswap_compress(struct folio *folio, struct zswap_entry *entry) { struct crypto_acomp_ctx *acomp_ctx; -- Gitee From 3a27345007be6d6a7c7ab14cc54427a21fdf7215 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 29 Jan 2024 20:36:50 -0500 Subject: [PATCH 178/484] mm: zswap: function ordering: public lru api commit 506a86c5e2217cfb1884ea4806dc74c82d058733 upstream Conflicts: none Backport-reason: ZSWAP mass udpate The zswap entry section sits awkwardly in the middle of LRU-related functions. Group the external LRU API functions first. Link: https://lkml.kernel.org/r/20240130014208.565554-15-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Reviewed-by: Nhat Pham Cc: Chengming Zhou Cc: Yosry Ahmed Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/zswap.c | 36 +++++++++++++++++++----------------- 1 file changed, 19 insertions(+), 17 deletions(-) diff --git a/mm/zswap.c b/mm/zswap.c index 9a4af947dad8..c58f6a4743ce 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -746,6 +746,10 @@ static int zswap_enabled_param_set(const char *val, return ret; } +/********************************* +* lru functions +**********************************/ + /* should be called under RCU */ #ifdef CONFIG_MEMCG static inline struct mem_cgroup *mem_cgroup_from_entry(struct zswap_entry *entry) @@ -764,6 +768,21 @@ static inline int entry_to_nid(struct zswap_entry *entry) return page_to_nid(virt_to_page(entry)); } +void zswap_lruvec_state_init(struct lruvec *lruvec) +{ + atomic_long_set(&lruvec->zswap_lruvec_state.nr_zswap_protected, 0); +} + +void zswap_folio_swapin(struct folio *folio) +{ + struct lruvec *lruvec; + + if (folio) { + lruvec = folio_lruvec(folio); + atomic_long_inc(&lruvec->zswap_lruvec_state.nr_zswap_protected); + } +} + void zswap_memcg_offline_cleanup(struct mem_cgroup *memcg) { struct zswap_pool *pool; @@ -798,23 +817,6 @@ static void zswap_entry_cache_free(struct zswap_entry *entry) kmem_cache_free(zswap_entry_cache, entry); } -/********************************* -* zswap lruvec functions -**********************************/ -void zswap_lruvec_state_init(struct lruvec *lruvec) -{ - atomic_long_set(&lruvec->zswap_lruvec_state.nr_zswap_protected, 0); -} - -void zswap_folio_swapin(struct folio *folio) -{ - struct lruvec *lruvec; - - VM_WARN_ON_ONCE(!folio_test_locked(folio)); - lruvec = folio_lruvec(folio); - atomic_long_inc(&lruvec->zswap_lruvec_state.nr_zswap_protected); -} - /********************************* * lru functions **********************************/ -- Gitee From 8a7f855ec7fc5aa639e0ccda21bdd845e0445f23 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 29 Jan 2024 20:36:51 -0500 Subject: [PATCH 179/484] mm: zswap: function ordering: move entry sections out of LRU section commit 5182661a11baeda74a547660b11570f5b5124eaf upstream Conflicts: none Backport-reason: ZSWAP mass udpate This completes consolidation of the LRU section. Link: https://lkml.kernel.org/r/20240130014208.565554-16-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Reviewed-by: Nhat Pham Cc: Chengming Zhou Cc: Yosry Ahmed Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/zswap.c | 101 ++++++++++++++++++++++++++--------------------------- 1 file changed, 49 insertions(+), 52 deletions(-) diff --git a/mm/zswap.c b/mm/zswap.c index c58f6a4743ce..5876cd2c3b4f 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -768,58 +768,6 @@ static inline int entry_to_nid(struct zswap_entry *entry) return page_to_nid(virt_to_page(entry)); } -void zswap_lruvec_state_init(struct lruvec *lruvec) -{ - atomic_long_set(&lruvec->zswap_lruvec_state.nr_zswap_protected, 0); -} - -void zswap_folio_swapin(struct folio *folio) -{ - struct lruvec *lruvec; - - if (folio) { - lruvec = folio_lruvec(folio); - atomic_long_inc(&lruvec->zswap_lruvec_state.nr_zswap_protected); - } -} - -void zswap_memcg_offline_cleanup(struct mem_cgroup *memcg) -{ - struct zswap_pool *pool; - - /* lock out zswap pools list modification */ - spin_lock(&zswap_pools_lock); - list_for_each_entry(pool, &zswap_pools, list) { - if (pool->next_shrink == memcg) - pool->next_shrink = mem_cgroup_iter(NULL, pool->next_shrink, NULL); - } - spin_unlock(&zswap_pools_lock); -} - -/********************************* -* zswap entry functions -**********************************/ -static struct kmem_cache *zswap_entry_cache; - -static struct zswap_entry *zswap_entry_cache_alloc(gfp_t gfp, int nid) -{ - struct zswap_entry *entry; - entry = kmem_cache_alloc_node(zswap_entry_cache, gfp, nid); - if (!entry) - return NULL; - entry->refcount = 1; - RB_CLEAR_NODE(&entry->rbnode); - return entry; -} - -static void zswap_entry_cache_free(struct zswap_entry *entry) -{ - kmem_cache_free(zswap_entry_cache, entry); -} - -/********************************* -* lru functions -**********************************/ static void zswap_lru_add(struct list_lru *list_lru, struct zswap_entry *entry) { atomic_long_t *nr_zswap_protected; @@ -872,6 +820,55 @@ static void zswap_lru_del(struct list_lru *list_lru, struct zswap_entry *entry) rcu_read_unlock(); } +void zswap_lruvec_state_init(struct lruvec *lruvec) +{ + atomic_long_set(&lruvec->zswap_lruvec_state.nr_zswap_protected, 0); +} + +void zswap_folio_swapin(struct folio *folio) +{ + struct lruvec *lruvec; + + if (folio) { + lruvec = folio_lruvec(folio); + atomic_long_inc(&lruvec->zswap_lruvec_state.nr_zswap_protected); + } +} + +void zswap_memcg_offline_cleanup(struct mem_cgroup *memcg) +{ + struct zswap_pool *pool; + + /* lock out zswap pools list modification */ + spin_lock(&zswap_pools_lock); + list_for_each_entry(pool, &zswap_pools, list) { + if (pool->next_shrink == memcg) + pool->next_shrink = mem_cgroup_iter(NULL, pool->next_shrink, NULL); + } + spin_unlock(&zswap_pools_lock); +} + +/********************************* +* zswap entry functions +**********************************/ +static struct kmem_cache *zswap_entry_cache; + +static struct zswap_entry *zswap_entry_cache_alloc(gfp_t gfp, int nid) +{ + struct zswap_entry *entry; + entry = kmem_cache_alloc_node(zswap_entry_cache, gfp, nid); + if (!entry) + return NULL; + entry->refcount = 1; + RB_CLEAR_NODE(&entry->rbnode); + return entry; +} + +static void zswap_entry_cache_free(struct zswap_entry *entry) +{ + kmem_cache_free(zswap_entry_cache, entry); +} + /********************************* * rbtree functions **********************************/ -- Gitee From a39938c2f83b6e1d13506b7d16d5c069a88ffcdc Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 29 Jan 2024 20:36:52 -0500 Subject: [PATCH 180/484] mm: zswap: function ordering: move entry section out of tree section commit 36034bf6fcdb1281e3f8db27d68b6516a3aed4c7 upstream Conflicts: none Backport-reason: ZSWAP mass udpate The higher-level entry operations modify the tree, so move the entry API after the tree section. Link: https://lkml.kernel.org/r/20240130014208.565554-17-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Reviewed-by: Nhat Pham Cc: Chengming Zhou Cc: Yosry Ahmed Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/zswap.c | 42 +++++++++++++++++++++--------------------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/mm/zswap.c b/mm/zswap.c index 5876cd2c3b4f..e9d11f6a9c1b 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -848,27 +848,6 @@ void zswap_memcg_offline_cleanup(struct mem_cgroup *memcg) spin_unlock(&zswap_pools_lock); } -/********************************* -* zswap entry functions -**********************************/ -static struct kmem_cache *zswap_entry_cache; - -static struct zswap_entry *zswap_entry_cache_alloc(gfp_t gfp, int nid) -{ - struct zswap_entry *entry; - entry = kmem_cache_alloc_node(zswap_entry_cache, gfp, nid); - if (!entry) - return NULL; - entry->refcount = 1; - RB_CLEAR_NODE(&entry->rbnode); - return entry; -} - -static void zswap_entry_cache_free(struct zswap_entry *entry) -{ - kmem_cache_free(zswap_entry_cache, entry); -} - /********************************* * rbtree functions **********************************/ @@ -930,6 +909,27 @@ static bool zswap_rb_erase(struct rb_root *root, struct zswap_entry *entry) return false; } +/********************************* +* zswap entry functions +**********************************/ +static struct kmem_cache *zswap_entry_cache; + +static struct zswap_entry *zswap_entry_cache_alloc(gfp_t gfp, int nid) +{ + struct zswap_entry *entry; + entry = kmem_cache_alloc_node(zswap_entry_cache, gfp, nid); + if (!entry) + return NULL; + entry->refcount = 1; + RB_CLEAR_NODE(&entry->rbnode); + return entry; +} + +static void zswap_entry_cache_free(struct zswap_entry *entry) +{ + kmem_cache_free(zswap_entry_cache, entry); +} + static struct zpool *zswap_find_zpool(struct zswap_entry *entry) { int i = 0; -- Gitee From 8ebcc74a1b06d3faeeb61e3c8265b61d1345d29d Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 29 Jan 2024 20:36:53 -0500 Subject: [PATCH 181/484] mm: zswap: function ordering: compress & decompress functions commit f91e81d31c1ef7561559368f29e4bb6eddf13ea1 upstream Conflicts: none Backport-reason: ZSWAP mass udpate Writeback needs to decompress. Move the (de)compression API above what will be the consolidated shrinking/writeback code. Link: https://lkml.kernel.org/r/20240130014208.565554-18-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Reviewed-by: Nhat Pham Cc: Chengming Zhou Cc: Yosry Ahmed Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/zswap.c | 207 +++++++++++++++++++++++++++-------------------------- 1 file changed, 105 insertions(+), 102 deletions(-) diff --git a/mm/zswap.c b/mm/zswap.c index e9d11f6a9c1b..1e2c7250a9dc 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -992,6 +992,111 @@ static void zswap_invalidate_entry(struct zswap_tree *tree, zswap_entry_put(entry); } +/********************************* +* compressed storage functions +**********************************/ +static bool zswap_compress(struct folio *folio, struct zswap_entry *entry) +{ + struct crypto_acomp_ctx *acomp_ctx; + struct scatterlist input, output; + unsigned int dlen = PAGE_SIZE; + unsigned long handle; + struct zpool *zpool; + char *buf; + gfp_t gfp; + int ret; + u8 *dst; + + acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx); + + mutex_lock(&acomp_ctx->mutex); + + dst = acomp_ctx->buffer; + sg_init_table(&input, 1); + sg_set_page(&input, &folio->page, PAGE_SIZE, 0); + + /* + * We need PAGE_SIZE * 2 here since there maybe over-compression case, + * and hardware-accelerators may won't check the dst buffer size, so + * giving the dst buffer with enough length to avoid buffer overflow. + */ + sg_init_one(&output, dst, PAGE_SIZE * 2); + acomp_request_set_params(acomp_ctx->req, &input, &output, PAGE_SIZE, dlen); + + /* + * it maybe looks a little bit silly that we send an asynchronous request, + * then wait for its completion synchronously. This makes the process look + * synchronous in fact. + * Theoretically, acomp supports users send multiple acomp requests in one + * acomp instance, then get those requests done simultaneously. but in this + * case, zswap actually does store and load page by page, there is no + * existing method to send the second page before the first page is done + * in one thread doing zwap. + * but in different threads running on different cpu, we have different + * acomp instance, so multiple threads can do (de)compression in parallel. + */ + ret = crypto_wait_req(crypto_acomp_compress(acomp_ctx->req), &acomp_ctx->wait); + dlen = acomp_ctx->req->dlen; + if (ret) { + zswap_reject_compress_fail++; + goto unlock; + } + + zpool = zswap_find_zpool(entry); + gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM; + if (zpool_malloc_support_movable(zpool)) + gfp |= __GFP_HIGHMEM | __GFP_MOVABLE; + ret = zpool_malloc(zpool, dlen, gfp, &handle); + if (ret == -ENOSPC) { + zswap_reject_compress_poor++; + goto unlock; + } + if (ret) { + zswap_reject_alloc_fail++; + goto unlock; + } + + buf = zpool_map_handle(zpool, handle, ZPOOL_MM_WO); + memcpy(buf, dst, dlen); + zpool_unmap_handle(zpool, handle); + + entry->handle = handle; + entry->length = dlen; + +unlock: + mutex_unlock(&acomp_ctx->mutex); + return ret == 0; +} + +static void zswap_decompress(struct zswap_entry *entry, struct page *page) +{ + struct zpool *zpool = zswap_find_zpool(entry); + struct scatterlist input, output; + struct crypto_acomp_ctx *acomp_ctx; + u8 *src; + + acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx); + mutex_lock(&acomp_ctx->mutex); + + src = zpool_map_handle(zpool, entry->handle, ZPOOL_MM_RO); + if (!zpool_can_sleep_mapped(zpool)) { + memcpy(acomp_ctx->buffer, src, entry->length); + src = acomp_ctx->buffer; + zpool_unmap_handle(zpool, entry->handle); + } + + sg_init_one(&input, src, entry->length); + sg_init_table(&output, 1); + sg_set_page(&output, page, PAGE_SIZE, 0); + acomp_request_set_params(acomp_ctx->req, &input, &output, entry->length, PAGE_SIZE); + BUG_ON(crypto_wait_req(crypto_acomp_decompress(acomp_ctx->req), &acomp_ctx->wait)); + BUG_ON(acomp_ctx->req->dlen != PAGE_SIZE); + mutex_unlock(&acomp_ctx->mutex); + + if (zpool_can_sleep_mapped(zpool)) + zpool_unmap_handle(zpool, entry->handle); +} + /********************************* * shrinker functions **********************************/ @@ -1317,108 +1422,6 @@ static void shrink_worker(struct work_struct *w) zswap_pool_put(pool); } -static bool zswap_compress(struct folio *folio, struct zswap_entry *entry) -{ - struct crypto_acomp_ctx *acomp_ctx; - struct scatterlist input, output; - unsigned int dlen = PAGE_SIZE; - unsigned long handle; - struct zpool *zpool; - char *buf; - gfp_t gfp; - int ret; - u8 *dst; - - acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx); - - mutex_lock(&acomp_ctx->mutex); - - dst = acomp_ctx->buffer; - sg_init_table(&input, 1); - sg_set_page(&input, &folio->page, PAGE_SIZE, 0); - - /* - * We need PAGE_SIZE * 2 here since there maybe over-compression case, - * and hardware-accelerators may won't check the dst buffer size, so - * giving the dst buffer with enough length to avoid buffer overflow. - */ - sg_init_one(&output, dst, PAGE_SIZE * 2); - acomp_request_set_params(acomp_ctx->req, &input, &output, PAGE_SIZE, dlen); - - /* - * it maybe looks a little bit silly that we send an asynchronous request, - * then wait for its completion synchronously. This makes the process look - * synchronous in fact. - * Theoretically, acomp supports users send multiple acomp requests in one - * acomp instance, then get those requests done simultaneously. but in this - * case, zswap actually does store and load page by page, there is no - * existing method to send the second page before the first page is done - * in one thread doing zwap. - * but in different threads running on different cpu, we have different - * acomp instance, so multiple threads can do (de)compression in parallel. - */ - ret = crypto_wait_req(crypto_acomp_compress(acomp_ctx->req), &acomp_ctx->wait); - dlen = acomp_ctx->req->dlen; - if (ret) { - zswap_reject_compress_fail++; - goto unlock; - } - - zpool = zswap_find_zpool(entry); - gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM; - if (zpool_malloc_support_movable(zpool)) - gfp |= __GFP_HIGHMEM | __GFP_MOVABLE; - ret = zpool_malloc(zpool, dlen, gfp, &handle); - if (ret == -ENOSPC) { - zswap_reject_compress_poor++; - goto unlock; - } - if (ret) { - zswap_reject_alloc_fail++; - goto unlock; - } - - buf = zpool_map_handle(zpool, handle, ZPOOL_MM_WO); - memcpy(buf, dst, dlen); - zpool_unmap_handle(zpool, handle); - - entry->handle = handle; - entry->length = dlen; - -unlock: - mutex_unlock(&acomp_ctx->mutex); - return ret == 0; -} - -static void zswap_decompress(struct zswap_entry *entry, struct page *page) -{ - struct zpool *zpool = zswap_find_zpool(entry); - struct scatterlist input, output; - struct crypto_acomp_ctx *acomp_ctx; - u8 *src; - - acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx); - mutex_lock(&acomp_ctx->mutex); - - src = zpool_map_handle(zpool, entry->handle, ZPOOL_MM_RO); - if (!zpool_can_sleep_mapped(zpool)) { - memcpy(acomp_ctx->buffer, src, entry->length); - src = acomp_ctx->buffer; - zpool_unmap_handle(zpool, entry->handle); - } - - sg_init_one(&input, src, entry->length); - sg_init_table(&output, 1); - sg_set_page(&output, page, PAGE_SIZE, 0); - acomp_request_set_params(acomp_ctx->req, &input, &output, entry->length, PAGE_SIZE); - BUG_ON(crypto_wait_req(crypto_acomp_decompress(acomp_ctx->req), &acomp_ctx->wait)); - BUG_ON(acomp_ctx->req->dlen != PAGE_SIZE); - mutex_unlock(&acomp_ctx->mutex); - - if (zpool_can_sleep_mapped(zpool)) - zpool_unmap_handle(zpool, entry->handle); -} - /********************************* * writeback code **********************************/ -- Gitee From 15e1cc2bef96b3aaca7154cd48563b30079c1227 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 29 Jan 2024 20:36:54 -0500 Subject: [PATCH 182/484] mm: zswap: function ordering: per-cpu compression infra commit 64f200b8304c6b26fbbd7768365850315bd71bc0 upstream Conflicts: none Backport-reason: ZSWAP mass udpate The per-cpu compression init/exit callbacks are awkwardly in the middle of the shrinker code. Move them up to the compression section. Link: https://lkml.kernel.org/r/20240130014208.565554-19-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Reviewed-by: Nhat Pham Cc: Chengming Zhou Cc: Yosry Ahmed Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/zswap.c | 135 ++++++++++++++++++++++++++--------------------------- 1 file changed, 66 insertions(+), 69 deletions(-) diff --git a/mm/zswap.c b/mm/zswap.c index 1e2c7250a9dc..006e9c5206da 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -995,6 +995,72 @@ static void zswap_invalidate_entry(struct zswap_tree *tree, /********************************* * compressed storage functions **********************************/ +static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node) +{ + struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node); + struct crypto_acomp_ctx *acomp_ctx = per_cpu_ptr(pool->acomp_ctx, cpu); + struct crypto_acomp *acomp; + struct acomp_req *req; + int ret; + + mutex_init(&acomp_ctx->mutex); + + acomp_ctx->buffer = kmalloc_node(PAGE_SIZE * 2, GFP_KERNEL, cpu_to_node(cpu)); + if (!acomp_ctx->buffer) + return -ENOMEM; + + acomp = crypto_alloc_acomp_node(pool->tfm_name, 0, 0, cpu_to_node(cpu)); + if (IS_ERR(acomp)) { + pr_err("could not alloc crypto acomp %s : %ld\n", + pool->tfm_name, PTR_ERR(acomp)); + ret = PTR_ERR(acomp); + goto acomp_fail; + } + acomp_ctx->acomp = acomp; + + req = acomp_request_alloc(acomp_ctx->acomp); + if (!req) { + pr_err("could not alloc crypto acomp_request %s\n", + pool->tfm_name); + ret = -ENOMEM; + goto req_fail; + } + acomp_ctx->req = req; + + crypto_init_wait(&acomp_ctx->wait); + /* + * if the backend of acomp is async zip, crypto_req_done() will wakeup + * crypto_wait_req(); if the backend of acomp is scomp, the callback + * won't be called, crypto_wait_req() will return without blocking. + */ + acomp_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG, + crypto_req_done, &acomp_ctx->wait); + + return 0; + +req_fail: + crypto_free_acomp(acomp_ctx->acomp); +acomp_fail: + kfree(acomp_ctx->buffer); + return ret; +} + +static int zswap_cpu_comp_dead(unsigned int cpu, struct hlist_node *node) +{ + struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node); + struct crypto_acomp_ctx *acomp_ctx = per_cpu_ptr(pool->acomp_ctx, cpu); + + if (!IS_ERR_OR_NULL(acomp_ctx)) { + if (!IS_ERR_OR_NULL(acomp_ctx->req)) + acomp_request_free(acomp_ctx->req); + if (!IS_ERR_OR_NULL(acomp_ctx->acomp)) + crypto_free_acomp(acomp_ctx->acomp); + kfree(acomp_ctx->buffer); + } + + return 0; +} + static bool zswap_compress(struct folio *folio, struct zswap_entry *entry) { struct crypto_acomp_ctx *acomp_ctx; @@ -1201,75 +1267,6 @@ static void zswap_alloc_shrinker(struct zswap_pool *pool) pool->shrinker->seeks = DEFAULT_SEEKS; } -/********************************* -* per-cpu code -**********************************/ -static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node) -{ - struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node); - struct crypto_acomp_ctx *acomp_ctx = per_cpu_ptr(pool->acomp_ctx, cpu); - struct crypto_acomp *acomp; - struct acomp_req *req; - int ret; - - mutex_init(&acomp_ctx->mutex); - - acomp_ctx->buffer = kmalloc_node(PAGE_SIZE * 2, GFP_KERNEL, cpu_to_node(cpu)); - if (!acomp_ctx->buffer) - return -ENOMEM; - - acomp = crypto_alloc_acomp_node(pool->tfm_name, 0, 0, cpu_to_node(cpu)); - if (IS_ERR(acomp)) { - pr_err("could not alloc crypto acomp %s : %ld\n", - pool->tfm_name, PTR_ERR(acomp)); - ret = PTR_ERR(acomp); - goto acomp_fail; - } - acomp_ctx->acomp = acomp; - - req = acomp_request_alloc(acomp_ctx->acomp); - if (!req) { - pr_err("could not alloc crypto acomp_request %s\n", - pool->tfm_name); - ret = -ENOMEM; - goto req_fail; - } - acomp_ctx->req = req; - - crypto_init_wait(&acomp_ctx->wait); - /* - * if the backend of acomp is async zip, crypto_req_done() will wakeup - * crypto_wait_req(); if the backend of acomp is scomp, the callback - * won't be called, crypto_wait_req() will return without blocking. - */ - acomp_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG, - crypto_req_done, &acomp_ctx->wait); - - return 0; - -req_fail: - crypto_free_acomp(acomp_ctx->acomp); -acomp_fail: - kfree(acomp_ctx->buffer); - return ret; -} - -static int zswap_cpu_comp_dead(unsigned int cpu, struct hlist_node *node) -{ - struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node); - struct crypto_acomp_ctx *acomp_ctx = per_cpu_ptr(pool->acomp_ctx, cpu); - - if (!IS_ERR_OR_NULL(acomp_ctx)) { - if (!IS_ERR_OR_NULL(acomp_ctx->req)) - acomp_request_free(acomp_ctx->req); - if (!IS_ERR_OR_NULL(acomp_ctx->acomp)) - crypto_free_acomp(acomp_ctx->acomp); - kfree(acomp_ctx->buffer); - } - - return 0; -} - static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_one *l, spinlock_t *lock, void *arg) { -- Gitee From d90e8440d48444b313e41eab623187a41b6fff9c Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 29 Jan 2024 20:36:55 -0500 Subject: [PATCH 183/484] mm: zswap: function ordering: writeback commit 9986d35d4ceb53962f1c3bc378e317b5ca841f73 upstream Conflicts: minor, writeback have return value and shrinked si refcount scope. Backport-reason: ZSWAP mass update Shrinking needs writeback. Naturally, move the writeback code above the shrinking code. Delete the forward decl. Link: https://lkml.kernel.org/r/20240130014208.565554-20-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Reviewed-by: Nhat Pham Cc: Chengming Zhou Cc: Yosry Ahmed Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/zswap.c | 197 ++++++++++++++++++++++++++--------------------------- 1 file changed, 97 insertions(+), 100 deletions(-) diff --git a/mm/zswap.c b/mm/zswap.c index 006e9c5206da..a41a536ad683 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -276,9 +276,6 @@ static inline struct zswap_tree *swap_zswap_tree(swp_entry_t swp) pr_debug("%s pool %s/%s\n", msg, (p)->tfm_name, \ zpool_get_type((p)->zpools[0])) -static int zswap_writeback_entry(struct zswap_entry *entry, - swp_entry_t swpentry); - static bool zswap_is_full(void) { return totalram_pages() * zswap_max_pool_percent / 100 < @@ -1163,6 +1160,103 @@ static void zswap_decompress(struct zswap_entry *entry, struct page *page) zpool_unmap_handle(zpool, entry->handle); } +/********************************* +* writeback code +**********************************/ +/* + * Attempts to free an entry by adding a folio to the swap cache, + * decompressing the entry data into the folio, and issuing a + * bio write to write the folio back to the swap device. + * + * This can be thought of as a "resumed writeback" of the folio + * to the swap device. We are basically resuming the same swap + * writeback path that was intercepted with the zswap_store() + * in the first place. After the folio has been decompressed into + * the swap cache, the compressed version stored by zswap can be + * freed. + */ +static int zswap_writeback_entry(struct zswap_entry *entry, + swp_entry_t swpentry) +{ + struct zswap_tree *tree; + struct folio *folio; + struct mempolicy *mpol; + bool folio_was_allocated; + struct swap_info_struct *si; + struct writeback_control wbc = { + .sync_mode = WB_SYNC_NONE, + }; + int ret = 0; + + /* try to allocate swap cache folio */ + si = get_swap_device(swpentry); + if (!si) + return -EEXIST; + + mpol = get_task_policy(current); + folio = __read_swap_cache_async(swpentry, GFP_KERNEL, mpol, + NO_INTERLEAVE_INDEX, &folio_was_allocated, true); + put_swap_device(si); + if (!folio) + return -ENOMEM; + + /* + * Found an existing folio, we raced with swapin or concurrent + * shrinker. We generally writeback cold folios from zswap, and + * swapin means the folio just became hot, so skip this folio. + * For unlikely concurrent shrinker case, it will be unlinked + * and freed when invalidated by the concurrent shrinker anyway. + */ + if (!folio_was_allocated) { + folio_put(folio); + return -EEXIST; + } + + /* + * folio is locked, and the swapcache is now secured against + * concurrent swapping to and from the slot. Verify that the + * swap entry hasn't been invalidated and recycled behind our + * backs (our zswap_entry reference doesn't prevent that), to + * avoid overwriting a new swap folio with old compressed data. + */ + tree = swap_zswap_tree(swpentry); + spin_lock(&tree->lock); + if (zswap_rb_search(&tree->rbroot, swp_offset(swpentry)) != entry) { + spin_unlock(&tree->lock); + delete_from_swap_cache(folio); + folio_unlock(folio); + folio_put(folio); + return -ENOMEM; + } + + /* Safe to deref entry after the entry is verified above. */ + zswap_entry_get(entry); + spin_unlock(&tree->lock); + + zswap_decompress(entry, &folio->page); + + count_vm_event(ZSWPWB); + if (entry->objcg) + count_objcg_event(entry->objcg, ZSWPWB); + + spin_lock(&tree->lock); + zswap_invalidate_entry(tree, entry); + zswap_entry_put(entry); + spin_unlock(&tree->lock); + + /* folio is up to date */ + folio_mark_uptodate(folio); + + /* move it to the tail of the inactive list after end_writeback */ + folio_set_reclaim(folio); + + /* start writeback */ + ret = __swap_writepage(folio, &wbc); + folio_put(folio); + + return ret; +} + /********************************* * shrinker functions **********************************/ @@ -1419,103 +1513,6 @@ static void shrink_worker(struct work_struct *w) zswap_pool_put(pool); } -/********************************* -* writeback code -**********************************/ -/* - * Attempts to free an entry by adding a folio to the swap cache, - * decompressing the entry data into the folio, and issuing a - * bio write to write the folio back to the swap device. - * - * This can be thought of as a "resumed writeback" of the folio - * to the swap device. We are basically resuming the same swap - * writeback path that was intercepted with the zswap_store() - * in the first place. After the folio has been decompressed into - * the swap cache, the compressed version stored by zswap can be - * freed. - */ -static int zswap_writeback_entry(struct zswap_entry *entry, - swp_entry_t swpentry) -{ - int ret; - struct zswap_tree *tree; - struct folio *folio; - struct mempolicy *mpol; - bool folio_was_allocated; - struct swap_info_struct *si; - struct writeback_control wbc = { - .sync_mode = WB_SYNC_NONE, - }; - - /* try to allocate swap cache folio */ - si = get_swap_device(swpentry); - if (!si) - return -EEXIST; - - mpol = get_task_policy(current); - folio = __read_swap_cache_async(swpentry, GFP_KERNEL, mpol, - NO_INTERLEAVE_INDEX, &folio_was_allocated, true); - put_swap_device(si); - if (!folio) - return -ENOMEM; - - /* - * Found an existing folio, we raced with swapin or concurrent - * shrinker. We generally writeback cold folios from zswap, and - * swapin means the folio just became hot, so skip this folio. - * For unlikely concurrent shrinker case, it will be unlinked - * and freed when invalidated by the concurrent shrinker anyway. - */ - if (!folio_was_allocated) { - folio_put(folio); - return -EEXIST; - } - - /* - * folio is locked, and the swapcache is now secured against - * concurrent swapping to and from the slot. Verify that the - * swap entry hasn't been invalidated and recycled behind our - * backs (our zswap_entry reference doesn't prevent that), to - * avoid overwriting a new swap folio with old compressed data. - */ - tree = swap_zswap_tree(swpentry); - spin_lock(&tree->lock); - if (zswap_rb_search(&tree->rbroot, swp_offset(swpentry)) != entry) { - spin_unlock(&tree->lock); - delete_from_swap_cache(folio); - folio_unlock(folio); - folio_put(folio); - return -ENOMEM; - } - - /* Safe to deref entry after the entry is verified above. */ - zswap_entry_get(entry); - spin_unlock(&tree->lock); - - zswap_decompress(entry, &folio->page); - - count_vm_event(ZSWPWB); - if (entry->objcg) - count_objcg_event(entry->objcg, ZSWPWB); - - spin_lock(&tree->lock); - zswap_invalidate_entry(tree, entry); - zswap_entry_put(entry); - spin_unlock(&tree->lock); - - /* folio is up to date */ - folio_mark_uptodate(folio); - - /* move it to the tail of the inactive list after end_writeback */ - folio_set_reclaim(folio); - - /* start writeback */ - ret = __swap_writepage(folio, &wbc); - folio_put(folio); - - return ret; -} - static int zswap_is_page_same_filled(void *ptr, unsigned long *value) { unsigned long *page; -- Gitee From afb0229ebdb6c25cb8a5fc15f1568a2d6636bbb3 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 29 Jan 2024 20:36:56 -0500 Subject: [PATCH 184/484] mm: zswap: function ordering: shrink_memcg_cb commit eb23ee4f96937ca47fe6c88151d998551642a772 upstream Conflicts: none Backport-reason: ZSWAP mass udpate shrink_memcg_cb() is called by the shrinker and is based on zswap_writeback_entry(). Move it in between. Save one fwd decl. Link: https://lkml.kernel.org/r/20240130014208.565554-21-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Reviewed-by: Nhat Pham Cc: Chengming Zhou Cc: Yosry Ahmed Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/zswap.c | 125 ++++++++++++++++++++++++++--------------------------- 1 file changed, 61 insertions(+), 64 deletions(-) diff --git a/mm/zswap.c b/mm/zswap.c index a41a536ad683..889901862fc8 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -1261,7 +1261,67 @@ static int zswap_writeback_entry(struct zswap_entry *entry, * shrinker functions **********************************/ static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_one *l, - spinlock_t *lock, void *arg); + spinlock_t *lock, void *arg) +{ + struct zswap_entry *entry = container_of(item, struct zswap_entry, lru); + bool *encountered_page_in_swapcache = (bool *)arg; + swp_entry_t swpentry; + enum lru_status ret = LRU_REMOVED_RETRY; + int writeback_result; + + /* + * Rotate the entry to the tail before unlocking the LRU, + * so that in case of an invalidation race concurrent + * reclaimers don't waste their time on it. + * + * If writeback succeeds, or failure is due to the entry + * being invalidated by the swap subsystem, the invalidation + * will unlink and free it. + * + * Temporary failures, where the same entry should be tried + * again immediately, almost never happen for this shrinker. + * We don't do any trylocking; -ENOMEM comes closest, + * but that's extremely rare and doesn't happen spuriously + * either. Don't bother distinguishing this case. + * + * But since they do exist in theory, the entry cannot just + * be unlinked, or we could leak it. Hence, rotate. + */ + list_move_tail(item, &l->list); + + /* + * Once the lru lock is dropped, the entry might get freed. The + * swpentry is copied to the stack, and entry isn't deref'd again + * until the entry is verified to still be alive in the tree. + */ + swpentry = entry->swpentry; + + /* + * It's safe to drop the lock here because we return either + * LRU_REMOVED_RETRY or LRU_RETRY. + */ + spin_unlock(lock); + + writeback_result = zswap_writeback_entry(entry, swpentry); + + if (writeback_result) { + zswap_reject_reclaim_fail++; + ret = LRU_RETRY; + + /* + * Encountering a page already in swap cache is a sign that we are shrinking + * into the warmer region. We should terminate shrinking (if we're in the dynamic + * shrinker context). + */ + if (writeback_result == -EEXIST && encountered_page_in_swapcache) + *encountered_page_in_swapcache = true; + } else { + zswap_written_back_pages++; + } + + spin_lock(lock); + return ret; +} static unsigned long zswap_shrinker_scan(struct shrinker *shrinker, struct shrink_control *sc) @@ -1361,69 +1421,6 @@ static void zswap_alloc_shrinker(struct zswap_pool *pool) pool->shrinker->seeks = DEFAULT_SEEKS; } -static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_one *l, - spinlock_t *lock, void *arg) -{ - struct zswap_entry *entry = container_of(item, struct zswap_entry, lru); - bool *encountered_page_in_swapcache = (bool *)arg; - swp_entry_t swpentry; - enum lru_status ret = LRU_REMOVED_RETRY; - int writeback_result; - - /* - * Rotate the entry to the tail before unlocking the LRU, - * so that in case of an invalidation race concurrent - * reclaimers don't waste their time on it. - * - * If writeback succeeds, or failure is due to the entry - * being invalidated by the swap subsystem, the invalidation - * will unlink and free it. - * - * Temporary failures, where the same entry should be tried - * again immediately, almost never happen for this shrinker. - * We don't do any trylocking; -ENOMEM comes closest, - * but that's extremely rare and doesn't happen spuriously - * either. Don't bother distinguishing this case. - * - * But since they do exist in theory, the entry cannot just - * be unlinked, or we could leak it. Hence, rotate. - */ - list_move_tail(item, &l->list); - - /* - * Once the lru lock is dropped, the entry might get freed. The - * swpentry is copied to the stack, and entry isn't deref'd again - * until the entry is verified to still be alive in the tree. - */ - swpentry = entry->swpentry; - - /* - * It's safe to drop the lock here because we return either - * LRU_REMOVED_RETRY or LRU_RETRY. - */ - spin_unlock(lock); - - writeback_result = zswap_writeback_entry(entry, swpentry); - - if (writeback_result) { - zswap_reject_reclaim_fail++; - ret = LRU_RETRY; - - /* - * Encountering a page already in swap cache is a sign that we are shrinking - * into the warmer region. We should terminate shrinking (if we're in the dynamic - * shrinker context). - */ - if (writeback_result == -EEXIST && encountered_page_in_swapcache) - *encountered_page_in_swapcache = true; - } else { - zswap_written_back_pages++; - } - - spin_lock(lock); - return ret; -} - static int shrink_memcg(struct mem_cgroup *memcg) { struct zswap_pool *pool; -- Gitee From f40510690534351c2b7af557ec79af099a5ab008 Mon Sep 17 00:00:00 2001 From: Chengming Zhou Date: Sun, 4 Feb 2024 03:05:59 +0000 Subject: [PATCH 185/484] mm/zswap: add more comments in shrink_memcg_cb() commit f9c0f1c32cb568e16ef0676d8e7827a3ad443742 upstream Conflicts: none Backport-reason: ZSWAP mass udpate Patch series "mm/zswap: optimize zswap lru list", v2. This series is motivated when observe the zswap lru list shrinking, noted there are some unexpected cases in zswap_writeback_entry(). bpftrace -e 'kr:zswap_writeback_entry {@[(int32)retval]=count()}' There are some -ENOMEM because when the swap entry is freed to per-cpu swap pool, it doesn't invalidate/drop zswap entry. Then the shrinker encounter these trashy zswap entries, it can't be reclaimed and return -ENOMEM. So move the invalidation ahead to when swap entry freed to the per-cpu swap pool, since there is no any benefit to leave trashy zswap entries on the zswap tree and lru list. Another case is -EEXIST, which is seen more in the case of !zswap_exclusive_loads_enabled, in which case the swapin folio will leave compressed copy on the tree and lru list. And it can't be reclaimed until the folio is removed from swapcache. Changing to zswap_exclusive_loads_enabled mode will invalidate when folio swapin, which has its own drawback if that folio is still clean in swapcache and swapout again, we need to compress it again. Please see the commit for details on why we choose exclusive load as the default for zswap. Another optimization for -EEXIST is that we add LRU_STOP to support terminating the shrinking process to avoid evicting warmer region. Testing using kernel build in tmpfs, one 50GB swapfile and zswap shrinker_enabled, with memory.max set to 2GB. mm-unstable zswap-optimize real 63.90s 63.25s user 1064.05s 1063.40s sys 292.32s 270.94s The main optimization is in sys cpu, about 7% improvement. This patch (of 6): Add more comments in shrink_memcg_cb() to describe the deref dance which is implemented to fix race problem between lru writeback and swapoff, and the reason why we rotate the entry at the beginning. Also fix the stale comments in zswap_writeback_entry(), and add more comments to state that we only deref the tree after we get the swapcache reference. Link: https://lkml.kernel.org/r/20240201-b4-zswap-invalidate-entry-v2-0-99d4084260a0@bytedance.com Link: https://lkml.kernel.org/r/20240201-b4-zswap-invalidate-entry-v2-1-99d4084260a0@bytedance.com Signed-off-by: Johannes Weiner Signed-off-by: Chengming Zhou Suggested-by: Yosry Ahmed Suggested-by: Johannes Weiner Acked-by: Yosry Ahmed Reviewed-by: Nhat Pham Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/zswap.c | 43 ++++++++++++++++++++++++++----------------- 1 file changed, 26 insertions(+), 17 deletions(-) diff --git a/mm/zswap.c b/mm/zswap.c index 889901862fc8..9c5afd2cbf99 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -1214,10 +1214,12 @@ static int zswap_writeback_entry(struct zswap_entry *entry, /* * folio is locked, and the swapcache is now secured against - * concurrent swapping to and from the slot. Verify that the - * swap entry hasn't been invalidated and recycled behind our - * backs (our zswap_entry reference doesn't prevent that), to - * avoid overwriting a new swap folio with old compressed data. + * concurrent swapping to and from the slot, and concurrent + * swapoff so we can safely dereference the zswap tree here. + * Verify that the swap entry hasn't been invalidated and recycled + * behind our backs, to avoid overwriting a new swap folio with + * old compressed data. Only when this is successful can the entry + * be dereferenced. */ tree = swap_zswap_tree(swpentry); spin_lock(&tree->lock); @@ -1270,22 +1272,29 @@ static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_o int writeback_result; /* - * Rotate the entry to the tail before unlocking the LRU, - * so that in case of an invalidation race concurrent - * reclaimers don't waste their time on it. + * As soon as we drop the LRU lock, the entry can be freed by + * a concurrent invalidation. This means the following: * - * If writeback succeeds, or failure is due to the entry - * being invalidated by the swap subsystem, the invalidation - * will unlink and free it. + * 1. We extract the swp_entry_t to the stack, allowing + * zswap_writeback_entry() to pin the swap entry and + * then validate the zwap entry against that swap entry's + * tree using pointer value comparison. Only when that + * is successful can the entry be dereferenced. * - * Temporary failures, where the same entry should be tried - * again immediately, almost never happen for this shrinker. - * We don't do any trylocking; -ENOMEM comes closest, - * but that's extremely rare and doesn't happen spuriously - * either. Don't bother distinguishing this case. + * 2. Usually, objects are taken off the LRU for reclaim. In + * this case this isn't possible, because if reclaim fails + * for whatever reason, we have no means of knowing if the + * entry is alive to put it back on the LRU. * - * But since they do exist in theory, the entry cannot just - * be unlinked, or we could leak it. Hence, rotate. + * So rotate it before dropping the lock. If the entry is + * written back or invalidated, the free path will unlink + * it. For failures, rotation is the right thing as well. + * + * Temporary failures, where the same entry should be tried + * again immediately, almost never happen for this shrinker. + * We don't do any trylocking; -ENOMEM comes closest, + * but that's extremely rare and doesn't happen spuriously + * either. Don't bother distinguishing this case. */ list_move_tail(item, &l->list); -- Gitee From bcc2fc03403d40f1d4fa14b9caca6f0200124a2d Mon Sep 17 00:00:00 2001 From: Chengming Zhou Date: Sun, 4 Feb 2024 03:06:00 +0000 Subject: [PATCH 186/484] mm/zswap: invalidate zswap entry when swap entry free commit 0827a1fb143fae588cb6f5b9a97c405d6c2ddec9 upstream Conflicts: major, we don't have swap_slot.c Backport-reason: ZSWAP mass update During testing I found there are some times the zswap_writeback_entry() return -ENOMEM, which is not we expected: bpftrace -e 'kr:zswap_writeback_entry {@[(int32)retval]=count()}' @[-12]: 1563 @[0]: 277221 The reason is that __read_swap_cache_async() return NULL because swapcache_prepare() failed. The reason is that we won't invalidate zswap entry when swap entry freed to the per-cpu pool, these zswap entries are still on the zswap tree and lru list. This patch moves the invalidation ahead to when swap entry freed to the per-cpu pool, since there is no any benefit to leave trashy zswap entry on the tree and lru list. With this patch: bpftrace -e 'kr:zswap_writeback_entry {@[(int32)retval]=count()}' @[0]: 259744 Note: large folio can't have zswap entry for now, so don't bother to add zswap entry invalidation in the large folio swap free path. Link: https://lkml.kernel.org/r/20240201-b4-zswap-invalidate-entry-v2-2-99d4084260a0@bytedance.com Signed-off-by: Chengming Zhou Reviewed-by: Nhat Pham Acked-by: Johannes Weiner Acked-by: Yosry Ahmed Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- include/linux/zswap.h | 4 ++-- mm/swapfile.c | 5 ++++- mm/zswap.c | 5 +++-- 3 files changed, 9 insertions(+), 5 deletions(-) diff --git a/include/linux/zswap.h b/include/linux/zswap.h index 91895ce1fdbc..341aea490070 100644 --- a/include/linux/zswap.h +++ b/include/linux/zswap.h @@ -29,7 +29,7 @@ struct zswap_lruvec_state { bool zswap_store(struct folio *folio); bool zswap_load(struct folio *folio); -void zswap_invalidate(int type, pgoff_t offset); +void zswap_invalidate(swp_entry_t swp); int zswap_swapon(int type, unsigned long nr_pages); void zswap_swapoff(int type); void zswap_memcg_offline_cleanup(struct mem_cgroup *memcg); @@ -50,7 +50,7 @@ static inline bool zswap_load(struct folio *folio) return false; } -static inline void zswap_invalidate(int type, pgoff_t offset) {} +static inline void zswap_invalidate(swp_entry_t swp) {} static inline int zswap_swapon(int type, unsigned long nr_pages) { return 0; diff --git a/mm/swapfile.c b/mm/swapfile.c index fcbaa1bbc747..a76bb64789f9 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1103,6 +1103,10 @@ static void swap_range_free(struct swap_info_struct *si, unsigned long offset, unsigned long begin = offset; unsigned long end = offset + nr_entries - 1; void (*swap_slot_free_notify)(struct block_device *, unsigned long); + unsigned int i; + + for (i = 0; i < nr_entries; i++) + zswap_invalidate(swp_entry(si->type, offset + i)); if (si->flags & SWP_BLKDEV) swap_slot_free_notify = @@ -1111,7 +1115,6 @@ static void swap_range_free(struct swap_info_struct *si, unsigned long offset, swap_slot_free_notify = NULL; while (offset <= end) { arch_swap_invalidate_page(si->type, offset); - zswap_invalidate(si->type, offset); if (swap_slot_free_notify) swap_slot_free_notify(si->bdev, offset); offset++; diff --git a/mm/zswap.c b/mm/zswap.c index 9c5afd2cbf99..8933a21faf1d 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -1746,9 +1746,10 @@ bool zswap_load(struct folio *folio) return true; } -void zswap_invalidate(int type, pgoff_t offset) +void zswap_invalidate(swp_entry_t swp) { - struct zswap_tree *tree = swap_zswap_tree(swp_entry(type, offset)); + pgoff_t offset = swp_offset(swp); + struct zswap_tree *tree = swap_zswap_tree(swp); struct zswap_entry *entry; spin_lock(&tree->lock); -- Gitee From f47e6fafa91b925e3987523c529c2548f278c1a5 Mon Sep 17 00:00:00 2001 From: Chengming Zhou Date: Sun, 4 Feb 2024 03:06:01 +0000 Subject: [PATCH 187/484] mm/zswap: stop lru list shrinking when encounter warm region commit b49547ade38a63ff39c9fbc53fb38622cb63854a upstream Conflicts: none Backport-reason: ZSWAP mass udpate When the shrinker encounter an existing folio in swap cache, it means we are shrinking into the warmer region. We should terminate shrinking if we're in the dynamic shrinker context. This patch add LRU_STOP to support this, to avoid overshrinking. Link: https://lkml.kernel.org/r/20240201-b4-zswap-invalidate-entry-v2-3-99d4084260a0@bytedance.com Signed-off-by: Chengming Zhou Acked-by: Johannes Weiner Acked-by: Nhat Pham Reviewed-by: Yosry Ahmed Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- include/linux/list_lru.h | 2 ++ mm/list_lru.c | 3 +++ mm/zswap.c | 4 +++- 3 files changed, 8 insertions(+), 1 deletion(-) diff --git a/include/linux/list_lru.h b/include/linux/list_lru.h index 9e184364a656..b5eb53b2f72b 100644 --- a/include/linux/list_lru.h +++ b/include/linux/list_lru.h @@ -24,6 +24,8 @@ enum lru_status { LRU_SKIP, /* item cannot be locked, skip */ LRU_RETRY, /* item not freeable. May drop the lock internally, but has to return locked. */ + LRU_STOP, /* stop lru list walking. May drop the lock + internally, but has to return locked. */ }; struct list_lru_one { diff --git a/mm/list_lru.c b/mm/list_lru.c index b8910e99d986..61d5707d8f13 100644 --- a/mm/list_lru.c +++ b/mm/list_lru.c @@ -274,6 +274,9 @@ __list_lru_walk_one(struct list_lru *lru, int nid, int memcg_idx, */ assert_spin_locked(&nlru->lock); goto restart; + case LRU_STOP: + assert_spin_locked(&nlru->lock); + goto out; default: BUG(); } diff --git a/mm/zswap.c b/mm/zswap.c index 8933a21faf1d..9bbf44eeca91 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -1322,8 +1322,10 @@ static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_o * into the warmer region. We should terminate shrinking (if we're in the dynamic * shrinker context). */ - if (writeback_result == -EEXIST && encountered_page_in_swapcache) + if (writeback_result == -EEXIST && encountered_page_in_swapcache) { + ret = LRU_STOP; *encountered_page_in_swapcache = true; + } } else { zswap_written_back_pages++; } -- Gitee From 349fa5f2b6f875854a10bc5c403c08bb0471ef03 Mon Sep 17 00:00:00 2001 From: Chengming Zhou Date: Sun, 4 Feb 2024 03:06:02 +0000 Subject: [PATCH 188/484] mm/zswap: remove duplicate_entry debug value commit 3b631bd06550040b37cd2ebb00e2374404db0360 upstream Conflicts: none Backport-reason: ZSWAP mass udpate cat /sys/kernel/debug/zswap/duplicate_entry 2086447 When testing, the duplicate_entry value is very high, but no warning message in the kernel log. From the comment of duplicate_entry "Duplicate store was encountered (rare)", it seems something goes wrong. Actually it's incremented in the beginning of zswap_store(), which found its zswap entry has already on the tree. And this is a normal case, since the folio could leave zswap entry on the tree after swapin, later it's dirtied and swapout/zswap_store again, found its original zswap entry. So duplicate_entry should be only incremented in the real bug case, which already have "WARN_ON(1)", it looks redundant to count bug case, so this patch just remove it. Link: https://lkml.kernel.org/r/20240201-b4-zswap-invalidate-entry-v2-4-99d4084260a0@bytedance.com Signed-off-by: Chengming Zhou Acked-by: Johannes Weiner Reviewed-by: Nhat Pham Acked-by: Yosry Ahmed Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/zswap.c | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/mm/zswap.c b/mm/zswap.c index 9bbf44eeca91..1bb1922d8949 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -71,8 +71,6 @@ static u64 zswap_reject_compress_poor; static u64 zswap_reject_alloc_fail; /* Store failed because the entry metadata could not be allocated (rare) */ static u64 zswap_reject_kmemcache_fail; -/* Duplicate store was encountered (rare) */ -static u64 zswap_duplicate_entry; /* Shrinker work queue */ static struct workqueue_struct *shrink_wq; @@ -1575,10 +1573,8 @@ bool zswap_store(struct folio *folio) */ spin_lock(&tree->lock); entry = zswap_rb_search(&tree->rbroot, offset); - if (entry) { + if (entry) zswap_invalidate_entry(tree, entry); - zswap_duplicate_entry++; - } spin_unlock(&tree->lock); if (!zswap_enabled) @@ -1669,7 +1665,6 @@ bool zswap_store(struct folio *folio) */ while (zswap_rb_insert(&tree->rbroot, entry, &dupentry) == -EEXIST) { WARN_ON(1); - zswap_duplicate_entry++; zswap_invalidate_entry(tree, dupentry); } if (entry->length) { @@ -1830,8 +1825,6 @@ static int zswap_debugfs_init(void) zswap_debugfs_root, &zswap_reject_compress_poor); debugfs_create_u64("written_back_pages", 0444, zswap_debugfs_root, &zswap_written_back_pages); - debugfs_create_u64("duplicate_entry", 0444, - zswap_debugfs_root, &zswap_duplicate_entry); debugfs_create_u64("pool_total_size", 0444, zswap_debugfs_root, &zswap_pool_total_size); debugfs_create_atomic_t("stored_pages", 0444, -- Gitee From aa7022b566ad82f9f8f4b39d185605ee8f5f5c98 Mon Sep 17 00:00:00 2001 From: Chengming Zhou Date: Sun, 4 Feb 2024 03:06:03 +0000 Subject: [PATCH 189/484] mm/zswap: only support zswap_exclusive_loads_enabled commit c2e2ba770200b379069011a1fdeeb41e4569c486 upstream Conflicts: none Backport-reason: ZSWAP mass udpate The !zswap_exclusive_loads_enabled mode will leave compressed copy in the zswap tree and lru list after the folio swapin. There are some disadvantages in this mode: 1. It's a waste of memory since there are two copies of data, one is folio, the other one is compressed data in zswap. And it's unlikely the compressed data is useful in the near future. 2. If that folio is dirtied, the compressed data must be not useful, but we don't know and don't invalidate the trashy memory in zswap. 3. It's not reclaimable from zswap shrinker since zswap_writeback_entry() will always return -EEXIST and terminate the shrinking process. On the other hand, the only downside of zswap_exclusive_loads_enabled is a little more cpu usage/latency when compression, and the same if the folio is removed from swapcache or dirtied. More explanation by Johannes on why we should consider exclusive load as the default for zswap: Caching "swapout work" is helpful when the system is thrashing. Then recently swapped in pages might get swapped out again very soon. It certainly makes sense with conventional swap, because keeping a clean copy on the disk saves IO work and doesn't cost any additional memory. But with zswap, it's different. It saves some compression work on a thrashing page. But the act of keeping compressed memory contributes to a higher rate of thrashing. And that can cause IO in other places like zswap writeback and file memory. And the A/B test results of the kernel build in tmpfs with limited memory can support this theory: !exclusive exclusive real 63.80 63.01 user 1063.83 1061.32 sys 290.31 266.15 workingset_refault_anon 2383084.40 1976397.40 workingset_refault_file 44134.00 45689.40 workingset_activate_anon 837878.00 728441.20 workingset_activate_file 4710.00 4085.20 workingset_restore_anon 732622.60 639428.40 workingset_restore_file 1007.00 926.80 workingset_nodereclaim 0.00 0.00 pgscan 14343003.40 12409570.20 pgscan_kswapd 0.00 0.00 pgscan_direct 14343003.40 12409570.20 pgscan_khugepaged 0.00 0.00 Link: https://lkml.kernel.org/r/20240201-b4-zswap-invalidate-entry-v2-5-99d4084260a0@bytedance.com Signed-off-by: Chengming Zhou Acked-by: Yosry Ahmed Acked-by: Johannes Weiner Reviewed-by: Nhat Pham Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/Kconfig | 16 ---------------- mm/zswap.c | 14 +++----------- 2 files changed, 3 insertions(+), 27 deletions(-) diff --git a/mm/Kconfig b/mm/Kconfig index 416cc1b98168..d388868d5afe 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -45,22 +45,6 @@ config ZSWAP_DEFAULT_ON The selection made here can be overridden by using the kernel command line 'zswap.enabled=' option. -config ZSWAP_EXCLUSIVE_LOADS_DEFAULT_ON - bool "Invalidate zswap entries when pages are loaded" - depends on ZSWAP - help - If selected, exclusive loads for zswap will be enabled at boot, - otherwise it will be disabled. - - If exclusive loads are enabled, when a page is loaded from zswap, - the zswap entry is invalidated at once, as opposed to leaving it - in zswap until the swap entry is freed. - - This avoids having two copies of the same page in memory - (compressed and uncompressed) after faulting in a page from zswap. - The cost is that if the page was never dirtied and needs to be - swapped out again, it will be re-compressed. - config ZSWAP_SHRINKER_DEFAULT_ON bool "Shrink the zswap pool on memory pressure" depends on ZSWAP diff --git a/mm/zswap.c b/mm/zswap.c index 1bb1922d8949..7b0ad659ede6 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -139,10 +139,6 @@ static bool zswap_non_same_filled_pages_enabled = true; module_param_named(non_same_filled_pages_enabled, zswap_non_same_filled_pages_enabled, bool, 0644); -static bool zswap_exclusive_loads_enabled = IS_ENABLED( - CONFIG_ZSWAP_EXCLUSIVE_LOADS_DEFAULT_ON); -module_param_named(exclusive_loads, zswap_exclusive_loads_enabled, bool, 0644); - /* Number of zpools in zswap_pool (empirically determined for scalability) */ #define ZSWAP_NR_ZPOOLS 32 @@ -1730,16 +1726,12 @@ bool zswap_load(struct folio *folio) count_objcg_event(entry->objcg, ZSWPIN); spin_lock(&tree->lock); - if (zswap_exclusive_loads_enabled) { - zswap_invalidate_entry(tree, entry); - folio_mark_dirty(folio); - } else if (entry->length) { - zswap_lru_del(&entry->pool->list_lru, entry); - zswap_lru_add(&entry->pool->list_lru, entry); - } + zswap_invalidate_entry(tree, entry); zswap_entry_put(entry); spin_unlock(&tree->lock); + folio_mark_dirty(folio); + return true; } -- Gitee From 2b222a4bbdd48dd18f3d8ccf073db21f1d25b3f8 Mon Sep 17 00:00:00 2001 From: Chengming Zhou Date: Sun, 4 Feb 2024 03:06:04 +0000 Subject: [PATCH 190/484] mm/zswap: zswap entry doesn't need refcount anymore commit a230c20e63efef3daab510979898b25a0e446b36 upstream Conflicts: none Backport-reason: ZSWAP mass udpate Since we don't need to leave zswap entry on the zswap tree anymore, we should remove it from tree once we find it from the tree. Then after using it, we can directly free it, no concurrent path can find it from tree. Only the shrinker can see it from lru list, which will also double check under tree lock, so no race problem. So we don't need refcount in zswap entry anymore and don't need to take the spinlock for the second time to invalidate it. The side effect is that zswap_entry_free() maybe not happen in tree spinlock, but it's ok since nothing need to be protected by the lock. Link: https://lkml.kernel.org/r/20240201-b4-zswap-invalidate-entry-v2-6-99d4084260a0@bytedance.com Signed-off-by: Chengming Zhou Reviewed-by: Nhat Pham Acked-by: Johannes Weiner Cc: Yosry Ahmed Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/zswap.c | 63 ++++++++++-------------------------------------------- 1 file changed, 11 insertions(+), 52 deletions(-) diff --git a/mm/zswap.c b/mm/zswap.c index 7b0ad659ede6..7404cd36156f 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -193,12 +193,6 @@ struct zswap_pool { * * rbnode - links the entry into red-black tree for the appropriate swap type * swpentry - associated swap entry, the offset indexes into the red-black tree - * refcount - the number of outstanding reference to the entry. This is needed - * to protect against premature freeing of the entry by code - * concurrent calls to load, invalidate, and writeback. The lock - * for the zswap_tree structure that contains the entry must - * be held while changing the refcount. Since the lock must - * be held, there is no reason to also make refcount atomic. * length - the length in bytes of the compressed page data. Needed during * decompression. For a same value filled page length is 0, and both * pool and lru are invalid and must be ignored. @@ -211,7 +205,6 @@ struct zswap_pool { struct zswap_entry { struct rb_node rbnode; swp_entry_t swpentry; - int refcount; unsigned int length; struct zswap_pool *pool; union { @@ -222,11 +215,6 @@ struct zswap_entry { struct list_head lru; }; -/* - * The tree lock in the zswap_tree struct protects a few things: - * - the rbtree - * - the refcount field of each entry in the tree - */ struct zswap_tree { struct rb_root rbroot; spinlock_t lock; @@ -890,14 +878,10 @@ static int zswap_rb_insert(struct rb_root *root, struct zswap_entry *entry, return 0; } -static bool zswap_rb_erase(struct rb_root *root, struct zswap_entry *entry) +static void zswap_rb_erase(struct rb_root *root, struct zswap_entry *entry) { - if (!RB_EMPTY_NODE(&entry->rbnode)) { - rb_erase(&entry->rbnode, root); - RB_CLEAR_NODE(&entry->rbnode); - return true; - } - return false; + rb_erase(&entry->rbnode, root); + RB_CLEAR_NODE(&entry->rbnode); } /********************************* @@ -911,7 +895,6 @@ static struct zswap_entry *zswap_entry_cache_alloc(gfp_t gfp, int nid) entry = kmem_cache_alloc_node(zswap_entry_cache, gfp, nid); if (!entry) return NULL; - entry->refcount = 1; RB_CLEAR_NODE(&entry->rbnode); return entry; } @@ -954,33 +937,15 @@ static void zswap_entry_free(struct zswap_entry *entry) zswap_update_total_size(); } -/* caller must hold the tree lock */ -static void zswap_entry_get(struct zswap_entry *entry) -{ - WARN_ON_ONCE(!entry->refcount); - entry->refcount++; -} - -/* caller must hold the tree lock */ -static void zswap_entry_put(struct zswap_entry *entry) -{ - WARN_ON_ONCE(!entry->refcount); - if (--entry->refcount == 0) { - WARN_ON_ONCE(!RB_EMPTY_NODE(&entry->rbnode)); - zswap_entry_free(entry); - } -} - /* - * If the entry is still valid in the tree, drop the initial ref and remove it - * from the tree. This function must be called with an additional ref held, - * otherwise it may race with another invalidation freeing the entry. + * The caller hold the tree lock and search the entry from the tree, + * so it must be on the tree, remove it from the tree and free it. */ static void zswap_invalidate_entry(struct zswap_tree *tree, struct zswap_entry *entry) { - if (zswap_rb_erase(&tree->rbroot, entry)) - zswap_entry_put(entry); + zswap_rb_erase(&tree->rbroot, entry); + zswap_entry_free(entry); } /********************************* @@ -1226,7 +1191,7 @@ static int zswap_writeback_entry(struct zswap_entry *entry, } /* Safe to deref entry after the entry is verified above. */ - zswap_entry_get(entry); + zswap_rb_erase(&tree->rbroot, entry); spin_unlock(&tree->lock); zswap_decompress(entry, &folio->page); @@ -1235,10 +1200,7 @@ static int zswap_writeback_entry(struct zswap_entry *entry, if (entry->objcg) count_objcg_event(entry->objcg, ZSWPWB); - spin_lock(&tree->lock); - zswap_invalidate_entry(tree, entry); - zswap_entry_put(entry); - spin_unlock(&tree->lock); + zswap_entry_free(entry); /* folio is up to date */ folio_mark_uptodate(folio); @@ -1710,7 +1672,7 @@ bool zswap_load(struct folio *folio) spin_unlock(&tree->lock); return false; } - zswap_entry_get(entry); + zswap_rb_erase(&tree->rbroot, entry); spin_unlock(&tree->lock); if (entry->length) @@ -1725,10 +1687,7 @@ bool zswap_load(struct folio *folio) if (entry->objcg) count_objcg_event(entry->objcg, ZSWPIN); - spin_lock(&tree->lock); - zswap_invalidate_entry(tree, entry); - zswap_entry_put(entry); - spin_unlock(&tree->lock); + zswap_entry_free(entry); folio_mark_dirty(folio); -- Gitee From ac3e524941977d7a49a044561705110ab466b913 Mon Sep 17 00:00:00 2001 From: Chengming Zhou Date: Fri, 9 Feb 2024 04:41:12 +0000 Subject: [PATCH 191/484] mm/zswap: optimize and cleanup the invalidation of duplicate entry commit f576a1e80c3a97ea26b436bc060df42cf6d19026 upstream Conflicts: none Backport-reason: ZSWAP mass udpate We may encounter duplicate entry in the zswap_store(): 1. swap slot that freed to per-cpu swap cache, doesn't invalidate the zswap entry, then got reused. This has been fixed. 2. !exclusive load mode, swapin folio will leave its zswap entry on the tree, then swapout again. This has been removed. 3. one folio can be dirtied again after zswap_store(), so need to zswap_store() again. This should be handled correctly. So we must invalidate the old duplicate entry before inserting the new one, which actually doesn't have to be done at the beginning of zswap_store(). The good point is that we don't need to lock the tree twice in the normal store success path. And cleanup the loop as we are here. Note we still need to invalidate the old duplicate entry when store failed or zswap is disabled , otherwise the new data in swapfile could be overwrite by the old data in zswap pool when lru writeback. Link: https://lkml.kernel.org/r/20240209044112.3883835-1-chengming.zhou@linux.dev Signed-off-by: Chengming Zhou Acked-by: Johannes Weiner Acked-by: Yosry Ahmed Acked-by: Chris Li Acked-by: Nhat Pham Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/zswap.c | 34 ++++++++++++++++------------------ 1 file changed, 16 insertions(+), 18 deletions(-) diff --git a/mm/zswap.c b/mm/zswap.c index 7404cd36156f..fcabab8b2f29 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -1524,19 +1524,8 @@ bool zswap_store(struct folio *folio) if (folio_test_large(folio)) return false; - /* - * If this is a duplicate, it must be removed before attempting to store - * it, otherwise, if the store fails the old page won't be removed from - * the tree, and it might be written back overriding the new data. - */ - spin_lock(&tree->lock); - entry = zswap_rb_search(&tree->rbroot, offset); - if (entry) - zswap_invalidate_entry(tree, entry); - spin_unlock(&tree->lock); - if (!zswap_enabled) - return false; + goto check_old; objcg = get_obj_cgroup_from_folio(folio); if (objcg && !obj_cgroup_may_zswap(objcg)) { @@ -1616,14 +1605,12 @@ bool zswap_store(struct folio *folio) /* map */ spin_lock(&tree->lock); /* - * A duplicate entry should have been removed at the beginning of this - * function. Since the swap entry should be pinned, if a duplicate is - * found again here it means that something went wrong in the swap - * cache. + * The folio may have been dirtied again, invalidate the + * possibly stale entry before inserting the new entry. */ - while (zswap_rb_insert(&tree->rbroot, entry, &dupentry) == -EEXIST) { - WARN_ON(1); + if (zswap_rb_insert(&tree->rbroot, entry, &dupentry) == -EEXIST) { zswap_invalidate_entry(tree, dupentry); + WARN_ON(zswap_rb_insert(&tree->rbroot, entry, &dupentry)); } if (entry->length) { INIT_LIST_HEAD(&entry->lru); @@ -1646,6 +1633,17 @@ bool zswap_store(struct folio *folio) reject: if (objcg) obj_cgroup_put(objcg); +check_old: + /* + * If the zswap store fails or zswap is disabled, we must invalidate the + * possibly stale entry which was previously stored at this offset. + * Otherwise, writeback could overwrite the new data in the swapfile. + */ + spin_lock(&tree->lock); + entry = zswap_rb_search(&tree->rbroot, offset); + if (entry) + zswap_invalidate_entry(tree, entry); + spin_unlock(&tree->lock); return false; shrink: -- Gitee From 4cb61546040372ff9ee4686fac5b074a41ac52f8 Mon Sep 17 00:00:00 2001 From: Barry Song Date: Tue, 20 Feb 2024 10:19:35 +1300 Subject: [PATCH 192/484] mm: zswap: increase reject_compress_poor but not reject_compress_fail if compression returns ENOSPC commit 55e78c933d747ed44b2b22879e6a55e5af34b9f7 upstream Conflicts: none Backport-reason: ZSWAP mass udpate We used to rely on the returned -ENOSPC of zpool_malloc() to increase reject_compress_poor. But the code wouldn't get to there after commit 744e1885922a ("crypto: scomp - fix req->dst buffer overflow") as the new code will goto out immediately after the special compression case happens. So there might be no longer a chance to execute zpool_malloc now. We are incorrectly increasing zswap_reject_compress_fail instead. Thus, we need to fix the counters handling right after compressions return ENOSPC. This patch also centralizes the counters handling for all of compress_poor, compress_fail and alloc_fail. Link: https://lkml.kernel.org/r/20240219211935.72394-1-21cnbao@gmail.com Fixes: 744e1885922a ("crypto: scomp - fix req->dst buffer overflow") Signed-off-by: Barry Song Cc: Sergey Senozhatsky Reviewed-by: Nhat Pham Acked-by: Yosry Ahmed Reviewed-by: Chengming Zhou Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/zswap.c | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/mm/zswap.c b/mm/zswap.c index fcabab8b2f29..0b2fd3a05f39 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -1021,12 +1021,12 @@ static bool zswap_compress(struct folio *folio, struct zswap_entry *entry) { struct crypto_acomp_ctx *acomp_ctx; struct scatterlist input, output; + int comp_ret = 0, alloc_ret = 0; unsigned int dlen = PAGE_SIZE; unsigned long handle; struct zpool *zpool; char *buf; gfp_t gfp; - int ret; u8 *dst; acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx); @@ -1057,26 +1057,18 @@ static bool zswap_compress(struct folio *folio, struct zswap_entry *entry) * but in different threads running on different cpu, we have different * acomp instance, so multiple threads can do (de)compression in parallel. */ - ret = crypto_wait_req(crypto_acomp_compress(acomp_ctx->req), &acomp_ctx->wait); + comp_ret = crypto_wait_req(crypto_acomp_compress(acomp_ctx->req), &acomp_ctx->wait); dlen = acomp_ctx->req->dlen; - if (ret) { - zswap_reject_compress_fail++; + if (comp_ret) goto unlock; - } zpool = zswap_find_zpool(entry); gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM; if (zpool_malloc_support_movable(zpool)) gfp |= __GFP_HIGHMEM | __GFP_MOVABLE; - ret = zpool_malloc(zpool, dlen, gfp, &handle); - if (ret == -ENOSPC) { - zswap_reject_compress_poor++; - goto unlock; - } - if (ret) { - zswap_reject_alloc_fail++; + alloc_ret = zpool_malloc(zpool, dlen, gfp, &handle); + if (alloc_ret) goto unlock; - } buf = zpool_map_handle(zpool, handle, ZPOOL_MM_WO); memcpy(buf, dst, dlen); @@ -1086,8 +1078,15 @@ static bool zswap_compress(struct folio *folio, struct zswap_entry *entry) entry->length = dlen; unlock: + if (comp_ret == -ENOSPC || alloc_ret == -ENOSPC) + zswap_reject_compress_poor++; + else if (comp_ret) + zswap_reject_compress_fail++; + else if (alloc_ret) + zswap_reject_alloc_fail++; + mutex_unlock(&acomp_ctx->mutex); - return ret == 0; + return comp_ret == 0 && alloc_ret == 0; } static void zswap_decompress(struct zswap_entry *entry, struct page *page) -- Gitee From 87aa7c2a36eb2f794dcc5bba1833180f5621e407 Mon Sep 17 00:00:00 2001 From: Chengming Zhou Date: Fri, 16 Feb 2024 08:55:04 +0000 Subject: [PATCH 193/484] mm/zswap: global lru and shrinker shared by all zswap_pools commit bf9b7df23cb3c03d8cdbcaa58817e60ce06105ac upstream Conflicts: none Backport-reason: ZSWAP mass udpate Patch series "mm/zswap: optimize for dynamic zswap_pools", v3. Dynamic pool creation has been supported for a long time, which maybe not used so much in practice. But with the per-memcg lru merged, the current structure of zswap_pool's lru and shrinker become less optimal. In the current structure, each zswap_pool has its own lru, shrinker and shrink_work, but only the latest zswap_pool will be the current used. 1. When memory has pressure, all shrinkers of zswap_pools will try to shrink its lru list, there is no order between them. 2. When zswap limit hit, only the last zswap_pool's shrink_work will try to shrink its own lru, which is inefficient. A more natural way is to have a global zswap lru shared between all zswap_pools, and so is the shrinker. The code becomes much simpler too. Another optimization is changing zswap_pool kref to percpu_ref, which will be taken reference by every zswap entry. So the scalability is better. Testing kernel build (32 threads) in tmpfs with memory.max=2GB. (zswap shrinker and writeback enabled with one 50GB swapfile, on a 128 CPUs x86-64 machine, below is the average of 5 runs) mm-unstable zswap-global-lru real 63.20 63.12 user 1061.75 1062.95 sys 268.74 264.44 This patch (of 3): Dynamic zswap_pool creation may create/reuse to have multiple zswap_pools in a list, only the first will be current used. Each zswap_pool has its own lru and shrinker, which is not necessary and has its problem: 1. When memory has pressure, all shrinker of zswap_pools will try to shrink its own lru, there is no order between them. 2. When zswap limit hit, only the last zswap_pool's shrink_work will try to shrink its lru list. The rationale here was to try and empty the old pool first so that we can completely drop it. However, since we only support exclusive loads now, the LRU ordering should be entirely decided by the order of stores, so the oldest entries on the LRU will naturally be from the oldest pool. Anyway, having a global lru and shrinker shared by all zswap_pools is better and efficient. Link: https://lkml.kernel.org/r/20240210-zswap-global-lru-v3-0-200495333595@bytedance.com Link: https://lkml.kernel.org/r/20240210-zswap-global-lru-v3-1-200495333595@bytedance.com Signed-off-by: Chengming Zhou Acked-by: Yosry Ahmed Cc: Johannes Weiner Cc: Nhat Pham Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/zswap.c | 171 +++++++++++++++++++++-------------------------------- 1 file changed, 66 insertions(+), 105 deletions(-) diff --git a/mm/zswap.c b/mm/zswap.c index 0b2fd3a05f39..d839bb0ba71e 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -176,14 +176,19 @@ struct zswap_pool { struct kref kref; struct list_head list; struct work_struct release_work; - struct work_struct shrink_work; struct hlist_node node; char tfm_name[CRYPTO_MAX_ALG_NAME]; +}; + +static struct { struct list_lru list_lru; - struct mem_cgroup *next_shrink; - struct shrinker *shrinker; atomic_t nr_stored; -}; + struct shrinker *shrinker; + struct work_struct shrink_work; + struct mem_cgroup *next_shrink; + /* The lock protects next_shrink. */ + spinlock_t shrink_lock; +} zswap; /* * struct zswap_entry @@ -301,9 +306,6 @@ static void zswap_update_total_size(void) * pool functions **********************************/ -static void zswap_alloc_shrinker(struct zswap_pool *pool); -static void shrink_worker(struct work_struct *w); - static struct zswap_pool *zswap_pool_create(char *type, char *compressor) { int i; @@ -353,30 +355,16 @@ static struct zswap_pool *zswap_pool_create(char *type, char *compressor) if (ret) goto error; - zswap_alloc_shrinker(pool); - if (!pool->shrinker) - goto error; - - pr_debug("using %s compressor\n", pool->tfm_name); - /* being the current pool takes 1 ref; this func expects the * caller to always add the new pool as the current pool */ kref_init(&pool->kref); INIT_LIST_HEAD(&pool->list); - if (list_lru_init_memcg(&pool->list_lru, pool->shrinker)) - goto lru_fail; - shrinker_register(pool->shrinker); - INIT_WORK(&pool->shrink_work, shrink_worker); - atomic_set(&pool->nr_stored, 0); zswap_pool_debug("created", pool); return pool; -lru_fail: - list_lru_destroy(&pool->list_lru); - shrinker_free(pool->shrinker); error: if (pool->acomp_ctx) free_percpu(pool->acomp_ctx); @@ -434,15 +422,8 @@ static void zswap_pool_destroy(struct zswap_pool *pool) zswap_pool_debug("destroying", pool); - shrinker_free(pool->shrinker); cpuhp_state_remove_instance(CPUHP_MM_ZSWP_POOL_PREPARE, &pool->node); free_percpu(pool->acomp_ctx); - list_lru_destroy(&pool->list_lru); - - spin_lock(&zswap_pools_lock); - mem_cgroup_iter_break(NULL, pool->next_shrink); - pool->next_shrink = NULL; - spin_unlock(&zswap_pools_lock); for (i = 0; i < ZSWAP_NR_ZPOOLS; i++) zpool_destroy_pool(pool->zpools[i]); @@ -529,24 +510,6 @@ static struct zswap_pool *zswap_pool_current_get(void) return pool; } -static struct zswap_pool *zswap_pool_last_get(void) -{ - struct zswap_pool *pool, *last = NULL; - - rcu_read_lock(); - - list_for_each_entry_rcu(pool, &zswap_pools, list) - last = pool; - WARN_ONCE(!last && zswap_has_pool, - "%s: no page storage pool!\n", __func__); - if (!zswap_pool_get(last)) - last = NULL; - - rcu_read_unlock(); - - return last; -} - /* type and compressor must be null-terminated */ static struct zswap_pool *zswap_pool_find_get(char *type, char *compressor) { @@ -816,15 +779,11 @@ void zswap_folio_swapin(struct folio *folio) void zswap_memcg_offline_cleanup(struct mem_cgroup *memcg) { - struct zswap_pool *pool; - - /* lock out zswap pools list modification */ - spin_lock(&zswap_pools_lock); - list_for_each_entry(pool, &zswap_pools, list) { - if (pool->next_shrink == memcg) - pool->next_shrink = mem_cgroup_iter(NULL, pool->next_shrink, NULL); - } - spin_unlock(&zswap_pools_lock); + /* lock out zswap shrinker walking memcg tree */ + spin_lock(&zswap.shrink_lock); + if (zswap.next_shrink == memcg) + zswap.next_shrink = mem_cgroup_iter(NULL, zswap.next_shrink, NULL); + spin_unlock(&zswap.shrink_lock); } /********************************* @@ -923,9 +882,9 @@ static void zswap_entry_free(struct zswap_entry *entry) if (!entry->length) atomic_dec(&zswap_same_filled_pages); else { - zswap_lru_del(&entry->pool->list_lru, entry); + zswap_lru_del(&zswap.list_lru, entry); zpool_free(zswap_find_zpool(entry), entry->handle); - atomic_dec(&entry->pool->nr_stored); + atomic_dec(&zswap.nr_stored); zswap_pool_put(entry->pool); } if (entry->objcg) { @@ -1294,7 +1253,6 @@ static unsigned long zswap_shrinker_scan(struct shrinker *shrinker, { struct lruvec *lruvec = mem_cgroup_lruvec(sc->memcg, NODE_DATA(sc->nid)); unsigned long shrink_ret, nr_protected, lru_size; - struct zswap_pool *pool = shrinker->private_data; bool encountered_page_in_swapcache = false; if (!zswap_shrinker_enabled || @@ -1305,7 +1263,7 @@ static unsigned long zswap_shrinker_scan(struct shrinker *shrinker, nr_protected = atomic_long_read(&lruvec->zswap_lruvec_state.nr_zswap_protected); - lru_size = list_lru_shrink_count(&pool->list_lru, sc); + lru_size = list_lru_shrink_count(&zswap.list_lru, sc); /* * Abort if we are shrinking into the protected region. @@ -1322,7 +1280,7 @@ static unsigned long zswap_shrinker_scan(struct shrinker *shrinker, return SHRINK_STOP; } - shrink_ret = list_lru_shrink_walk(&pool->list_lru, sc, &shrink_memcg_cb, + shrink_ret = list_lru_shrink_walk(&zswap.list_lru, sc, &shrink_memcg_cb, &encountered_page_in_swapcache); if (encountered_page_in_swapcache) @@ -1334,7 +1292,6 @@ static unsigned long zswap_shrinker_scan(struct shrinker *shrinker, static unsigned long zswap_shrinker_count(struct shrinker *shrinker, struct shrink_control *sc) { - struct zswap_pool *pool = shrinker->private_data; struct mem_cgroup *memcg = sc->memcg; struct lruvec *lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(sc->nid)); unsigned long nr_backing, nr_stored, nr_freeable, nr_protected; @@ -1348,8 +1305,8 @@ static unsigned long zswap_shrinker_count(struct shrinker *shrinker, nr_stored = memcg_page_state(memcg, MEMCG_ZSWAPPED); #else /* use pool stats instead of memcg stats */ - nr_backing = get_zswap_pool_size(pool) >> PAGE_SHIFT; - nr_stored = atomic_read(&pool->nr_stored); + nr_backing = zswap_pool_total_size >> PAGE_SHIFT; + nr_stored = atomic_read(&zswap.nr_stored); #endif if (!nr_stored) @@ -1357,7 +1314,7 @@ static unsigned long zswap_shrinker_count(struct shrinker *shrinker, nr_protected = atomic_long_read(&lruvec->zswap_lruvec_state.nr_zswap_protected); - nr_freeable = list_lru_shrink_count(&pool->list_lru, sc); + nr_freeable = list_lru_shrink_count(&zswap.list_lru, sc); /* * Subtract the lru size by an estimate of the number of pages * that should be protected. @@ -1373,23 +1330,24 @@ static unsigned long zswap_shrinker_count(struct shrinker *shrinker, return mult_frac(nr_freeable, nr_backing, nr_stored); } -static void zswap_alloc_shrinker(struct zswap_pool *pool) +static struct shrinker *zswap_alloc_shrinker(void) { - pool->shrinker = + struct shrinker *shrinker; + + shrinker = shrinker_alloc(SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE, "mm-zswap"); - if (!pool->shrinker) - return; + if (!shrinker) + return NULL; - pool->shrinker->private_data = pool; - pool->shrinker->scan_objects = zswap_shrinker_scan; - pool->shrinker->count_objects = zswap_shrinker_count; - pool->shrinker->batch = 0; - pool->shrinker->seeks = DEFAULT_SEEKS; + shrinker->scan_objects = zswap_shrinker_scan; + shrinker->count_objects = zswap_shrinker_count; + shrinker->batch = 0; + shrinker->seeks = DEFAULT_SEEKS; + return shrinker; } static int shrink_memcg(struct mem_cgroup *memcg) { - struct zswap_pool *pool; int nid, shrunk = 0; if (!mem_cgroup_zswap_writeback_enabled(memcg)) @@ -1402,32 +1360,25 @@ static int shrink_memcg(struct mem_cgroup *memcg) if (memcg && !mem_cgroup_online(memcg)) return -ENOENT; - pool = zswap_pool_current_get(); - if (!pool) - return -EINVAL; - for_each_node_state(nid, N_NORMAL_MEMORY) { unsigned long nr_to_walk = 1; - shrunk += list_lru_walk_one(&pool->list_lru, nid, memcg, + shrunk += list_lru_walk_one(&zswap.list_lru, nid, memcg, &shrink_memcg_cb, NULL, &nr_to_walk); } - zswap_pool_put(pool); return shrunk ? 0 : -EAGAIN; } static void shrink_worker(struct work_struct *w) { - struct zswap_pool *pool = container_of(w, typeof(*pool), - shrink_work); struct mem_cgroup *memcg; int ret, failures = 0; /* global reclaim will select cgroup in a round-robin fashion. */ do { - spin_lock(&zswap_pools_lock); - pool->next_shrink = mem_cgroup_iter(NULL, pool->next_shrink, NULL); - memcg = pool->next_shrink; + spin_lock(&zswap.shrink_lock); + zswap.next_shrink = mem_cgroup_iter(NULL, zswap.next_shrink, NULL); + memcg = zswap.next_shrink; /* * We need to retry if we have gone through a full round trip, or if we @@ -1441,7 +1392,7 @@ static void shrink_worker(struct work_struct *w) * memcg is not killed when we are reclaiming. */ if (!memcg) { - spin_unlock(&zswap_pools_lock); + spin_unlock(&zswap.shrink_lock); if (++failures == MAX_RECLAIM_RETRIES) break; @@ -1451,15 +1402,15 @@ static void shrink_worker(struct work_struct *w) if (!mem_cgroup_tryget_online(memcg)) { /* drop the reference from mem_cgroup_iter() */ mem_cgroup_iter_break(NULL, memcg); - pool->next_shrink = NULL; - spin_unlock(&zswap_pools_lock); + zswap.next_shrink = NULL; + spin_unlock(&zswap.shrink_lock); if (++failures == MAX_RECLAIM_RETRIES) break; goto resched; } - spin_unlock(&zswap_pools_lock); + spin_unlock(&zswap.shrink_lock); ret = shrink_memcg(memcg); /* drop the extra reference */ @@ -1473,7 +1424,6 @@ static void shrink_worker(struct work_struct *w) resched: cond_resched(); } while (!zswap_can_accept()); - zswap_pool_put(pool); } static int zswap_is_page_same_filled(void *ptr, unsigned long *value) @@ -1514,7 +1464,6 @@ bool zswap_store(struct folio *folio) struct zswap_entry *entry, *dupentry; struct obj_cgroup *objcg = NULL; struct mem_cgroup *memcg = NULL; - struct zswap_pool *shrink_pool; VM_WARN_ON_ONCE(!folio_test_locked(folio)); VM_WARN_ON_ONCE(!folio_test_swapcache(folio)); @@ -1582,7 +1531,7 @@ bool zswap_store(struct folio *folio) if (objcg) { memcg = get_mem_cgroup_from_objcg(objcg); - if (memcg_list_lru_alloc(memcg, &entry->pool->list_lru, GFP_KERNEL)) { + if (memcg_list_lru_alloc(memcg, &zswap.list_lru, GFP_KERNEL)) { mem_cgroup_put(memcg); goto put_pool; } @@ -1613,8 +1562,8 @@ bool zswap_store(struct folio *folio) } if (entry->length) { INIT_LIST_HEAD(&entry->lru); - zswap_lru_add(&entry->pool->list_lru, entry); - atomic_inc(&entry->pool->nr_stored); + zswap_lru_add(&zswap.list_lru, entry); + atomic_inc(&zswap.nr_stored); } spin_unlock(&tree->lock); @@ -1646,9 +1595,7 @@ bool zswap_store(struct folio *folio) return false; shrink: - shrink_pool = zswap_pool_last_get(); - if (shrink_pool && !queue_work(shrink_wq, &shrink_pool->shrink_work)) - zswap_pool_put(shrink_pool); + queue_work(shrink_wq, &zswap.shrink_work); goto reject; } @@ -1810,6 +1757,22 @@ static int zswap_setup(void) if (ret) goto hp_fail; + shrink_wq = alloc_workqueue("zswap-shrink", + WQ_UNBOUND|WQ_MEM_RECLAIM, 1); + if (!shrink_wq) + goto shrink_wq_fail; + + zswap.shrinker = zswap_alloc_shrinker(); + if (!zswap.shrinker) + goto shrinker_fail; + if (list_lru_init_memcg(&zswap.list_lru, zswap.shrinker)) + goto lru_fail; + shrinker_register(zswap.shrinker); + + INIT_WORK(&zswap.shrink_work, shrink_worker); + atomic_set(&zswap.nr_stored, 0); + spin_lock_init(&zswap.shrink_lock); + pool = __zswap_pool_create_fallback(); if (pool) { pr_info("loaded using pool %s/%s\n", pool->tfm_name, @@ -1821,19 +1784,17 @@ static int zswap_setup(void) zswap_enabled = false; } - shrink_wq = alloc_workqueue("zswap-shrink", - WQ_UNBOUND|WQ_MEM_RECLAIM, 1); - if (!shrink_wq) - goto fallback_fail; - if (zswap_debugfs_init()) pr_warn("debugfs initialization failed\n"); zswap_init_state = ZSWAP_INIT_SUCCEED; return 0; -fallback_fail: - if (pool) - zswap_pool_destroy(pool); +lru_fail: + shrinker_free(zswap.shrinker); +shrinker_fail: + destroy_workqueue(shrink_wq); +shrink_wq_fail: + cpuhp_remove_multi_state(CPUHP_MM_ZSWP_POOL_PREPARE); hp_fail: kmem_cache_destroy(zswap_entry_cache); cache_fail: -- Gitee From 9964a45c3154e5b0a7b92c2b803bdbc19a8c2b41 Mon Sep 17 00:00:00 2001 From: Chengming Zhou Date: Fri, 16 Feb 2024 08:55:05 +0000 Subject: [PATCH 194/484] mm/zswap: change zswap_pool kref to percpu_ref commit 94ace3fec8477f8a46d08fc57cd1dd5efbd0a32b upstream Conflicts: none Backport-reason: ZSWAP mass udpate All zswap entries will take a reference of zswap_pool when zswap_store(), and drop it when free. Change it to use the percpu_ref is better for scalability performance. Although percpu_ref use a bit more memory which should be ok for our use case, since we almost have only one zswap_pool to be using. The performance gain is for zswap_store/load hotpath. Testing kernel build (32 threads) in tmpfs with memory.max=2GB. (zswap shrinker and writeback enabled with one 50GB swapfile, on a 128 CPUs x86-64 machine, below is the average of 5 runs) mm-unstable zswap-global-lru real 63.20 63.12 user 1061.75 1062.95 sys 268.74 264.44 [chengming.zhou@linux.dev: fix zswap_pools_lock usages after changing to percpu_ref] Link: https://lkml.kernel.org/r/20240228154954.3028626-1-chengming.zhou@linux.dev Link: https://lkml.kernel.org/r/20240210-zswap-global-lru-v3-2-200495333595@bytedance.com Signed-off-by: Chengming Zhou Reviewed-by: Nhat Pham Cc: Johannes Weiner Cc: Yosry Ahmed Cc: Chengming Zhou Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/zswap.c | 48 +++++++++++++++++++++++++++++++++--------------- 1 file changed, 33 insertions(+), 15 deletions(-) diff --git a/mm/zswap.c b/mm/zswap.c index d839bb0ba71e..4db554cbcff2 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -173,7 +173,7 @@ struct crypto_acomp_ctx { struct zswap_pool { struct zpool *zpools[ZSWAP_NR_ZPOOLS]; struct crypto_acomp_ctx __percpu *acomp_ctx; - struct kref kref; + struct percpu_ref ref; struct list_head list; struct work_struct release_work; struct hlist_node node; @@ -305,6 +305,7 @@ static void zswap_update_total_size(void) /********************************* * pool functions **********************************/ +static void __zswap_pool_empty(struct percpu_ref *ref); static struct zswap_pool *zswap_pool_create(char *type, char *compressor) { @@ -358,13 +359,18 @@ static struct zswap_pool *zswap_pool_create(char *type, char *compressor) /* being the current pool takes 1 ref; this func expects the * caller to always add the new pool as the current pool */ - kref_init(&pool->kref); + ret = percpu_ref_init(&pool->ref, __zswap_pool_empty, + PERCPU_REF_ALLOW_REINIT, GFP_KERNEL); + if (ret) + goto ref_fail; INIT_LIST_HEAD(&pool->list); zswap_pool_debug("created", pool); return pool; +ref_fail: + cpuhp_state_remove_instance(CPUHP_MM_ZSWP_POOL_PREPARE, &pool->node); error: if (pool->acomp_ctx) free_percpu(pool->acomp_ctx); @@ -437,8 +443,9 @@ static void __zswap_pool_release(struct work_struct *work) synchronize_rcu(); - /* nobody should have been able to get a kref... */ - WARN_ON(kref_get_unless_zero(&pool->kref)); + /* nobody should have been able to get a ref... */ + WARN_ON(!percpu_ref_is_zero(&pool->ref)); + percpu_ref_exit(&pool->ref); /* pool is now off zswap_pools list and has no references. */ zswap_pool_destroy(pool); @@ -446,13 +453,13 @@ static void __zswap_pool_release(struct work_struct *work) static struct zswap_pool *zswap_pool_current(void); -static void __zswap_pool_empty(struct kref *kref) +static void __zswap_pool_empty(struct percpu_ref *ref) { struct zswap_pool *pool; - pool = container_of(kref, typeof(*pool), kref); + pool = container_of(ref, typeof(*pool), ref); - spin_lock(&zswap_pools_lock); + spin_lock_bh(&zswap_pools_lock); WARN_ON(pool == zswap_pool_current()); @@ -461,7 +468,7 @@ static void __zswap_pool_empty(struct kref *kref) INIT_WORK(&pool->release_work, __zswap_pool_release); schedule_work(&pool->release_work); - spin_unlock(&zswap_pools_lock); + spin_unlock_bh(&zswap_pools_lock); } static int __must_check zswap_pool_get(struct zswap_pool *pool) @@ -469,12 +476,12 @@ static int __must_check zswap_pool_get(struct zswap_pool *pool) if (!pool) return 0; - return kref_get_unless_zero(&pool->kref); + return percpu_ref_tryget(&pool->ref); } static void zswap_pool_put(struct zswap_pool *pool) { - kref_put(&pool->kref, __zswap_pool_empty); + percpu_ref_put(&pool->ref); } static struct zswap_pool *__zswap_pool_current(void) @@ -591,7 +598,7 @@ static int __zswap_param_set(const char *val, const struct kernel_param *kp, return -EINVAL; } - spin_lock(&zswap_pools_lock); + spin_lock_bh(&zswap_pools_lock); pool = zswap_pool_find_get(type, compressor); if (pool) { @@ -600,17 +607,28 @@ static int __zswap_param_set(const char *val, const struct kernel_param *kp, list_del_rcu(&pool->list); } - spin_unlock(&zswap_pools_lock); + spin_unlock_bh(&zswap_pools_lock); if (!pool) pool = zswap_pool_create(type, compressor); + else { + /* + * Restore the initial ref dropped by percpu_ref_kill() + * when the pool was decommissioned and switch it again + * to percpu mode. + */ + percpu_ref_resurrect(&pool->ref); + + /* Drop the ref from zswap_pool_find_get(). */ + zswap_pool_put(pool); + } if (pool) ret = param_set_charp(s, kp); else ret = -EINVAL; - spin_lock(&zswap_pools_lock); + spin_lock_bh(&zswap_pools_lock); if (!ret) { put_pool = zswap_pool_current(); @@ -625,7 +643,7 @@ static int __zswap_param_set(const char *val, const struct kernel_param *kp, put_pool = pool; } - spin_unlock(&zswap_pools_lock); + spin_unlock_bh(&zswap_pools_lock); if (!zswap_has_pool && !pool) { /* if initial pool creation failed, and this pool creation also @@ -642,7 +660,7 @@ static int __zswap_param_set(const char *val, const struct kernel_param *kp, * or the new pool we failed to add */ if (put_pool) - zswap_pool_put(put_pool); + percpu_ref_kill(&put_pool->ref); return ret; } -- Gitee From 42d0660a2032609eb3715039ad5c824d5191cbd5 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 27 Feb 2024 17:42:35 +0000 Subject: [PATCH 195/484] mm: make folios_put() the basis of release_pages() commit 99fbb6bfc16f202adc411ad5d353db214750d121 upstream Conflicts: minor, due to backported 66edc3a5894c Backport-reason: Memory: more folios part 1 Patch series "Rearrange batched folio freeing", v3. Other than the obvious "remove calls to compound_head" changes, the fundamental belief here is that iterating a linked list is much slower than iterating an array (5-15x slower in my testing). There's also an associated belief that since we iterate the batch of folios three times, we do better when the array is small (ie 15 entries) than we do with a batch that is hundreds of entries long, which only gives us the opportunity for the first pages to fall out of cache by the time we get to the end. It is possible we should increase the size of folio_batch. Hopefully the bots let us know if this introduces any performance regressions. This patch (of 3): By making release_pages() call folios_put(), we can get rid of the calls to compound_head() for the callers that already know they have folios. We can also get rid of the lock_batch tracking as we know the size of the batch is limited by folio_batch. This does reduce the maximum number of pages for which the lruvec lock is held, from SWAP_CLUSTER_MAX (32) to PAGEVEC_SIZE (15). I do not expect this to make a significant difference, but if it does, we can increase PAGEVEC_SIZE to 31. Link: https://lkml.kernel.org/r/20240227174254.710559-1-willy@infradead.org Link: https://lkml.kernel.org/r/20240227174254.710559-2-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: David Hildenbrand Cc: Mel Gorman Cc: Ryan Roberts Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- include/linux/mm.h | 16 +++++--- mm/mlock.c | 3 +- mm/swap.c | 98 +++++++++++++++++++++++++++------------------- 3 files changed, 69 insertions(+), 48 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index e97c091acf5a..58f4903057d2 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -36,6 +36,7 @@ struct anon_vma; struct anon_vma_chain; struct user_struct; struct pt_regs; +struct folio_batch; extern int sysctl_page_lock_unfairness; @@ -1511,6 +1512,8 @@ static inline void folio_put_refs(struct folio *folio, int refs) __folio_put(folio); } +void folios_put_refs(struct folio_batch *folios, unsigned int *refs); + /* * union release_pages_arg - an array of pages or folios * @@ -1533,18 +1536,19 @@ void release_pages(release_pages_arg, int nr); /** * folios_put - Decrement the reference count on an array of folios. * @folios: The folios. - * @nr: How many folios there are. * - * Like folio_put(), but for an array of folios. This is more efficient - * than writing the loop yourself as it will optimise the locks which - * need to be taken if the folios are freed. + * Like folio_put(), but for a batch of folios. This is more efficient + * than writing the loop yourself as it will optimise the locks which need + * to be taken if the folios are freed. The folios batch is returned + * empty and ready to be reused for another batch; there is no need to + * reinitialise it. * * Context: May be called in process or interrupt context, but not in NMI * context. May be called while holding a spinlock. */ -static inline void folios_put(struct folio **folios, unsigned int nr) +static inline void folios_put(struct folio_batch *folios) { - release_pages(folios, nr); + folios_put_refs(folios, NULL); } static inline void put_page(struct page *page) diff --git a/mm/mlock.c b/mm/mlock.c index 3044340af784..826a93c4af2c 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -206,8 +206,7 @@ static void mlock_folio_batch(struct folio_batch *fbatch) if (lruvec) unlock_page_lruvec_irq(lruvec); - folios_put(fbatch->folios, folio_batch_count(fbatch)); - folio_batch_reinit(fbatch); + folios_put(fbatch); } void mlock_drain_local(void) diff --git a/mm/swap.c b/mm/swap.c index 529dcc7c60de..688b27c28566 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -167,7 +167,7 @@ static void lru_add_fn(struct lruvec *lruvec, struct folio *folio) * while the LRU lock is held. * * (That is not true of __page_cache_release(), and not necessarily - * true of release_pages(): but those only clear the mlocked flag after + * true of folios_put(): but those only clear the mlocked flag after * folio_put_testzero() has excluded any other users of the folio.) */ if (folio_evictable(folio)) { @@ -209,8 +209,7 @@ static void folio_batch_move_lru(struct folio_batch *fbatch, move_fn_t move_fn) if (lruvec) unlock_page_lruvec_irqrestore(lruvec, flags); - folios_put(fbatch->folios, folio_batch_count(fbatch)); - folio_batch_reinit(fbatch); + folios_put(fbatch); } static void folio_batch_add_and_move(struct folio_batch *fbatch, @@ -976,47 +975,30 @@ void lru_cache_disable(void) } /** - * release_pages - batched put_page() - * @arg: array of pages to release - * @nr: number of pages + * folios_put_refs - Reduce the reference count on a batch of folios. + * @folios: The folios. + * @refs: The number of refs to subtract from each folio. * - * Decrement the reference count on all the pages in @arg. If it - * fell to zero, remove the page from the LRU and free it. + * Like folio_put(), but for a batch of folios. This is more efficient + * than writing the loop yourself as it will optimise the locks which need + * to be taken if the folios are freed. The folios batch is returned + * empty and ready to be reused for another batch; there is no need + * to reinitialise it. If @refs is NULL, we subtract one from each + * folio refcount. * - * Note that the argument can be an array of pages, encoded pages, - * or folio pointers. We ignore any encoded bits, and turn any of - * them into just a folio that gets free'd. + * Context: May be called in process or interrupt context, but not in NMI + * context. May be called while holding a spinlock. */ -void release_pages(release_pages_arg arg, int nr) +void folios_put_refs(struct folio_batch *folios, unsigned int *refs) { int i; - struct encoded_page **encoded = arg.encoded_pages; LIST_HEAD(pages_to_free); struct lruvec *lruvec = NULL; unsigned long flags = 0; - unsigned int lock_batch; - for (i = 0; i < nr; i++) { - unsigned int nr_refs = 1; - struct folio *folio; - - /* Turn any of the argument types into a folio */ - folio = page_folio(encoded_page_ptr(encoded[i])); - - /* Is our next entry actually "nr_pages" -> "nr_refs" ? */ - if (unlikely(encoded_page_flags(encoded[i]) & - ENCODED_PAGE_BIT_NR_PAGES_NEXT)) - nr_refs = encoded_nr_pages(encoded[++i]); - - /* - * Make sure the IRQ-safe lock-holding time does not get - * excessive with a continuous string of pages from the - * same lruvec. The lock is held only if lruvec != NULL. - */ - if (lruvec && ++lock_batch == SWAP_CLUSTER_MAX) { - unlock_page_lruvec_irqrestore(lruvec, flags); - lruvec = NULL; - } + for (i = 0; i < folios->nr; i++) { + struct folio *folio = folios->folios[i]; + unsigned int nr_refs = refs ? refs[i] : 1; if (is_huge_zero_page(&folio->page)) continue; @@ -1046,13 +1028,8 @@ void release_pages(release_pages_arg arg, int nr) } if (folio_test_lru(folio)) { - struct lruvec *prev_lruvec = lruvec; - lruvec = folio_lruvec_relock_irqsave(folio, lruvec, &flags); - if (prev_lruvec != lruvec) - lock_batch = 0; - lruvec_del_folio(lruvec, folio); __folio_clear_lru_flags(folio); } @@ -1064,6 +1041,47 @@ void release_pages(release_pages_arg arg, int nr) mem_cgroup_uncharge_list(&pages_to_free); free_unref_page_list(&pages_to_free); + folio_batch_reinit(folios); +} +EXPORT_SYMBOL(folios_put_refs); + +/** + * release_pages - batched put_page() + * @arg: array of pages to release + * @nr: number of pages + * + * Decrement the reference count on all the pages in @arg. If it + * fell to zero, remove the page from the LRU and free it. + * + * Note that the argument can be an array of pages, encoded pages, + * or folio pointers. We ignore any encoded bits, and turn any of + * them into just a folio that gets free'd. + */ +void release_pages(release_pages_arg arg, int nr) +{ + struct folio_batch fbatch; + int refs[PAGEVEC_SIZE]; + struct encoded_page **encoded = arg.encoded_pages; + int i; + + folio_batch_init(&fbatch); + for (i = 0; i < nr; i++) { + /* Turn any of the argument types into a folio */ + struct folio *folio = page_folio(encoded_page_ptr(encoded[i])); + + /* Is our next entry actually "nr_pages" -> "nr_refs" ? */ + refs[fbatch.nr] = 1; + if (unlikely(encoded_page_flags(encoded[i]) & + ENCODED_PAGE_BIT_NR_PAGES_NEXT)) + refs[fbatch.nr] = encoded_nr_pages(encoded[++i]); + + if (folio_batch_add(&fbatch, folio) > 0) + continue; + folios_put_refs(&fbatch, refs); + } + + if (fbatch.nr) + folios_put_refs(&fbatch, refs); } EXPORT_SYMBOL(release_pages); -- Gitee From 7ec9eb27b77e08f73f2cefcd94a011d893058055 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 27 Feb 2024 17:42:36 +0000 Subject: [PATCH 196/484] mm: convert free_unref_page_list() to use folios commit 7c76d92253dbb7c53ba03a4cd6639113cd1f7d3a upstream Conflicts: none Backport-reason: Memory: more folios part 1 Most of its callees are not yet ready to accept a folio, but we know all of the pages passed in are actually folios because they're linked through ->lru. Link: https://lkml.kernel.org/r/20240227174254.710559-3-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Ryan Roberts Cc: David Hildenbrand Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/page_alloc.c | 38 ++++++++++++++++++++------------------ 1 file changed, 20 insertions(+), 18 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index ef844c113eb3..b6557a035e9c 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2612,17 +2612,17 @@ void free_unref_page(struct page *page, unsigned int order) void free_unref_page_list(struct list_head *list) { unsigned long __maybe_unused UP_flags; - struct page *page, *next; + struct folio *folio, *next; struct per_cpu_pages *pcp = NULL; struct zone *locked_zone = NULL; int batch_count = 0; int migratetype; /* Prepare pages for freeing */ - list_for_each_entry_safe(page, next, list, lru) { - unsigned long pfn = page_to_pfn(page); - if (!free_unref_page_prepare(page, pfn, 0)) { - list_del(&page->lru); + list_for_each_entry_safe(folio, next, list, lru) { + unsigned long pfn = folio_pfn(folio); + if (!free_unref_page_prepare(&folio->page, pfn, 0)) { + list_del(&folio->lru); continue; } @@ -2630,24 +2630,25 @@ void free_unref_page_list(struct list_head *list) * Free isolated pages directly to the allocator, see * comment in free_unref_page. */ - migratetype = get_pcppage_migratetype(page); + migratetype = get_pcppage_migratetype(&folio->page); if (unlikely(is_migrate_isolate(migratetype))) { - list_del(&page->lru); - free_one_page(page_zone(page), page, pfn, 0, migratetype, FPI_NONE); + list_del(&folio->lru); + free_one_page(folio_zone(folio), &folio->page, pfn, + 0, migratetype, FPI_NONE); continue; } } - list_for_each_entry_safe(page, next, list, lru) { - struct zone *zone = page_zone(page); + list_for_each_entry_safe(folio, next, list, lru) { + struct zone *zone = folio_zone(folio); - list_del(&page->lru); - migratetype = get_pcppage_migratetype(page); + list_del(&folio->lru); + migratetype = get_pcppage_migratetype(&folio->page); /* * Either different zone requiring a different pcp lock or * excessive lock hold times when freeing a large list of - * pages. + * folios. */ if (zone != locked_zone || batch_count == SWAP_CLUSTER_MAX) { if (pcp) { @@ -2658,15 +2659,16 @@ void free_unref_page_list(struct list_head *list) batch_count = 0; /* - * trylock is necessary as pages may be getting freed + * trylock is necessary as folios may be getting freed * from IRQ or SoftIRQ context after an IO completion. */ pcp_trylock_prepare(UP_flags); pcp = pcp_spin_trylock(zone->per_cpu_pageset); if (unlikely(!pcp)) { pcp_trylock_finish(UP_flags); - free_one_page(zone, page, page_to_pfn(page), - 0, migratetype, FPI_NONE); + free_one_page(zone, &folio->page, + folio_pfn(folio), 0, + migratetype, FPI_NONE); locked_zone = NULL; continue; } @@ -2680,8 +2682,8 @@ void free_unref_page_list(struct list_head *list) if (unlikely(migratetype >= MIGRATE_PCPTYPES)) migratetype = MIGRATE_MOVABLE; - trace_mm_page_free_batched(page); - free_unref_page_commit(zone, pcp, page, migratetype, 0); + trace_mm_page_free_batched(&folio->page); + free_unref_page_commit(zone, pcp, &folio->page, migratetype, 0); batch_count++; } -- Gitee From 95fbfbd16ea244560972eaff0d4a3adf13db810a Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 27 Feb 2024 17:42:37 +0000 Subject: [PATCH 197/484] mm: add free_unref_folios() commit 90491d87dd46a4c843dae775b9e72c91624c5a7b upstream Conflicts: none Backport-reason: Memory: more folios part 1 Iterate over a folio_batch rather than a linked list. This is easier for the CPU to prefetch and has a batch count naturally built in so we don't need to track it. Again, this lowers the maximum lock hold time from 32 folios to 15, but I do not expect this to have a significant effect. Link: https://lkml.kernel.org/r/20240227174254.710559-4-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: David Hildenbrand Cc: Mel Gorman Cc: Ryan Roberts Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/internal.h | 5 +++-- mm/page_alloc.c | 59 ++++++++++++++++++++++++++++++------------------- 2 files changed, 39 insertions(+), 25 deletions(-) diff --git a/mm/internal.h b/mm/internal.h index b1be18013b40..86c1a7efd2fe 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -681,8 +681,9 @@ extern bool free_pages_prepare(struct page *page, unsigned int order); extern int user_min_free_kbytes; -extern void free_unref_page(struct page *page, unsigned int order); -extern void free_unref_page_list(struct list_head *list); +void free_unref_page(struct page *page, unsigned int order); +void free_unref_folios(struct folio_batch *fbatch); +void free_unref_page_list(struct list_head *list); extern void zone_pcp_reset(struct zone *zone); extern void zone_pcp_disable(struct zone *zone); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index b6557a035e9c..d40a7ff0a02d 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -32,6 +32,7 @@ #include #include #include +#include #include #include #include @@ -2607,57 +2608,51 @@ void free_unref_page(struct page *page, unsigned int order) } /* - * Free a list of 0-order pages + * Free a batch of 0-order pages */ -void free_unref_page_list(struct list_head *list) +void free_unref_folios(struct folio_batch *folios) { unsigned long __maybe_unused UP_flags; - struct folio *folio, *next; struct per_cpu_pages *pcp = NULL; struct zone *locked_zone = NULL; - int batch_count = 0; - int migratetype; + int i, j, migratetype; - /* Prepare pages for freeing */ - list_for_each_entry_safe(folio, next, list, lru) { + /* Prepare folios for freeing */ + for (i = 0, j = 0; i < folios->nr; i++) { + struct folio *folio = folios->folios[i]; unsigned long pfn = folio_pfn(folio); - if (!free_unref_page_prepare(&folio->page, pfn, 0)) { - list_del(&folio->lru); + if (!free_unref_page_prepare(&folio->page, pfn, 0)) continue; - } /* - * Free isolated pages directly to the allocator, see + * Free isolated folios directly to the allocator, see * comment in free_unref_page. */ migratetype = get_pcppage_migratetype(&folio->page); if (unlikely(is_migrate_isolate(migratetype))) { - list_del(&folio->lru); free_one_page(folio_zone(folio), &folio->page, pfn, 0, migratetype, FPI_NONE); continue; } + if (j != i) + folios->folios[j] = folio; + j++; } + folios->nr = j; - list_for_each_entry_safe(folio, next, list, lru) { + for (i = 0; i < folios->nr; i++) { + struct folio *folio = folios->folios[i]; struct zone *zone = folio_zone(folio); - list_del(&folio->lru); migratetype = get_pcppage_migratetype(&folio->page); - /* - * Either different zone requiring a different pcp lock or - * excessive lock hold times when freeing a large list of - * folios. - */ - if (zone != locked_zone || batch_count == SWAP_CLUSTER_MAX) { + /* Different zone requires a different pcp lock */ + if (zone != locked_zone) { if (pcp) { pcp_spin_unlock(pcp); pcp_trylock_finish(UP_flags); } - batch_count = 0; - /* * trylock is necessary as folios may be getting freed * from IRQ or SoftIRQ context after an IO completion. @@ -2684,13 +2679,31 @@ void free_unref_page_list(struct list_head *list) trace_mm_page_free_batched(&folio->page); free_unref_page_commit(zone, pcp, &folio->page, migratetype, 0); - batch_count++; } if (pcp) { pcp_spin_unlock(pcp); pcp_trylock_finish(UP_flags); } + folio_batch_reinit(folios); +} + +void free_unref_page_list(struct list_head *list) +{ + struct folio_batch fbatch; + + folio_batch_init(&fbatch); + while (!list_empty(list)) { + struct folio *folio = list_first_entry(list, struct folio, lru); + + list_del(&folio->lru); + if (folio_batch_add(&fbatch, folio) > 0) + continue; + free_unref_folios(&fbatch); + } + + if (fbatch.nr) + free_unref_folios(&fbatch); } /* -- Gitee From e4f6cdc8c18efe0e8ef66dfbe65982d9754b8074 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 27 Feb 2024 17:42:38 +0000 Subject: [PATCH 198/484] mm: use folios_put() in __folio_batch_release() commit 6871cc5742f411bf8ebbcb78b4afeb992d888228 upstream Conflicts: none Backport-reason: Memory: more folios part 1 There's no need to indirect through release_pages() and iterate over this batch of folios an extra time; we can just use the batch that we have. Link: https://lkml.kernel.org/r/20240227174254.710559-5-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Ryan Roberts Cc: David Hildenbrand Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/swap.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/mm/swap.c b/mm/swap.c index 688b27c28566..75cdbc60bddc 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -1101,8 +1101,7 @@ void __folio_batch_release(struct folio_batch *fbatch) lru_add_drain(); fbatch->percpu_pvec_drained = true; } - release_pages(fbatch->folios, folio_batch_count(fbatch)); - folio_batch_reinit(fbatch); + folios_put(fbatch); } EXPORT_SYMBOL(__folio_batch_release); -- Gitee From 11668faf13fbd69a3c302df8dd311569783a03f2 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 15 Mar 2024 14:08:21 +0000 Subject: [PATCH 199/484] mm: increase folio batch size commit 9cecde80aae0fb0aa44425575d5aca71bc646d89 upstream Conflicts: none Backport-reason: folio fixes On a 104 thread, 2 socket Skylake system, Intel report a 4.7% performance reduction with will-it-scale page_fault2. This was due to reducing the size of the batch from 32 to 15. Increasing the folio batch size from 15 to 31 gives a performance increase of 12.5% relative to the original, or 17.2% relative to the reduced performance commit. The penalty of this commit is an additional 128 bytes of stack usage. Six folio_batches are also allocated from percpu memory in cpu_fbatches so that will be an additional 768 bytes of percpu memory (per CPU). Tim Chen originally submitted a patch like this in 2020: https://lore.kernel.org/linux-mm/d1cc9f12a8ad6c2a52cb600d93b06b064f2bbc57.1593205965.git.tim.c.chen@linux.intel.com/ Link: https://lkml.kernel.org/r/20240315140823.2478146-1-willy@infradead.org Fixes: 99fbb6bfc16f ("mm: make folios_put() the basis of release_pages()") Signed-off-by: Matthew Wilcox (Oracle) Tested-by: Yujie Liu Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-lkp/202403151058.7048f6a8-oliver.sang@intel.com Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- include/linux/pagevec.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/pagevec.h b/include/linux/pagevec.h index 87cc678adc85..67f10b8810a8 100644 --- a/include/linux/pagevec.h +++ b/include/linux/pagevec.h @@ -11,8 +11,8 @@ #include -/* 15 pointers + header align the folio_batch structure to a power of two */ -#define PAGEVEC_SIZE 15 +/* 31 pointers + header align the folio_batch structure to a power of two */ +#define PAGEVEC_SIZE 31 struct folio; -- Gitee From 36286e5cc7d27eb70a05fc3e98b11ca2df1532fc Mon Sep 17 00:00:00 2001 From: Nhat Pham Date: Fri, 6 Oct 2023 11:46:26 -0700 Subject: [PATCH 200/484] memcontrol: add helpers for hugetlb memcg accounting commit 4b569387c0d566db288e7c3e1b484b43df797bdb upstream Conflicts: trivial Backport-reason: Memory Cgroup: new helpers & hugetlb accounting Patch series "hugetlb memcg accounting", v4. Currently, hugetlb memory usage is not acounted for in the memory controller, which could lead to memory overprotection for cgroups with hugetlb-backed memory. This has been observed in our production system. For instance, here is one of our usecases: suppose there are two 32G containers. The machine is booted with hugetlb_cma=6G, and each container may or may not use up to 3 gigantic page, depending on the workload within it. The rest is anon, cache, slab, etc. We can set the hugetlb cgroup limit of each cgroup to 3G to enforce hugetlb fairness. But it is very difficult to configure memory.max to keep overall consumption, including anon, cache, slab etcetera fair. What we have had to resort to is to constantly poll hugetlb usage and readjust memory.max. Similar procedure is done to other memory limits (memory.low for e.g). However, this is rather cumbersome and buggy. Furthermore, when there is a delay in memory limits correction, (for e.g when hugetlb usage changes within consecutive runs of the userspace agent), the system could be in an over/underprotected state. This patch series rectifies this issue by charging the memcg when the hugetlb folio is allocated, and uncharging when the folio is freed. In addition, a new selftest is added to demonstrate and verify this new behavior. This patch (of 4): This patch exposes charge committing and cancelling as parts of the memory controller interface. These functionalities are useful when the try_charge() and commit_charge() stages have to be separated by other actions in between (which can fail). One such example is the new hugetlb accounting behavior in the following patch. The patch also adds a helper function to obtain a reference to the current task's memcg. Link: https://lkml.kernel.org/r/20231006184629.155543-1-nphamcs@gmail.com Link: https://lkml.kernel.org/r/20231006184629.155543-2-nphamcs@gmail.com Signed-off-by: Nhat Pham Acked-by: Michal Hocko Acked-by: Johannes Weiner Cc: Frank van der Linden Cc: Mike Kravetz Cc: Muchun Song Cc: Rik van Riel Cc: Roman Gushchin Cc: Shakeel Butt Cc: Shuah Khan Cc: Tejun heo Cc: Yosry Ahmed Cc: Zefan Li Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- include/linux/memcontrol.h | 21 ++++++++++++++ mm/memcontrol.c | 59 ++++++++++++++++++++++++++++++-------- 2 files changed, 68 insertions(+), 12 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index de9e3f6a0034..ce5e656fc007 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -780,6 +780,8 @@ static inline bool mem_cgroup_below_min(struct mem_cgroup *target, page_counter_read(&memcg->memory); } +void mem_cgroup_commit_charge(struct folio *folio, struct mem_cgroup *memcg); + int __mem_cgroup_charge(struct folio *folio, struct mm_struct *mm, gfp_t gfp); /** @@ -841,6 +843,8 @@ static inline void mem_cgroup_uncharge_list(struct list_head *page_list) __mem_cgroup_uncharge_list(page_list); } +void mem_cgroup_cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages); + void mem_cgroup_migrate(struct folio *old, struct folio *new); /** @@ -940,6 +944,8 @@ static inline bool mem_sell_check_memcg(struct mem_cgroup *memcg) } #endif +struct mem_cgroup *get_mem_cgroup_from_current(void); + struct lruvec *folio_lruvec_lock(struct folio *folio); struct lruvec *folio_lruvec_lock_irq(struct folio *folio); struct lruvec *folio_lruvec_lock_irqsave(struct folio *folio, @@ -1440,6 +1446,11 @@ static inline bool mem_cgroup_below_min(struct mem_cgroup *target, return false; } +static inline void mem_cgroup_commit_charge(struct folio *folio, + struct mem_cgroup *memcg) +{ +} + static inline int mem_cgroup_charge(struct folio *folio, struct mm_struct *mm, gfp_t gfp) { @@ -1470,6 +1481,11 @@ static inline void mem_cgroup_uncharge_list(struct list_head *page_list) { } +static inline void mem_cgroup_cancel_charge(struct mem_cgroup *memcg, + unsigned int nr_pages) +{ +} + static inline void mem_cgroup_migrate(struct folio *old, struct folio *new) { } @@ -1507,6 +1523,11 @@ static inline struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm) return NULL; } +static inline struct mem_cgroup *get_mem_cgroup_from_current(void) +{ + return NULL; +} + static inline struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *css) { diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 6f5bbadef31e..8f7c75d7114d 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1159,6 +1159,27 @@ static __always_inline bool memcg_kmem_bypass(void) return false; } +/** + * get_mem_cgroup_from_current - Obtain a reference on current task's memcg. + */ +struct mem_cgroup *get_mem_cgroup_from_current(void) +{ + struct mem_cgroup *memcg; + + if (mem_cgroup_disabled()) + return NULL; + +again: + rcu_read_lock(); + memcg = mem_cgroup_from_task(current); + if (!css_tryget(&memcg->css)) { + rcu_read_unlock(); + goto again; + } + rcu_read_unlock(); + return memcg; +} + /** * mem_cgroup_iter - iterate over memory cgroup hierarchy * @root: hierarchy root @@ -3396,7 +3417,12 @@ static inline int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, return try_charge_memcg(memcg, gfp_mask, nr_pages); } -static inline void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages) +/** + * mem_cgroup_cancel_charge() - cancel an uncommitted try_charge() call. + * @memcg: memcg previously charged. + * @nr_pages: number of pages previously charged. + */ +void mem_cgroup_cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages) { if (mem_cgroup_is_root(memcg)) return; @@ -3421,6 +3447,22 @@ static void commit_charge(struct folio *folio, struct mem_cgroup *memcg) folio->memcg_data = (unsigned long)memcg; } +/** + * mem_cgroup_commit_charge - commit a previously successful try_charge(). + * @folio: folio to commit the charge to. + * @memcg: memcg previously charged. + */ +void mem_cgroup_commit_charge(struct folio *folio, struct mem_cgroup *memcg) +{ + css_get(&memcg->css); + commit_charge(folio, memcg); + + local_irq_disable(); + mem_cgroup_charge_statistics(memcg, folio_nr_pages(folio)); + memcg_check_events(memcg, folio_nid(folio)); + local_irq_enable(); +} + #ifdef CONFIG_MEMCG_KMEM /* * The allocated objcg pointers array is not accounted directly. @@ -8105,7 +8147,7 @@ static void __mem_cgroup_clear_mc(void) /* we must uncharge all the leftover precharges from mc.to */ if (mc.precharge) { - cancel_charge(mc.to, mc.precharge); + mem_cgroup_cancel_charge(mc.to, mc.precharge); mc.precharge = 0; } /* @@ -8113,7 +8155,7 @@ static void __mem_cgroup_clear_mc(void) * we must uncharge here. */ if (mc.moved_charge) { - cancel_charge(mc.from, mc.moved_charge); + mem_cgroup_cancel_charge(mc.from, mc.moved_charge); mc.moved_charge = 0; } /* we must fixup refcnts and charges */ @@ -9219,20 +9261,13 @@ void mem_cgroup_calculate_protection(struct mem_cgroup *root, static int charge_memcg(struct folio *folio, struct mem_cgroup *memcg, gfp_t gfp) { - long nr_pages = folio_nr_pages(folio); int ret; - ret = try_charge(memcg, gfp, nr_pages); + ret = try_charge(memcg, gfp, folio_nr_pages(folio)); if (ret) goto out; - css_get(&memcg->css); - commit_charge(folio, memcg); - - local_irq_disable(); - mem_cgroup_charge_statistics(memcg, nr_pages); - memcg_check_events(memcg, folio_nid(folio)); - local_irq_enable(); + mem_cgroup_commit_charge(folio, memcg); out: return ret; } -- Gitee From b41a8fe1d4011384ca1cbb410e019af98415a963 Mon Sep 17 00:00:00 2001 From: Nhat Pham Date: Fri, 6 Oct 2023 11:46:27 -0700 Subject: [PATCH 201/484] memcontrol: only transfer the memcg data for migration commit 85ce2c517ade0d51b7ad95f2e88be9bbe294379a upstream Conflicts: none Backport-reason: Memory Cgroup: new helpers & hugetlb accounting For most migration use cases, only transfer the memcg data from the old folio to the new folio, and clear the old folio's memcg data. No charging and uncharging will be done. This shaves off some work on the migration path, and avoids the temporary double charging of a folio during its migration. The only exception is replace_page_cache_folio(), which will use the old mem_cgroup_migrate() (now renamed to mem_cgroup_replace_folio). In that context, the isolation of the old page isn't quite as thorough as with migration, so we cannot use our new implementation directly. This patch is the result of the following discussion on the new hugetlb memcg accounting behavior: https://lore.kernel.org/lkml/20231003171329.GB314430@monkey/ Link: https://lkml.kernel.org/r/20231006184629.155543-3-nphamcs@gmail.com Signed-off-by: Nhat Pham Suggested-by: Johannes Weiner Acked-by: Johannes Weiner Cc: Frank van der Linden Cc: Michal Hocko Cc: Mike Kravetz Cc: Muchun Song Cc: Rik van Riel Cc: Roman Gushchin Cc: Shakeel Butt Cc: Shuah Khan Cc: Tejun heo Cc: Yosry Ahmed Cc: Zefan Li Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- include/linux/memcontrol.h | 7 +++++++ mm/filemap.c | 2 +- mm/memcontrol.c | 40 +++++++++++++++++++++++++++++++++++--- 3 files changed, 45 insertions(+), 4 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index ce5e656fc007..988427ac2e7d 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -845,6 +845,8 @@ static inline void mem_cgroup_uncharge_list(struct list_head *page_list) void mem_cgroup_cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages); +void mem_cgroup_replace_folio(struct folio *old, struct folio *new); + void mem_cgroup_migrate(struct folio *old, struct folio *new); /** @@ -1486,6 +1488,11 @@ static inline void mem_cgroup_cancel_charge(struct mem_cgroup *memcg, { } +static inline void mem_cgroup_replace_folio(struct folio *old, + struct folio *new) +{ +} + static inline void mem_cgroup_migrate(struct folio *old, struct folio *new) { } diff --git a/mm/filemap.c b/mm/filemap.c index 26fcdb0d05c8..0860266001b4 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -834,7 +834,7 @@ void replace_page_cache_folio(struct folio *old, struct folio *new) new->mapping = mapping; new->index = offset; - mem_cgroup_migrate(old, new); + mem_cgroup_replace_folio(old, new); xas_lock_irq(&xas); xas_store(&xas, new); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 8f7c75d7114d..2cfc4034e90d 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -9506,16 +9506,17 @@ void __mem_cgroup_uncharge_list(struct list_head *page_list) } /** - * mem_cgroup_migrate - Charge a folio's replacement. + * mem_cgroup_replace_folio - Charge a folio's replacement. * @old: Currently circulating folio. * @new: Replacement folio. * * Charge @new as a replacement folio for @old. @old will - * be uncharged upon free. + * be uncharged upon free. This is only used by the page cache + * (in replace_page_cache_folio()). * * Both folios must be locked, @new->mapping must be set up. */ -void mem_cgroup_migrate(struct folio *old, struct folio *new) +void mem_cgroup_replace_folio(struct folio *old, struct folio *new) { struct mem_cgroup *memcg; long nr_pages = folio_nr_pages(new); @@ -9554,6 +9555,39 @@ void mem_cgroup_migrate(struct folio *old, struct folio *new) local_irq_restore(flags); } +/** + * mem_cgroup_migrate - Transfer the memcg data from the old to the new folio. + * @old: Currently circulating folio. + * @new: Replacement folio. + * + * Transfer the memcg data from the old folio to the new folio for migration. + * The old folio's data info will be cleared. Note that the memory counters + * will remain unchanged throughout the process. + * + * Both folios must be locked, @new->mapping must be set up. + */ +void mem_cgroup_migrate(struct folio *old, struct folio *new) +{ + struct mem_cgroup *memcg; + + VM_BUG_ON_FOLIO(!folio_test_locked(old), old); + VM_BUG_ON_FOLIO(!folio_test_locked(new), new); + VM_BUG_ON_FOLIO(folio_test_anon(old) != folio_test_anon(new), new); + VM_BUG_ON_FOLIO(folio_nr_pages(old) != folio_nr_pages(new), new); + + if (mem_cgroup_disabled()) + return; + + memcg = folio_memcg(old); + VM_WARN_ON_ONCE_FOLIO(!memcg, old); + if (!memcg) + return; + + /* Transfer the charge and the css ref */ + commit_charge(new, memcg); + old->memcg_data = 0; +} + DEFINE_STATIC_KEY_FALSE(memcg_sockets_enabled_key); EXPORT_SYMBOL(memcg_sockets_enabled_key); -- Gitee From f39400a36a3a349fdc16dba2dd07d9ab468d3cc2 Mon Sep 17 00:00:00 2001 From: Nhat Pham Date: Fri, 6 Oct 2023 11:46:28 -0700 Subject: [PATCH 202/484] hugetlb: memcg: account hugetlb-backed memory in memory controller commit 8cba9576df601c384abd334a503c3f6e1e29eefb upstream Conflicts: trivial, resolved Backport-reason: Memory Cgroup: new helpers & hugetlb accounting Currently, hugetlb memory usage is not acounted for in the memory controller, which could lead to memory overprotection for cgroups with hugetlb-backed memory. This has been observed in our production system. For instance, here is one of our usecases: suppose there are two 32G containers. The machine is booted with hugetlb_cma=6G, and each container may or may not use up to 3 gigantic page, depending on the workload within it. The rest is anon, cache, slab, etc. We can set the hugetlb cgroup limit of each cgroup to 3G to enforce hugetlb fairness. But it is very difficult to configure memory.max to keep overall consumption, including anon, cache, slab etc. fair. What we have had to resort to is to constantly poll hugetlb usage and readjust memory.max. Similar procedure is done to other memory limits (memory.low for e.g). However, this is rather cumbersome and buggy. Furthermore, when there is a delay in memory limits correction, (for e.g when hugetlb usage changes within consecutive runs of the userspace agent), the system could be in an over/underprotected state. This patch rectifies this issue by charging the memcg when the hugetlb folio is utilized, and uncharging when the folio is freed (analogous to the hugetlb controller). Note that we do not charge when the folio is allocated to the hugetlb pool, because at this point it is not owned by any memcg. Some caveats to consider: * This feature is only available on cgroup v2. * There is no hugetlb pool management involved in the memory controller. As stated above, hugetlb folios are only charged towards the memory controller when it is used. Host overcommit management has to consider it when configuring hard limits. * Failure to charge towards the memcg results in SIGBUS. This could happen even if the hugetlb pool still has pages (but the cgroup limit is hit and reclaim attempt fails). * When this feature is enabled, hugetlb pages contribute to memory reclaim protection. low, min limits tuning must take into account hugetlb memory. * Hugetlb pages utilized while this option is not selected will not be tracked by the memory controller (even if cgroup v2 is remounted later on). Link: https://lkml.kernel.org/r/20231006184629.155543-4-nphamcs@gmail.com Signed-off-by: Nhat Pham Acked-by: Johannes Weiner Cc: Frank van der Linden Cc: Michal Hocko Cc: Mike Kravetz Cc: Muchun Song Cc: Rik van Riel Cc: Roman Gushchin Cc: Shakeel Butt Cc: Shuah Khan Cc: Tejun heo Cc: Yosry Ahmed Cc: Zefan Li Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- Documentation/admin-guide/cgroup-v2.rst | 29 +++++++++++++++++ include/linux/cgroup-defs.h | 5 +++ include/linux/memcontrol.h | 9 ++++++ kernel/cgroup/cgroup.c | 15 ++++++++- mm/hugetlb.c | 35 ++++++++++++++++----- mm/memcontrol.c | 42 ++++++++++++++++++++++++- mm/migrate.c | 3 +- 7 files changed, 127 insertions(+), 11 deletions(-) diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index f6ae7fa4d60f..a5beae80d3ca 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -210,6 +210,35 @@ cgroup v2 currently supports the following mount options. relying on the original semantics (e.g. specifying bogusly high 'bypass' protection values at higher tree levels). + memory_hugetlb_accounting + Count HugeTLB memory usage towards the cgroup's overall + memory usage for the memory controller (for the purpose of + statistics reporting and memory protetion). This is a new + behavior that could regress existing setups, so it must be + explicitly opted in with this mount option. + + A few caveats to keep in mind: + + * There is no HugeTLB pool management involved in the memory + controller. The pre-allocated pool does not belong to anyone. + Specifically, when a new HugeTLB folio is allocated to + the pool, it is not accounted for from the perspective of the + memory controller. It is only charged to a cgroup when it is + actually used (for e.g at page fault time). Host memory + overcommit management has to consider this when configuring + hard limits. In general, HugeTLB pool management should be + done via other mechanisms (such as the HugeTLB controller). + * Failure to charge a HugeTLB folio to the memory controller + results in SIGBUS. This could happen even if the HugeTLB pool + still has pages available (but the cgroup limit is hit and + reclaim attempt fails). + * Charging HugeTLB memory towards the memory controller affects + memory protection and reclaim dynamics. Any userspace tuning + (of low, min limits for e.g) needs to take this into account. + * HugeTLB pages utilized while this option is not selected + will not be tracked by the memory controller (even if cgroup + v2 is remounted later on). + Organizing Processes and Threads -------------------------------- diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 1ec1d161b026..3f2dcd5010dc 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -117,6 +117,11 @@ enum { * Enable recursive subtree protection */ CGRP_ROOT_MEMORY_RECURSIVE_PROT = (1 << 18), + + /* + * Enable hugetlb accounting for the memory controller. + */ + CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING = (1 << 19), }; /* cftype->flags */ diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 988427ac2e7d..f980bd9dfea2 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -816,6 +816,9 @@ static inline int mem_cgroup_charge_file(struct folio *folio, struct mm_struct * return __mem_cgroup_charge_file(folio, mm, gfp); } +int mem_cgroup_hugetlb_try_charge(struct mem_cgroup *memcg, gfp_t gfp, + long nr_pages); + int mem_cgroup_swapin_charge_folio(struct folio *folio, struct mm_struct *mm, gfp_t gfp, swp_entry_t entry); void mem_cgroup_swapin_uncharge_swap(swp_entry_t entry); @@ -1465,6 +1468,12 @@ static inline int mem_cgroup_charge_file(struct folio *folio, return 0; } +static inline int mem_cgroup_hugetlb_try_charge(struct mem_cgroup *memcg, + gfp_t gfp, long nr_pages) +{ + return 0; +} + static inline int mem_cgroup_swapin_charge_folio(struct folio *folio, struct mm_struct *mm, gfp_t gfp, swp_entry_t entry) { diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 91d7c5eb576e..6f654ff5083b 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -1932,6 +1932,7 @@ enum cgroup2_param { Opt_favordynmods, Opt_memory_localevents, Opt_memory_recursiveprot, + Opt_memory_hugetlb_accounting, nr__cgroup2_params }; @@ -1940,6 +1941,7 @@ static const struct fs_parameter_spec cgroup2_fs_parameters[] = { fsparam_flag("favordynmods", Opt_favordynmods), fsparam_flag("memory_localevents", Opt_memory_localevents), fsparam_flag("memory_recursiveprot", Opt_memory_recursiveprot), + fsparam_flag("memory_hugetlb_accounting", Opt_memory_hugetlb_accounting), {} }; @@ -1966,6 +1968,9 @@ static int cgroup2_parse_param(struct fs_context *fc, struct fs_parameter *param case Opt_memory_recursiveprot: ctx->flags |= CGRP_ROOT_MEMORY_RECURSIVE_PROT; return 0; + case Opt_memory_hugetlb_accounting: + ctx->flags |= CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING; + return 0; } return -EINVAL; } @@ -1990,6 +1995,11 @@ static void apply_cgroup_root_flags(unsigned int root_flags) cgrp_dfl_root.flags |= CGRP_ROOT_MEMORY_RECURSIVE_PROT; else cgrp_dfl_root.flags &= ~CGRP_ROOT_MEMORY_RECURSIVE_PROT; + + if (root_flags & CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING) + cgrp_dfl_root.flags |= CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING; + else + cgrp_dfl_root.flags &= ~CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING; } } @@ -2003,6 +2013,8 @@ static int cgroup_show_options(struct seq_file *seq, struct kernfs_root *kf_root seq_puts(seq, ",memory_localevents"); if (cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_RECURSIVE_PROT) seq_puts(seq, ",memory_recursiveprot"); + if (cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING) + seq_puts(seq, ",memory_hugetlb_accounting"); return 0; } @@ -7786,7 +7798,8 @@ static ssize_t features_show(struct kobject *kobj, struct kobj_attribute *attr, "nsdelegate\n" "favordynmods\n" "memory_localevents\n" - "memory_recursiveprot\n"); + "memory_recursiveprot\n" + "memory_hugetlb_accounting\n"); } static struct kobj_attribute cgroup_features_attr = __ATTR_RO(features); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 87d0f5c29c0c..0109ab23447d 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1934,6 +1934,7 @@ void free_huge_folio(struct folio *folio) pages_per_huge_page(h), folio); hugetlb_cgroup_uncharge_folio_rsvd(hstate_index(h), pages_per_huge_page(h), folio); + mem_cgroup_uncharge(folio); if (restore_reserve) h->resv_huge_pages++; @@ -3075,11 +3076,20 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, struct hugepage_subpool *spool = subpool_vma(vma); struct hstate *h = hstate_vma(vma); struct folio *folio; - long map_chg, map_commit; + long map_chg, map_commit, nr_pages = pages_per_huge_page(h); long gbl_chg; - int ret, idx; + int memcg_charge_ret, ret, idx; struct hugetlb_cgroup *h_cg = NULL; + struct mem_cgroup *memcg; bool deferred_reserve; + gfp_t gfp = htlb_alloc_mask(h) | __GFP_RETRY_MAYFAIL; + + memcg = get_mem_cgroup_from_current(); + memcg_charge_ret = mem_cgroup_hugetlb_try_charge(memcg, gfp, nr_pages); + if (memcg_charge_ret == -ENOMEM) { + mem_cgroup_put(memcg); + return ERR_PTR(-ENOMEM); + } idx = hstate_index(h); /* @@ -3088,8 +3098,12 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, * code of zero indicates a reservation exists (no change). */ map_chg = gbl_chg = vma_needs_reservation(h, vma, addr); - if (map_chg < 0) + if (map_chg < 0) { + if (!memcg_charge_ret) + mem_cgroup_cancel_charge(memcg, nr_pages); + mem_cgroup_put(memcg); return ERR_PTR(-ENOMEM); + } /* * Processes that did not create the mapping will have no @@ -3100,10 +3114,8 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, */ if (map_chg || avoid_reserve) { gbl_chg = hugepage_subpool_get_pages(spool, 1); - if (gbl_chg < 0) { - vma_end_reservation(h, vma, addr); - return ERR_PTR(-ENOSPC); - } + if (gbl_chg < 0) + goto out_end_reservation; /* * Even though there was no reservation in the region/reserve @@ -3188,6 +3200,11 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, spin_unlock_irq(&hugetlb_lock); } } + + if (!memcg_charge_ret) + mem_cgroup_commit_charge(folio, memcg); + mem_cgroup_put(memcg); + return folio; out_uncharge_cgroup: @@ -3199,7 +3216,11 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, out_subpool_put: if (map_chg || avoid_reserve) hugepage_subpool_put_pages(spool, 1); +out_end_reservation: vma_end_reservation(h, vma, addr); + if (!memcg_charge_ret) + mem_cgroup_cancel_charge(memcg, nr_pages); + mem_cgroup_put(memcg); return ERR_PTR(-ENOSPC); } diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 2cfc4034e90d..8b2927c39a30 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -9307,6 +9307,41 @@ int __mem_cgroup_charge_file(struct folio *folio, struct mm_struct *mm, gfp_t gf return ret; } +/** + * mem_cgroup_hugetlb_try_charge - try to charge the memcg for a hugetlb folio + * @memcg: memcg to charge. + * @gfp: reclaim mode. + * @nr_pages: number of pages to charge. + * + * This function is called when allocating a huge page folio to determine if + * the memcg has the capacity for it. It does not commit the charge yet, + * as the hugetlb folio itself has not been obtained from the hugetlb pool. + * + * Once we have obtained the hugetlb folio, we can call + * mem_cgroup_commit_charge() to commit the charge. If we fail to obtain the + * folio, we should instead call mem_cgroup_cancel_charge() to undo the effect + * of try_charge(). + * + * Returns 0 on success. Otherwise, an error code is returned. + */ +int mem_cgroup_hugetlb_try_charge(struct mem_cgroup *memcg, gfp_t gfp, + long nr_pages) +{ + /* + * If hugetlb memcg charging is not enabled, do not fail hugetlb allocation, + * but do not attempt to commit charge later (or cancel on error) either. + */ + if (mem_cgroup_disabled() || !memcg || + !cgroup_subsys_on_dfl(memory_cgrp_subsys) || + !(cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING)) + return -EOPNOTSUPP; + + if (try_charge(memcg, gfp, nr_pages)) + return -ENOMEM; + + return 0; +} + /** * mem_cgroup_swapin_charge_folio - Charge a newly allocated folio for swapin. * @folio: folio to charge. @@ -9579,7 +9614,12 @@ void mem_cgroup_migrate(struct folio *old, struct folio *new) return; memcg = folio_memcg(old); - VM_WARN_ON_ONCE_FOLIO(!memcg, old); + /* + * Note that it is normal to see !memcg for a hugetlb folio. + * For e.g, itt could have been allocated when memory_hugetlb_accounting + * was not selected. + */ + VM_WARN_ON_ONCE_FOLIO(!folio_test_hugetlb(old) && !memcg, old); if (!memcg) return; diff --git a/mm/migrate.c b/mm/migrate.c index c3b7fa31dd3c..00633c8aa332 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -638,8 +638,7 @@ void folio_migrate_flags(struct folio *newfolio, struct folio *folio) folio_copy_owner(newfolio, folio); - if (!folio_test_hugetlb(folio)) - mem_cgroup_migrate(folio, newfolio); + mem_cgroup_migrate(folio, newfolio); } EXPORT_SYMBOL(folio_migrate_flags); -- Gitee From 3b8929b0fc84cafae5466361d4451f1232d44dd3 Mon Sep 17 00:00:00 2001 From: Nhat Pham Date: Fri, 6 Oct 2023 11:46:29 -0700 Subject: [PATCH 203/484] selftests: add a selftest to verify hugetlb usage in memcg commit c0dddb7aa5f85e6150a1e3230e01d5ba286eded9 upstream Conflicts: none Backport-reason: Memory Cgroup: new helpers & hugetlb accounting This patch add a new kselftest to demonstrate and verify the new hugetlb memcg accounting behavior. Link: https://lkml.kernel.org/r/20231006184629.155543-5-nphamcs@gmail.com Signed-off-by: Nhat Pham Cc: Frank van der Linden Cc: Johannes Weiner Cc: Michal Hocko Cc: Mike Kravetz Cc: Muchun Song Cc: Rik van Riel Cc: Roman Gushchin Cc: Shakeel Butt Cc: Shuah Khan Cc: Tejun heo Cc: Yosry Ahmed Cc: Zefan Li Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- MAINTAINERS | 2 + tools/testing/selftests/cgroup/.gitignore | 1 + tools/testing/selftests/cgroup/Makefile | 2 + .../selftests/cgroup/test_hugetlb_memcg.c | 234 ++++++++++++++++++ 4 files changed, 239 insertions(+) create mode 100644 tools/testing/selftests/cgroup/test_hugetlb_memcg.c diff --git a/MAINTAINERS b/MAINTAINERS index 082aac073070..1809d6d70d1e 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -5288,6 +5288,7 @@ S: Maintained F: mm/memcontrol.c F: mm/swap_cgroup.c F: tools/testing/selftests/cgroup/memcg_protection.m +F: tools/testing/selftests/cgroup/test_hugetlb_memcg.c F: tools/testing/selftests/cgroup/test_kmem.c F: tools/testing/selftests/cgroup/test_memcontrol.c @@ -9693,6 +9694,7 @@ F: include/linux/hugetlb.h F: mm/hugetlb.c F: mm/hugetlb_vmemmap.c F: mm/hugetlb_vmemmap.h +F: tools/testing/selftests/cgroup/test_hugetlb_memcg.c HVA ST MEDIA DRIVER M: Jean-Christophe Trotin diff --git a/tools/testing/selftests/cgroup/.gitignore b/tools/testing/selftests/cgroup/.gitignore index af8c3f30b9c1..2732e0b29271 100644 --- a/tools/testing/selftests/cgroup/.gitignore +++ b/tools/testing/selftests/cgroup/.gitignore @@ -7,4 +7,5 @@ test_kill test_cpu test_cpuset test_zswap +test_hugetlb_memcg wait_inotify diff --git a/tools/testing/selftests/cgroup/Makefile b/tools/testing/selftests/cgroup/Makefile index c27f05f6ce9b..00b441928909 100644 --- a/tools/testing/selftests/cgroup/Makefile +++ b/tools/testing/selftests/cgroup/Makefile @@ -14,6 +14,7 @@ TEST_GEN_PROGS += test_kill TEST_GEN_PROGS += test_cpu TEST_GEN_PROGS += test_cpuset TEST_GEN_PROGS += test_zswap +TEST_GEN_PROGS += test_hugetlb_memcg LOCAL_HDRS += $(selfdir)/clone3/clone3_selftests.h $(selfdir)/pidfd/pidfd.h @@ -27,3 +28,4 @@ $(OUTPUT)/test_kill: cgroup_util.c $(OUTPUT)/test_cpu: cgroup_util.c $(OUTPUT)/test_cpuset: cgroup_util.c $(OUTPUT)/test_zswap: cgroup_util.c +$(OUTPUT)/test_hugetlb_memcg: cgroup_util.c diff --git a/tools/testing/selftests/cgroup/test_hugetlb_memcg.c b/tools/testing/selftests/cgroup/test_hugetlb_memcg.c new file mode 100644 index 000000000000..f0fefeb4cc24 --- /dev/null +++ b/tools/testing/selftests/cgroup/test_hugetlb_memcg.c @@ -0,0 +1,234 @@ +// SPDX-License-Identifier: GPL-2.0 +#define _GNU_SOURCE + +#include +#include +#include +#include +#include +#include +#include "../kselftest.h" +#include "cgroup_util.h" + +#define ADDR ((void *)(0x0UL)) +#define FLAGS (MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB) +/* mapping 8 MBs == 4 hugepages */ +#define LENGTH (8UL*1024*1024) +#define PROTECTION (PROT_READ | PROT_WRITE) + +/* borrowed from mm/hmm-tests.c */ +static long get_hugepage_size(void) +{ + int fd; + char buf[2048]; + int len; + char *p, *q, *path = "/proc/meminfo", *tag = "Hugepagesize:"; + long val; + + fd = open(path, O_RDONLY); + if (fd < 0) { + /* Error opening the file */ + return -1; + } + + len = read(fd, buf, sizeof(buf)); + close(fd); + if (len < 0) { + /* Error in reading the file */ + return -1; + } + if (len == sizeof(buf)) { + /* Error file is too large */ + return -1; + } + buf[len] = '\0'; + + /* Search for a tag if provided */ + if (tag) { + p = strstr(buf, tag); + if (!p) + return -1; /* looks like the line we want isn't there */ + p += strlen(tag); + } else + p = buf; + + val = strtol(p, &q, 0); + if (*q != ' ') { + /* Error parsing the file */ + return -1; + } + + return val; +} + +static int set_file(const char *path, long value) +{ + FILE *file; + int ret; + + file = fopen(path, "w"); + if (!file) + return -1; + ret = fprintf(file, "%ld\n", value); + fclose(file); + return ret; +} + +static int set_nr_hugepages(long value) +{ + return set_file("/proc/sys/vm/nr_hugepages", value); +} + +static unsigned int check_first(char *addr) +{ + return *(unsigned int *)addr; +} + +static void write_data(char *addr) +{ + unsigned long i; + + for (i = 0; i < LENGTH; i++) + *(addr + i) = (char)i; +} + +static int hugetlb_test_program(const char *cgroup, void *arg) +{ + char *test_group = (char *)arg; + void *addr; + long old_current, expected_current, current; + int ret = EXIT_FAILURE; + + old_current = cg_read_long(test_group, "memory.current"); + set_nr_hugepages(20); + current = cg_read_long(test_group, "memory.current"); + if (current - old_current >= MB(2)) { + ksft_print_msg( + "setting nr_hugepages should not increase hugepage usage.\n"); + ksft_print_msg("before: %ld, after: %ld\n", old_current, current); + return EXIT_FAILURE; + } + + addr = mmap(ADDR, LENGTH, PROTECTION, FLAGS, 0, 0); + if (addr == MAP_FAILED) { + ksft_print_msg("fail to mmap.\n"); + return EXIT_FAILURE; + } + current = cg_read_long(test_group, "memory.current"); + if (current - old_current >= MB(2)) { + ksft_print_msg("mmap should not increase hugepage usage.\n"); + ksft_print_msg("before: %ld, after: %ld\n", old_current, current); + goto out_failed_munmap; + } + old_current = current; + + /* read the first page */ + check_first(addr); + expected_current = old_current + MB(2); + current = cg_read_long(test_group, "memory.current"); + if (!values_close(expected_current, current, 5)) { + ksft_print_msg("memory usage should increase by around 2MB.\n"); + ksft_print_msg( + "expected memory: %ld, actual memory: %ld\n", + expected_current, current); + goto out_failed_munmap; + } + + /* write to the whole range */ + write_data(addr); + current = cg_read_long(test_group, "memory.current"); + expected_current = old_current + MB(8); + if (!values_close(expected_current, current, 5)) { + ksft_print_msg("memory usage should increase by around 8MB.\n"); + ksft_print_msg( + "expected memory: %ld, actual memory: %ld\n", + expected_current, current); + goto out_failed_munmap; + } + + /* unmap the whole range */ + munmap(addr, LENGTH); + current = cg_read_long(test_group, "memory.current"); + expected_current = old_current; + if (!values_close(expected_current, current, 5)) { + ksft_print_msg("memory usage should go back down.\n"); + ksft_print_msg( + "expected memory: %ld, actual memory: %ld\n", + expected_current, current); + return ret; + } + + ret = EXIT_SUCCESS; + return ret; + +out_failed_munmap: + munmap(addr, LENGTH); + return ret; +} + +static int test_hugetlb_memcg(char *root) +{ + int ret = KSFT_FAIL; + char *test_group; + + test_group = cg_name(root, "hugetlb_memcg_test"); + if (!test_group || cg_create(test_group)) { + ksft_print_msg("fail to create cgroup.\n"); + goto out; + } + + if (cg_write(test_group, "memory.max", "100M")) { + ksft_print_msg("fail to set cgroup memory limit.\n"); + goto out; + } + + /* disable swap */ + if (cg_write(test_group, "memory.swap.max", "0")) { + ksft_print_msg("fail to disable swap.\n"); + goto out; + } + + if (!cg_run(test_group, hugetlb_test_program, (void *)test_group)) + ret = KSFT_PASS; +out: + cg_destroy(test_group); + free(test_group); + return ret; +} + +int main(int argc, char **argv) +{ + char root[PATH_MAX]; + int ret = EXIT_SUCCESS, has_memory_hugetlb_acc; + + has_memory_hugetlb_acc = proc_mount_contains("memory_hugetlb_accounting"); + if (has_memory_hugetlb_acc < 0) + ksft_exit_skip("Failed to query cgroup mount option\n"); + else if (!has_memory_hugetlb_acc) + ksft_exit_skip("memory hugetlb accounting is disabled\n"); + + /* Unit is kB! */ + if (get_hugepage_size() != 2048) { + ksft_print_msg("test_hugetlb_memcg requires 2MB hugepages\n"); + ksft_test_result_skip("test_hugetlb_memcg\n"); + return ret; + } + + if (cg_find_unified_root(root, sizeof(root))) + ksft_exit_skip("cgroup v2 isn't mounted\n"); + + switch (test_hugetlb_memcg(root)) { + case KSFT_PASS: + ksft_test_result_pass("test_hugetlb_memcg\n"); + break; + case KSFT_SKIP: + ksft_test_result_skip("test_hugetlb_memcg\n"); + break; + default: + ret = EXIT_FAILURE; + ksft_test_result_fail("test_hugetlb_memcg\n"); + break; + } + + return ret; +} -- Gitee From 41f3789454069f4a609a2932f2784dd512a986a8 Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Wed, 20 Dec 2023 14:51:40 +0800 Subject: [PATCH 204/484] mm: memcg: fix split queue list crash when large folio migration commit 9bcef5973e31020e5aa8571eb994d67b77318356 upstream Conflicts: none Backport-reason: mTHP fixes When running autonuma with enabling multi-size THP, I encountered the following kernel crash issue: [ 134.290216] list_del corruption. prev->next should be fffff9ad42e1c490, but was dead000000000100. (prev=fffff9ad42399890) [ 134.290877] kernel BUG at lib/list_debug.c:62! [ 134.291052] invalid opcode: 0000 [#1] PREEMPT SMP NOPTI [ 134.291210] CPU: 56 PID: 8037 Comm: numa01 Kdump: loaded Tainted: G E 6.7.0-rc4+ #20 [ 134.291649] RIP: 0010:__list_del_entry_valid_or_report+0x97/0xb0 ...... [ 134.294252] Call Trace: [ 134.294362] [ 134.294440] ? die+0x33/0x90 [ 134.294561] ? do_trap+0xe0/0x110 ...... [ 134.295681] ? __list_del_entry_valid_or_report+0x97/0xb0 [ 134.295842] folio_undo_large_rmappable+0x99/0x100 [ 134.296003] destroy_large_folio+0x68/0x70 [ 134.296172] migrate_folio_move+0x12e/0x260 [ 134.296264] ? __pfx_remove_migration_pte+0x10/0x10 [ 134.296389] migrate_pages_batch+0x495/0x6b0 [ 134.296523] migrate_pages+0x1d0/0x500 [ 134.296646] ? __pfx_alloc_misplaced_dst_folio+0x10/0x10 [ 134.296799] migrate_misplaced_folio+0x12d/0x2b0 [ 134.296953] do_numa_page+0x1f4/0x570 [ 134.297121] __handle_mm_fault+0x2b0/0x6c0 [ 134.297254] handle_mm_fault+0x107/0x270 [ 134.300897] do_user_addr_fault+0x167/0x680 [ 134.304561] exc_page_fault+0x65/0x140 [ 134.307919] asm_exc_page_fault+0x22/0x30 The reason for the crash is that, the commit 85ce2c517ade ("memcontrol: only transfer the memcg data for migration") removed the charging and uncharging operations of the migration folios and cleared the memcg data of the old folio. During the subsequent release process of the old large folio in destroy_large_folio(), if the large folio needs to be removed from the split queue, an incorrect split queue can be obtained (which is pgdat->deferred_split_queue) because the old folio's memcg is NULL now. This can lead to list operations being performed under the wrong split queue lock protection, resulting in a list crash as above. After the migration, the old folio is going to be freed, so we can remove it from the split queue in mem_cgroup_migrate() a bit earlier before clearing the memcg data to avoid getting incorrect split queue. [akpm@linux-foundation.org: fix comment, per Zi Yan] Link: https://lkml.kernel.org/r/61273e5e9b490682388377c20f52d19de4a80460.1703054559.git.baolin.wang@linux.alibaba.com Fixes: 85ce2c517ade ("memcontrol: only transfer the memcg data for migration") Signed-off-by: Baolin Wang Reviewed-by: Nhat Pham Reviewed-by: Yang Shi Reviewed-by: Zi Yan Cc: David Hildenbrand Cc: "Huang, Ying" Cc: Johannes Weiner Cc: Michal Hocko Cc: Muchun Song Cc: Roman Gushchin Cc: Shakeel Butt Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/memcontrol.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 8b2927c39a30..eaf04246f060 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -9625,6 +9625,17 @@ void mem_cgroup_migrate(struct folio *old, struct folio *new) /* Transfer the charge and the css ref */ commit_charge(new, memcg); + /* + * If the old folio is a large folio and is in the split queue, it needs + * to be removed from the split queue now, in case getting an incorrect + * split queue in destroy_large_folio() after the memcg of the old folio + * is cleared. + * + * In addition, the old folio is about to be freed after migration, so + * removing from the split queue a bit earlier seems reasonable. + */ + if (folio_test_large(old) && folio_test_large_rmappable(old)) + folio_undo_large_rmappable(old); old->memcg_data = 0; } -- Gitee From cdb590ea402187291f4759a717c14540ca549bf8 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 27 Feb 2024 17:42:39 +0000 Subject: [PATCH 205/484] memcg: add mem_cgroup_uncharge_folios() commit 4882c80975e2bf7241a5b043eb1dbe8df2726a29 upstream Conflicts: none Backport-reason: Memory: more folios part 1 Almost identical to mem_cgroup_uncharge_list(), except it takes a folio_batch instead of a list_head. Link: https://lkml.kernel.org/r/20240227174254.710559-6-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Ryan Roberts Cc: David Hildenbrand Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- include/linux/memcontrol.h | 14 ++++++++++++-- mm/memcontrol.c | 13 +++++++++++++ 2 files changed, 25 insertions(+), 2 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index f980bd9dfea2..996923b66d00 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -846,10 +846,16 @@ static inline void mem_cgroup_uncharge_list(struct list_head *page_list) __mem_cgroup_uncharge_list(page_list); } -void mem_cgroup_cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages); +void __mem_cgroup_uncharge_folios(struct folio_batch *folios); +static inline void mem_cgroup_uncharge_folios(struct folio_batch *folios) +{ + if (mem_cgroup_disabled()) + return; + __mem_cgroup_uncharge_folios(folios); +} +void mem_cgroup_cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages); void mem_cgroup_replace_folio(struct folio *old, struct folio *new); - void mem_cgroup_migrate(struct folio *old, struct folio *new); /** @@ -1492,6 +1498,10 @@ static inline void mem_cgroup_uncharge_list(struct list_head *page_list) { } +static inline void mem_cgroup_uncharge_folios(struct folio_batch *folios) +{ +} + static inline void mem_cgroup_cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages) { diff --git a/mm/memcontrol.c b/mm/memcontrol.c index eaf04246f060..0b0d329bc5ed 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -33,6 +33,7 @@ #include #include #include +#include #include #include #include @@ -9540,6 +9541,18 @@ void __mem_cgroup_uncharge_list(struct list_head *page_list) uncharge_batch(&ug); } +void __mem_cgroup_uncharge_folios(struct folio_batch *folios) +{ + struct uncharge_gather ug; + unsigned int i; + + uncharge_gather_clear(&ug); + for (i = 0; i < folios->nr; i++) + uncharge_folio(folios->folios[i], &ug); + if (ug.memcg) + uncharge_batch(&ug); +} + /** * mem_cgroup_replace_folio - Charge a folio's replacement. * @old: Currently circulating folio. -- Gitee From 111d21bb47235b7fdb15ea0727337caea7b41c36 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 27 Feb 2024 17:42:40 +0000 Subject: [PATCH 206/484] mm: remove use of folio list from folios_put() commit 7c33b8c4229af19797c78de48827ca70228c1f47 upstream Conflicts: minor, due to backported 66edc3a5894c Backport-reason: Memory: more folios part 1 Instead of putting the interesting folios on a list, delete the uninteresting one from the folio_batch. Link: https://lkml.kernel.org/r/20240227174254.710559-7-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Ryan Roberts Cc: David Hildenbrand Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/swap.c | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/mm/swap.c b/mm/swap.c index 75cdbc60bddc..9a49ed156946 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -991,12 +991,11 @@ void lru_cache_disable(void) */ void folios_put_refs(struct folio_batch *folios, unsigned int *refs) { - int i; - LIST_HEAD(pages_to_free); + int i, j; struct lruvec *lruvec = NULL; unsigned long flags = 0; - for (i = 0; i < folios->nr; i++) { + for (i = 0, j = 0; i < folios->nr; i++) { struct folio *folio = folios->folios[i]; unsigned int nr_refs = refs ? refs[i] : 1; @@ -1034,14 +1033,20 @@ void folios_put_refs(struct folio_batch *folios, unsigned int *refs) __folio_clear_lru_flags(folio); } - list_add(&folio->lru, &pages_to_free); + if (j != i) + folios->folios[j] = folio; + j++; } if (lruvec) unlock_page_lruvec_irqrestore(lruvec, flags); + if (!j) { + folio_batch_reinit(folios); + return; + } - mem_cgroup_uncharge_list(&pages_to_free); - free_unref_page_list(&pages_to_free); - folio_batch_reinit(folios); + folios->nr = j; + mem_cgroup_uncharge_folios(folios); + free_unref_folios(folios); } EXPORT_SYMBOL(folios_put_refs); -- Gitee From b4ceb29354c5a13ca97dca1a671becf221555524 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 27 Feb 2024 17:42:41 +0000 Subject: [PATCH 207/484] mm: use free_unref_folios() in put_pages_list() commit 24835f899c0129a4733e899e4da20e2e72f40bd9 upstream Conflicts: none Backport-reason: Memory: more folios part 1 Break up the list of folios into batches here so that the folios are more likely to be cache hot when doing the rest of the processing. Link: https://lkml.kernel.org/r/20240227174254.710559-8-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: David Hildenbrand Cc: Mel Gorman Cc: Ryan Roberts Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/swap.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/mm/swap.c b/mm/swap.c index 9a49ed156946..3bcad0e39d4f 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -130,22 +130,25 @@ EXPORT_SYMBOL(__folio_put); */ void put_pages_list(struct list_head *pages) { - struct folio *folio, *next; + struct folio_batch fbatch; + struct folio *folio; - list_for_each_entry_safe(folio, next, pages, lru) { - if (!folio_put_testzero(folio)) { - list_del(&folio->lru); + folio_batch_init(&fbatch); + list_for_each_entry(folio, pages, lru) { + if (!folio_put_testzero(folio)) continue; - } if (folio_test_large(folio)) { - list_del(&folio->lru); __folio_put_large(folio); continue; } /* LRU flag must be clear because it's passed using the lru */ + if (folio_batch_add(&fbatch, folio) > 0) + continue; + free_unref_folios(&fbatch); } - free_unref_page_list(pages); + if (fbatch.nr) + free_unref_folios(&fbatch); INIT_LIST_HEAD(pages); } EXPORT_SYMBOL(put_pages_list); -- Gitee From 2d043027999d0be87d6752125f93a9d34041f9af Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 27 Feb 2024 17:42:42 +0000 Subject: [PATCH 208/484] mm: use __page_cache_release() in folios_put() commit f1ee018baee9f4e724e08859c2559323be768be3 upstream Conflicts: minor, due to backported 33dfe9204f29 and 66edc3a5894c Backport-reason: Memory: more folios part 1 Pass a pointer to the lruvec so we can take advantage of the folio_lruvec_relock_irqsave(). Adjust the calling convention of folio_lruvec_relock_irqsave() to suit and add a page_cache_release() wrapper. Link: https://lkml.kernel.org/r/20240227174254.710559-9-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: David Hildenbrand Cc: Mel Gorman Cc: Ryan Roberts Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- include/linux/memcontrol.h | 16 ++++++++-------- mm/swap.c | 36 +++++++++++++++++++----------------- 2 files changed, 27 insertions(+), 25 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 996923b66d00..53dba5c29d51 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -1917,18 +1917,18 @@ static inline struct lruvec *folio_lruvec_relock_irq(struct folio *folio, return folio_lruvec_lock_irq(folio); } -/* Don't lock again iff page's lruvec locked */ -static inline struct lruvec *folio_lruvec_relock_irqsave(struct folio *folio, - struct lruvec *locked_lruvec, unsigned long *flags) +/* Don't lock again iff folio's lruvec locked */ +static inline void folio_lruvec_relock_irqsave(struct folio *folio, + struct lruvec **lruvecp, unsigned long *flags) { - if (locked_lruvec) { - if (folio_matches_lruvec(folio, locked_lruvec)) - return locked_lruvec; + if (*lruvecp) { + if (folio_matches_lruvec(folio, *lruvecp)) + return; - unlock_page_lruvec_irqrestore(locked_lruvec, *flags); + unlock_page_lruvec_irqrestore(*lruvecp, *flags); } - return folio_lruvec_lock_irqsave(folio, flags); + *lruvecp = folio_lruvec_lock_irqsave(folio, flags); } #ifdef CONFIG_CGROUP_WRITEBACK diff --git a/mm/swap.c b/mm/swap.c index 3bcad0e39d4f..c03dda646c5a 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -74,26 +74,33 @@ static DEFINE_PER_CPU(struct cpu_fbatches, cpu_fbatches) = { .lock = INIT_LOCAL_LOCK(lock), }; +static void __page_cache_release(struct folio *folio, struct lruvec **lruvecp, + unsigned long *flagsp) +{ + if (folio_test_lru(folio)) { + folio_lruvec_relock_irqsave(folio, lruvecp, flagsp); + lruvec_del_folio(*lruvecp, folio); + __folio_clear_lru_flags(folio); + } +} + /* * This path almost never happens for VM activity - pages are normally freed * in batches. But it gets used by networking - and for compound pages. */ -static void __page_cache_release(struct folio *folio) +static void page_cache_release(struct folio *folio) { - if (folio_test_lru(folio)) { - struct lruvec *lruvec; - unsigned long flags; + struct lruvec *lruvec = NULL; + unsigned long flags; - lruvec = folio_lruvec_lock_irqsave(folio, &flags); - lruvec_del_folio(lruvec, folio); - __folio_clear_lru_flags(folio); + __page_cache_release(folio, &lruvec, &flags); + if (lruvec) unlock_page_lruvec_irqrestore(lruvec, flags); - } } static void __folio_put_small(struct folio *folio) { - __page_cache_release(folio); + page_cache_release(folio); mem_cgroup_uncharge(folio); free_unref_page(&folio->page, 0); } @@ -107,7 +114,7 @@ static void __folio_put_large(struct folio *folio) * be called for hugetlb (it has a separate hugetlb_cgroup.) */ if (!folio_test_hugetlb(folio)) - __page_cache_release(folio); + page_cache_release(folio); destroy_large_folio(folio); } @@ -204,7 +211,7 @@ static void folio_batch_move_lru(struct folio_batch *fbatch, move_fn_t move_fn) for (i = 0; i < folio_batch_count(fbatch); i++) { struct folio *folio = fbatch->folios[i]; - lruvec = folio_lruvec_relock_irqsave(folio, lruvec, &flags); + folio_lruvec_relock_irqsave(folio, &lruvec, &flags); move_fn(lruvec, folio); folio_set_lru(folio); @@ -1029,12 +1036,7 @@ void folios_put_refs(struct folio_batch *folios, unsigned int *refs) continue; } - if (folio_test_lru(folio)) { - lruvec = folio_lruvec_relock_irqsave(folio, lruvec, - &flags); - lruvec_del_folio(lruvec, folio); - __folio_clear_lru_flags(folio); - } + __page_cache_release(folio, &lruvec, &flags); if (j != i) folios->folios[j] = folio; -- Gitee From e7951099dfc5d48cb439a38bcfe1f4ba4f0b73ef Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 27 Feb 2024 17:42:43 +0000 Subject: [PATCH 209/484] mm: handle large folios in free_unref_folios() commit 31b2ff82aefb33ce92496a1becddd6ce51060db2 upstream Conflicts: none Backport-reason: Memory: more folios part 1 Call folio_undo_large_rmappable() if needed. free_unref_page_prepare() destroys the ability to call folio_order(), so stash the order in folio->private for the benefit of the second loop. Link: https://lkml.kernel.org/r/20240227174254.710559-10-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: David Hildenbrand Cc: Mel Gorman Cc: Ryan Roberts Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/page_alloc.c | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index d40a7ff0a02d..14916b6b542b 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2608,7 +2608,7 @@ void free_unref_page(struct page *page, unsigned int order) } /* - * Free a batch of 0-order pages + * Free a batch of folios */ void free_unref_folios(struct folio_batch *folios) { @@ -2621,19 +2621,25 @@ void free_unref_folios(struct folio_batch *folios) for (i = 0, j = 0; i < folios->nr; i++) { struct folio *folio = folios->folios[i]; unsigned long pfn = folio_pfn(folio); - if (!free_unref_page_prepare(&folio->page, pfn, 0)) + unsigned int order = folio_order(folio); + + if (order > 0 && folio_test_large_rmappable(folio)) + folio_undo_large_rmappable(folio); + if (!free_unref_page_prepare(&folio->page, pfn, order)) continue; /* - * Free isolated folios directly to the allocator, see - * comment in free_unref_page. + * Free isolated folios and orders not handled on the PCP + * directly to the allocator, see comment in free_unref_page. */ migratetype = get_pcppage_migratetype(&folio->page); - if (unlikely(is_migrate_isolate(migratetype))) { + if (!pcp_allowed_order(order) || + is_migrate_isolate(migratetype)) { free_one_page(folio_zone(folio), &folio->page, pfn, - 0, migratetype, FPI_NONE); + order, migratetype, FPI_NONE); continue; } + folio->private = (void *)(unsigned long)order; if (j != i) folios->folios[j] = folio; j++; @@ -2643,7 +2649,9 @@ void free_unref_folios(struct folio_batch *folios) for (i = 0; i < folios->nr; i++) { struct folio *folio = folios->folios[i]; struct zone *zone = folio_zone(folio); + unsigned int order = (unsigned long)folio->private; + folio->private = NULL; migratetype = get_pcppage_migratetype(&folio->page); /* Different zone requires a different pcp lock */ @@ -2662,7 +2670,7 @@ void free_unref_folios(struct folio_batch *folios) if (unlikely(!pcp)) { pcp_trylock_finish(UP_flags); free_one_page(zone, &folio->page, - folio_pfn(folio), 0, + folio_pfn(folio), order, migratetype, FPI_NONE); locked_zone = NULL; continue; @@ -2678,7 +2686,8 @@ void free_unref_folios(struct folio_batch *folios) migratetype = MIGRATE_MOVABLE; trace_mm_page_free_batched(&folio->page); - free_unref_page_commit(zone, pcp, &folio->page, migratetype, 0); + free_unref_page_commit(zone, pcp, &folio->page, migratetype, + order); } if (pcp) { -- Gitee From 177800eebd9786a8ba9cedcd3364b14bfddc0ed5 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 27 Feb 2024 17:42:44 +0000 Subject: [PATCH 210/484] mm: allow non-hugetlb large folios to be batch processed commit f77171d241e379ea93448a53d58104191e02135c upstream Conflicts: none Backport-reason: Memory: more folios part 1 Hugetlb folios still get special treatment, but normal large folios can now be freed by free_unref_folios(). This should have a reasonable performance impact, TBD. Link: https://lkml.kernel.org/r/20240227174254.710559-11-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Ryan Roberts Cc: David Hildenbrand Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/swap.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/mm/swap.c b/mm/swap.c index c03dda646c5a..f2c1f407fe1e 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -1027,12 +1027,13 @@ void folios_put_refs(struct folio_batch *folios, unsigned int *refs) if (!folio_ref_sub_and_test(folio, nr_refs)) continue; - if (folio_test_large(folio)) { + /* hugetlb has its own memcg */ + if (folio_test_hugetlb(folio)) { if (lruvec) { unlock_page_lruvec_irqrestore(lruvec, flags); lruvec = NULL; } - __folio_put_large(folio); + free_huge_folio(folio); continue; } -- Gitee From 9349d36c6ccd6a67cd26160acac1353006c5836d Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 27 Feb 2024 17:42:45 +0000 Subject: [PATCH 211/484] mm: free folios in a batch in shrink_folio_list() commit bc2ff4cbc3294c01f29449405c42ee26ee0e1f59 upstream Conflicts: none Backport-reason: Memory: more folios part 1 Use free_unref_page_batch() to free the folios. This may increase the number of IPIs from calling try_to_unmap_flush() more often, but that's going to be very workload-dependent. It may even reduce the number of IPIs as we now batch-free large folios instead of freeing them one at a time. Link: https://lkml.kernel.org/r/20240227174254.710559-12-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: Mel Gorman Cc: David Hildenbrand Cc: Ryan Roberts Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/vmscan.c | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index a290c6bde5d3..3e056bc3faea 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1130,14 +1130,15 @@ static unsigned int shrink_folio_list(struct list_head *folio_list, struct pglist_data *pgdat, struct scan_control *sc, struct reclaim_stat *stat, bool ignore_references) { + struct folio_batch free_folios; LIST_HEAD(ret_folios); - LIST_HEAD(free_folios); LIST_HEAD(demote_folios); unsigned int nr_reclaimed = 0; unsigned int pgactivate = 0; bool do_demote_pass; struct swap_iocb *plug = NULL; + folio_batch_init(&free_folios); memset(stat, 0, sizeof(*stat)); cond_resched(); do_demote_pass = can_demote(pgdat->node_id, sc); @@ -1551,14 +1552,11 @@ static unsigned int shrink_folio_list(struct list_head *folio_list, */ nr_reclaimed += nr_pages; - /* - * Is there need to periodically free_folio_list? It would - * appear not as the counts should be low - */ - if (unlikely(folio_test_large(folio))) - destroy_large_folio(folio); - else - list_add(&folio->lru, &free_folios); + if (folio_batch_add(&free_folios, folio) == 0) { + mem_cgroup_uncharge_folios(&free_folios); + try_to_unmap_flush(); + free_unref_folios(&free_folios); + } continue; activate_locked_split: @@ -1622,9 +1620,9 @@ static unsigned int shrink_folio_list(struct list_head *folio_list, pgactivate = stat->nr_activate[0] + stat->nr_activate[1]; - mem_cgroup_uncharge_list(&free_folios); + mem_cgroup_uncharge_folios(&free_folios); try_to_unmap_flush(); - free_unref_page_list(&free_folios); + free_unref_folios(&free_folios); list_splice(&ret_folios, folio_list); count_vm_events(PGACTIVATE, pgactivate); -- Gitee From ce690b73d504a96d1e806cbd081f98b8cd67ee3b Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 27 Feb 2024 17:42:46 +0000 Subject: [PATCH 212/484] mm: free folios directly in move_folios_to_lru() commit 29f3843026cf83414a8bc319c97c1d09a6c33f4e upstream Conflicts: none Backport-reason: Memory: more folios part 1 The few folios which can't be moved to the LRU list (because their refcount dropped to zero) used to be returned to the caller to dispose of. Make this simpler to call by freeing the folios directly through free_unref_folios(). Link: https://lkml.kernel.org/r/20240227174254.710559-13-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: David Hildenbrand Cc: Mel Gorman Cc: Ryan Roberts Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/vmscan.c | 32 ++++++++++++-------------------- 1 file changed, 12 insertions(+), 20 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 3e056bc3faea..ad7ff578c7bd 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1901,7 +1901,6 @@ static int too_many_isolated(struct pglist_data *pgdat, int file, /* * move_folios_to_lru() moves folios from private @list to appropriate LRU list. - * On return, @list is reused as a list of folios to be freed by the caller. * * Returns the number of pages moved to the given lruvec. */ @@ -1909,8 +1908,9 @@ static unsigned int move_folios_to_lru(struct lruvec *lruvec, struct list_head *list) { int nr_pages, nr_moved = 0; - LIST_HEAD(folios_to_free); + struct folio_batch free_folios; + folio_batch_init(&free_folios); while (!list_empty(list)) { struct folio *folio = lru_to_folio(list); @@ -1939,12 +1939,12 @@ static unsigned int move_folios_to_lru(struct lruvec *lruvec, if (unlikely(folio_put_testzero(folio))) { __folio_clear_lru_flags(folio); - if (unlikely(folio_test_large(folio))) { + if (folio_batch_add(&free_folios, folio) == 0) { spin_unlock_irq(&lruvec->lru_lock); - destroy_large_folio(folio); + mem_cgroup_uncharge_folios(&free_folios); + free_unref_folios(&free_folios); spin_lock_irq(&lruvec->lru_lock); - } else - list_add(&folio->lru, &folios_to_free); + } continue; } @@ -1959,10 +1959,12 @@ static unsigned int move_folios_to_lru(struct lruvec *lruvec, nr_moved += nr_pages; } - /* - * To save our caller's stack, now use input list for pages to free. - */ - list_splice(&folios_to_free, list); + if (free_folios.nr) { + spin_unlock_irq(&lruvec->lru_lock); + mem_cgroup_uncharge_folios(&free_folios); + free_unref_folios(&free_folios); + spin_lock_irq(&lruvec->lru_lock); + } return nr_moved; } @@ -2041,8 +2043,6 @@ static unsigned long shrink_inactive_list(unsigned long nr_to_scan, spin_unlock_irq(&lruvec->lru_lock); lru_note_cost(lruvec, file, stat.nr_pageout, nr_scanned - nr_reclaimed); - mem_cgroup_uncharge_list(&folio_list); - free_unref_page_list(&folio_list); /* * If dirty folios are scanned that are not queued for IO, it @@ -2187,8 +2187,6 @@ static void shrink_active_list(unsigned long nr_to_scan, nr_activate = move_folios_to_lru(lruvec, &l_active); nr_deactivate = move_folios_to_lru(lruvec, &l_inactive); - /* Keep all free folios in l_active list */ - list_splice(&l_inactive, &l_active); __count_vm_events(PGDEACTIVATE, nr_deactivate); __count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, nr_deactivate); @@ -2198,8 +2196,6 @@ static void shrink_active_list(unsigned long nr_to_scan, if (nr_rotated) lru_note_cost(lruvec, file, 0, nr_rotated); - mem_cgroup_uncharge_list(&l_active); - free_unref_page_list(&l_active); trace_mm_vmscan_lru_shrink_active(pgdat->node_id, nr_taken, nr_activate, nr_deactivate, nr_rotated, sc->priority, file); } @@ -4752,10 +4748,6 @@ static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swap spin_unlock_irq(&lruvec->lru_lock); - mem_cgroup_uncharge_list(&list); - free_unref_page_list(&list); - - INIT_LIST_HEAD(&list); list_splice_init(&clean, &list); if (!list_empty(&list)) { -- Gitee From 4b829c77309e4fbbb9b05d1ed3ed390eb102ef35 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 27 Feb 2024 17:42:47 +0000 Subject: [PATCH 213/484] memcg: remove mem_cgroup_uncharge_list() commit be5a9e17a2ccbecfb7020aa1938e2c62d8a9189c upstream Conflicts: none Backport-reason: Memory: more folios part 1 All users have been converted to mem_cgroup_uncharge_folios() so we can remove this API. Link: https://lkml.kernel.org/r/20240227174254.710559-14-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Ryan Roberts Cc: David Hildenbrand Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- include/linux/memcontrol.h | 12 ------------ mm/memcontrol.c | 19 ------------------- 2 files changed, 31 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 53dba5c29d51..49de8d2790eb 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -838,14 +838,6 @@ static inline void mem_cgroup_uncharge(struct folio *folio) __mem_cgroup_uncharge(folio); } -void __mem_cgroup_uncharge_list(struct list_head *page_list); -static inline void mem_cgroup_uncharge_list(struct list_head *page_list) -{ - if (mem_cgroup_disabled()) - return; - __mem_cgroup_uncharge_list(page_list); -} - void __mem_cgroup_uncharge_folios(struct folio_batch *folios); static inline void mem_cgroup_uncharge_folios(struct folio_batch *folios) { @@ -1494,10 +1486,6 @@ static inline void mem_cgroup_uncharge(struct folio *folio) { } -static inline void mem_cgroup_uncharge_list(struct list_head *page_list) -{ -} - static inline void mem_cgroup_uncharge_folios(struct folio_batch *folios) { } diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 0b0d329bc5ed..7fb9c6050baa 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -9522,25 +9522,6 @@ void __mem_cgroup_uncharge(struct folio *folio) uncharge_batch(&ug); } -/** - * __mem_cgroup_uncharge_list - uncharge a list of page - * @page_list: list of pages to uncharge - * - * Uncharge a list of pages previously charged with - * __mem_cgroup_charge(). - */ -void __mem_cgroup_uncharge_list(struct list_head *page_list) -{ - struct uncharge_gather ug; - struct folio *folio; - - uncharge_gather_clear(&ug); - list_for_each_entry(folio, page_list, lru) - uncharge_folio(folio, &ug); - if (ug.memcg) - uncharge_batch(&ug); -} - void __mem_cgroup_uncharge_folios(struct folio_batch *folios) { struct uncharge_gather ug; -- Gitee From 090c42d40a2de9bbef4d153a2d9dcf7453561c9e Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 27 Feb 2024 17:42:48 +0000 Subject: [PATCH 214/484] mm: remove free_unref_page_list() commit 8b7b0a5eee22e3cd0468944d0720120c36340a2b upstream Conflicts: none Backport-reason: Memory: more folios part 1 All callers now use free_unref_folios() so we can delete this function. Link: https://lkml.kernel.org/r/20240227174254.710559-15-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Ryan Roberts Cc: David Hildenbrand Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/internal.h | 1 - mm/page_alloc.c | 18 ------------------ 2 files changed, 19 deletions(-) diff --git a/mm/internal.h b/mm/internal.h index 86c1a7efd2fe..a5ed5dd675c5 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -683,7 +683,6 @@ extern int user_min_free_kbytes; void free_unref_page(struct page *page, unsigned int order); void free_unref_folios(struct folio_batch *fbatch); -void free_unref_page_list(struct list_head *list); extern void zone_pcp_reset(struct zone *zone); extern void zone_pcp_disable(struct zone *zone); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 14916b6b542b..6565ef713c25 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2697,24 +2697,6 @@ void free_unref_folios(struct folio_batch *folios) folio_batch_reinit(folios); } -void free_unref_page_list(struct list_head *list) -{ - struct folio_batch fbatch; - - folio_batch_init(&fbatch); - while (!list_empty(list)) { - struct folio *folio = list_first_entry(list, struct folio, lru); - - list_del(&folio->lru); - if (folio_batch_add(&fbatch, folio) > 0) - continue; - free_unref_folios(&fbatch); - } - - if (fbatch.nr) - free_unref_folios(&fbatch); -} - /* * split_page takes a non-compound higher-order page, and splits it into * n (1< Date: Tue, 27 Feb 2024 17:42:49 +0000 Subject: [PATCH 215/484] mm: remove lru_to_page() commit f39ec4dcb9e9b03b2a280829b8c15e3ae607398c upstream Conflicts: none Backport-reason: Memory: more folios part 1 The last user was removed over a year ago; remove the definition. Link: https://lkml.kernel.org/r/20240227174254.710559-16-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: David Hildenbrand Cc: Mel Gorman Cc: Ryan Roberts Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- include/linux/mm.h | 1 - 1 file changed, 1 deletion(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 58f4903057d2..98f7d0f2aa32 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -231,7 +231,6 @@ int overcommit_policy_handler(struct ctl_table *, int, void *, size_t *, /* test whether an address (unsigned long or pointer) is aligned to PAGE_SIZE */ #define PAGE_ALIGNED(addr) IS_ALIGNED((unsigned long)(addr), PAGE_SIZE) -#define lru_to_page(head) (list_entry((head)->prev, struct page, lru)) static inline struct folio *lru_to_folio(struct list_head *head) { return list_entry((head)->prev, struct folio, lru); -- Gitee From b7689f5bec502420487cf5a12fe5430cbbe7e97e Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 27 Feb 2024 17:42:50 +0000 Subject: [PATCH 216/484] mm: convert free_pages_and_swap_cache() to use folios_put() commit 4907e80b76af004b6af42f0d4131e23ac73bc07c upstream Conflicts: none Backport-reason: Memory: more folios part 1 Process the pages in batch-sized quantities instead of all-at-once. Link: https://lkml.kernel.org/r/20240227174254.710559-17-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: David Hildenbrand Cc: Mel Gorman Cc: Ryan Roberts Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/swap_state.c | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/mm/swap_state.c b/mm/swap_state.c index 09e8de4da384..ff75b96ece5b 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include #include @@ -251,21 +252,25 @@ void free_page_and_swap_cache(struct page *page) */ void free_pages_and_swap_cache(struct encoded_page **pages, int nr) { + struct folio_batch folios; + unsigned int refs[PAGEVEC_SIZE]; + lru_add_drain(); + folio_batch_init(&folios); for (int i = 0; i < nr; i++) { - struct page *page = encoded_page_ptr(pages[i]); + struct folio *folio = page_folio(encoded_page_ptr(pages[i])); - /* - * Skip over the "nr_pages" entry. It's sufficient to call - * free_swap_cache() only once per folio. - */ + free_swap_cache(&folio->page); + refs[folios.nr] = 1; if (unlikely(encoded_page_flags(pages[i]) & ENCODED_PAGE_BIT_NR_PAGES_NEXT)) - i++; + refs[folios.nr] = encoded_nr_pages(pages[++i]); - free_swap_cache(page); + if (folio_batch_add(&folios, folio) == 0) + folios_put_refs(&folios, refs); } - release_pages(pages, nr); + if (folios.nr) + folios_put_refs(&folios, refs); } static inline bool swap_use_vma_readahead(void) -- Gitee From 4f0d77ad1c2a4989cf51e37ed68e668eb3921530 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 27 Feb 2024 17:42:51 +0000 Subject: [PATCH 217/484] mm: use a folio in __collapse_huge_page_copy_succeeded() commit d4111eecdc3c2a5eabafcc467dbfce0e216fa485 upstream Conflicts: none Backport-reason: Memory: more folios part 1 These pages are all chained together through the lru list, so we know they're folios. Use the folio APIs to save three hidden calls to compound_head(). Link: https://lkml.kernel.org/r/20240227174254.710559-18-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: David Hildenbrand Cc: Mel Gorman Cc: Ryan Roberts Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/khugepaged.c | 30 ++++++++++++++---------------- 1 file changed, 14 insertions(+), 16 deletions(-) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 962eb2d3cdb9..e5b7dcef4460 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -727,9 +727,7 @@ static void __collapse_huge_page_copy_succeeded(pte_t *pte, spinlock_t *ptl, struct list_head *compound_pagelist) { - struct folio *src_folio; - struct page *src_page; - struct page *tmp; + struct folio *src, *tmp; pte_t *_pte; pte_t pteval; @@ -748,10 +746,11 @@ static void __collapse_huge_page_copy_succeeded(pte_t *pte, ksm_might_unmap_zero_page(vma->vm_mm, pteval); } } else { - src_page = pte_page(pteval); - src_folio = page_folio(src_page); - if (!folio_test_large(src_folio)) - release_pte_folio(src_folio); + struct page *src_page = pte_page(pteval); + + src = page_folio(src_page); + if (!folio_test_large(src)) + release_pte_folio(src); /* * ptl mostly unnecessary, but preempt has to * be disabled to update the per-cpu stats @@ -759,20 +758,19 @@ static void __collapse_huge_page_copy_succeeded(pte_t *pte, */ spin_lock(ptl); ptep_clear(vma->vm_mm, address, _pte); - folio_remove_rmap_pte(src_folio, src_page, vma); + folio_remove_rmap_pte(src, src_page, vma); spin_unlock(ptl); free_page_and_swap_cache(src_page); } } - list_for_each_entry_safe(src_page, tmp, compound_pagelist, lru) { - list_del(&src_page->lru); - mod_node_page_state(page_pgdat(src_page), - NR_ISOLATED_ANON + page_is_file_lru(src_page), - -compound_nr(src_page)); - unlock_page(src_page); - free_swap_cache(src_page); - putback_lru_page(src_page); + list_for_each_entry_safe(src, tmp, compound_pagelist, lru) { + list_del(&src->lru); + node_stat_sub_folio(src, NR_ISOLATED_ANON + + folio_is_file_lru(src)); + folio_unlock(src); + free_swap_cache(&src->page); + folio_putback_lru(src); } } -- Gitee From 1ed30d34a31977d8f6a781ccafabae00f2efb328 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 27 Feb 2024 17:42:52 +0000 Subject: [PATCH 218/484] mm: convert free_swap_cache() to take a folio commit 63b774993dd02b17127cb404b7362fc436632995 upstream Conflicts: none Backport-reason: Memory: more folios part 1 All but one caller already has a folio, so convert free_page_and_swap_cache() to have a folio and remove the call to page_folio(). Link: https://lkml.kernel.org/r/20240227174254.710559-19-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Ryan Roberts Reviewed-by: David Hildenbrand Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- include/linux/swap.h | 8 ++++---- mm/khugepaged.c | 2 +- mm/memory.c | 2 +- mm/swap_state.c | 12 ++++++------ 4 files changed, 12 insertions(+), 12 deletions(-) diff --git a/include/linux/swap.h b/include/linux/swap.h index 40fb4e600064..82d2d684e70a 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -489,9 +489,9 @@ static inline unsigned long total_swapcache_pages(void) return global_node_page_state(NR_SWAPCACHE); } -extern void free_swap_cache(struct page *page); -extern void free_page_and_swap_cache(struct page *); -extern void free_pages_and_swap_cache(struct encoded_page **, int); +void free_swap_cache(struct folio *folio); +void free_page_and_swap_cache(struct page *); +void free_pages_and_swap_cache(struct encoded_page **, int); /* linux/mm/swapfile.c */ extern atomic_long_t nr_swap_pages; extern long total_swap_pages; @@ -571,7 +571,7 @@ static inline void free_swap_and_cache_nr(swp_entry_t entry, int nr) { } -static inline void free_swap_cache(struct page *page) +static inline void free_swap_cache(struct folio *folio) { } diff --git a/mm/khugepaged.c b/mm/khugepaged.c index e5b7dcef4460..70de487c6400 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -769,7 +769,7 @@ static void __collapse_huge_page_copy_succeeded(pte_t *pte, node_stat_sub_folio(src, NR_ISOLATED_ANON + folio_is_file_lru(src)); folio_unlock(src); - free_swap_cache(&src->page); + free_swap_cache(src); folio_putback_lru(src); } } diff --git a/mm/memory.c b/mm/memory.c index c2e3bddde8f7..ba364eba8057 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3393,7 +3393,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) folio_put(new_folio); if (old_folio) { if (page_copied) - free_swap_cache(&old_folio->page); + free_swap_cache(old_folio); folio_put(old_folio); } diff --git a/mm/swap_state.c b/mm/swap_state.c index ff75b96ece5b..776eea26d92c 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -224,10 +224,8 @@ void clear_shadow_from_swap_cache(int type, unsigned long begin, * folio_free_swap() _with_ the lock. * - Marcelo */ -void free_swap_cache(struct page *page) +void free_swap_cache(struct folio *folio) { - struct folio *folio = page_folio(page); - if (folio_test_swapcache(folio) && !folio_mapped(folio) && folio_trylock(folio)) { folio_free_swap(folio); @@ -241,9 +239,11 @@ void free_swap_cache(struct page *page) */ void free_page_and_swap_cache(struct page *page) { - free_swap_cache(page); + struct folio *folio = page_folio(page); + + free_swap_cache(folio); if (!is_huge_zero_page(page)) - put_page(page); + folio_put(folio); } /* @@ -260,7 +260,7 @@ void free_pages_and_swap_cache(struct encoded_page **pages, int nr) for (int i = 0; i < nr; i++) { struct folio *folio = page_folio(encoded_page_ptr(pages[i])); - free_swap_cache(&folio->page); + free_swap_cache(folio); refs[folios.nr] = 1; if (unlikely(encoded_page_flags(pages[i]) & ENCODED_PAGE_BIT_NR_PAGES_NEXT)) -- Gitee From ecd82645e20ce8a6c3bb6c3c33e9fc1a1ae62958 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 11 Mar 2024 19:18:34 +0000 Subject: [PATCH 219/484] mm: remove folio from deferred split list before uncharging it commit 47932e7048df9156e96133ee90fb3e9df68dbd15 upstream Conflicts: none Backport-reason: THP: fixing deferred list When freeing a large folio, we must remove it from the deferred split list before we uncharge it as each memcg has its own deferred split list (with associated lock) and removing a folio from the deferred split list while holding the wrong lock will corrupt that list and cause various related problems. Link: https://lore.kernel.org/linux-mm/367a14f7-340e-4b29-90ae-bc3fcefdd5f4@arm.com/ Link: https://lkml.kernel.org/r/20240311191835.312162-1-willy@infradead.org Fixes: f77171d241e3 (mm: allow non-hugetlb large folios to be batch processed) Fixes: 29f3843026cf (mm: free folios directly in move_folios_to_lru()) Fixes: bc2ff4cbc329 (mm: free folios in a batch in shrink_folio_list()) Signed-off-by: Matthew Wilcox (Oracle) Debugged-by: Ryan Roberts Tested-by: Ryan Roberts Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/swap.c | 3 +++ mm/vmscan.c | 6 ++++++ 2 files changed, 9 insertions(+) diff --git a/mm/swap.c b/mm/swap.c index f2c1f407fe1e..e843c6f454e1 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -1036,6 +1036,9 @@ void folios_put_refs(struct folio_batch *folios, unsigned int *refs) free_huge_folio(folio); continue; } + if (folio_test_large(folio) && + folio_test_large_rmappable(folio)) + folio_undo_large_rmappable(folio); __page_cache_release(folio, &lruvec, &flags); diff --git a/mm/vmscan.c b/mm/vmscan.c index ad7ff578c7bd..f7e8d59ffcea 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1552,6 +1552,9 @@ static unsigned int shrink_folio_list(struct list_head *folio_list, */ nr_reclaimed += nr_pages; + if (folio_test_large(folio) && + folio_test_large_rmappable(folio)) + folio_undo_large_rmappable(folio); if (folio_batch_add(&free_folios, folio) == 0) { mem_cgroup_uncharge_folios(&free_folios); try_to_unmap_flush(); @@ -1939,6 +1942,9 @@ static unsigned int move_folios_to_lru(struct lruvec *lruvec, if (unlikely(folio_put_testzero(folio))) { __folio_clear_lru_flags(folio); + if (folio_test_large(folio) && + folio_test_large_rmappable(folio)) + folio_undo_large_rmappable(folio); if (folio_batch_add(&free_folios, folio) == 0) { spin_unlock_irq(&lruvec->lru_lock); mem_cgroup_uncharge_folios(&free_folios); -- Gitee From c9d9b243da17de5da04018bb913f3e65e907a995 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 6 Mar 2024 21:27:30 +0000 Subject: [PATCH 220/484] mm: fix list corruption in put_pages_list commit b555895c313511830762dbb2f469587a822c1759 upstream Conflicts: none Backport-reason: THP: fixing deferred list My recent change to put_pages_list() dereferences folio->lru.next after returning the folio to the page allocator. Usually this is now on the pcp list with other free folios, so we try to free an already-free folio. This only happens with lists that have more than 15 entries, so it wasn't immediately discovered. Revert to using list_for_each_safe() so we dereference lru.next before disposing of the folio. Link: https://lkml.kernel.org/r/20240306212749.1823380-1-willy@infradead.org Fixes: 24835f899c01 ("mm: use free_unref_folios() in put_pages_list()") Signed-off-by: Matthew Wilcox (Oracle) Reported-by: "Borah, Chaitanya Kumar" Closes: https://lore.kernel.org/intel-gfx/SJ1PR11MB61292145F3B79DA58ADDDA63B9232@SJ1PR11MB6129.namprd11.prod.outlook.com/ Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/swap.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/swap.c b/mm/swap.c index e843c6f454e1..015e11bed4c9 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -138,10 +138,10 @@ EXPORT_SYMBOL(__folio_put); void put_pages_list(struct list_head *pages) { struct folio_batch fbatch; - struct folio *folio; + struct folio *folio, *next; folio_batch_init(&fbatch); - list_for_each_entry(folio, pages, lru) { + list_for_each_entry_safe(folio, next, pages, lru) { if (!folio_put_testzero(folio)) continue; if (folio_test_large(folio)) { -- Gitee From 5dfa36ba9432a1ed91c99cba0df0fc8f8df6f38e Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 5 Apr 2024 19:00:36 +0100 Subject: [PATCH 221/484] mm: convert pagecache_isize_extended to use a folio commit 2ebe90dab9808a15e5d1c973e7d3ddaee05ddbd3 upstream Conflicts: none Backport-reason: More folios Remove four hidden calls to compound_head(). Also exit early if the filesystem block size is >= PAGE_SIZE instead of just equal to PAGE_SIZE. Link: https://lkml.kernel.org/r/20240405180038.2618624-1-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Pankaj Raghav Reviewed-by: David Hildenbrand Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/truncate.c | 36 +++++++++++++++++------------------- 1 file changed, 17 insertions(+), 19 deletions(-) diff --git a/mm/truncate.c b/mm/truncate.c index de98ebc49a3c..a39a08e6547b 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -774,15 +774,15 @@ EXPORT_SYMBOL(truncate_setsize); * @from: original inode size * @to: new inode size * - * Handle extension of inode size either caused by extending truncate or by - * write starting after current i_size. We mark the page straddling current - * i_size RO so that page_mkwrite() is called on the nearest write access to - * the page. This way filesystem can be sure that page_mkwrite() is called on - * the page before user writes to the page via mmap after the i_size has been - * changed. + * Handle extension of inode size either caused by extending truncate or + * by write starting after current i_size. We mark the page straddling + * current i_size RO so that page_mkwrite() is called on the first + * write access to the page. The filesystem will update its per-block + * information before user writes to the page via mmap after the i_size + * has been changed. * * The function must be called after i_size is updated so that page fault - * coming after we unlock the page will already see the new i_size. + * coming after we unlock the folio will already see the new i_size. * The function must be called while we still hold i_rwsem - this not only * makes sure i_size is stable but also that userspace cannot observe new * i_size value before we are prepared to store mmap writes at new inode size. @@ -791,31 +791,29 @@ void pagecache_isize_extended(struct inode *inode, loff_t from, loff_t to) { int bsize = i_blocksize(inode); loff_t rounded_from; - struct page *page; - pgoff_t index; + struct folio *folio; WARN_ON(to > inode->i_size); - if (from >= to || bsize == PAGE_SIZE) + if (from >= to || bsize >= PAGE_SIZE) return; /* Page straddling @from will not have any hole block created? */ rounded_from = round_up(from, bsize); if (to <= rounded_from || !(rounded_from & (PAGE_SIZE - 1))) return; - index = from >> PAGE_SHIFT; - page = find_lock_page(inode->i_mapping, index); - /* Page not cached? Nothing to do */ - if (!page) + folio = filemap_lock_folio(inode->i_mapping, from / PAGE_SIZE); + /* Folio not cached? Nothing to do */ + if (IS_ERR(folio)) return; /* - * See clear_page_dirty_for_io() for details why set_page_dirty() + * See folio_clear_dirty_for_io() for details why folio_mark_dirty() * is needed. */ - if (page_mkclean(page)) - set_page_dirty(page); - unlock_page(page); - put_page(page); + if (folio_mkclean(folio)) + folio_mark_dirty(folio); + folio_unlock(folio); + folio_put(folio); } EXPORT_SYMBOL(pagecache_isize_extended); -- Gitee From 3a1cf659ba34e915d4b3254e0191482c4753d0bd Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 5 Apr 2024 16:32:23 +0100 Subject: [PATCH 222/484] mm: free non-hugetlb large folios in a batch commit 2f166704290eadec480209cf28060f154184afe9 upstream Conflicts: none Backport-reason: More folios Patch series "Clean up __folio_put()". With all the changes over the last few years, __folio_put_small and __folio_put_large have become almost identical to each other ... except you can't tell because they're spread over two files. Rearrange it all so that you can tell, and then inline them both into __folio_put(). This patch (of 5): free_unref_folios() can now handle non-hugetlb large folios, so keep normal large folios in the batch. hugetlb folios still need to be handled specially. [peterx@redhat.com: fix panic] Link: https://lkml.kernel.org/r/ZikjPB0Dt5HA8-uL@x1n Link: https://lkml.kernel.org/r/20240405153228.2563754-1-willy@infradead.org Link: https://lkml.kernel.org/r/20240405153228.2563754-2-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Peter Xu Reviewed-by: Zi Yan Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/swap.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/swap.c b/mm/swap.c index 015e11bed4c9..616760460f75 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -144,8 +144,8 @@ void put_pages_list(struct list_head *pages) list_for_each_entry_safe(folio, next, pages, lru) { if (!folio_put_testzero(folio)) continue; - if (folio_test_large(folio)) { - __folio_put_large(folio); + if (folio_test_hugetlb(folio)) { + free_huge_folio(folio); continue; } /* LRU flag must be clear because it's passed using the lru */ -- Gitee From d5abf6c0f0daf571b9bccfc0705a03e7a6926197 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 21 Mar 2024 14:25:32 -0400 Subject: [PATCH 223/484] mm: zswap: fix writeback shinker GFP_NOIO/GFP_NOFS recursion MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 30fb6a8d9e3378919f378f9bf561142b4a6d2637 upstream Conflicts: none Backport-reason: ZSWAP mass update Kent forwards this bug report of zswap re-entering the block layer from an IO request allocation and locking up: [10264.128242] sysrq: Show Blocked State [10264.128268] task:kworker/20:0H state:D stack:0 pid:143 tgid:143 ppid:2 flags:0x00004000 [10264.128271] Workqueue: bcachefs_io btree_write_submit [bcachefs] [10264.128295] Call Trace: [10264.128295] [10264.128297] __schedule+0x3e6/0x1520 [10264.128303] schedule+0x32/0xd0 [10264.128304] schedule_timeout+0x98/0x160 [10264.128308] io_schedule_timeout+0x50/0x80 [10264.128309] wait_for_completion_io_timeout+0x7f/0x180 [10264.128310] submit_bio_wait+0x78/0xb0 [10264.128313] swap_writepage_bdev_sync+0xf6/0x150 [10264.128317] zswap_writeback_entry+0xf2/0x180 [10264.128319] shrink_memcg_cb+0xe7/0x2f0 [10264.128322] __list_lru_walk_one+0xb9/0x1d0 [10264.128325] list_lru_walk_one+0x5d/0x90 [10264.128326] zswap_shrinker_scan+0xc4/0x130 [10264.128327] do_shrink_slab+0x13f/0x360 [10264.128328] shrink_slab+0x28e/0x3c0 [10264.128329] shrink_one+0x123/0x1b0 [10264.128331] shrink_node+0x97e/0xbc0 [10264.128332] do_try_to_free_pages+0xe7/0x5b0 [10264.128333] try_to_free_pages+0xe1/0x200 [10264.128334] __alloc_pages_slowpath.constprop.0+0x343/0xde0 [10264.128337] __alloc_pages+0x32d/0x350 [10264.128338] allocate_slab+0x400/0x460 [10264.128339] ___slab_alloc+0x40d/0xa40 [10264.128345] kmem_cache_alloc+0x2e7/0x330 [10264.128348] mempool_alloc+0x86/0x1b0 [10264.128349] bio_alloc_bioset+0x200/0x4f0 [10264.128352] bio_alloc_clone+0x23/0x60 [10264.128354] alloc_io+0x26/0xf0 [dm_mod 7e9e6b44df4927f93fb3e4b5c782767396f58382] [10264.128361] dm_submit_bio+0xb8/0x580 [dm_mod 7e9e6b44df4927f93fb3e4b5c782767396f58382] [10264.128366] __submit_bio+0xb0/0x170 [10264.128367] submit_bio_noacct_nocheck+0x159/0x370 [10264.128368] bch2_submit_wbio_replicas+0x21c/0x3a0 [bcachefs 85f1b9a7a824f272eff794653a06dde1a94439f2] [10264.128391] btree_write_submit+0x1cf/0x220 [bcachefs 85f1b9a7a824f272eff794653a06dde1a94439f2] [10264.128406] process_one_work+0x178/0x350 [10264.128408] worker_thread+0x30f/0x450 [10264.128409] kthread+0xe5/0x120 The zswap shrinker resumes the swap_writepage()s that were intercepted by the zswap store. This will enter the block layer, and may even enter the filesystem depending on the swap backing file. Make it respect GFP_NOIO and GFP_NOFS. Link: https://lore.kernel.org/linux-mm/rc4pk2r42oyvjo4dc62z6sovquyllq56i5cdgcaqbd7wy3hfzr@n4nbxido3fme/ Link: https://lkml.kernel.org/r/20240321182532.60000-1-hannes@cmpxchg.org Fixes: b5ba474f3f51 ("zswap: shrink zswap pool based on memory pressure") Signed-off-by: Johannes Weiner Reported-by: Kent Overstreet Acked-by: Yosry Ahmed Reported-by: Jérôme Poulin Reviewed-by: Nhat Pham Reviewed-by: Chengming Zhou Cc: stable@vger.kernel.org [v6.8] Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/zswap.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/mm/zswap.c b/mm/zswap.c index 4db554cbcff2..5b91acd14439 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -1317,6 +1317,14 @@ static unsigned long zswap_shrinker_count(struct shrinker *shrinker, if (!zswap_shrinker_enabled || !mem_cgroup_zswap_writeback_enabled(memcg)) return 0; + /* + * The shrinker resumes swap writeback, which will enter block + * and may enter fs. XXX: Harmonize with vmscan.c __GFP_FS + * rules (may_enter_fs()), which apply on a per-folio basis. + */ + if (!gfp_has_io_fs(sc->gfp_mask)) + return 0; + #ifdef CONFIG_MEMCG_KMEM mem_cgroup_flush_stats(memcg); nr_backing = memcg_page_state(memcg, MEMCG_ZSWAP_B) >> PAGE_SHIFT; -- Gitee From efa54230a556c171ada20b95adf3861ea5979480 Mon Sep 17 00:00:00 2001 From: Chengming Zhou Date: Tue, 5 Mar 2024 07:53:45 +0000 Subject: [PATCH 224/484] mm/zswap: global lru and shrinker shared by all zswap_pools fix commit e35606e4167dd55d48aa6232c074137577818add upstream Conflicts: none Backport-reason: ZSWAP mass update Commit bf9b7df23cb3 ("mm/zswap: global lru and shrinker shared by all zswap_pools") introduced a new lock to protect zswap_next_shrink, instead of reusing zswap_pools_lock. But the problem is that it's initialized only when zswap enabled, which causes bug if zswap_memcg_offline_cleanup() called without zswap enabled. Fix it by using DEFINE_SPINLOCK() to statically initialize them and define them as multiple static variables to keep in consistent with the existing global variables in zswap. Link: https://lkml.kernel.org/r/20240305075345.1493214-1-chengming.zhou@linux.dev Fixes: bf9b7df23cb3 ("mm/zswap: global lru and shrinker shared by all zswap_pools") Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-lkp/202403051008.a8cf8a94-lkp@intel.com Signed-off-by: Chengming Zhou Acked-by: Johannes Weiner Cc: Nhat Pham Cc: Yosry Ahmed Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/zswap.c | 77 +++++++++++++++++++++++++++--------------------------- 1 file changed, 38 insertions(+), 39 deletions(-) diff --git a/mm/zswap.c b/mm/zswap.c index 5b91acd14439..78e97e237da2 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -180,15 +180,16 @@ struct zswap_pool { char tfm_name[CRYPTO_MAX_ALG_NAME]; }; -static struct { - struct list_lru list_lru; - atomic_t nr_stored; - struct shrinker *shrinker; - struct work_struct shrink_work; - struct mem_cgroup *next_shrink; - /* The lock protects next_shrink. */ - spinlock_t shrink_lock; -} zswap; +/* Global LRU lists shared by all zswap pools. */ +static struct list_lru zswap_list_lru; +/* counter of pages stored in all zswap pools. */ +static atomic_t zswap_nr_stored = ATOMIC_INIT(0); + +/* The lock protects zswap_next_shrink updates. */ +static DEFINE_SPINLOCK(zswap_shrink_lock); +static struct mem_cgroup *zswap_next_shrink; +static struct work_struct zswap_shrink_work; +static struct shrinker *zswap_shrinker; /* * struct zswap_entry @@ -798,10 +799,10 @@ void zswap_folio_swapin(struct folio *folio) void zswap_memcg_offline_cleanup(struct mem_cgroup *memcg) { /* lock out zswap shrinker walking memcg tree */ - spin_lock(&zswap.shrink_lock); - if (zswap.next_shrink == memcg) - zswap.next_shrink = mem_cgroup_iter(NULL, zswap.next_shrink, NULL); - spin_unlock(&zswap.shrink_lock); + spin_lock(&zswap_shrink_lock); + if (zswap_next_shrink == memcg) + zswap_next_shrink = mem_cgroup_iter(NULL, zswap_next_shrink, NULL); + spin_unlock(&zswap_shrink_lock); } /********************************* @@ -900,9 +901,9 @@ static void zswap_entry_free(struct zswap_entry *entry) if (!entry->length) atomic_dec(&zswap_same_filled_pages); else { - zswap_lru_del(&zswap.list_lru, entry); + zswap_lru_del(&zswap_list_lru, entry); zpool_free(zswap_find_zpool(entry), entry->handle); - atomic_dec(&zswap.nr_stored); + atomic_dec(&zswap_nr_stored); zswap_pool_put(entry->pool); } if (entry->objcg) { @@ -1281,7 +1282,7 @@ static unsigned long zswap_shrinker_scan(struct shrinker *shrinker, nr_protected = atomic_long_read(&lruvec->zswap_lruvec_state.nr_zswap_protected); - lru_size = list_lru_shrink_count(&zswap.list_lru, sc); + lru_size = list_lru_shrink_count(&zswap_list_lru, sc); /* * Abort if we are shrinking into the protected region. @@ -1298,7 +1299,7 @@ static unsigned long zswap_shrinker_scan(struct shrinker *shrinker, return SHRINK_STOP; } - shrink_ret = list_lru_shrink_walk(&zswap.list_lru, sc, &shrink_memcg_cb, + shrink_ret = list_lru_shrink_walk(&zswap_list_lru, sc, &shrink_memcg_cb, &encountered_page_in_swapcache); if (encountered_page_in_swapcache) @@ -1332,7 +1333,7 @@ static unsigned long zswap_shrinker_count(struct shrinker *shrinker, #else /* use pool stats instead of memcg stats */ nr_backing = zswap_pool_total_size >> PAGE_SHIFT; - nr_stored = atomic_read(&zswap.nr_stored); + nr_stored = atomic_read(&zswap_nr_stored); #endif if (!nr_stored) @@ -1340,7 +1341,7 @@ static unsigned long zswap_shrinker_count(struct shrinker *shrinker, nr_protected = atomic_long_read(&lruvec->zswap_lruvec_state.nr_zswap_protected); - nr_freeable = list_lru_shrink_count(&zswap.list_lru, sc); + nr_freeable = list_lru_shrink_count(&zswap_list_lru, sc); /* * Subtract the lru size by an estimate of the number of pages * that should be protected. @@ -1389,7 +1390,7 @@ static int shrink_memcg(struct mem_cgroup *memcg) for_each_node_state(nid, N_NORMAL_MEMORY) { unsigned long nr_to_walk = 1; - shrunk += list_lru_walk_one(&zswap.list_lru, nid, memcg, + shrunk += list_lru_walk_one(&zswap_list_lru, nid, memcg, &shrink_memcg_cb, NULL, &nr_to_walk); } return shrunk ? 0 : -EAGAIN; @@ -1402,9 +1403,9 @@ static void shrink_worker(struct work_struct *w) /* global reclaim will select cgroup in a round-robin fashion. */ do { - spin_lock(&zswap.shrink_lock); - zswap.next_shrink = mem_cgroup_iter(NULL, zswap.next_shrink, NULL); - memcg = zswap.next_shrink; + spin_lock(&zswap_shrink_lock); + zswap_next_shrink = mem_cgroup_iter(NULL, zswap_next_shrink, NULL); + memcg = zswap_next_shrink; /* * We need to retry if we have gone through a full round trip, or if we @@ -1418,7 +1419,7 @@ static void shrink_worker(struct work_struct *w) * memcg is not killed when we are reclaiming. */ if (!memcg) { - spin_unlock(&zswap.shrink_lock); + spin_unlock(&zswap_shrink_lock); if (++failures == MAX_RECLAIM_RETRIES) break; @@ -1428,15 +1429,15 @@ static void shrink_worker(struct work_struct *w) if (!mem_cgroup_tryget_online(memcg)) { /* drop the reference from mem_cgroup_iter() */ mem_cgroup_iter_break(NULL, memcg); - zswap.next_shrink = NULL; - spin_unlock(&zswap.shrink_lock); + zswap_next_shrink = NULL; + spin_unlock(&zswap_shrink_lock); if (++failures == MAX_RECLAIM_RETRIES) break; goto resched; } - spin_unlock(&zswap.shrink_lock); + spin_unlock(&zswap_shrink_lock); ret = shrink_memcg(memcg); /* drop the extra reference */ @@ -1557,7 +1558,7 @@ bool zswap_store(struct folio *folio) if (objcg) { memcg = get_mem_cgroup_from_objcg(objcg); - if (memcg_list_lru_alloc(memcg, &zswap.list_lru, GFP_KERNEL)) { + if (memcg_list_lru_alloc(memcg, &zswap_list_lru, GFP_KERNEL)) { mem_cgroup_put(memcg); goto put_pool; } @@ -1588,8 +1589,8 @@ bool zswap_store(struct folio *folio) } if (entry->length) { INIT_LIST_HEAD(&entry->lru); - zswap_lru_add(&zswap.list_lru, entry); - atomic_inc(&zswap.nr_stored); + zswap_lru_add(&zswap_list_lru, entry); + atomic_inc(&zswap_nr_stored); } spin_unlock(&tree->lock); @@ -1621,7 +1622,7 @@ bool zswap_store(struct folio *folio) return false; shrink: - queue_work(shrink_wq, &zswap.shrink_work); + queue_work(shrink_wq, &zswap_shrink_work); goto reject; } @@ -1788,16 +1789,14 @@ static int zswap_setup(void) if (!shrink_wq) goto shrink_wq_fail; - zswap.shrinker = zswap_alloc_shrinker(); - if (!zswap.shrinker) + zswap_shrinker = zswap_alloc_shrinker(); + if (!zswap_shrinker) goto shrinker_fail; - if (list_lru_init_memcg(&zswap.list_lru, zswap.shrinker)) + if (list_lru_init_memcg(&zswap_list_lru, zswap_shrinker)) goto lru_fail; - shrinker_register(zswap.shrinker); + shrinker_register(zswap_shrinker); - INIT_WORK(&zswap.shrink_work, shrink_worker); - atomic_set(&zswap.nr_stored, 0); - spin_lock_init(&zswap.shrink_lock); + INIT_WORK(&zswap_shrink_work, shrink_worker); pool = __zswap_pool_create_fallback(); if (pool) { @@ -1816,7 +1815,7 @@ static int zswap_setup(void) return 0; lru_fail: - shrinker_free(zswap.shrinker); + shrinker_free(zswap_shrinker); shrinker_fail: destroy_workqueue(shrink_wq); shrink_wq_fail: -- Gitee From be72f8f3f285914d57d1ea39f971f0bf07a8b2dc Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 18 Apr 2024 08:26:28 -0400 Subject: [PATCH 225/484] mm: zswap: fix shrinker NULL crash with cgroup_disable=memory commit 682886ec69d22363819a83ddddd5d66cb5c791e1 upstream Conflicts: none Backport-reason: ZSWAP mass update Christian reports a NULL deref in zswap that he bisected down to the zswap shrinker. The issue also cropped up in the bug trackers of libguestfs [1] and the Red Hat bugzilla [2]. The problem is that when memcg is disabled with the boot time flag, the zswap shrinker might get called with sc->memcg == NULL. This is okay in many places, like the lruvec operations. But it crashes in memcg_page_state() - which is only used due to the non-node accounting of cgroup's the zswap memory to begin with. Nhat spotted that the memcg can be NULL in the memcg-disabled case, and I was then able to reproduce the crash locally as well. [1] https://github.com/libguestfs/libguestfs/issues/139 [2] https://bugzilla.redhat.com/show_bug.cgi?id=2275252 Link: https://lkml.kernel.org/r/20240418124043.GC1055428@cmpxchg.org Link: https://lkml.kernel.org/r/20240417143324.GA1055428@cmpxchg.org Fixes: b5ba474f3f51 ("zswap: shrink zswap pool based on memory pressure") Signed-off-by: Johannes Weiner Reported-by: Christian Heusel Debugged-by: Nhat Pham Suggested-by: Nhat Pham Tested-by: Christian Heusel Acked-by: Yosry Ahmed Cc: Chengming Zhou Cc: Dan Streetman Cc: Richard W.M. Jones Cc: Seth Jennings Cc: Vitaly Wool Cc: [v6.8] Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/zswap.c | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/mm/zswap.c b/mm/zswap.c index 78e97e237da2..0f02765d6ad1 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -1326,15 +1326,22 @@ static unsigned long zswap_shrinker_count(struct shrinker *shrinker, if (!gfp_has_io_fs(sc->gfp_mask)) return 0; -#ifdef CONFIG_MEMCG_KMEM - mem_cgroup_flush_stats(memcg); - nr_backing = memcg_page_state(memcg, MEMCG_ZSWAP_B) >> PAGE_SHIFT; - nr_stored = memcg_page_state(memcg, MEMCG_ZSWAPPED); -#else - /* use pool stats instead of memcg stats */ - nr_backing = zswap_pool_total_size >> PAGE_SHIFT; - nr_stored = atomic_read(&zswap_nr_stored); -#endif + /* + * For memcg, use the cgroup-wide ZSWAP stats since we don't + * have them per-node and thus per-lruvec. Careful if memcg is + * runtime-disabled: we can get sc->memcg == NULL, which is ok + * for the lruvec, but not for memcg_page_state(). + * + * Without memcg, use the zswap pool-wide metrics. + */ + if (!mem_cgroup_disabled()) { + mem_cgroup_flush_stats(memcg); + nr_backing = memcg_page_state(memcg, MEMCG_ZSWAP_B) >> PAGE_SHIFT; + nr_stored = memcg_page_state(memcg, MEMCG_ZSWAPPED); + } else { + nr_backing = zswap_pool_total_size >> PAGE_SHIFT; + nr_stored = atomic_read(&zswap_nr_stored); + } if (!nr_stored) return 0; -- Gitee From 2849d399cbaba603c0ff90c18b417f01335d7185 Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Sat, 16 Mar 2024 01:58:03 +0000 Subject: [PATCH 226/484] mm: memcg: add NULL check to obj_cgroup_put() commit 91b71e78b8e478e1a9af36b15ff1d38810572866 upstream Conflicts: trivial Backport-reason: Backport this clean up first to make the code more robust. 9 out of 16 callers perform a NULL check before calling obj_cgroup_put(). Move the NULL check in the function, similar to mem_cgroup_put(). The unlikely() NULL check in current_objcg_update() was left alone to avoid dropping the unlikey() annotation as this a fast path. Link: https://lkml.kernel.org/r/20240316015803.2777252-1-yosryahmed@google.com Signed-off-by: Yosry Ahmed Acked-by: Johannes Weiner Acked-by: Roman Gushchin Cc: Michal Hocko Cc: Muchun Song Cc: Shakeel Butt Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- include/linux/memcontrol.h | 3 ++- kernel/bpf/memalloc.c | 6 ++---- mm/memcontrol.c | 9 +++------ mm/zswap.c | 3 +-- 4 files changed, 8 insertions(+), 13 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 49de8d2790eb..94b94dcdfc4b 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -986,7 +986,8 @@ static inline void obj_cgroup_get_many(struct obj_cgroup *objcg, static inline void obj_cgroup_put(struct obj_cgroup *objcg) { - percpu_ref_put(&objcg->refcnt); + if (objcg) + percpu_ref_put(&objcg->refcnt); } static inline bool mem_cgroup_tryget(struct mem_cgroup *memcg) diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c index 85f9501ff6e6..71c168763948 100644 --- a/kernel/bpf/memalloc.c +++ b/kernel/bpf/memalloc.c @@ -691,8 +691,7 @@ void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma) rcu_in_progress += atomic_read(&c->call_rcu_in_progress); } /* objcg is the same across cpus */ - if (c->objcg) - obj_cgroup_put(c->objcg); + obj_cgroup_put(c->objcg); destroy_mem_alloc(ma, rcu_in_progress); } if (ma->caches) { @@ -708,8 +707,7 @@ void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma) rcu_in_progress += atomic_read(&c->call_rcu_in_progress); } } - if (c->objcg) - obj_cgroup_put(c->objcg); + obj_cgroup_put(c->objcg); destroy_mem_alloc(ma, rcu_in_progress); } } diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 7fb9c6050baa..e7fc3b86e468 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2622,8 +2622,7 @@ static void drain_local_stock(struct work_struct *dummy) clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags); local_unlock_irqrestore(&memcg_stock.stock_lock, flags); - if (old) - obj_cgroup_put(old); + obj_cgroup_put(old); } /* @@ -3828,8 +3827,7 @@ void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat, mod_objcg_mlstate(objcg, pgdat, idx, nr); local_unlock_irqrestore(&memcg_stock.stock_lock, flags); - if (old) - obj_cgroup_put(old); + obj_cgroup_put(old); } static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes) @@ -3956,8 +3954,7 @@ static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes, } local_unlock_irqrestore(&memcg_stock.stock_lock, flags); - if (old) - obj_cgroup_put(old); + obj_cgroup_put(old); if (nr_pages) obj_cgroup_uncharge_pages(objcg, nr_pages); diff --git a/mm/zswap.c b/mm/zswap.c index 0f02765d6ad1..c39fa52bed44 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -1613,8 +1613,7 @@ bool zswap_store(struct folio *folio) freepage: zswap_entry_cache_free(entry); reject: - if (objcg) - obj_cgroup_put(objcg); + obj_cgroup_put(objcg); check_old: /* * If the zswap store fails or zswap is disabled, we must invalidate the -- Gitee From e4fdf9a68236b6e41c9097f770c0dd3c34cde920 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Tue, 12 Mar 2024 11:34:11 -0400 Subject: [PATCH 227/484] mm: zswap: optimize zswap pool size tracking commit 91cdcd8d624bfdf05e8db9d572759516f3c786b8 upstream Conflicts: none Backport-reason: ZSWAP mass update Profiling the munmap() of a zswapped memory region shows 60% of the total cycles currently going into updating the zswap_pool_total_size. There are three consumers of this counter: - store, to enforce the globally configured pool limit - meminfo & debugfs, to report the size to the user - shrink, to determine the batch size for each cycle Instead of aggregating everytime an entry enters or exits the zswap pool, aggregate the value from the zpools on-demand: - Stores aggregate the counter anyway upon success. Aggregating to check the limit instead is the same amount of work. - Meminfo & debugfs might benefit somewhat from a pre-aggregated counter, but aren't exactly hotpaths. - Shrinking can aggregate once for every cycle instead of doing it for every freed entry. As the shrinker might work on tens or hundreds of objects per scan cycle, this is a large reduction in aggregations. The paths that benefit dramatically are swapin, swapoff, and unmaps. There could be millions of pages being processed until somebody asks for the pool size again. This eliminates the pool size updates from those paths entirely. Top profile entries for a 24G range munmap(), before: 38.54% zswap-unmap [kernel.kallsyms] [k] zs_zpool_total_size 12.51% zswap-unmap [kernel.kallsyms] [k] zpool_get_total_size 9.10% zswap-unmap [kernel.kallsyms] [k] zswap_update_total_size 2.95% zswap-unmap [kernel.kallsyms] [k] obj_cgroup_uncharge_zswap 2.88% zswap-unmap [kernel.kallsyms] [k] __slab_free 2.86% zswap-unmap [kernel.kallsyms] [k] xas_store and after: 7.70% zswap-unmap [kernel.kallsyms] [k] __slab_free 7.16% zswap-unmap [kernel.kallsyms] [k] obj_cgroup_uncharge_zswap 6.74% zswap-unmap [kernel.kallsyms] [k] xas_store It was also briefly considered to move to a single atomic in zswap that is updated by the backends, since zswap only cares about the sum of all pools anyway. However, zram directly needs per-pool information out of zsmalloc. To keep the backend from having to update two atomics every time, I opted for the lazy aggregation instead for now. Link: https://lkml.kernel.org/r/20240312153901.3441-1-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Acked-by: Yosry Ahmed Reviewed-by: Chengming Zhou Reviewed-by: Nhat Pham Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- fs/proc/meminfo.c | 3 +- include/linux/zswap.h | 2 +- mm/zswap.c | 101 +++++++++++++++++++++--------------------- 3 files changed, 52 insertions(+), 54 deletions(-) diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index f691080d979e..7bad4ce3a1d0 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -91,8 +91,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v) show_val_kb(m, "SwapTotal: ", i.totalswap); show_val_kb(m, "SwapFree: ", i.freeswap); #ifdef CONFIG_ZSWAP - seq_printf(m, "Zswap: %8lu kB\n", - (unsigned long)(zswap_pool_total_size >> 10)); + show_val_kb(m, "Zswap: ", zswap_total_pages()); seq_printf(m, "Zswapped: %8lu kB\n", (unsigned long)atomic_read(&zswap_stored_pages) << (PAGE_SHIFT - 10)); diff --git a/include/linux/zswap.h b/include/linux/zswap.h index 341aea490070..2a85b941db97 100644 --- a/include/linux/zswap.h +++ b/include/linux/zswap.h @@ -7,7 +7,6 @@ struct lruvec; -extern u64 zswap_pool_total_size; extern atomic_t zswap_stored_pages; #ifdef CONFIG_ZSWAP @@ -27,6 +26,7 @@ struct zswap_lruvec_state { atomic_long_t nr_zswap_protected; }; +unsigned long zswap_total_pages(void); bool zswap_store(struct folio *folio); bool zswap_load(struct folio *folio); void zswap_invalidate(swp_entry_t swp); diff --git a/mm/zswap.c b/mm/zswap.c index c39fa52bed44..4d8bd62c4ff0 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -43,8 +43,6 @@ /********************************* * statistics **********************************/ -/* Total bytes used by the compressed storage */ -u64 zswap_pool_total_size; /* The number of compressed pages currently stored in zswap */ atomic_t zswap_stored_pages = ATOMIC_INIT(0); /* The number of same-value filled pages currently stored in zswap */ @@ -264,45 +262,6 @@ static inline struct zswap_tree *swap_zswap_tree(swp_entry_t swp) pr_debug("%s pool %s/%s\n", msg, (p)->tfm_name, \ zpool_get_type((p)->zpools[0])) -static bool zswap_is_full(void) -{ - return totalram_pages() * zswap_max_pool_percent / 100 < - DIV_ROUND_UP(zswap_pool_total_size, PAGE_SIZE); -} - -static bool zswap_can_accept(void) -{ - return totalram_pages() * zswap_accept_thr_percent / 100 * - zswap_max_pool_percent / 100 > - DIV_ROUND_UP(zswap_pool_total_size, PAGE_SIZE); -} - -static u64 get_zswap_pool_size(struct zswap_pool *pool) -{ - u64 pool_size = 0; - int i; - - for (i = 0; i < ZSWAP_NR_ZPOOLS; i++) - pool_size += zpool_get_total_size(pool->zpools[i]); - - return pool_size; -} - -static void zswap_update_total_size(void) -{ - struct zswap_pool *pool; - u64 total = 0; - - rcu_read_lock(); - - list_for_each_entry_rcu(pool, &zswap_pools, list) - total += get_zswap_pool_size(pool); - - rcu_read_unlock(); - - zswap_pool_total_size = total; -} - /********************************* * pool functions **********************************/ @@ -540,6 +499,33 @@ static struct zswap_pool *zswap_pool_find_get(char *type, char *compressor) return NULL; } +static unsigned long zswap_max_pages(void) +{ + return totalram_pages() * zswap_max_pool_percent / 100; +} + +static unsigned long zswap_accept_thr_pages(void) +{ + return zswap_max_pages() * zswap_accept_thr_percent / 100; +} + +unsigned long zswap_total_pages(void) +{ + struct zswap_pool *pool; + u64 total = 0; + + rcu_read_lock(); + list_for_each_entry_rcu(pool, &zswap_pools, list) { + int i; + + for (i = 0; i < ZSWAP_NR_ZPOOLS; i++) + total += zpool_get_total_size(pool->zpools[i]); + } + rcu_read_unlock(); + + return total >> PAGE_SHIFT; +} + /********************************* * param callbacks **********************************/ @@ -912,7 +898,6 @@ static void zswap_entry_free(struct zswap_entry *entry) } zswap_entry_cache_free(entry); atomic_dec(&zswap_stored_pages); - zswap_update_total_size(); } /* @@ -1339,7 +1324,7 @@ static unsigned long zswap_shrinker_count(struct shrinker *shrinker, nr_backing = memcg_page_state(memcg, MEMCG_ZSWAP_B) >> PAGE_SHIFT; nr_stored = memcg_page_state(memcg, MEMCG_ZSWAPPED); } else { - nr_backing = zswap_pool_total_size >> PAGE_SHIFT; + nr_backing = zswap_total_pages(); nr_stored = atomic_read(&zswap_nr_stored); } @@ -1407,6 +1392,10 @@ static void shrink_worker(struct work_struct *w) { struct mem_cgroup *memcg; int ret, failures = 0; + unsigned long thr; + + /* Reclaim down to the accept threshold */ + thr = zswap_accept_thr_pages(); /* global reclaim will select cgroup in a round-robin fashion. */ do { @@ -1454,10 +1443,9 @@ static void shrink_worker(struct work_struct *w) break; if (ret && ++failures == MAX_RECLAIM_RETRIES) break; - resched: cond_resched(); - } while (!zswap_can_accept()); + } while (zswap_total_pages() > thr); } static int zswap_is_page_same_filled(void *ptr, unsigned long *value) @@ -1498,6 +1486,7 @@ bool zswap_store(struct folio *folio) struct zswap_entry *entry, *dupentry; struct obj_cgroup *objcg = NULL; struct mem_cgroup *memcg = NULL; + unsigned long max_pages, cur_pages; VM_WARN_ON_ONCE(!folio_test_locked(folio)); VM_WARN_ON_ONCE(!folio_test_swapcache(folio)); @@ -1509,6 +1498,7 @@ bool zswap_store(struct folio *folio) if (!zswap_enabled) goto check_old; + /* Check cgroup limits */ objcg = get_obj_cgroup_from_folio(folio); if (objcg && !obj_cgroup_may_zswap(objcg)) { memcg = get_mem_cgroup_from_objcg(objcg); @@ -1519,15 +1509,18 @@ bool zswap_store(struct folio *folio) mem_cgroup_put(memcg); } - /* reclaim space if needed */ - if (zswap_is_full()) { + /* Check global limits */ + cur_pages = zswap_total_pages(); + max_pages = zswap_max_pages(); + + if (cur_pages >= max_pages) { zswap_pool_limit_hit++; zswap_pool_reached_full = true; goto shrink; } if (zswap_pool_reached_full) { - if (!zswap_can_accept()) + if (cur_pages > zswap_accept_thr_pages()) goto shrink; else zswap_pool_reached_full = false; @@ -1603,7 +1596,6 @@ bool zswap_store(struct folio *folio) /* update stats */ atomic_inc(&zswap_stored_pages); - zswap_update_total_size(); count_vm_event(ZSWPOUT); return true; @@ -1732,6 +1724,13 @@ void zswap_swapoff(int type) static struct dentry *zswap_debugfs_root; +static int debugfs_get_total_size(void *data, u64 *val) +{ + *val = zswap_total_pages() * PAGE_SIZE; + return 0; +} +DEFINE_DEBUGFS_ATTRIBUTE(total_size_fops, debugfs_get_total_size, NULL, "%llu\n"); + static int zswap_debugfs_init(void) { if (!debugfs_initialized()) @@ -1753,8 +1752,8 @@ static int zswap_debugfs_init(void) zswap_debugfs_root, &zswap_reject_compress_poor); debugfs_create_u64("written_back_pages", 0444, zswap_debugfs_root, &zswap_written_back_pages); - debugfs_create_u64("pool_total_size", 0444, - zswap_debugfs_root, &zswap_pool_total_size); + debugfs_create_file("pool_total_size", 0444, + zswap_debugfs_root, NULL, &total_size_fops); debugfs_create_atomic_t("stored_pages", 0444, zswap_debugfs_root, &zswap_stored_pages); debugfs_create_atomic_t("same_filled_pages", 0444, -- Gitee From 289b3cdb691a91c0f9be9507168da788a8d99924 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Tue, 12 Mar 2024 11:34:12 -0400 Subject: [PATCH 228/484] mm: zpool: return pool size in pages commit 4196b48ddd382419e51658bf94a25af195ba9450 upstream Conflicts: none Backport-reason: ZSWAP mass update All zswap backends track their pool sizes in pages. Currently they multiply by PAGE_SIZE for zswap, only for zswap to divide again in order to do limit math. Report pages directly. Link: https://lkml.kernel.org/r/20240312153901.3441-2-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Acked-by: Yosry Ahmed Reviewed-by: Chengming Zhou Reviewed-by: Nhat Pham Cc: Yosry Ahmed Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- include/linux/zpool.h | 4 ++-- mm/z3fold.c | 10 +++++----- mm/zbud.c | 10 +++++----- mm/zpool.c | 10 +++++----- mm/zsmalloc.c | 6 +++--- mm/zswap.c | 6 +++--- 6 files changed, 23 insertions(+), 23 deletions(-) diff --git a/include/linux/zpool.h b/include/linux/zpool.h index 3296438eec06..a67d62b79698 100644 --- a/include/linux/zpool.h +++ b/include/linux/zpool.h @@ -53,7 +53,7 @@ void *zpool_map_handle(struct zpool *pool, unsigned long handle, void zpool_unmap_handle(struct zpool *pool, unsigned long handle); -u64 zpool_get_total_size(struct zpool *pool); +u64 zpool_get_total_pages(struct zpool *pool); /** @@ -91,7 +91,7 @@ struct zpool_driver { enum zpool_mapmode mm); void (*unmap)(void *pool, unsigned long handle); - u64 (*total_size)(void *pool); + u64 (*total_pages)(void *pool); }; void zpool_register_driver(struct zpool_driver *driver); diff --git a/mm/z3fold.c b/mm/z3fold.c index 7c76b396b74c..785a2394e96b 100644 --- a/mm/z3fold.c +++ b/mm/z3fold.c @@ -1236,12 +1236,12 @@ static void z3fold_unmap(struct z3fold_pool *pool, unsigned long handle) } /** - * z3fold_get_pool_size() - gets the z3fold pool size in pages + * z3fold_get_pool_pages() - gets the z3fold pool size in pages * @pool: pool whose size is being queried * * Returns: size in pages of the given pool. */ -static u64 z3fold_get_pool_size(struct z3fold_pool *pool) +static u64 z3fold_get_pool_pages(struct z3fold_pool *pool) { return atomic64_read(&pool->pages_nr); } @@ -1401,9 +1401,9 @@ static void z3fold_zpool_unmap(void *pool, unsigned long handle) z3fold_unmap(pool, handle); } -static u64 z3fold_zpool_total_size(void *pool) +static u64 z3fold_zpool_total_pages(void *pool) { - return z3fold_get_pool_size(pool) * PAGE_SIZE; + return z3fold_get_pool_pages(pool); } static struct zpool_driver z3fold_zpool_driver = { @@ -1416,7 +1416,7 @@ static struct zpool_driver z3fold_zpool_driver = { .free = z3fold_zpool_free, .map = z3fold_zpool_map, .unmap = z3fold_zpool_unmap, - .total_size = z3fold_zpool_total_size, + .total_pages = z3fold_zpool_total_pages, }; MODULE_ALIAS("zpool-z3fold"); diff --git a/mm/zbud.c b/mm/zbud.c index 2190cc1f37b3..e9836fff9438 100644 --- a/mm/zbud.c +++ b/mm/zbud.c @@ -365,13 +365,13 @@ static void zbud_unmap(struct zbud_pool *pool, unsigned long handle) } /** - * zbud_get_pool_size() - gets the zbud pool size in pages + * zbud_get_pool_pages() - gets the zbud pool size in pages * @pool: pool whose size is being queried * * Returns: size in pages of the given pool. The pool lock need not be * taken to access pages_nr. */ -static u64 zbud_get_pool_size(struct zbud_pool *pool) +static u64 zbud_get_pool_pages(struct zbud_pool *pool) { return pool->pages_nr; } @@ -410,9 +410,9 @@ static void zbud_zpool_unmap(void *pool, unsigned long handle) zbud_unmap(pool, handle); } -static u64 zbud_zpool_total_size(void *pool) +static u64 zbud_zpool_total_pages(void *pool) { - return zbud_get_pool_size(pool) * PAGE_SIZE; + return zbud_get_pool_pages(pool); } static struct zpool_driver zbud_zpool_driver = { @@ -425,7 +425,7 @@ static struct zpool_driver zbud_zpool_driver = { .free = zbud_zpool_free, .map = zbud_zpool_map, .unmap = zbud_zpool_unmap, - .total_size = zbud_zpool_total_size, + .total_pages = zbud_zpool_total_pages, }; MODULE_ALIAS("zpool-zbud"); diff --git a/mm/zpool.c b/mm/zpool.c index 846410479c2f..b9fda1fa857d 100644 --- a/mm/zpool.c +++ b/mm/zpool.c @@ -321,16 +321,16 @@ void zpool_unmap_handle(struct zpool *zpool, unsigned long handle) } /** - * zpool_get_total_size() - The total size of the pool + * zpool_get_total_pages() - The total size of the pool * @zpool: The zpool to check * - * This returns the total size in bytes of the pool. + * This returns the total size in pages of the pool. * - * Returns: Total size of the zpool in bytes. + * Returns: Total size of the zpool in pages. */ -u64 zpool_get_total_size(struct zpool *zpool) +u64 zpool_get_total_pages(struct zpool *zpool) { - return zpool->driver->total_size(zpool->pool); + return zpool->driver->total_pages(zpool->pool); } /** diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index c743ce7a5f49..9aeeea4e14b9 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -405,9 +405,9 @@ static void zs_zpool_unmap(void *pool, unsigned long handle) zs_unmap_object(pool, handle); } -static u64 zs_zpool_total_size(void *pool) +static u64 zs_zpool_total_pages(void *pool) { - return zs_get_total_pages(pool) << PAGE_SHIFT; + return zs_get_total_pages(pool); } static struct zpool_driver zs_zpool_driver = { @@ -420,7 +420,7 @@ static struct zpool_driver zs_zpool_driver = { .free = zs_zpool_free, .map = zs_zpool_map, .unmap = zs_zpool_unmap, - .total_size = zs_zpool_total_size, + .total_pages = zs_zpool_total_pages, }; MODULE_ALIAS("zpool-zsmalloc"); diff --git a/mm/zswap.c b/mm/zswap.c index 4d8bd62c4ff0..319055a7bb0a 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -512,18 +512,18 @@ static unsigned long zswap_accept_thr_pages(void) unsigned long zswap_total_pages(void) { struct zswap_pool *pool; - u64 total = 0; + unsigned long total = 0; rcu_read_lock(); list_for_each_entry_rcu(pool, &zswap_pools, list) { int i; for (i = 0; i < ZSWAP_NR_ZPOOLS; i++) - total += zpool_get_total_size(pool->zpools[i]); + total += zpool_get_total_pages(pool->zpools[i]); } rcu_read_unlock(); - return total >> PAGE_SHIFT; + return total; } /********************************* -- Gitee From 34934f3c0b25693d78a7aa642f86d1969d706e82 Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Mon, 11 Mar 2024 23:52:10 +0000 Subject: [PATCH 229/484] mm: zswap: remove unnecessary check in zswap_find_zpool() commit fea68a75651cfc3458318416a1a2b45161cd22c2 upstream Conflicts: none Backport-reason: ZSWAP mass update zswap_find_zpool() checks if ZSWAP_NR_ZPOOLS > 1, which is always true. This is a remnant from a patch version that had ZSWAP_NR_ZPOOLS as a config option and never made it upstream. Remove the unnecessary check. Link: https://lkml.kernel.org/r/20240311235210.2937484-1-yosryahmed@google.com Signed-off-by: Yosry Ahmed Reviewed-by: Chengming Zhou Reviewed-by: Nhat Pham Acked-by: Johannes Weiner Cc: Yosry Ahmed Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/zswap.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/mm/zswap.c b/mm/zswap.c index 319055a7bb0a..cb5382ac1ea3 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -870,12 +870,7 @@ static void zswap_entry_cache_free(struct zswap_entry *entry) static struct zpool *zswap_find_zpool(struct zswap_entry *entry) { - int i = 0; - - if (ZSWAP_NR_ZPOOLS > 1) - i = hash_ptr(entry, ilog2(ZSWAP_NR_ZPOOLS)); - - return entry->pool->zpools[i]; + return entry->pool->zpools[hash_ptr(entry, ilog2(ZSWAP_NR_ZPOOLS))]; } /* -- Gitee From 9acc9f74aca4baadadf82d2e73679f68632c3718 Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Fri, 22 Mar 2024 00:10:01 +0000 Subject: [PATCH 230/484] mm: zswap: remove nr_zswap_stored atomic commit cc9bc36ebef70f3437caf712574478d5b95b25d4 upstream Conflicts: none Backport-reason: ZSWAP mass update nr_stored was introduced by commit b5ba474f3f51 ("zswap: shrink zswap pool based on memory pressure") as a per zswap_pool counter of the number of stored pages that are not same-filled pages. It is used in zswap_shrinker_count() to scale the number of freeable compressed pages by the compression ratio. That is, to reduce the amount of writeback from zswap with higher compression ratios as the ROI from IO diminishes. Later on, commit bf9b7df23cb3 ("mm/zswap: global lru and shrinker shared by all zswap_pools") made the shrinker global (not per zswap_pool), and replaced nr_stored with nr_zswap_stored (initially introduced as zswap.nr_stored), which is now a global counter. The counter is now awfully close to zswap_stored_pages. The only difference is that the latter also includes same-filled pages. Also, when memcgs are enabled, we use memcg_page_state(memcg, MEMCG_ZSWAPPED), which includes same-filled pages anyway (i.e. equivalent to zswap_stored_pages). Use zswap_stored_pages instead in zswap_shrinker_count() to keep things consistent whether memcgs are enabled or not, and add a comment about the number of freeable pages possibly being scaled down more than it should if we have lots of same-filled pages (i.e. inflated compression ratio). Remove nr_zswap_stored and one atomic operation in the store and free paths. Link: https://lkml.kernel.org/r/20240322001001.1562517-1-yosryahmed@google.com Signed-off-by: Yosry Ahmed Reviewed-by: Nhat Pham Acked-by: Johannes Weiner Reviewed-by: Chengming Zhou Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/zswap.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/mm/zswap.c b/mm/zswap.c index cb5382ac1ea3..5bc80334cd41 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -180,8 +180,6 @@ struct zswap_pool { /* Global LRU lists shared by all zswap pools. */ static struct list_lru zswap_list_lru; -/* counter of pages stored in all zswap pools. */ -static atomic_t zswap_nr_stored = ATOMIC_INIT(0); /* The lock protects zswap_next_shrink updates. */ static DEFINE_SPINLOCK(zswap_shrink_lock); @@ -884,7 +882,6 @@ static void zswap_entry_free(struct zswap_entry *entry) else { zswap_lru_del(&zswap_list_lru, entry); zpool_free(zswap_find_zpool(entry), entry->handle); - atomic_dec(&zswap_nr_stored); zswap_pool_put(entry->pool); } if (entry->objcg) { @@ -1320,7 +1317,7 @@ static unsigned long zswap_shrinker_count(struct shrinker *shrinker, nr_stored = memcg_page_state(memcg, MEMCG_ZSWAPPED); } else { nr_backing = zswap_total_pages(); - nr_stored = atomic_read(&zswap_nr_stored); + nr_stored = atomic_read(&zswap_stored_pages); } if (!nr_stored) @@ -1340,6 +1337,11 @@ static unsigned long zswap_shrinker_count(struct shrinker *shrinker, * This ensures that the better zswap compresses memory, the fewer * pages we will evict to swap (as it will otherwise incur IO for * relatively small memory saving). + * + * The memory saving factor calculated here takes same-filled pages into + * account, but those are not freeable since they almost occupy no + * space. Hence, we may scale nr_freeable down a little bit more than we + * should if we have a lot of same-filled pages. */ return mult_frac(nr_freeable, nr_backing, nr_stored); } @@ -1585,7 +1587,6 @@ bool zswap_store(struct folio *folio) if (entry->length) { INIT_LIST_HEAD(&entry->lru); zswap_lru_add(&zswap_list_lru, entry); - atomic_inc(&zswap_nr_stored); } spin_unlock(&tree->lock); -- Gitee From 873e8b8c9955b782f6b6e10573768ee03fae665d Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Mon, 30 Oct 2023 09:11:47 +0800 Subject: [PATCH 231/484] mm: huge_memory: batch tlb flush when splitting a pte-mapped THP commit 3027c6f8eb9d3857aef08f401aeb7de715410525 upstream Conflicts: none Backport-reason: mTHP batch TLB flush I can observe an obvious tlb flush hotspot when splitting a pte-mapped THP on my ARM64 server, and the distribution of this hotspot is as follows: - 16.85% split_huge_page_to_list + 7.80% down_write - 7.49% try_to_migrate - 7.48% rmap_walk_anon 7.23% ptep_clear_flush + 1.52% __split_huge_page The reason is that the split_huge_page_to_list() will build migration entries for each subpage of a pte-mapped Anon THP by try_to_migrate(), or unmap for file THP, and it will clear and tlb flush for each subpage's pte. Moreover, the split_huge_page_to_list() will set TTU_SPLIT_HUGE_PMD flag to ensure the THP is already a pte-mapped THP before splitting it to some normal pages. Actually, there is no need to flush tlb for each subpage immediately, instead we can batch tlb flush for the pte-mapped THP to improve the performance. After this patch, we can see the batch tlb flush can improve the latency obviously when running thpscale. k6.5-base patched Amean fault-both-1 1071.17 ( 0.00%) 901.83 * 15.81%* Amean fault-both-3 2386.08 ( 0.00%) 1865.32 * 21.82%* Amean fault-both-5 2851.10 ( 0.00%) 2273.84 * 20.25%* Amean fault-both-7 3679.91 ( 0.00%) 2881.66 * 21.69%* Amean fault-both-12 5916.66 ( 0.00%) 4369.55 * 26.15%* Amean fault-both-18 7981.36 ( 0.00%) 6303.57 * 21.02%* Amean fault-both-24 10950.79 ( 0.00%) 8752.56 * 20.07%* Amean fault-both-30 14077.35 ( 0.00%) 10170.01 * 27.76%* Amean fault-both-32 13061.57 ( 0.00%) 11630.08 * 10.96%* Link: https://lkml.kernel.org/r/431d9fb6823036369dcb1d3b2f63732f01df21a7.1698488264.git.baolin.wang@linux.alibaba.com Signed-off-by: Baolin Wang Reviewed-by: "Huang, Ying" Reviewed-by: Yang Shi Reviewed-by: Alistair Popple Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/huge_memory.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index af946fe73095..47e3b9c6f8e0 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2759,7 +2759,7 @@ void vma_adjust_trans_huge(struct vm_area_struct *vma, static void unmap_folio(struct folio *folio) { enum ttu_flags ttu_flags = TTU_RMAP_LOCKED | TTU_SPLIT_HUGE_PMD | - TTU_SYNC; + TTU_SYNC | TTU_BATCH_FLUSH; VM_BUG_ON_FOLIO(!folio_test_large(folio), folio); @@ -2772,6 +2772,8 @@ static void unmap_folio(struct folio *folio) try_to_migrate(folio, ttu_flags); else try_to_unmap(folio, ttu_flags | TTU_IGNORE_MLOCK); + + try_to_unmap_flush(); } static void remap_page(struct folio *folio, unsigned long nr) -- Gitee From e504a641a3b37fa31d70f534be38677be32c5f17 Mon Sep 17 00:00:00 2001 From: Zi Yan Date: Mon, 26 Feb 2024 15:55:27 -0500 Subject: [PATCH 232/484] mm/huge_memory: only split PMD mapping when necessary in unmap_folio() commit 319a624ec2b79db7a0b0a2a2a61e3aa5c96eabfc upstream Conflicts: none Backport-reason: mTHP splitting Patch series "Split a folio to any lower order folios", v5. File folio supports any order and multi-size THP is upstreamed[1], so both file and anonymous folios can be >0 order. Currently, split_huge_page() only splits a huge page to order-0 pages, but splitting to orders higher than 0 might better utilize large folios, if done properly. In addition, Large Block Sizes in XFS support would benefit from it during truncate[2]. This patchset adds support for splitting a large folio to any lower order folios. In addition to this implementation of split_huge_page_to_list_to_order(), a possible optimization could be splitting a large folio to arbitrary smaller folios instead of a single order. As both Hugh and Ryan pointed out [3,5] that split to a single order might not be optimal, an order-9 folio might be better split into 1 order-8, 1 order-7, ..., 1 order-1, and 2 order-0 folios, depending on subsequent folio operations. Leave this as future work. [1] https://lore.kernel.org/all/20231207161211.2374093-1-ryan.roberts@arm.com/ [2] https://lore.kernel.org/linux-mm/20240226094936.2677493-1-kernel@pankajraghav.com/ [3] https://lore.kernel.org/linux-mm/9dd96da-efa2-5123-20d4-4992136ef3ad@google.com/ [4] https://lore.kernel.org/linux-mm/cbb1d6a0-66dd-47d0-8733-f836fe050374@arm.com/ [5] https://lore.kernel.org/linux-mm/20240213215520.1048625-1-zi.yan@sent.com/ This patch (of 8): As multi-size THP support is added, not all THPs are PMD-mapped, thus during a huge page split, there is no need to always split PMD mapping in unmap_folio(). Make it conditional. Link: https://lkml.kernel.org/r/20240226205534.1603748-1-zi.yan@sent.com Link: https://lkml.kernel.org/r/20240226205534.1603748-2-zi.yan@sent.com Signed-off-by: Zi Yan Reviewed-by: David Hildenbrand Cc: Hugh Dickins Cc: Kirill A. Shutemov Cc: Luis Chamberlain Cc: Matthew Wilcox Cc: Michal Koutny Cc: Roman Gushchin Cc: Ryan Roberts Cc: Yang Shi Cc: Yu Zhao Cc: Zach O'Keefe Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/huge_memory.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 47e3b9c6f8e0..805821b5ffba 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2758,11 +2758,14 @@ void vma_adjust_trans_huge(struct vm_area_struct *vma, static void unmap_folio(struct folio *folio) { - enum ttu_flags ttu_flags = TTU_RMAP_LOCKED | TTU_SPLIT_HUGE_PMD | - TTU_SYNC | TTU_BATCH_FLUSH; + enum ttu_flags ttu_flags = TTU_RMAP_LOCKED | TTU_SYNC | + TTU_BATCH_FLUSH; VM_BUG_ON_FOLIO(!folio_test_large(folio), folio); + if (folio_test_pmd_mappable(folio)) + ttu_flags |= TTU_SPLIT_HUGE_PMD; + /* * Anon pages need migration entries to preserve them, but file * pages can simply be left unmapped, then faulted back on demand. -- Gitee From 6c64c4755f2c6025709a55f39ea01803509ecdd7 Mon Sep 17 00:00:00 2001 From: Zi Yan Date: Mon, 26 Feb 2024 15:55:29 -0500 Subject: [PATCH 233/484] mm/memcg: use order instead of nr in split_page_memcg() commit 502003bb76b83649cd4ff7f701987ac5cf43bc4b upstream Conflicts: none Backport-reason: mTHP splitting We do not have non power of two pages, using nr is error prone if nr is not power-of-two. Use page order instead. Link: https://lkml.kernel.org/r/20240226205534.1603748-4-zi.yan@sent.com Signed-off-by: Zi Yan Acked-by: David Hildenbrand Cc: Hugh Dickins Cc: Kirill A. Shutemov Cc: Luis Chamberlain Cc: "Matthew Wilcox (Oracle)" Cc: Michal Koutny Cc: Roman Gushchin Cc: Ryan Roberts Cc: Yang Shi Cc: Yu Zhao Cc: Zach O'Keefe Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- include/linux/memcontrol.h | 4 ++-- mm/huge_memory.c | 5 +++-- mm/memcontrol.c | 3 ++- mm/page_alloc.c | 4 ++-- 4 files changed, 9 insertions(+), 7 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 94b94dcdfc4b..e086c21e137d 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -1349,7 +1349,7 @@ static inline void memcg_memory_event_mm(struct mm_struct *mm, rcu_read_unlock(); } -void split_page_memcg(struct page *head, unsigned int nr); +void split_page_memcg(struct page *head, int order); unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order, gfp_t gfp_mask, @@ -1832,7 +1832,7 @@ void count_memcg_event_mm(struct mm_struct *mm, enum vm_event_item idx) { } -static inline void split_page_memcg(struct page *head, unsigned int nr) +static inline void split_page_memcg(struct page *head, int order) { } diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 805821b5ffba..c43e2891e8b4 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2918,11 +2918,12 @@ static void __split_huge_page(struct page *page, struct list_head *list, struct lruvec *lruvec; struct address_space *swap_cache = NULL; unsigned long offset = 0; - unsigned int nr = thp_nr_pages(head); int i, nr_dropped = 0; + int order = folio_order(folio); + unsigned int nr = 1 << order; /* complete memcg works before add pages to LRU */ - split_page_memcg(head, nr); + split_page_memcg(head, order); if (folio_test_anon(folio) && folio_test_swapcache(folio)) { offset = swap_cache_index(folio->swap); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index e7fc3b86e468..9341bd622e90 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -4016,11 +4016,12 @@ EXPORT_SYMBOL_GPL(obj_cgroup_uncharge); /* * Because page_memcg(head) is not set on tails, set it now. */ -void split_page_memcg(struct page *head, unsigned int nr) +void split_page_memcg(struct page *head, int order) { struct folio *folio = page_folio(head); struct mem_cgroup *memcg = folio_memcg(folio); int i; + unsigned int nr = 1 << order; if (mem_cgroup_disabled() || !memcg) return; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 6565ef713c25..de64df9a8d94 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2715,7 +2715,7 @@ void split_page(struct page *page, unsigned int order) for (i = 1; i < (1 << order); i++) set_page_refcounted(page + i); split_page_owner(page, 1 << order); - split_page_memcg(page, 1 << order); + split_page_memcg(page, order); } EXPORT_SYMBOL_GPL(split_page); @@ -4956,7 +4956,7 @@ static void *make_alloc_exact(unsigned long addr, unsigned int order, struct page *last = page + nr; split_page_owner(page, 1 << order); - split_page_memcg(page, 1 << order); + split_page_memcg(page, order); while (page < --last) set_page_refcounted(last); -- Gitee From 172d1c13558b17b8745e971512c94f77e09587a5 Mon Sep 17 00:00:00 2001 From: Zi Yan Date: Mon, 26 Feb 2024 15:55:30 -0500 Subject: [PATCH 234/484] mm/page_owner: use order instead of nr in split_page_owner() commit 9a581c12cddb06696fe4811239934fcde57ceb91 upstream Conflicts: none Backport-reason: mTHP splitting We do not have non power of two pages, using nr is error prone if nr is not power-of-two. Use page order instead. Link: https://lkml.kernel.org/r/20240226205534.1603748-5-zi.yan@sent.com Signed-off-by: Zi Yan Acked-by: David Hildenbrand Cc: Hugh Dickins Cc: Kirill A. Shutemov Cc: Luis Chamberlain Cc: "Matthew Wilcox (Oracle)" Cc: Michal Koutny Cc: Roman Gushchin Cc: Ryan Roberts Cc: Yang Shi Cc: Yu Zhao Cc: Zach O'Keefe Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- include/linux/page_owner.h | 9 ++++----- mm/huge_memory.c | 2 +- mm/page_alloc.c | 4 ++-- mm/page_owner.c | 3 ++- 4 files changed, 9 insertions(+), 9 deletions(-) diff --git a/include/linux/page_owner.h b/include/linux/page_owner.h index 119a0c9d2a8b..2b39c8e19d98 100644 --- a/include/linux/page_owner.h +++ b/include/linux/page_owner.h @@ -11,7 +11,7 @@ extern struct page_ext_operations page_owner_ops; extern void __reset_page_owner(struct page *page, unsigned short order); extern void __set_page_owner(struct page *page, unsigned short order, gfp_t gfp_mask); -extern void __split_page_owner(struct page *page, unsigned int nr); +extern void __split_page_owner(struct page *page, int order); extern void __folio_copy_owner(struct folio *newfolio, struct folio *old); extern void __set_page_owner_migrate_reason(struct page *page, int reason); extern void __dump_page_owner(const struct page *page); @@ -31,10 +31,10 @@ static inline void set_page_owner(struct page *page, __set_page_owner(page, order, gfp_mask); } -static inline void split_page_owner(struct page *page, unsigned int nr) +static inline void split_page_owner(struct page *page, int order) { if (static_branch_unlikely(&page_owner_inited)) - __split_page_owner(page, nr); + __split_page_owner(page, order); } static inline void folio_copy_owner(struct folio *newfolio, struct folio *old) { @@ -59,8 +59,7 @@ static inline void set_page_owner(struct page *page, unsigned int order, gfp_t gfp_mask) { } -static inline void split_page_owner(struct page *page, - unsigned short order) +static inline void split_page_owner(struct page *page, int order) { } static inline void folio_copy_owner(struct folio *newfolio, struct folio *folio) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index c43e2891e8b4..92735b0e0c80 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2962,7 +2962,7 @@ static void __split_huge_page(struct page *page, struct list_head *list, unlock_page_lruvec(lruvec); /* Caller disabled irqs, so they are still disabled here */ - split_page_owner(head, nr); + split_page_owner(head, order); /* See comment in __split_huge_page_tail() */ if (PageAnon(head)) { diff --git a/mm/page_alloc.c b/mm/page_alloc.c index de64df9a8d94..a3390c6c70ef 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2714,7 +2714,7 @@ void split_page(struct page *page, unsigned int order) for (i = 1; i < (1 << order); i++) set_page_refcounted(page + i); - split_page_owner(page, 1 << order); + split_page_owner(page, order); split_page_memcg(page, order); } EXPORT_SYMBOL_GPL(split_page); @@ -4955,7 +4955,7 @@ static void *make_alloc_exact(unsigned long addr, unsigned int order, struct page *page = virt_to_page((void *)addr); struct page *last = page + nr; - split_page_owner(page, 1 << order); + split_page_owner(page, order); split_page_memcg(page, order); while (page < --last) set_page_refcounted(last); diff --git a/mm/page_owner.c b/mm/page_owner.c index 89045a6b1628..b1ed46409425 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -211,11 +211,12 @@ void __set_page_owner_migrate_reason(struct page *page, int reason) page_ext_put(page_ext); } -void __split_page_owner(struct page *page, unsigned int nr) +void __split_page_owner(struct page *page, int order) { int i; struct page_ext *page_ext = page_ext_get(page); struct page_owner *page_owner; + unsigned int nr = 1 << order; if (unlikely(!page_ext)) return; -- Gitee From 690ba9e5d52c6c4cb4954b8bd7e092280d4cfb14 Mon Sep 17 00:00:00 2001 From: Zi Yan Date: Mon, 26 Feb 2024 15:55:31 -0500 Subject: [PATCH 235/484] mm: memcg: make memcg huge page split support any order split commit b8791381d7edae3706edde207f52d9e483ed400c upstream Conflicts: none Backport-reason: mTHP splitting It sets memcg information for the pages after the split. A new parameter new_order is added to tell the order of subpages in the new page, always 0 for now. It prepares for upcoming changes to support split huge page to any lower order. Link: https://lkml.kernel.org/r/20240226205534.1603748-6-zi.yan@sent.com Signed-off-by: Zi Yan Acked-by: David Hildenbrand Cc: Hugh Dickins Cc: Kirill A. Shutemov Cc: Luis Chamberlain Cc: "Matthew Wilcox (Oracle)" Cc: Michal Koutny Cc: Roman Gushchin Cc: Ryan Roberts Cc: Yang Shi Cc: Yu Zhao Cc: Zach O'Keefe Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- include/linux/memcontrol.h | 4 ++-- mm/huge_memory.c | 2 +- mm/memcontrol.c | 11 ++++++----- mm/page_alloc.c | 4 ++-- 4 files changed, 11 insertions(+), 10 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index e086c21e137d..21871192b84b 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -1349,7 +1349,7 @@ static inline void memcg_memory_event_mm(struct mm_struct *mm, rcu_read_unlock(); } -void split_page_memcg(struct page *head, int order); +void split_page_memcg(struct page *head, int old_order, int new_order); unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order, gfp_t gfp_mask, @@ -1832,7 +1832,7 @@ void count_memcg_event_mm(struct mm_struct *mm, enum vm_event_item idx) { } -static inline void split_page_memcg(struct page *head, int order) +static inline void split_page_memcg(struct page *head, int old_order, int new_order) { } diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 92735b0e0c80..2d62916455c8 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2923,7 +2923,7 @@ static void __split_huge_page(struct page *page, struct list_head *list, unsigned int nr = 1 << order; /* complete memcg works before add pages to LRU */ - split_page_memcg(head, order); + split_page_memcg(head, order, 0); if (folio_test_anon(folio) && folio_test_swapcache(folio)) { offset = swap_cache_index(folio->swap); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 9341bd622e90..e9ed61561b68 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -4016,23 +4016,24 @@ EXPORT_SYMBOL_GPL(obj_cgroup_uncharge); /* * Because page_memcg(head) is not set on tails, set it now. */ -void split_page_memcg(struct page *head, int order) +void split_page_memcg(struct page *head, int old_order, int new_order) { struct folio *folio = page_folio(head); struct mem_cgroup *memcg = folio_memcg(folio); int i; - unsigned int nr = 1 << order; + unsigned int old_nr = 1 << old_order; + unsigned int new_nr = 1 << new_order; if (mem_cgroup_disabled() || !memcg) return; - for (i = 1; i < nr; i++) + for (i = new_nr; i < old_nr; i += new_nr) folio_page(folio, i)->memcg_data = folio->memcg_data; if (folio_memcg_kmem(folio)) - obj_cgroup_get_many(__folio_objcg(folio), nr - 1); + obj_cgroup_get_many(__folio_objcg(folio), old_nr / new_nr - 1); else - css_get_many(&memcg->css, nr - 1); + css_get_many(&memcg->css, old_nr / new_nr - 1); } #ifdef CONFIG_SWAP diff --git a/mm/page_alloc.c b/mm/page_alloc.c index a3390c6c70ef..1de2047276ee 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2715,7 +2715,7 @@ void split_page(struct page *page, unsigned int order) for (i = 1; i < (1 << order); i++) set_page_refcounted(page + i); split_page_owner(page, order); - split_page_memcg(page, order); + split_page_memcg(page, order, 0); } EXPORT_SYMBOL_GPL(split_page); @@ -4956,7 +4956,7 @@ static void *make_alloc_exact(unsigned long addr, unsigned int order, struct page *last = page + nr; split_page_owner(page, order); - split_page_memcg(page, order); + split_page_memcg(page, order, 0); while (page < --last) set_page_refcounted(last); -- Gitee From 2e4fdad47cceff75f74a2f8bb553ea42b84c911f Mon Sep 17 00:00:00 2001 From: Zi Yan Date: Mon, 26 Feb 2024 15:55:32 -0500 Subject: [PATCH 236/484] mm: page_owner: add support for splitting to any order in split page_owner commit 46d44d09d24c5b451d6ab8f0fca5a40f651e3837 upstream Conflicts: none Backport-reason: mTHP splitting It adds a new_order parameter to set new page order in page owner. It prepares for upcoming changes to support split huge page to any lower order. Link: https://lkml.kernel.org/r/20240226205534.1603748-7-zi.yan@sent.com Signed-off-by: Zi Yan Cc: David Hildenbrand Acked-by: David Hildenbrand Cc: Kirill A. Shutemov Cc: Luis Chamberlain Cc: "Matthew Wilcox (Oracle)" Cc: Michal Koutny Cc: Roman Gushchin Cc: Ryan Roberts Cc: Yang Shi Cc: Yu Zhao Cc: Zach O'Keefe Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- include/linux/page_owner.h | 13 ++++++++----- mm/huge_memory.c | 2 +- mm/page_alloc.c | 4 ++-- mm/page_owner.c | 7 +++---- 4 files changed, 14 insertions(+), 12 deletions(-) diff --git a/include/linux/page_owner.h b/include/linux/page_owner.h index 2b39c8e19d98..debdc25f08b9 100644 --- a/include/linux/page_owner.h +++ b/include/linux/page_owner.h @@ -11,7 +11,8 @@ extern struct page_ext_operations page_owner_ops; extern void __reset_page_owner(struct page *page, unsigned short order); extern void __set_page_owner(struct page *page, unsigned short order, gfp_t gfp_mask); -extern void __split_page_owner(struct page *page, int order); +extern void __split_page_owner(struct page *page, int old_order, + int new_order); extern void __folio_copy_owner(struct folio *newfolio, struct folio *old); extern void __set_page_owner_migrate_reason(struct page *page, int reason); extern void __dump_page_owner(const struct page *page); @@ -31,10 +32,11 @@ static inline void set_page_owner(struct page *page, __set_page_owner(page, order, gfp_mask); } -static inline void split_page_owner(struct page *page, int order) +static inline void split_page_owner(struct page *page, int old_order, + int new_order) { if (static_branch_unlikely(&page_owner_inited)) - __split_page_owner(page, order); + __split_page_owner(page, old_order, new_order); } static inline void folio_copy_owner(struct folio *newfolio, struct folio *old) { @@ -56,10 +58,11 @@ static inline void reset_page_owner(struct page *page, unsigned short order) { } static inline void set_page_owner(struct page *page, - unsigned int order, gfp_t gfp_mask) + unsigned short order, gfp_t gfp_mask) { } -static inline void split_page_owner(struct page *page, int order) +static inline void split_page_owner(struct page *page, int old_order, + int new_order) { } static inline void folio_copy_owner(struct folio *newfolio, struct folio *folio) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 2d62916455c8..c6fd249c41e1 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2962,7 +2962,7 @@ static void __split_huge_page(struct page *page, struct list_head *list, unlock_page_lruvec(lruvec); /* Caller disabled irqs, so they are still disabled here */ - split_page_owner(head, order); + split_page_owner(head, order, 0); /* See comment in __split_huge_page_tail() */ if (PageAnon(head)) { diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 1de2047276ee..4703c1d6d1e1 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2714,7 +2714,7 @@ void split_page(struct page *page, unsigned int order) for (i = 1; i < (1 << order); i++) set_page_refcounted(page + i); - split_page_owner(page, order); + split_page_owner(page, order, 0); split_page_memcg(page, order, 0); } EXPORT_SYMBOL_GPL(split_page); @@ -4955,7 +4955,7 @@ static void *make_alloc_exact(unsigned long addr, unsigned int order, struct page *page = virt_to_page((void *)addr); struct page *last = page + nr; - split_page_owner(page, order); + split_page_owner(page, order, 0); split_page_memcg(page, order, 0); while (page < --last) set_page_refcounted(last); diff --git a/mm/page_owner.c b/mm/page_owner.c index b1ed46409425..28972e2d585d 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -211,19 +211,18 @@ void __set_page_owner_migrate_reason(struct page *page, int reason) page_ext_put(page_ext); } -void __split_page_owner(struct page *page, int order) +void __split_page_owner(struct page *page, int old_order, int new_order) { int i; struct page_ext *page_ext = page_ext_get(page); struct page_owner *page_owner; - unsigned int nr = 1 << order; if (unlikely(!page_ext)) return; - for (i = 0; i < nr; i++) { + for (i = 0; i < (1 << old_order); i++) { page_owner = get_page_owner(page_ext); - page_owner->order = 0; + page_owner->order = new_order; page_ext = page_ext_next(page_ext); } page_ext_put(page_ext); -- Gitee From a2e1a9b826d13968d41835efab7a177923b1b325 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Fri, 10 Nov 2023 11:33:21 +0800 Subject: [PATCH 237/484] mm: huge_memory: use more folio api in __split_huge_page_tail() commit b75427691f4a1fd3625d1f4c016832b8c75c2326 upstream Conflicts: none Backport-reason: Memory: more folios Use more folio APIs to save six compound_head() calls in __split_huge_page_tail(). Link: https://lkml.kernel.org/r/20231110033324.2455523-5-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Reviewed-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/huge_memory.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index c6fd249c41e1..f71704dc2be6 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2892,13 +2892,13 @@ static void __split_huge_page_tail(struct folio *folio, int tail, clear_compound_head(page_tail); /* Finally unfreeze refcount. Additional reference from page cache. */ - page_ref_unfreeze(page_tail, 1 + (!PageAnon(head) || - PageSwapCache(head))); + page_ref_unfreeze(page_tail, 1 + (!folio_test_anon(folio) || + folio_test_swapcache(folio))); - if (page_is_young(head)) - set_page_young(page_tail); - if (page_is_idle(head)) - set_page_idle(page_tail); + if (folio_test_young(folio)) + folio_set_young(new_folio); + if (folio_test_idle(folio)) + folio_set_idle(new_folio); folio_xchg_last_cpupid(new_folio, folio_last_cpupid(folio)); -- Gitee From dd70e012ed4f9d497d20b049fa026cc797232c45 Mon Sep 17 00:00:00 2001 From: Zi Yan Date: Mon, 26 Feb 2024 15:55:33 -0500 Subject: [PATCH 238/484] mm: thp: split huge page to any lower order pages commit c010d47f107f609b9f4d6a103b6dfc53889049e9 upstream Conflicts: minor, downstream have already dropped split_swap_cluster and already partially backported the list_del_init in this commit. Backport-reason: mTHP splitting To split a THP to any lower order pages, we need to reform THPs on subpages at given order and add page refcount based on the new page order. Also we need to reinitialize page_deferred_list after removing the page from the split_queue, otherwise a subsequent split will see list corruption when checking the page_deferred_list again. Note: Anonymous order-1 folio is not supported because _deferred_list, which is used by partially mapped folios, is stored in subpage 2 and an order-1 folio only has subpage 0 and 1. File-backed order-1 folios are fine, since they do not use _deferred_list. [ziy@nvidia.com: fixup per discussion with Ryan] Link: https://lkml.kernel.org/r/494F48CD-1F0F-4CAD-884E-6D48F40AF990@nvidia.com Link: https://lkml.kernel.org/r/20240226205534.1603748-8-zi.yan@sent.com Signed-off-by: Zi Yan Cc: David Hildenbrand Cc: Hugh Dickins Cc: Kirill A. Shutemov Cc: Luis Chamberlain Cc: "Matthew Wilcox (Oracle)" Cc: Michal Koutny Cc: Roman Gushchin Cc: Ryan Roberts Cc: Yang Shi Cc: Yu Zhao Cc: Zach O'Keefe Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- include/linux/huge_mm.h | 21 +++++--- mm/huge_memory.c | 105 +++++++++++++++++++++++++++++++--------- 2 files changed, 95 insertions(+), 31 deletions(-) diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 0ba1d8959ab3..8fc964270496 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -352,10 +352,11 @@ unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr, void folio_prep_large_rmappable(struct folio *folio); bool can_split_folio(struct folio *folio, int *pextra_pins); -int split_huge_page_to_list(struct page *page, struct list_head *list); +int split_huge_page_to_list_to_order(struct page *page, struct list_head *list, + unsigned int new_order); static inline int split_huge_page(struct page *page) { - return split_huge_page_to_list(page, NULL); + return split_huge_page_to_list_to_order(page, NULL, 0); } void deferred_split_folio(struct folio *folio); @@ -559,7 +560,8 @@ can_split_folio(struct folio *folio, int *pextra_pins) return false; } static inline int -split_huge_page_to_list(struct page *page, struct list_head *list) +split_huge_page_to_list_to_order(struct page *page, struct list_head *list, + unsigned int new_order) { return 0; } @@ -656,17 +658,20 @@ static inline bool thp_migration_supported(void) } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ -static inline int split_folio_to_list(struct folio *folio, - struct list_head *list) +static inline int split_folio_to_list_to_order(struct folio *folio, + struct list_head *list, int new_order) { - return split_huge_page_to_list(&folio->page, list); + return split_huge_page_to_list_to_order(&folio->page, list, new_order); } -static inline int split_folio(struct folio *folio) +static inline int split_folio_to_order(struct folio *folio, int new_order) { - return split_folio_to_list(folio, NULL); + return split_folio_to_list_to_order(folio, NULL, new_order); } +#define split_folio_to_list(f, l) split_folio_to_list_to_order(f, l, 0) +#define split_folio(f) split_folio_to_order(f, 0) + /* * archs that select ARCH_WANTS_THP_SWAP but don't support THP_SWP due to * limitations in the implementation like arm64 MTE can override this to diff --git a/mm/huge_memory.c b/mm/huge_memory.c index f71704dc2be6..dd583ae9f53c 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2799,7 +2799,6 @@ static void lru_add_page_tail(struct page *head, struct page *tail, struct lruvec *lruvec, struct list_head *list) { VM_BUG_ON_PAGE(!PageHead(head), head); - VM_BUG_ON_PAGE(PageCompound(tail), head); VM_BUG_ON_PAGE(PageLRU(tail), head); lockdep_assert_held(&lruvec->lru_lock); @@ -2820,7 +2819,8 @@ static void lru_add_page_tail(struct page *head, struct page *tail, } static void __split_huge_page_tail(struct folio *folio, int tail, - struct lruvec *lruvec, struct list_head *list) + struct lruvec *lruvec, struct list_head *list, + unsigned int new_order) { struct page *head = &folio->page; struct page *page_tail = head + tail; @@ -2890,10 +2890,15 @@ static void __split_huge_page_tail(struct folio *folio, int tail, * which needs correct compound_head(). */ clear_compound_head(page_tail); + if (new_order) { + prep_compound_page(page_tail, new_order); + folio_prep_large_rmappable(new_folio); + } /* Finally unfreeze refcount. Additional reference from page cache. */ - page_ref_unfreeze(page_tail, 1 + (!folio_test_anon(folio) || - folio_test_swapcache(folio))); + page_ref_unfreeze(page_tail, + 1 + ((!folio_test_anon(folio) || folio_test_swapcache(folio)) ? + folio_nr_pages(new_folio) : 0)); if (folio_test_young(folio)) folio_set_young(new_folio); @@ -2911,7 +2916,7 @@ static void __split_huge_page_tail(struct folio *folio, int tail, } static void __split_huge_page(struct page *page, struct list_head *list, - pgoff_t end) + pgoff_t end, unsigned int new_order) { struct folio *folio = page_folio(page); struct page *head = &folio->page; @@ -2919,11 +2924,12 @@ static void __split_huge_page(struct page *page, struct list_head *list, struct address_space *swap_cache = NULL; unsigned long offset = 0; int i, nr_dropped = 0; + unsigned int new_nr = 1 << new_order; int order = folio_order(folio); unsigned int nr = 1 << order; /* complete memcg works before add pages to LRU */ - split_page_memcg(head, order, 0); + split_page_memcg(head, order, new_order); if (folio_test_anon(folio) && folio_test_swapcache(folio)) { offset = swap_cache_index(folio->swap); @@ -2936,8 +2942,8 @@ static void __split_huge_page(struct page *page, struct list_head *list, ClearPageHasHWPoisoned(head); - for (i = nr - 1; i >= 1; i--) { - __split_huge_page_tail(folio, i, lruvec, list); + for (i = nr - new_nr; i >= new_nr; i -= new_nr) { + __split_huge_page_tail(folio, i, lruvec, list, new_order); /* Some pages can be beyond EOF: drop them from page cache */ if (head[i].index >= end) { struct folio *tail = page_folio(head + i); @@ -2958,24 +2964,30 @@ static void __split_huge_page(struct page *page, struct list_head *list, } } - ClearPageCompound(head); + if (!new_order) + ClearPageCompound(head); + else { + struct folio *new_folio = (struct folio *)head; + + folio_set_order(new_folio, new_order); + } unlock_page_lruvec(lruvec); /* Caller disabled irqs, so they are still disabled here */ - split_page_owner(head, order, 0); + split_page_owner(head, order, new_order); /* See comment in __split_huge_page_tail() */ if (PageAnon(head)) { /* Additional pin to swap cache */ if (PageSwapCache(head)) { - page_ref_add(head, 2); + page_ref_add(head, 1 + new_nr); xa_unlock(&swap_cache->i_pages); } else { page_ref_inc(head); } } else { /* Additional pin to page cache */ - page_ref_add(head, 2); + page_ref_add(head, 1 + new_nr); xa_unlock(&head->mapping->i_pages); } local_irq_enable(); @@ -2984,7 +2996,15 @@ static void __split_huge_page(struct page *page, struct list_head *list, shmem_uncharge(head->mapping->host, nr_dropped); remap_page(folio, nr); - for (i = 0; i < nr; i++) { + /* + * set page to its compound_head when split to non order-0 pages, so + * we can skip unlocking it below, since PG_locked is transferred to + * the compound_head of the page and the caller will unlock it. + */ + if (new_order) + page = compound_head(page); + + for (i = 0; i < nr; i += new_nr) { struct page *subpage = head + i; if (subpage == page) continue; @@ -3018,29 +3038,36 @@ bool can_split_folio(struct folio *folio, int *pextra_pins) } /* - * This function splits huge page into normal pages. @page can point to any - * subpage of huge page to split. Split doesn't change the position of @page. + * This function splits huge page into pages in @new_order. @page can point to + * any subpage of huge page to split. Split doesn't change the position of + * @page. + * + * NOTE: order-1 anonymous folio is not supported because _deferred_list, + * which is used by partially mapped folios, is stored in subpage 2 and an + * order-1 folio only has subpage 0 and 1. File-backed order-1 folios are OK, + * since they do not use _deferred_list. * * Only caller must hold pin on the @page, otherwise split fails with -EBUSY. * The huge page must be locked. * * If @list is null, tail pages will be added to LRU list, otherwise, to @list. * - * Both head page and tail pages will inherit mapping, flags, and so on from - * the hugepage. + * Pages in new_order will inherit mapping, flags, and so on from the hugepage. * - * GUP pin and PG_locked transferred to @page. Rest subpages can be freed if - * they are not mapped. + * GUP pin and PG_locked transferred to @page or the compound page @page belongs + * to. Rest subpages can be freed if they are not mapped. * * Returns 0 if the hugepage is split successfully. * Returns -EBUSY if the page is pinned or if anon_vma disappeared from under * us. */ -int split_huge_page_to_list(struct page *page, struct list_head *list) +int split_huge_page_to_list_to_order(struct page *page, struct list_head *list, + unsigned int new_order) { struct folio *folio = page_folio(page); struct deferred_split *ds_queue = get_deferred_split_queue(folio); - XA_STATE(xas, &folio->mapping->i_pages, folio->index); + /* reset xarray order to new order after split */ + XA_STATE_ORDER(xas, &folio->mapping->i_pages, folio->index, new_order); struct anon_vma *anon_vma = NULL; struct address_space *mapping = NULL; bool is_thp = folio_test_pmd_mappable(folio); @@ -3051,6 +3078,31 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); VM_BUG_ON_FOLIO(!folio_test_large(folio), folio); + /* Cannot split anonymous THP to order-1 */ + if (new_order == 1 && folio_test_anon(folio)) { + VM_WARN_ONCE(1, "Cannot split to order-1 folio"); + return -EINVAL; + } + + if (new_order) { + /* Only swapping a whole PMD-mapped folio is supported */ + if (folio_test_swapcache(folio)) + return -EINVAL; + /* Split shmem folio to non-zero order not supported */ + if (shmem_mapping(folio->mapping)) { + VM_WARN_ONCE(1, + "Cannot split shmem folio to non-0 order"); + return -EINVAL; + } + /* No split if the file system does not support large folio */ + if (!mapping_large_folio_support(folio->mapping)) { + VM_WARN_ONCE(1, + "Cannot split file folio to non-0 order"); + return -EINVAL; + } + } + + is_hzp = is_huge_zero_page(&folio->page); if (is_hzp) { pr_warn_ratelimited("Called split_huge_page for huge zero page\n"); @@ -3147,6 +3199,12 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) if (folio_order(folio) > 1 && !list_empty(&folio->_deferred_list)) { ds_queue->split_queue_len--; + /* + * Reinitialize page_deferred_list after removing the + * page from the split_queue, otherwise a subsequent + * split will see list corruption when checking the + * page_deferred_list. + */ list_del_init(&folio->_deferred_list); } spin_unlock(&ds_queue->split_queue_lock); @@ -3154,7 +3212,8 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) int nr = folio_nr_pages(folio); xas_split(&xas, folio, folio_order(folio)); - if (folio_test_pmd_mappable(folio)) { + if (folio_test_pmd_mappable(folio) && + new_order < HPAGE_PMD_ORDER) { if (folio_test_swapbacked(folio)) { __lruvec_stat_mod_folio(folio, NR_SHMEM_THPS, -nr); @@ -3166,7 +3225,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) } } - __split_huge_page(page, list, end); + __split_huge_page(page, list, end, new_order); ret = 0; } else { spin_unlock(&ds_queue->split_queue_lock); -- Gitee From 807e81c362b6b36e7ee8bcff95ae9e88daac31a3 Mon Sep 17 00:00:00 2001 From: Zi Yan Date: Mon, 26 Feb 2024 15:55:34 -0500 Subject: [PATCH 239/484] mm: huge_memory: enable debugfs to split huge pages to any order commit fc4d182316bd5309b4066fd9ef21529ea397a7d4 upstream Conflicts: trivial, selftest header is slightly different. Backport-reason: mTHP splitting It is used to test split_huge_page_to_list_to_order for pagecache THPs. Also add test cases for split_huge_page_to_list_to_order via both debugfs. [ziy@nvidia.com: fix issue discovered with NFS] Link: https://lkml.kernel.org/r/262E4DAA-4A78-4328-B745-1355AE356A07@nvidia.com Link: https://lkml.kernel.org/r/20240226205534.1603748-9-zi.yan@sent.com Signed-off-by: Zi Yan Tested-by: Aishwarya TCV Cc: David Hildenbrand Cc: Hugh Dickins Cc: Kirill A. Shutemov Cc: Luis Chamberlain Cc: "Matthew Wilcox (Oracle)" Cc: Michal Koutny Cc: Roman Gushchin Cc: Ryan Roberts Cc: Yang Shi Cc: Yu Zhao Cc: Zach O'Keefe Cc: Aishwarya TCV Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/huge_memory.c | 34 ++-- tools/testing/selftests/mm/run_vmtests.sh | 22 ++- .../selftests/mm/split_huge_page_test.c | 164 +++++++++++++++++- 3 files changed, 202 insertions(+), 18 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index dd583ae9f53c..e1b8943a3fc8 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -3448,7 +3448,7 @@ static inline bool vma_not_suitable_for_thp_split(struct vm_area_struct *vma) } static int split_huge_pages_pid(int pid, unsigned long vaddr_start, - unsigned long vaddr_end) + unsigned long vaddr_end, unsigned int new_order) { int ret = 0; struct task_struct *task; @@ -3512,13 +3512,19 @@ static int split_huge_pages_pid(int pid, unsigned long vaddr_start, goto next; total++; - if (!can_split_folio(folio, NULL)) + /* + * For folios with private, split_huge_page_to_list_to_order() + * will try to drop it before split and then check if the folio + * can be split or not. So skip the check here. + */ + if (!folio_test_private(folio) && + !can_split_folio(folio, NULL)) goto next; if (!folio_trylock(folio)) goto next; - if (!split_folio(folio)) + if (!split_folio_to_order(folio, new_order)) split++; folio_unlock(folio); @@ -3536,7 +3542,7 @@ static int split_huge_pages_pid(int pid, unsigned long vaddr_start, } static int split_huge_pages_in_file(const char *file_path, pgoff_t off_start, - pgoff_t off_end) + pgoff_t off_end, unsigned int new_order) { struct filename *file; struct file *candidate; @@ -3575,7 +3581,7 @@ static int split_huge_pages_in_file(const char *file_path, pgoff_t off_start, if (!folio_trylock(folio)) goto next; - if (!split_folio(folio)) + if (!split_folio_to_order(folio, new_order)) split++; folio_unlock(folio); @@ -3600,10 +3606,14 @@ static ssize_t split_huge_pages_write(struct file *file, const char __user *buf, { static DEFINE_MUTEX(split_debug_mutex); ssize_t ret; - /* hold pid, start_vaddr, end_vaddr or file_path, off_start, off_end */ + /* + * hold pid, start_vaddr, end_vaddr, new_order or + * file_path, off_start, off_end, new_order + */ char input_buf[MAX_INPUT_BUF_SZ]; int pid; unsigned long vaddr_start, vaddr_end; + unsigned int new_order = 0; ret = mutex_lock_interruptible(&split_debug_mutex); if (ret) @@ -3632,29 +3642,29 @@ static ssize_t split_huge_pages_write(struct file *file, const char __user *buf, goto out; } - ret = sscanf(buf, "0x%lx,0x%lx", &off_start, &off_end); - if (ret != 2) { + ret = sscanf(buf, "0x%lx,0x%lx,%d", &off_start, &off_end, &new_order); + if (ret != 2 && ret != 3) { ret = -EINVAL; goto out; } - ret = split_huge_pages_in_file(file_path, off_start, off_end); + ret = split_huge_pages_in_file(file_path, off_start, off_end, new_order); if (!ret) ret = input_len; goto out; } - ret = sscanf(input_buf, "%d,0x%lx,0x%lx", &pid, &vaddr_start, &vaddr_end); + ret = sscanf(input_buf, "%d,0x%lx,0x%lx,%d", &pid, &vaddr_start, &vaddr_end, &new_order); if (ret == 1 && pid == 1) { split_huge_pages_all(); ret = strlen(input_buf); goto out; - } else if (ret != 3) { + } else if (ret != 3 && ret != 4) { ret = -EINVAL; goto out; } - ret = split_huge_pages_pid(pid, vaddr_start, vaddr_end); + ret = split_huge_pages_pid(pid, vaddr_start, vaddr_end, new_order); if (!ret) ret = strlen(input_buf); out: diff --git a/tools/testing/selftests/mm/run_vmtests.sh b/tools/testing/selftests/mm/run_vmtests.sh index 8ec99d704d06..d2493a8dbe7b 100755 --- a/tools/testing/selftests/mm/run_vmtests.sh +++ b/tools/testing/selftests/mm/run_vmtests.sh @@ -381,7 +381,27 @@ CATEGORY="thp" run_test ./khugepaged -s 2 CATEGORY="thp" run_test ./transhuge-stress -d 20 -CATEGORY="thp" run_test ./split_huge_page_test +# Try to create XFS if not provided +if [ -z "${SPLIT_HUGE_PAGE_TEST_XFS_PATH}" ]; then + if test_selected "thp"; then + if grep xfs /proc/filesystems &>/dev/null; then + XFS_IMG=$(mktemp /tmp/xfs_img_XXXXXX) + SPLIT_HUGE_PAGE_TEST_XFS_PATH=$(mktemp -d /tmp/xfs_dir_XXXXXX) + truncate -s 314572800 ${XFS_IMG} + mkfs.xfs -q ${XFS_IMG} + mount -o loop ${XFS_IMG} ${SPLIT_HUGE_PAGE_TEST_XFS_PATH} + MOUNTED_XFS=1 + fi + fi +fi + +CATEGORY="thp" run_test ./split_huge_page_test ${SPLIT_HUGE_PAGE_TEST_XFS_PATH} + +if [ -n "${MOUNTED_XFS}" ]; then + umount ${SPLIT_HUGE_PAGE_TEST_XFS_PATH} + rmdir ${SPLIT_HUGE_PAGE_TEST_XFS_PATH} + rm -f ${XFS_IMG} +fi CATEGORY="migration" run_test ./migration diff --git a/tools/testing/selftests/mm/split_huge_page_test.c b/tools/testing/selftests/mm/split_huge_page_test.c index dff3be23488b..061d26462d33 100644 --- a/tools/testing/selftests/mm/split_huge_page_test.c +++ b/tools/testing/selftests/mm/split_huge_page_test.c @@ -16,6 +16,7 @@ #include #include #include +#include #include "vm_util.h" uint64_t pagesize; @@ -23,10 +24,11 @@ unsigned int pageshift; uint64_t pmd_pagesize; #define SPLIT_DEBUGFS "/sys/kernel/debug/split_huge_pages" +#define SMAP_PATH "/proc/self/smaps" #define INPUT_MAX 80 -#define PID_FMT "%d,0x%lx,0x%lx" -#define PATH_FMT "%s,0x%lx,0x%lx" +#define PID_FMT "%d,0x%lx,0x%lx,%d" +#define PATH_FMT "%s,0x%lx,0x%lx,%d" #define PFN_MASK ((1UL<<55)-1) #define KPF_THP (1UL<<22) @@ -113,7 +115,7 @@ void split_pmd_thp(void) /* split all THPs */ write_debugfs(PID_FMT, getpid(), (uint64_t)one_page, - (uint64_t)one_page + len); + (uint64_t)one_page + len, 0); for (i = 0; i < len; i++) if (one_page[i] != (char)i) { @@ -203,7 +205,7 @@ void split_pte_mapped_thp(void) /* split all remapped THPs */ write_debugfs(PID_FMT, getpid(), (uint64_t)pte_mapped, - (uint64_t)pte_mapped + pagesize * 4); + (uint64_t)pte_mapped + pagesize * 4, 0); /* smap does not show THPs after mremap, use kpageflags instead */ thp_size = 0; @@ -269,7 +271,7 @@ void split_file_backed_thp(void) } /* split the file-backed THP */ - write_debugfs(PATH_FMT, testfile, pgoff_start, pgoff_end); + write_debugfs(PATH_FMT, testfile, pgoff_start, pgoff_end, 0); status = unlink(testfile); if (status) @@ -290,13 +292,157 @@ void split_file_backed_thp(void) printf("file-backed THP split test done, please check dmesg for more information\n"); } +bool prepare_thp_fs(const char *xfs_path, char *thp_fs_template, + const char **thp_fs_loc) +{ + if (xfs_path) { + *thp_fs_loc = xfs_path; + return false; + } + + *thp_fs_loc = mkdtemp(thp_fs_template); + + if (!*thp_fs_loc) + ksft_exit_fail_msg("cannot create temp folder\n"); + + return true; +} + +void cleanup_thp_fs(const char *thp_fs_loc, bool created_tmp) +{ + int status; + + if (!created_tmp) + return; + + status = rmdir(thp_fs_loc); + if (status) + ksft_exit_fail_msg("cannot remove tmp dir: %s\n", + strerror(errno)); +} + +int create_pagecache_thp_and_fd(const char *testfile, size_t fd_size, int *fd, + char **addr) +{ + size_t i; + int dummy; + + srand(time(NULL)); + + *fd = open(testfile, O_CREAT | O_RDWR, 0664); + if (*fd == -1) + ksft_exit_fail_msg("Failed to create a file at %s\n", testfile); + + for (i = 0; i < fd_size; i++) { + unsigned char byte = (unsigned char)i; + + write(*fd, &byte, sizeof(byte)); + } + close(*fd); + sync(); + *fd = open("/proc/sys/vm/drop_caches", O_WRONLY); + if (*fd == -1) { + ksft_perror("open drop_caches"); + goto err_out_unlink; + } + if (write(*fd, "3", 1) != 1) { + ksft_perror("write to drop_caches"); + goto err_out_unlink; + } + close(*fd); + + *fd = open(testfile, O_RDWR); + if (*fd == -1) { + ksft_perror("Failed to open testfile\n"); + goto err_out_unlink; + } + + *addr = mmap(NULL, fd_size, PROT_READ|PROT_WRITE, MAP_SHARED, *fd, 0); + if (*addr == (char *)-1) { + ksft_perror("cannot mmap"); + goto err_out_close; + } + madvise(*addr, fd_size, MADV_HUGEPAGE); + + for (size_t i = 0; i < fd_size; i++) + dummy += *(*addr + i); + + if (!check_huge_file(*addr, fd_size / pmd_pagesize, pmd_pagesize)) { + ksft_print_msg("No large pagecache folio generated, please provide a filesystem supporting large folio\n"); + munmap(*addr, fd_size); + close(*fd); + unlink(testfile); + ksft_test_result_skip("Pagecache folio split skipped\n"); + return -2; + } + return 0; +err_out_close: + close(*fd); +err_out_unlink: + unlink(testfile); + ksft_exit_fail_msg("Failed to create large pagecache folios\n"); + return -1; +} + +void split_thp_in_pagecache_to_order(size_t fd_size, int order, const char *fs_loc) +{ + int fd; + char *addr; + size_t i; + char testfile[INPUT_MAX]; + int err = 0; + + err = snprintf(testfile, INPUT_MAX, "%s/test", fs_loc); + + if (err < 0) + ksft_exit_fail_msg("cannot generate right test file name\n"); + + err = create_pagecache_thp_and_fd(testfile, fd_size, &fd, &addr); + if (err) + return; + err = 0; + + write_debugfs(PID_FMT, getpid(), (uint64_t)addr, (uint64_t)addr + fd_size, order); + + for (i = 0; i < fd_size; i++) + if (*(addr + i) != (char)i) { + ksft_print_msg("%lu byte corrupted in the file\n", i); + err = EXIT_FAILURE; + goto out; + } + + if (!check_huge_file(addr, 0, pmd_pagesize)) { + ksft_print_msg("Still FilePmdMapped not split\n"); + err = EXIT_FAILURE; + goto out; + } + +out: + munmap(addr, fd_size); + close(fd); + unlink(testfile); + if (err) + ksft_exit_fail_msg("Split PMD-mapped pagecache folio to order %d failed\n", order); + ksft_test_result_pass("Split PMD-mapped pagecache folio to order %d passed\n", order); +} + int main(int argc, char **argv) { + int i; + size_t fd_size; + char *optional_xfs_path = NULL; + char fs_loc_template[] = "/tmp/thp_fs_XXXXXX"; + const char *fs_loc; + bool created_tmp; + if (geteuid() != 0) { printf("Please run the benchmark as root\n"); exit(EXIT_FAILURE); } + if (argc > 1) + optional_xfs_path = argv[1]; + pagesize = getpagesize(); pageshift = ffs(pagesize) - 1; pmd_pagesize = read_pmd_pagesize(); @@ -305,9 +451,17 @@ int main(int argc, char **argv) exit(EXIT_FAILURE); } + fd_size = 2 * pmd_pagesize; + split_pmd_thp(); split_pte_mapped_thp(); split_file_backed_thp(); + created_tmp = prepare_thp_fs(optional_xfs_path, fs_loc_template, + &fs_loc); + for (i = 8; i >= 0; i--) + split_thp_in_pagecache_to_order(fd_size, i, fs_loc); + cleanup_thp_fs(fs_loc, created_tmp); + return 0; } -- Gitee From fe6c35c68c7afb278b07b503986b4064ae22c81b Mon Sep 17 00:00:00 2001 From: Barry Song Date: Sat, 23 Mar 2024 00:41:36 +1300 Subject: [PATCH 240/484] arm64: mm: swap: support THP_SWAP on hardware with MTE commit f238b8c33c6738f146bbfbb09b78870ea157c2b7 upstream Conflicts: trivial, we don't have swap_slots.c Backport-reason: SWAP mTHP support Commit d0637c505f8a1 ("arm64: enable THP_SWAP for arm64") brings up THP_SWAP on ARM64, but it doesn't enable THP_SWP on hardware with MTE as the MTE code works with the assumption tags save/restore is always handling a folio with only one page. The limitation should be removed as more and more ARM64 SoCs have this feature. Co-existence of MTE and THP_SWAP becomes more and more important. This patch makes MTE tags saving support large folios, then we don't need to split large folios into base pages for swapping out on ARM64 SoCs with MTE any more. arch_prepare_to_swap() should take folio rather than page as parameter because we support THP swap-out as a whole. It saves tags for all pages in a large folio. As now we are restoring tags based-on folio, in arch_swap_restore(), we may increase some extra loops and early-exitings while refaulting a large folio which is still in swapcache in do_swap_page(). In case a large folio has nr pages, do_swap_page() will only set the PTE of the particular page which is causing the page fault. Thus do_swap_page() runs nr times, and each time, arch_swap_restore() will loop nr times for those subpages in the folio. So right now the algorithmic complexity becomes O(nr^2). Once we support mapping large folios in do_swap_page(), extra loops and early-exitings will decrease while not being completely removed as a large folio might get partially tagged in corner cases such as, 1. a large folio in swapcache can be partially unmapped, thus, MTE tags for the unmapped pages will be invalidated; 2. users might use mprotect() to set MTEs on a part of a large folio. arch_thp_swp_supported() is dropped since ARM64 MTE was the only one who needed it. Link: https://lkml.kernel.org/r/20240322114136.61386-2-21cnbao@gmail.com Signed-off-by: Barry Song Reviewed-by: Steven Price Acked-by: Chris Li Reviewed-by: Ryan Roberts Cc: Catalin Marinas Cc: Will Deacon Cc: Mark Rutland Cc: David Hildenbrand Cc: Kemeng Shi Cc: "Matthew Wilcox (Oracle)" Cc: Anshuman Khandual Cc: Peter Collingbourne Cc: Yosry Ahmed Cc: Peter Xu Cc: Lorenzo Stoakes Cc: "Mike Rapoport (IBM)" Cc: Hugh Dickins Cc: "Aneesh Kumar K.V" Cc: Rick Edgecombe Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- arch/arm64/include/asm/pgtable.h | 19 ++------------ arch/arm64/mm/mteswap.c | 45 ++++++++++++++++++++++++++++++++ include/linux/huge_mm.h | 12 --------- include/linux/pgtable.h | 2 +- mm/internal.h | 14 ++++++++++ mm/memory.c | 2 +- mm/page_io.c | 2 +- mm/shmem.c | 2 +- mm/swapfile.c | 2 +- 9 files changed, 66 insertions(+), 34 deletions(-) diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 9b52aa7d6645..059b6e14cb90 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -45,12 +45,6 @@ __flush_tlb_range(vma, addr, end, PUD_SIZE, false, 1) #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ -static inline bool arch_thp_swp_supported(void) -{ - return !system_supports_mte(); -} -#define arch_thp_swp_supported arch_thp_swp_supported - /* * Outside of a few very special situations (e.g. hibernation), we always * use broadcast TLB invalidation instructions, therefore a spurious page @@ -1110,12 +1104,7 @@ static inline pmd_t pmdp_establish(struct vm_area_struct *vma, #ifdef CONFIG_ARM64_MTE #define __HAVE_ARCH_PREPARE_TO_SWAP -static inline int arch_prepare_to_swap(struct page *page) -{ - if (system_supports_mte()) - return mte_save_tags(page); - return 0; -} +extern int arch_prepare_to_swap(struct folio *folio); #define __HAVE_ARCH_SWAP_INVALIDATE static inline void arch_swap_invalidate_page(int type, pgoff_t offset) @@ -1131,11 +1120,7 @@ static inline void arch_swap_invalidate_area(int type) } #define __HAVE_ARCH_SWAP_RESTORE -static inline void arch_swap_restore(swp_entry_t entry, struct folio *folio) -{ - if (system_supports_mte()) - mte_restore_tags(entry, &folio->page); -} +extern void arch_swap_restore(swp_entry_t entry, struct folio *folio); #endif /* CONFIG_ARM64_MTE */ diff --git a/arch/arm64/mm/mteswap.c b/arch/arm64/mm/mteswap.c index a31833e3ddc5..63e8d72f202a 100644 --- a/arch/arm64/mm/mteswap.c +++ b/arch/arm64/mm/mteswap.c @@ -68,6 +68,13 @@ void mte_invalidate_tags(int type, pgoff_t offset) mte_free_tag_storage(tags); } +static inline void __mte_invalidate_tags(struct page *page) +{ + swp_entry_t entry = page_swap_entry(page); + + mte_invalidate_tags(swp_type(entry), swp_offset(entry)); +} + void mte_invalidate_tags_area(int type) { swp_entry_t entry = swp_entry(type, 0); @@ -83,3 +90,41 @@ void mte_invalidate_tags_area(int type) } xa_unlock(&mte_pages); } + +int arch_prepare_to_swap(struct folio *folio) +{ + long i, nr; + int err; + + if (!system_supports_mte()) + return 0; + + nr = folio_nr_pages(folio); + + for (i = 0; i < nr; i++) { + err = mte_save_tags(folio_page(folio, i)); + if (err) + goto out; + } + return 0; + +out: + while (i--) + __mte_invalidate_tags(folio_page(folio, i)); + return err; +} + +void arch_swap_restore(swp_entry_t entry, struct folio *folio) +{ + long i, nr; + + if (!system_supports_mte()) + return; + + nr = folio_nr_pages(folio); + + for (i = 0; i < nr; i++) { + mte_restore_tags(entry, folio_page(folio, i)); + entry.val++; + } +} diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 8fc964270496..c56d42e28fd9 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -672,16 +672,4 @@ static inline int split_folio_to_order(struct folio *folio, int new_order) #define split_folio_to_list(f, l) split_folio_to_list_to_order(f, l, 0) #define split_folio(f) split_folio_to_order(f, 0) -/* - * archs that select ARCH_WANTS_THP_SWAP but don't support THP_SWP due to - * limitations in the implementation like arm64 MTE can override this to - * false - */ -#ifndef arch_thp_swp_supported -static inline bool arch_thp_swp_supported(void) -{ - return true; -} -#endif - #endif /* _LINUX_HUGE_MM_H */ diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index ff43f1b1c406..bfe4182e7940 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -1120,7 +1120,7 @@ static inline int arch_unmap_one(struct mm_struct *mm, * prototypes must be defined in the arch-specific asm/pgtable.h file. */ #ifndef __HAVE_ARCH_PREPARE_TO_SWAP -static inline int arch_prepare_to_swap(struct page *page) +static inline int arch_prepare_to_swap(struct folio *folio) { return 0; } diff --git a/mm/internal.h b/mm/internal.h index a5ed5dd675c5..4cb81b87c639 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -78,6 +78,20 @@ static inline int folio_nr_pages_mapped(struct folio *folio) return atomic_read(&folio->_nr_pages_mapped) & FOLIO_PAGES_MAPPED; } +/* + * Retrieve the first entry of a folio based on a provided entry within the + * folio. We cannot rely on folio->swap as there is no guarantee that it has + * been initialized. Used for calling arch_swap_restore() + */ +static inline swp_entry_t folio_swap(swp_entry_t entry, struct folio *folio) +{ + swp_entry_t swap = { + .val = ALIGN_DOWN(entry.val, folio_nr_pages(folio)), + }; + + return swap; +} + static inline void *folio_raw_mapping(struct folio *folio) { unsigned long mapping = (unsigned long)folio->mapping; diff --git a/mm/memory.c b/mm/memory.c index ba364eba8057..1cdeaa1a2957 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4214,7 +4214,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) * when reading from swap. This metadata may be indexed by swap entry * so this must be called before swap_free(). */ - arch_swap_restore(entry, folio); + arch_swap_restore(folio_swap(entry, folio), folio); /* * Remove the swap entry and conditionally try to free up the swapcache. diff --git a/mm/page_io.c b/mm/page_io.c index 8a3e9e6d59c3..1ea085329e1c 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -188,7 +188,7 @@ int swap_writepage(struct page *page, struct writeback_control *wbc) * Arch code may have to preserve more data than just the page * contents, e.g. memory tags. */ - ret = arch_prepare_to_swap(&folio->page); + ret = arch_prepare_to_swap(folio); if (ret) { folio_mark_dirty(folio); folio_unlock(folio); diff --git a/mm/shmem.c b/mm/shmem.c index 53e6a417f69f..c8369e760bce 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1916,7 +1916,7 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, * Some architectures may have to restore extra metadata to the * folio after reading from swap. */ - arch_swap_restore(swap, folio); + arch_swap_restore(folio_swap(swap, folio), folio); if (shmem_should_replace_folio(folio, gfp)) { error = shmem_replace_folio(&folio, gfp, info, index); diff --git a/mm/swapfile.c b/mm/swapfile.c index a76bb64789f9..879469ab3b5e 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1977,7 +1977,7 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, * when reading from swap. This metadata may be indexed by swap entry * so this must be called before swap_free(). */ - arch_swap_restore(entry, folio); + arch_swap_restore(folio_swap(entry, folio), folio); dec_mm_counter(vma->vm_mm, MM_SWAPENTS); inc_mm_counter(vma->vm_mm, MM_ANONPAGES); -- Gitee From c98b179b77a5d378952d9aad32438394e5674c8d Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Sun, 24 Mar 2024 17:04:47 -0400 Subject: [PATCH 241/484] mm: zswap: fix data loss on SWP_SYNCHRONOUS_IO devices commit 25cd241408a2adc1ed0ebc90ae0793576c111880 upstream Conflicts: none Backport-reason: ZSWAP mass update Zhongkun He reports data corruption when combining zswap with zram. The issue is the exclusive loads we're doing in zswap. They assume that all reads are going into the swapcache, which can assume authoritative ownership of the data and so the zswap copy can go. However, zram files are marked SWP_SYNCHRONOUS_IO, and faults will try to bypass the swapcache. This results in an optimistic read of the swap data into a page that will be dismissed if the fault fails due to races. In this case, zswap mustn't drop its authoritative copy. Link: https://lore.kernel.org/all/CACSyD1N+dUvsu8=zV9P691B9bVq33erwOXNTmEaUbi9DrDeJzw@mail.gmail.com/ Fixes: b9c91c43412f ("mm: zswap: support exclusive loads") Link: https://lkml.kernel.org/r/20240324210447.956973-1-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Reported-by: Zhongkun He Tested-by: Zhongkun He Acked-by: Yosry Ahmed Acked-by: Barry Song Reviewed-by: Chengming Zhou Reviewed-by: Nhat Pham Acked-by: Chris Li Cc: [6.5+] Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/zswap.c | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/mm/zswap.c b/mm/zswap.c index 5bc80334cd41..293102049596 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -1625,6 +1625,7 @@ bool zswap_load(struct folio *folio) swp_entry_t swp = folio->swap; pgoff_t offset = swp_offset(swp); struct page *page = &folio->page; + bool swapcache = folio_test_swapcache(folio); struct zswap_tree *tree = swap_zswap_tree(swp); struct zswap_entry *entry; u8 *dst; @@ -1637,7 +1638,20 @@ bool zswap_load(struct folio *folio) spin_unlock(&tree->lock); return false; } - zswap_rb_erase(&tree->rbroot, entry); + /* + * When reading into the swapcache, invalidate our entry. The + * swapcache can be the authoritative owner of the page and + * its mappings, and the pressure that results from having two + * in-memory copies outweighs any benefits of caching the + * compression work. + * + * (Most swapins go through the swapcache. The notable + * exception is the singleton fault on SWP_SYNCHRONOUS_IO + * files, which reads into a private page and may free it if + * the fault fails. We remain the primary owner of the entry.) + */ + if (swapcache) + zswap_rb_erase(&tree->rbroot, entry); spin_unlock(&tree->lock); if (entry->length) @@ -1652,9 +1666,10 @@ bool zswap_load(struct folio *folio) if (entry->objcg) count_objcg_event(entry->objcg, ZSWPIN); - zswap_entry_free(entry); - - folio_mark_dirty(folio); + if (swapcache) { + zswap_entry_free(entry); + folio_mark_dirty(folio); + } return true; } -- Gitee From dd4b04f526cb5941234ee900a2231838dbabb111 Mon Sep 17 00:00:00 2001 From: Chris Li Date: Tue, 26 Mar 2024 11:35:43 -0700 Subject: [PATCH 242/484] zswap: replace RB tree with xarray commit 796c2c23e14e754b1b72bed450f03d6f69f29358 upstream Conflicts: none Backport-reason: ZSWAP mass update Very deep RB tree requires rebalance at times. That contributes to the zswap fault latencies. Xarray does not need to perform tree rebalance. Replacing RB tree to xarray can have some small performance gain. One small difference is that xarray insert might fail with ENOMEM, while RB tree insert does not allocate additional memory. The zswap_entry size will reduce a bit due to removing the RB node, which has two pointers and a color field. Xarray store the pointer in the xarray tree rather than the zswap_entry. Every entry has one pointer from the xarray tree. Overall, switching to xarray should save some memory, if the swap entries are densely packed. Notice the zswap_rb_search and zswap_rb_insert often followed by zswap_rb_erase. Use xa_erase and xa_store directly. That saves one tree lookup as well. Remove zswap_invalidate_entry due to no need to call zswap_rb_erase any more. Use zswap_free_entry instead. The "struct zswap_tree" has been replaced by "struct xarray". The tree spin lock has transferred to the xarray lock. Run the kernel build testing 5 times for each version, averages: (memory.max=2GB, zswap shrinker and writeback enabled, one 50GB swapfile, 24 HT core, 32 jobs) mm-unstable-4aaccadb5c04 xarray v9 user 3548.902 3534.375 sys 522.232 520.976 real 202.796 200.864 [chrisl@kernel.org: restore original comment "erase" to "invalidate"] Link: https://lkml.kernel.org/r/20240326-zswap-xarray-v10-1-bf698417c968@kernel.org Link: https://lkml.kernel.org/r/20240326-zswap-xarray-v9-1-d2891a65dfc7@kernel.org Signed-off-by: Chris Li Acked-by: Yosry Ahmed Acked-by: Johannes Weiner Reviewed-by: Nhat Pham Cc: Barry Song Cc: Chengming Zhou Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/zswap.c | 183 +++++++++++++++++------------------------------------ 1 file changed, 57 insertions(+), 126 deletions(-) diff --git a/mm/zswap.c b/mm/zswap.c index 293102049596..6e536a70ab78 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -20,7 +20,6 @@ #include #include #include -#include #include #include #include @@ -193,7 +192,6 @@ static struct shrinker *zswap_shrinker; * This structure contains the metadata for tracking a single compressed * page within zswap. * - * rbnode - links the entry into red-black tree for the appropriate swap type * swpentry - associated swap entry, the offset indexes into the red-black tree * length - the length in bytes of the compressed page data. Needed during * decompression. For a same value filled page length is 0, and both @@ -205,7 +203,6 @@ static struct shrinker *zswap_shrinker; * lru - handle to the pool's lru used to evict pages. */ struct zswap_entry { - struct rb_node rbnode; swp_entry_t swpentry; unsigned int length; struct zswap_pool *pool; @@ -217,12 +214,7 @@ struct zswap_entry { struct list_head lru; }; -struct zswap_tree { - struct rb_root rbroot; - spinlock_t lock; -}; - -static struct zswap_tree *zswap_trees[MAX_SWAPFILES]; +static struct xarray *zswap_trees[MAX_SWAPFILES]; static unsigned int nr_zswap_trees[MAX_SWAPFILES]; /* RCU-protected iteration */ @@ -250,7 +242,7 @@ static bool zswap_has_pool; * helpers and fwd declarations **********************************/ -static inline struct zswap_tree *swap_zswap_tree(swp_entry_t swp) +static inline struct xarray *swap_zswap_tree(swp_entry_t swp) { return &zswap_trees[swp_type(swp)][swp_offset(swp) >> SWAP_ADDRESS_SPACE_SHIFT]; @@ -789,63 +781,6 @@ void zswap_memcg_offline_cleanup(struct mem_cgroup *memcg) spin_unlock(&zswap_shrink_lock); } -/********************************* -* rbtree functions -**********************************/ -static struct zswap_entry *zswap_rb_search(struct rb_root *root, pgoff_t offset) -{ - struct rb_node *node = root->rb_node; - struct zswap_entry *entry; - pgoff_t entry_offset; - - while (node) { - entry = rb_entry(node, struct zswap_entry, rbnode); - entry_offset = swp_offset(entry->swpentry); - if (entry_offset > offset) - node = node->rb_left; - else if (entry_offset < offset) - node = node->rb_right; - else - return entry; - } - return NULL; -} - -/* - * In the case that a entry with the same offset is found, a pointer to - * the existing entry is stored in dupentry and the function returns -EEXIST - */ -static int zswap_rb_insert(struct rb_root *root, struct zswap_entry *entry, - struct zswap_entry **dupentry) -{ - struct rb_node **link = &root->rb_node, *parent = NULL; - struct zswap_entry *myentry; - pgoff_t myentry_offset, entry_offset = swp_offset(entry->swpentry); - - while (*link) { - parent = *link; - myentry = rb_entry(parent, struct zswap_entry, rbnode); - myentry_offset = swp_offset(myentry->swpentry); - if (myentry_offset > entry_offset) - link = &(*link)->rb_left; - else if (myentry_offset < entry_offset) - link = &(*link)->rb_right; - else { - *dupentry = myentry; - return -EEXIST; - } - } - rb_link_node(&entry->rbnode, parent, link); - rb_insert_color(&entry->rbnode, root); - return 0; -} - -static void zswap_rb_erase(struct rb_root *root, struct zswap_entry *entry) -{ - rb_erase(&entry->rbnode, root); - RB_CLEAR_NODE(&entry->rbnode); -} - /********************************* * zswap entry functions **********************************/ @@ -857,7 +792,6 @@ static struct zswap_entry *zswap_entry_cache_alloc(gfp_t gfp, int nid) entry = kmem_cache_alloc_node(zswap_entry_cache, gfp, nid); if (!entry) return NULL; - RB_CLEAR_NODE(&entry->rbnode); return entry; } @@ -892,17 +826,6 @@ static void zswap_entry_free(struct zswap_entry *entry) atomic_dec(&zswap_stored_pages); } -/* - * The caller hold the tree lock and search the entry from the tree, - * so it must be on the tree, remove it from the tree and free it. - */ -static void zswap_invalidate_entry(struct zswap_tree *tree, - struct zswap_entry *entry) -{ - zswap_rb_erase(&tree->rbroot, entry); - zswap_entry_free(entry); -} - /********************************* * compressed storage functions **********************************/ @@ -1091,7 +1014,8 @@ static void zswap_decompress(struct zswap_entry *entry, struct page *page) static int zswap_writeback_entry(struct zswap_entry *entry, swp_entry_t swpentry) { - struct zswap_tree *tree; + struct xarray *tree; + pgoff_t offset = swp_offset(swpentry); struct folio *folio; struct mempolicy *mpol; bool folio_was_allocated; @@ -1135,19 +1059,13 @@ static int zswap_writeback_entry(struct zswap_entry *entry, * be dereferenced. */ tree = swap_zswap_tree(swpentry); - spin_lock(&tree->lock); - if (zswap_rb_search(&tree->rbroot, swp_offset(swpentry)) != entry) { - spin_unlock(&tree->lock); + if (entry != xa_cmpxchg(tree, offset, entry, NULL, GFP_KERNEL)) { delete_from_swap_cache(folio); folio_unlock(folio); folio_put(folio); return -ENOMEM; } - /* Safe to deref entry after the entry is verified above. */ - zswap_rb_erase(&tree->rbroot, entry); - spin_unlock(&tree->lock); - zswap_decompress(entry, &folio->page); count_vm_event(ZSWPWB); @@ -1479,8 +1397,8 @@ bool zswap_store(struct folio *folio) { swp_entry_t swp = folio->swap; pgoff_t offset = swp_offset(swp); - struct zswap_tree *tree = swap_zswap_tree(swp); - struct zswap_entry *entry, *dupentry; + struct xarray *tree = swap_zswap_tree(swp); + struct zswap_entry *entry, *old; struct obj_cgroup *objcg = NULL; struct mem_cgroup *memcg = NULL; unsigned long max_pages, cur_pages; @@ -1568,27 +1486,43 @@ bool zswap_store(struct folio *folio) insert_entry: entry->swpentry = swp; entry->objcg = objcg; + + old = xa_store(tree, offset, entry, GFP_KERNEL); + if (xa_is_err(old)) { + int err = xa_err(old); + + WARN_ONCE(err != -ENOMEM, "unexpected xarray error: %d\n", err); + zswap_reject_alloc_fail++; + goto store_failed; + } + + /* + * We may have had an existing entry that became stale when + * the folio was redirtied and now the new version is being + * swapped out. Get rid of the old. + */ + if (old) + zswap_entry_free(old); + if (objcg) { obj_cgroup_charge_zswap(objcg, entry->length); - /* Account before objcg ref is moved to tree */ count_objcg_event(objcg, ZSWPOUT); } - /* map */ - spin_lock(&tree->lock); /* - * The folio may have been dirtied again, invalidate the - * possibly stale entry before inserting the new entry. + * We finish initializing the entry while it's already in xarray. + * This is safe because: + * + * 1. Concurrent stores and invalidations are excluded by folio lock. + * + * 2. Writeback is excluded by the entry not being on the LRU yet. + * The publishing order matters to prevent writeback from seeing + * an incoherent entry. */ - if (zswap_rb_insert(&tree->rbroot, entry, &dupentry) == -EEXIST) { - zswap_invalidate_entry(tree, dupentry); - WARN_ON(zswap_rb_insert(&tree->rbroot, entry, &dupentry)); - } if (entry->length) { INIT_LIST_HEAD(&entry->lru); zswap_lru_add(&zswap_list_lru, entry); } - spin_unlock(&tree->lock); /* update stats */ atomic_inc(&zswap_stored_pages); @@ -1596,8 +1530,14 @@ bool zswap_store(struct folio *folio) return true; +store_failed: + if (!entry->length) + atomic_dec(&zswap_same_filled_pages); + else { + zpool_free(zswap_find_zpool(entry), entry->handle); put_pool: - zswap_pool_put(entry->pool); + zswap_pool_put(entry->pool); + } freepage: zswap_entry_cache_free(entry); reject: @@ -1608,11 +1548,9 @@ bool zswap_store(struct folio *folio) * possibly stale entry which was previously stored at this offset. * Otherwise, writeback could overwrite the new data in the swapfile. */ - spin_lock(&tree->lock); - entry = zswap_rb_search(&tree->rbroot, offset); + entry = xa_erase(tree, offset); if (entry) - zswap_invalidate_entry(tree, entry); - spin_unlock(&tree->lock); + zswap_entry_free(entry); return false; shrink: @@ -1626,18 +1564,12 @@ bool zswap_load(struct folio *folio) pgoff_t offset = swp_offset(swp); struct page *page = &folio->page; bool swapcache = folio_test_swapcache(folio); - struct zswap_tree *tree = swap_zswap_tree(swp); + struct xarray *tree = swap_zswap_tree(swp); struct zswap_entry *entry; u8 *dst; VM_WARN_ON_ONCE(!folio_test_locked(folio)); - spin_lock(&tree->lock); - entry = zswap_rb_search(&tree->rbroot, offset); - if (!entry) { - spin_unlock(&tree->lock); - return false; - } /* * When reading into the swapcache, invalidate our entry. The * swapcache can be the authoritative owner of the page and @@ -1651,8 +1583,12 @@ bool zswap_load(struct folio *folio) * the fault fails. We remain the primary owner of the entry.) */ if (swapcache) - zswap_rb_erase(&tree->rbroot, entry); - spin_unlock(&tree->lock); + entry = xa_erase(tree, offset); + else + entry = xa_load(tree, offset); + + if (!entry) + return false; if (entry->length) zswap_decompress(entry, page); @@ -1677,19 +1613,17 @@ bool zswap_load(struct folio *folio) void zswap_invalidate(swp_entry_t swp) { pgoff_t offset = swp_offset(swp); - struct zswap_tree *tree = swap_zswap_tree(swp); + struct xarray *tree = swap_zswap_tree(swp); struct zswap_entry *entry; - spin_lock(&tree->lock); - entry = zswap_rb_search(&tree->rbroot, offset); + entry = xa_erase(tree, offset); if (entry) - zswap_invalidate_entry(tree, entry); - spin_unlock(&tree->lock); + zswap_entry_free(entry); } int zswap_swapon(int type, unsigned long nr_pages) { - struct zswap_tree *trees, *tree; + struct xarray *trees, *tree; unsigned int nr, i; nr = DIV_ROUND_UP(nr_pages, SWAP_ADDRESS_SPACE_PAGES); @@ -1699,11 +1633,8 @@ int zswap_swapon(int type, unsigned long nr_pages) return -ENOMEM; } - for (i = 0; i < nr; i++) { - tree = trees + i; - tree->rbroot = RB_ROOT; - spin_lock_init(&tree->lock); - } + for (i = 0; i < nr; i++) + xa_init(trees + i); nr_zswap_trees[type] = nr; zswap_trees[type] = trees; @@ -1712,7 +1643,7 @@ int zswap_swapon(int type, unsigned long nr_pages) void zswap_swapoff(int type) { - struct zswap_tree *trees = zswap_trees[type]; + struct xarray *trees = zswap_trees[type]; unsigned int i; if (!trees) @@ -1720,7 +1651,7 @@ void zswap_swapoff(int type) /* try_to_unuse() invalidated all the entries already */ for (i = 0; i < nr_zswap_trees[type]; i++) - WARN_ON_ONCE(!RB_EMPTY_ROOT(&trees[i].rbroot)); + WARN_ON_ONCE(!xa_empty(trees + i)); kvfree(trees); nr_zswap_trees[type] = 0; -- Gitee From b5718aec921ebbeacb875d30670950fbf2129a3b Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Sat, 13 Apr 2024 02:24:04 +0000 Subject: [PATCH 243/484] mm: zswap: always shrink in zswap_store() if zswap_pool_reached_full commit 4ea3fa9dd2e9c58920d73b1b2cd6bcc6f14ae0e0 upstream Conflicts: none Backport-reason: ZSWAP mass update Patch series "zswap same-filled and limit checking cleanups", v3. Miscellaneous cleanups for limit checking and same-filled handling in the store path. This series was broken out of the "zswap: store zero-filled pages more efficiently" series [1]. It contains the cleanups and drops the main functional changes. [1]https://lore.kernel.org/lkml/20240325235018.2028408-1-yosryahmed@google.com/ This patch (of 4): The cleanup code in zswap_store() is not pretty, particularly the 'shrink' label at the bottom that ends up jumping between cleanup labels. Instead of having a dedicated label to shrink the pool, just use zswap_pool_reached_full directly to figure out if the pool needs shrinking. zswap_pool_reached_full should be true if and only if the pool needs shrinking. The only caveat is that the value of zswap_pool_reached_full may be changed by concurrent zswap_store() calls between checking the limit and testing zswap_pool_reached_full in the cleanup code. This is fine because: - If zswap_pool_reached_full was true during limit checking then became false during the cleanup code, then someone else already took care of shrinking the pool and there is no need to queue the worker. That would be a good change. - If zswap_pool_reached_full was false during limit checking then became true during the cleanup code, then someone else hit the limit meanwhile. In this case, both threads will try to queue the worker, but it never gets queued more than once anyway. Also, calling queue_work() multiple times when the limit is hit could already happen today, so this isn't a significant change in any way. Link: https://lkml.kernel.org/r/20240413022407.785696-1-yosryahmed@google.com Link: https://lkml.kernel.org/r/20240413022407.785696-2-yosryahmed@google.com Signed-off-by: Yosry Ahmed Reviewed-by: Nhat Pham Reviewed-by: Chengming Zhou Acked-by: Johannes Weiner Cc: "Maciej S. Szmigiero" Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/zswap.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/mm/zswap.c b/mm/zswap.c index 6e536a70ab78..b9e169401282 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -1431,12 +1431,12 @@ bool zswap_store(struct folio *folio) if (cur_pages >= max_pages) { zswap_pool_limit_hit++; zswap_pool_reached_full = true; - goto shrink; + goto reject; } if (zswap_pool_reached_full) { if (cur_pages > zswap_accept_thr_pages()) - goto shrink; + goto reject; else zswap_pool_reached_full = false; } @@ -1542,6 +1542,8 @@ bool zswap_store(struct folio *folio) zswap_entry_cache_free(entry); reject: obj_cgroup_put(objcg); + if (zswap_pool_reached_full) + queue_work(shrink_wq, &zswap_shrink_work); check_old: /* * If the zswap store fails or zswap is disabled, we must invalidate the @@ -1552,10 +1554,6 @@ bool zswap_store(struct folio *folio) if (entry) zswap_entry_free(entry); return false; - -shrink: - queue_work(shrink_wq, &zswap_shrink_work); - goto reject; } bool zswap_load(struct folio *folio) -- Gitee From 788d552a451531d30c2f4008909f0919c5a59171 Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Sat, 13 Apr 2024 02:24:05 +0000 Subject: [PATCH 244/484] mm: zswap: refactor limit checking from zswap_store() commit 82e0f8e47b406bb5948c2300a02ac6ede21532c1 upstream Conflicts: none Backport-reason: ZSWAP mass update Refactor limit and acceptance threshold checking outside of zswap_store(). This code will be moved around in a following patch, so it would be cleaner to move a function call around. Link: https://lkml.kernel.org/r/20240413022407.785696-3-yosryahmed@google.com Signed-off-by: Yosry Ahmed Reviewed-by: Nhat Pham Cc: Chengming Zhou Cc: Johannes Weiner Cc: "Maciej S. Szmigiero" Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/zswap.c | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/mm/zswap.c b/mm/zswap.c index b9e169401282..823e7cb079ea 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -516,6 +516,21 @@ unsigned long zswap_total_pages(void) return total; } +static bool zswap_check_limits(void) +{ + unsigned long cur_pages = zswap_total_pages(); + unsigned long max_pages = zswap_max_pages(); + + if (cur_pages >= max_pages) { + zswap_pool_limit_hit++; + zswap_pool_reached_full = true; + } else if (zswap_pool_reached_full && + cur_pages <= zswap_accept_thr_pages()) { + zswap_pool_reached_full = false; + } + return zswap_pool_reached_full; +} + /********************************* * param callbacks **********************************/ @@ -1401,7 +1416,6 @@ bool zswap_store(struct folio *folio) struct zswap_entry *entry, *old; struct obj_cgroup *objcg = NULL; struct mem_cgroup *memcg = NULL; - unsigned long max_pages, cur_pages; VM_WARN_ON_ONCE(!folio_test_locked(folio)); VM_WARN_ON_ONCE(!folio_test_swapcache(folio)); @@ -1424,22 +1438,8 @@ bool zswap_store(struct folio *folio) mem_cgroup_put(memcg); } - /* Check global limits */ - cur_pages = zswap_total_pages(); - max_pages = zswap_max_pages(); - - if (cur_pages >= max_pages) { - zswap_pool_limit_hit++; - zswap_pool_reached_full = true; + if (zswap_check_limits()) goto reject; - } - - if (zswap_pool_reached_full) { - if (cur_pages > zswap_accept_thr_pages()) - goto reject; - else - zswap_pool_reached_full = false; - } /* allocate entry */ entry = zswap_entry_cache_alloc(GFP_KERNEL, folio_nid(folio)); -- Gitee From ea8b40adb504322aea52f6221c9eed2b9f75af62 Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Sat, 13 Apr 2024 02:24:06 +0000 Subject: [PATCH 245/484] mm: zswap: move more same-filled pages checks outside of zswap_store() commit e87b881489085b4832ad417da653739cbace49e7 upstream Conflicts: none Backport-reason: ZSWAP mass update Currently, zswap_store() checks zswap_same_filled_pages_enabled, kmaps the folio, then calls zswap_is_page_same_filled() to check the folio contents. Move this logic into zswap_is_page_same_filled() as well (and rename it to use 'folio' while we are at it). This makes zswap_store() cleaner, and makes following changes to that logic contained within the helper. While we are at it: - Rename the insert_entry label to store_entry to match xa_store(). - Add comment headers for same-filled functions and the main API functions (load, store, invalidate, swapon, swapoff). No functional change intended. Link: https://lkml.kernel.org/r/20240413022407.785696-4-yosryahmed@google.com Signed-off-by: Yosry Ahmed Reviewed-by: Nhat Pham Reviewed-by: Chengming Zhou Acked-by: Johannes Weiner Cc: "Maciej S. Szmigiero" Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/zswap.c | 45 +++++++++++++++++++++++++-------------------- 1 file changed, 25 insertions(+), 20 deletions(-) diff --git a/mm/zswap.c b/mm/zswap.c index 823e7cb079ea..ff6aeca280b0 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -1378,26 +1378,35 @@ static void shrink_worker(struct work_struct *w) } while (zswap_total_pages() > thr); } -static int zswap_is_page_same_filled(void *ptr, unsigned long *value) +/********************************* +* same-filled functions +**********************************/ +static bool zswap_is_folio_same_filled(struct folio *folio, unsigned long *value) { unsigned long *page; unsigned long val; unsigned int pos, last_pos = PAGE_SIZE / sizeof(*page) - 1; + bool ret = false; - page = (unsigned long *)ptr; + if (!zswap_same_filled_pages_enabled) + return false; + + page = kmap_local_folio(folio, 0); val = page[0]; if (val != page[last_pos]) - return 0; + goto out; for (pos = 1; pos < last_pos; pos++) { if (val != page[pos]) - return 0; + goto out; } *value = val; - - return 1; + ret = true; +out: + kunmap_local(page); + return ret; } static void zswap_fill_page(void *ptr, unsigned long value) @@ -1408,6 +1417,9 @@ static void zswap_fill_page(void *ptr, unsigned long value) memset_l(page, value, PAGE_SIZE / sizeof(unsigned long)); } +/********************************* +* main API +**********************************/ bool zswap_store(struct folio *folio) { swp_entry_t swp = folio->swap; @@ -1416,6 +1428,7 @@ bool zswap_store(struct folio *folio) struct zswap_entry *entry, *old; struct obj_cgroup *objcg = NULL; struct mem_cgroup *memcg = NULL; + unsigned long value; VM_WARN_ON_ONCE(!folio_test_locked(folio)); VM_WARN_ON_ONCE(!folio_test_swapcache(folio)); @@ -1448,19 +1461,11 @@ bool zswap_store(struct folio *folio) goto reject; } - if (zswap_same_filled_pages_enabled) { - unsigned long value; - u8 *src; - - src = kmap_local_folio(folio, 0); - if (zswap_is_page_same_filled(src, &value)) { - kunmap_local(src); - entry->length = 0; - entry->value = value; - atomic_inc(&zswap_same_filled_pages); - goto insert_entry; - } - kunmap_local(src); + if (zswap_is_folio_same_filled(folio, &value)) { + entry->length = 0; + entry->value = value; + atomic_inc(&zswap_same_filled_pages); + goto store_entry; } if (!zswap_non_same_filled_pages_enabled) @@ -1483,7 +1488,7 @@ bool zswap_store(struct folio *folio) if (!zswap_compress(folio, entry)) goto put_pool; -insert_entry: +store_entry: entry->swpentry = swp; entry->objcg = objcg; -- Gitee From 5b19b9ad6a9dc93294f4a845002c79f02a0d2d36 Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Sat, 13 Apr 2024 02:24:07 +0000 Subject: [PATCH 246/484] mm: zswap: remove same_filled module params commit c074e1467f8546c8f8c9ea2128fb8bf8c4579418 upstream Conflicts: none Backport-reason: ZSWAP mass update These knobs offer more fine-grained control to userspace than needed and directly expose/influence kernel implementation; remove them. For disabling same_filled handling, there is no logical reason to refuse storing same-filled pages more efficiently and opt for compression. Scanning pages for patterns may be an argument, but the page contents will be read into the CPU cache anyway during compression. Also, removing the same_filled handling code does not move the needle significantly in terms of performance anyway [1]. For disabling non_same_filled handling, it was added when the compressed pages in zswap were not being properly charged to memcgs, as workloads could escape the accounting with compression [2]. This is no longer the case after commit f4840ccfca25 ("zswap: memcg accounting"), and using zswap without compression does not make much sense. [1]https://lore.kernel.org/lkml/CAJD7tkaySFP2hBQw4pnZHJJwe3bMdjJ1t9VC2VJd=khn1_TXvA@mail.gmail.com/ [2]https://lore.kernel.org/lkml/19d5cdee-2868-41bd-83d5-6da75d72e940@maciej.szmigiero.name/ [yosryahmed@google.com: remove same_filled_pages from docs] Link: https://lkml.kernel.org/r/ZhxFVggdyvCo79jc@google.com Link: https://lkml.kernel.org/r/20240413022407.785696-5-yosryahmed@google.com Signed-off-by: Yosry Ahmed Acked-by: Johannes Weiner Reviewed-by: Nhat Pham Reviewed-by: Chengming Zhou Cc: "Maciej S. Szmigiero" Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- Documentation/admin-guide/mm/zswap.rst | 29 ------------------- .../driver-api/crypto/iaa/iaa-crypto.rst | 2 -- mm/zswap.c | 19 ------------ 3 files changed, 50 deletions(-) diff --git a/Documentation/admin-guide/mm/zswap.rst b/Documentation/admin-guide/mm/zswap.rst index b42132969e31..59783134afbe 100644 --- a/Documentation/admin-guide/mm/zswap.rst +++ b/Documentation/admin-guide/mm/zswap.rst @@ -111,35 +111,6 @@ checked if it is a same-value filled page before compressing it. If true, the compressed length of the page is set to zero and the pattern or same-filled value is stored. -Same-value filled pages identification feature is enabled by default and can be -disabled at boot time by setting the ``same_filled_pages_enabled`` attribute -to 0, e.g. ``zswap.same_filled_pages_enabled=0``. It can also be enabled and -disabled at runtime using the sysfs ``same_filled_pages_enabled`` -attribute, e.g.:: - - echo 1 > /sys/module/zswap/parameters/same_filled_pages_enabled - -When zswap same-filled page identification is disabled at runtime, it will stop -checking for the same-value filled pages during store operation. -In other words, every page will be then considered non-same-value filled. -However, the existing pages which are marked as same-value filled pages remain -stored unchanged in zswap until they are either loaded or invalidated. - -In some circumstances it might be advantageous to make use of just the zswap -ability to efficiently store same-filled pages without enabling the whole -compressed page storage. -In this case the handling of non-same-value pages by zswap (enabled by default) -can be disabled by setting the ``non_same_filled_pages_enabled`` attribute -to 0, e.g. ``zswap.non_same_filled_pages_enabled=0``. -It can also be enabled and disabled at runtime using the sysfs -``non_same_filled_pages_enabled`` attribute, e.g.:: - - echo 1 > /sys/module/zswap/parameters/non_same_filled_pages_enabled - -Disabling both ``zswap.same_filled_pages_enabled`` and -``zswap.non_same_filled_pages_enabled`` effectively disables accepting any new -pages by zswap. - To prevent zswap from shrinking pool when zswap is full and there's a high pressure on swap (this will result in flipping pages in and out zswap pool without any real benefit but with a performance drop for the system), a diff --git a/Documentation/driver-api/crypto/iaa/iaa-crypto.rst b/Documentation/driver-api/crypto/iaa/iaa-crypto.rst index f4fba897d931..bba40158dd5c 100644 --- a/Documentation/driver-api/crypto/iaa/iaa-crypto.rst +++ b/Documentation/driver-api/crypto/iaa/iaa-crypto.rst @@ -471,7 +471,6 @@ Use the following commands to enable zswap:: # echo deflate-iaa > /sys/module/zswap/parameters/compressor # echo zsmalloc > /sys/module/zswap/parameters/zpool # echo 1 > /sys/module/zswap/parameters/enabled - # echo 0 > /sys/module/zswap/parameters/same_filled_pages_enabled # echo 100 > /proc/sys/vm/swappiness # echo never > /sys/kernel/mm/transparent_hugepage/enabled # echo 1 > /proc/sys/vm/overcommit_memory @@ -621,7 +620,6 @@ the 'fixed' compression mode:: echo deflate-iaa > /sys/module/zswap/parameters/compressor echo zsmalloc > /sys/module/zswap/parameters/zpool echo 1 > /sys/module/zswap/parameters/enabled - echo 0 > /sys/module/zswap/parameters/same_filled_pages_enabled echo 100 > /proc/sys/vm/swappiness echo never > /sys/kernel/mm/transparent_hugepage/enabled diff --git a/mm/zswap.c b/mm/zswap.c index ff6aeca280b0..cc2764755325 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -123,19 +123,6 @@ static unsigned int zswap_accept_thr_percent = 90; /* of max pool size */ module_param_named(accept_threshold_percent, zswap_accept_thr_percent, uint, 0644); -/* - * Enable/disable handling same-value filled pages (enabled by default). - * If disabled every page is considered non-same-value filled. - */ -static bool zswap_same_filled_pages_enabled = true; -module_param_named(same_filled_pages_enabled, zswap_same_filled_pages_enabled, - bool, 0644); - -/* Enable/disable handling non-same-value filled pages (enabled by default) */ -static bool zswap_non_same_filled_pages_enabled = true; -module_param_named(non_same_filled_pages_enabled, zswap_non_same_filled_pages_enabled, - bool, 0644); - /* Number of zpools in zswap_pool (empirically determined for scalability) */ #define ZSWAP_NR_ZPOOLS 32 @@ -1388,9 +1375,6 @@ static bool zswap_is_folio_same_filled(struct folio *folio, unsigned long *value unsigned int pos, last_pos = PAGE_SIZE / sizeof(*page) - 1; bool ret = false; - if (!zswap_same_filled_pages_enabled) - return false; - page = kmap_local_folio(folio, 0); val = page[0]; @@ -1468,9 +1452,6 @@ bool zswap_store(struct folio *folio) goto store_entry; } - if (!zswap_non_same_filled_pages_enabled) - goto freepage; - /* if entry is successfully added, it keeps the reference */ entry->pool = zswap_pool_current_get(); if (!entry->pool) -- Gitee From c8fbde82172438255e03710a61ff3e910372513d Mon Sep 17 00:00:00 2001 From: Usama Arif Date: Mon, 10 Jun 2024 15:30:37 +0100 Subject: [PATCH 247/484] mm: do not start/end writeback for pages stored in zswap commit 9ba85f5529f1110aa6d787f8f12553a713f945fc upstream Conflicts: none Backport-reason: ZSWAP mass update Most of the work done in folio_start_writeback is reversed in folio_end_writeback. For e.g. NR_WRITEBACK and NR_ZONE_WRITE_PENDING are incremented in start_writeback and decremented in end_writeback. Calling end_writeback immediately after start_writeback (separated by folio_unlock) cancels the affect of most of the work done in start hence can be removed. There is some extra work done in folio_end_writeback, however it is incorrect/not applicable to zswap: - folio_end_writeback incorrectly increments NR_WRITTEN counter, eventhough the pages aren't written to disk, hence this change corrects this behaviour. - folio_end_writeback calls folio_rotate_reclaimable, but that only makes sense for async writeback pages, while for zswap pages are synchronously reclaimed. Link: https://lkml.kernel.org/r/20240612100109.1616626-1-usamaarif642@gmail.com Link: https://lkml.kernel.org/r/20240610143037.812955-1-usamaarif642@gmail.com Signed-off-by: Usama Arif Acked-by: Johannes Weiner Reviewed-by: Shakeel Butt Acked-by: Yosry Ahmed Reviewed-by: Chengming Zhou Suggested-by: Matthew Wilcox (Oracle) Cc: Nhat Pham Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/page_io.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/mm/page_io.c b/mm/page_io.c index 1ea085329e1c..7016aad14833 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -195,9 +195,7 @@ int swap_writepage(struct page *page, struct writeback_control *wbc) return ret; } if (zswap_store(folio)) { - folio_start_writeback(folio); folio_unlock(folio); - folio_end_writeback(folio); return 0; } -- Gitee From 1f9a5ce1ebcb1c47f7588dd3c07de5d98f5b4b53 Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Tue, 11 Jun 2024 02:45:14 +0000 Subject: [PATCH 248/484] mm: zswap: rename is_zswap_enabled() to zswap_is_enabled() commit 2b33a97c94bc44468fc1d54b745269c0cf0b7bb2 upstream Conflicts: none Backport-reason: ZSWAP mass update In preparation for introducing a similar function, rename is_zswap_enabled() to use zswap_* prefix like other zswap functions. Link: https://lkml.kernel.org/r/20240611024516.1375191-1-yosryahmed@google.com Signed-off-by: Yosry Ahmed Reviewed-by: Barry Song Reviewed-by: Nhat Pham Cc: Chengming Zhou Cc: Chris Li Cc: David Hildenbrand Cc: Johannes Weiner Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- include/linux/zswap.h | 4 ++-- mm/memcontrol.c | 2 +- mm/zswap.c | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/include/linux/zswap.h b/include/linux/zswap.h index 2a85b941db97..ce5e7bfe8f1e 100644 --- a/include/linux/zswap.h +++ b/include/linux/zswap.h @@ -35,7 +35,7 @@ void zswap_swapoff(int type); void zswap_memcg_offline_cleanup(struct mem_cgroup *memcg); void zswap_lruvec_state_init(struct lruvec *lruvec); void zswap_folio_swapin(struct folio *folio); -bool is_zswap_enabled(void); +bool zswap_is_enabled(void); #else struct zswap_lruvec_state {}; @@ -60,7 +60,7 @@ static inline void zswap_memcg_offline_cleanup(struct mem_cgroup *memcg) {} static inline void zswap_lruvec_state_init(struct lruvec *lruvec) {} static inline void zswap_folio_swapin(struct folio *folio) {} -static inline bool is_zswap_enabled(void) +static inline bool zswap_is_enabled(void) { return false; } diff --git a/mm/memcontrol.c b/mm/memcontrol.c index e9ed61561b68..f48ab74f3b05 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -10268,7 +10268,7 @@ void obj_cgroup_uncharge_zswap(struct obj_cgroup *objcg, size_t size) bool mem_cgroup_zswap_writeback_enabled(struct mem_cgroup *memcg) { /* if zswap is disabled, do not block pages going to the swapping device */ - return !is_zswap_enabled() || !memcg || READ_ONCE(memcg->zswap_writeback); + return !zswap_is_enabled() || !memcg || READ_ONCE(memcg->zswap_writeback); } static u64 zswap_current_read(struct cgroup_subsys_state *css, diff --git a/mm/zswap.c b/mm/zswap.c index cc2764755325..df36581980ca 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -131,7 +131,7 @@ static bool zswap_shrinker_enabled = IS_ENABLED( CONFIG_ZSWAP_SHRINKER_DEFAULT_ON); module_param_named(shrinker_enabled, zswap_shrinker_enabled, bool, 0644); -bool is_zswap_enabled(void) +bool zswap_is_enabled(void) { return zswap_enabled; } -- Gitee From 983e7d3e6cd0eeb170d76a356c849a68953e1797 Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Tue, 11 Jun 2024 02:45:15 +0000 Subject: [PATCH 249/484] mm: zswap: add zswap_never_enabled() commit 2d4d2b1cfb85cc07f6d5619acb882d8b11e55cf4 upstream Conflicts: none Backport-reason: ZSWAP mass update Add zswap_never_enabled() to skip the xarray lookup in zswap_load() if zswap was never enabled on the system. It is implemented using static branches for efficiency, as enabling zswap should be a rare event. This could shave some cycles off zswap_load() when CONFIG_ZSWAP is used but zswap is never enabled. However, the real motivation behind this patch is two-fold: - Incoming large folio swapin work will need to fallback to order-0 folios if zswap was ever enabled, because any part of the folio could be in zswap, until proper handling of large folios with zswap is added. - A warning and recovery attempt will be added in a following change in case the above was not done incorrectly. Zswap will fail the read if the folio is large and it was ever enabled. Expose zswap_never_enabled() in the header for the swapin work to use it later. [yosryahmed@google.com: expose zswap_never_enabled() in the header] Link: https://lkml.kernel.org/r/Zmjf0Dr8s9xSW41X@google.com Link: https://lkml.kernel.org/r/20240611024516.1375191-2-yosryahmed@google.com Signed-off-by: Yosry Ahmed Reviewed-by: Nhat Pham Cc: Barry Song Cc: Chengming Zhou Cc: Chris Li Cc: David Hildenbrand Cc: Johannes Weiner Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- include/linux/zswap.h | 6 ++++++ mm/zswap.c | 10 ++++++++++ 2 files changed, 16 insertions(+) diff --git a/include/linux/zswap.h b/include/linux/zswap.h index ce5e7bfe8f1e..bf83ae5e285d 100644 --- a/include/linux/zswap.h +++ b/include/linux/zswap.h @@ -36,6 +36,7 @@ void zswap_memcg_offline_cleanup(struct mem_cgroup *memcg); void zswap_lruvec_state_init(struct lruvec *lruvec); void zswap_folio_swapin(struct folio *folio); bool zswap_is_enabled(void); +bool zswap_never_enabled(void); #else struct zswap_lruvec_state {}; @@ -65,6 +66,11 @@ static inline bool zswap_is_enabled(void) return false; } +static inline bool zswap_never_enabled(void) +{ + return false; +} + #endif #endif /* _LINUX_ZSWAP_H */ diff --git a/mm/zswap.c b/mm/zswap.c index df36581980ca..32b585e2ce8f 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -83,6 +83,7 @@ static bool zswap_pool_reached_full; static int zswap_setup(void); /* Enable/disable zswap */ +static DEFINE_STATIC_KEY_MAYBE(CONFIG_ZSWAP_DEFAULT_ON, zswap_ever_enabled); static bool zswap_enabled = IS_ENABLED(CONFIG_ZSWAP_DEFAULT_ON); static int zswap_enabled_param_set(const char *, const struct kernel_param *); @@ -136,6 +137,11 @@ bool zswap_is_enabled(void) return zswap_enabled; } +bool zswap_never_enabled(void) +{ + return !static_branch_maybe(CONFIG_ZSWAP_DEFAULT_ON, &zswap_ever_enabled); +} + /********************************* * data structures **********************************/ @@ -1554,6 +1560,9 @@ bool zswap_load(struct folio *folio) VM_WARN_ON_ONCE(!folio_test_locked(folio)); + if (zswap_never_enabled()) + return false; + /* * When reading into the swapcache, invalidate our entry. The * swapcache can be the authoritative owner of the page and @@ -1735,6 +1744,7 @@ static int zswap_setup(void) zpool_get_type(pool->zpools[0])); list_add(&pool->list, &zswap_pools); zswap_has_pool = true; + static_branch_enable(&zswap_ever_enabled); } else { pr_err("pool creation failed\n"); zswap_enabled = false; -- Gitee From 6011c8db48bad7a05e0cfa60deb6fc1a6d33fb62 Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Tue, 11 Jun 2024 02:45:16 +0000 Subject: [PATCH 250/484] mm: zswap: handle incorrect attempts to load large folios commit c63f210d4891f5b1b1057a0d7c91d2b0d15431d1 upstream Conflicts: none Backport-reason: ZSWAP mass update Zswap does not support storing or loading large folios. Until proper support is added, attempts to load large folios from zswap are a bug. For example, if a swapin fault observes that contiguous PTEs are pointing to contiguous swap entries and tries to swap them in as a large folio, swap_read_folio() will pass in a large folio to zswap_load(), but zswap_load() will only effectively load the first page in the folio. If the first page is not in zswap, the folio will be read from disk, even though other pages may be in zswap. In both cases, this will lead to silent data corruption. Proper support needs to be added before large folio swapins and zswap can work together. Looking at callers of swap_read_folio(), it seems like they are either allocated from __read_swap_cache_async() or do_swap_page() in the SWP_SYNCHRONOUS_IO path. Both of which allocate order-0 folios, so everything is fine for now. However, there is ongoing work to add to support large folio swapins [1]. To make sure new development does not break zswap (or get broken by zswap), add minimal handling of incorrect loads of large folios to zswap. First, move the call folio_mark_uptodate() inside zswap_load(). If a large folio load is attempted, and zswap was ever enabled on the system, return 'true' without calling folio_mark_uptodate(). This will prevent the folio from being read from disk, and will emit an IO error because the folio is not uptodate (e.g. do_swap_fault() will return VM_FAULT_SIGBUS). It may not be reliable recovery in all cases, but it is better than nothing. This was tested by hacking the allocation in __read_swap_cache_async() to use order 2 and __GFP_COMP. In the future, to handle this correctly, the swapin code should: (a) Fall back to order-0 swapins if zswap was ever used on the machine, because compressed pages remain in zswap after it is disabled. (b) Add proper support to swapin large folios from zswap (fully or partially). Probably start with (a) then followup with (b). [1]https://lore.kernel.org/linux-mm/20240304081348.197341-6-21cnbao@gmail.com/ Link: https://lkml.kernel.org/r/20240611024516.1375191-3-yosryahmed@google.com Signed-off-by: Yosry Ahmed Acked-by: Barry Song Cc: Barry Song Cc: Chengming Zhou Cc: Chris Li Cc: David Hildenbrand Cc: Johannes Weiner Cc: Matthew Wilcox (Oracle) Cc: Nhat Pham Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/page_io.c | 1 - mm/zswap.c | 12 ++++++++++++ 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/mm/page_io.c b/mm/page_io.c index 7016aad14833..7cb539cf2c5c 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -528,7 +528,6 @@ void swap_read_folio(struct folio *folio, bool synchronous, delayacct_swapin_start(); if (zswap_load(folio)) { - folio_mark_uptodate(folio); folio_unlock(folio); } else if (data_race(sis->flags & SWP_FS_OPS)) { swap_read_folio_fs(folio, plug); diff --git a/mm/zswap.c b/mm/zswap.c index 32b585e2ce8f..0c7e4b774602 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -1563,6 +1563,17 @@ bool zswap_load(struct folio *folio) if (zswap_never_enabled()) return false; + /* + * Large folios should not be swapped in while zswap is being used, as + * they are not properly handled. Zswap does not properly load large + * folios, and a large folio may only be partially in zswap. + * + * Return true without marking the folio uptodate so that an IO error is + * emitted (e.g. do_swap_page() will sigbus). + */ + if (WARN_ON_ONCE(folio_test_large(folio))) + return true; + /* * When reading into the swapcache, invalidate our entry. The * swapcache can be the authoritative owner of the page and @@ -1600,6 +1611,7 @@ bool zswap_load(struct folio *folio) folio_mark_dirty(folio); } + folio_mark_uptodate(folio); return true; } -- Gitee From 8a25fa25c957ce017a28dd45b3afffbbee26d426 Mon Sep 17 00:00:00 2001 From: Chengming Zhou Date: Mon, 17 Jun 2024 20:57:41 +0800 Subject: [PATCH 251/484] mm/zswap: use only one pool in zswap commit 8edc9c4e72fe0cc9f7a258649827dafa9542c8ac upstream Conflicts: none Backport-reason: ZSWAP mass update Zswap uses 32 pools to workaround the locking scalability problem in zswap backends (mainly zsmalloc nowadays), which brings its own problems like memory waste and more memory fragmentation. Testing results show that we can have near performance with only one pool in zswap after changing zsmalloc to use per-size_class lock instead of pool spinlock. Testing kernel build (make bzImage -j32) on tmpfs with memory.max=1GB, and zswap shrinker enabled with 10GB swapfile on ext4. real user sys 6.10.0-rc3 138.18 1241.38 1452.73 6.10.0-rc3-onepool 149.45 1240.45 1844.69 6.10.0-rc3-onepool-perclass 138.23 1242.37 1469.71 And do the same testing using zbud, which shows a little worse performance as expected since we don't do any locking optimization for zbud. I think it's acceptable since zsmalloc became a lot more popular than other backends, and we may want to support only zsmalloc in the future. real user sys 6.10.0-rc3-zbud 138.23 1239.58 1430.09 6.10.0-rc3-onepool-zbud 139.64 1241.37 1516.59 [chengming.zhou@linux.dev: fix error handling in zswap_pool_create(), per Dan Carpenter] Link: https://lkml.kernel.org/r/20240621-zsmalloc-lock-mm-everything-v2-2-d30e9cd2b793@linux.dev [chengming.zhou@linux.dev: fix error handling again in zswap_pool_create(), per Yosry] Link: https://lkml.kernel.org/r/20240625-zsmalloc-lock-mm-everything-v3-2-ad941699cb61@linux.dev Link: https://lkml.kernel.org/r/20240617-zsmalloc-lock-mm-everything-v1-2-5e5081ea11b3@linux.dev Signed-off-by: Chengming Zhou Reviewed-by: Nhat Pham Acked-by: Yosry Ahmed Cc: Chengming Zhou Cc: Johannes Weiner Cc: Minchan Kim Cc: Sergey Senozhatsky Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/zswap.c | 61 ++++++++++++++++++------------------------------------ 1 file changed, 20 insertions(+), 41 deletions(-) diff --git a/mm/zswap.c b/mm/zswap.c index 0c7e4b774602..921b238a70c9 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -124,9 +124,6 @@ static unsigned int zswap_accept_thr_percent = 90; /* of max pool size */ module_param_named(accept_threshold_percent, zswap_accept_thr_percent, uint, 0644); -/* Number of zpools in zswap_pool (empirically determined for scalability) */ -#define ZSWAP_NR_ZPOOLS 32 - /* Enable/disable memory pressure-based shrinker. */ static bool zswap_shrinker_enabled = IS_ENABLED( CONFIG_ZSWAP_SHRINKER_DEFAULT_ON); @@ -161,7 +158,7 @@ struct crypto_acomp_ctx { * needs to be verified that it's still valid in the tree. */ struct zswap_pool { - struct zpool *zpools[ZSWAP_NR_ZPOOLS]; + struct zpool *zpool; struct crypto_acomp_ctx __percpu *acomp_ctx; struct percpu_ref ref; struct list_head list; @@ -243,7 +240,7 @@ static inline struct xarray *swap_zswap_tree(swp_entry_t swp) #define zswap_pool_debug(msg, p) \ pr_debug("%s pool %s/%s\n", msg, (p)->tfm_name, \ - zpool_get_type((p)->zpools[0])) + zpool_get_type((p)->zpool)) /********************************* * pool functions @@ -252,7 +249,6 @@ static void __zswap_pool_empty(struct percpu_ref *ref); static struct zswap_pool *zswap_pool_create(char *type, char *compressor) { - int i; struct zswap_pool *pool; char name[38]; /* 'zswap' + 32 char (max) num + \0 */ gfp_t gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM; @@ -273,18 +269,14 @@ static struct zswap_pool *zswap_pool_create(char *type, char *compressor) if (!pool) return NULL; - for (i = 0; i < ZSWAP_NR_ZPOOLS; i++) { - /* unique name for each pool specifically required by zsmalloc */ - snprintf(name, 38, "zswap%x", - atomic_inc_return(&zswap_pools_count)); - - pool->zpools[i] = zpool_create_pool(type, name, gfp); - if (!pool->zpools[i]) { - pr_err("%s zpool not available\n", type); - goto error; - } + /* unique name for each pool specifically required by zsmalloc */ + snprintf(name, 38, "zswap%x", atomic_inc_return(&zswap_pools_count)); + pool->zpool = zpool_create_pool(type, name, gfp); + if (!pool->zpool) { + pr_err("%s zpool not available\n", type); + goto error; } - pr_debug("using %s zpool\n", zpool_get_type(pool->zpools[0])); + pr_debug("using %s zpool\n", zpool_get_type(pool->zpool)); strscpy(pool->tfm_name, compressor, sizeof(pool->tfm_name)); @@ -317,8 +309,8 @@ static struct zswap_pool *zswap_pool_create(char *type, char *compressor) error: if (pool->acomp_ctx) free_percpu(pool->acomp_ctx); - while (i--) - zpool_destroy_pool(pool->zpools[i]); + if (pool->zpool) + zpool_destroy_pool(pool->zpool); kfree(pool); return NULL; } @@ -367,15 +359,12 @@ static struct zswap_pool *__zswap_pool_create_fallback(void) static void zswap_pool_destroy(struct zswap_pool *pool) { - int i; - zswap_pool_debug("destroying", pool); cpuhp_state_remove_instance(CPUHP_MM_ZSWP_POOL_PREPARE, &pool->node); free_percpu(pool->acomp_ctx); - for (i = 0; i < ZSWAP_NR_ZPOOLS; i++) - zpool_destroy_pool(pool->zpools[i]); + zpool_destroy_pool(pool->zpool); kfree(pool); } @@ -470,8 +459,7 @@ static struct zswap_pool *zswap_pool_find_get(char *type, char *compressor) list_for_each_entry_rcu(pool, &zswap_pools, list) { if (strcmp(pool->tfm_name, compressor)) continue; - /* all zpools share the same type */ - if (strcmp(zpool_get_type(pool->zpools[0]), type)) + if (strcmp(zpool_get_type(pool->zpool), type)) continue; /* if we can't get it, it's about to be destroyed */ if (!zswap_pool_get(pool)) @@ -498,12 +486,8 @@ unsigned long zswap_total_pages(void) unsigned long total = 0; rcu_read_lock(); - list_for_each_entry_rcu(pool, &zswap_pools, list) { - int i; - - for (i = 0; i < ZSWAP_NR_ZPOOLS; i++) - total += zpool_get_total_pages(pool->zpools[i]); - } + list_for_each_entry_rcu(pool, &zswap_pools, list) + total += zpool_get_total_pages(pool->zpool); rcu_read_unlock(); return total; @@ -808,11 +792,6 @@ static void zswap_entry_cache_free(struct zswap_entry *entry) kmem_cache_free(zswap_entry_cache, entry); } -static struct zpool *zswap_find_zpool(struct zswap_entry *entry) -{ - return entry->pool->zpools[hash_ptr(entry, ilog2(ZSWAP_NR_ZPOOLS))]; -} - /* * Carries out the common pattern of freeing and entry's zpool allocation, * freeing the entry itself, and decrementing the number of stored pages. @@ -823,7 +802,7 @@ static void zswap_entry_free(struct zswap_entry *entry) atomic_dec(&zswap_same_filled_pages); else { zswap_lru_del(&zswap_list_lru, entry); - zpool_free(zswap_find_zpool(entry), entry->handle); + zpool_free(entry->pool->zpool, entry->handle); zswap_pool_put(entry->pool); } if (entry->objcg) { @@ -948,7 +927,7 @@ static bool zswap_compress(struct folio *folio, struct zswap_entry *entry) if (comp_ret) goto unlock; - zpool = zswap_find_zpool(entry); + zpool = entry->pool->zpool; gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM; if (zpool_malloc_support_movable(zpool)) gfp |= __GFP_HIGHMEM | __GFP_MOVABLE; @@ -977,7 +956,7 @@ static bool zswap_compress(struct folio *folio, struct zswap_entry *entry) static void zswap_decompress(struct zswap_entry *entry, struct page *page) { - struct zpool *zpool = zswap_find_zpool(entry); + struct zpool *zpool = entry->pool->zpool; struct scatterlist input, output; struct crypto_acomp_ctx *acomp_ctx; u8 *src; @@ -1526,7 +1505,7 @@ bool zswap_store(struct folio *folio) if (!entry->length) atomic_dec(&zswap_same_filled_pages); else { - zpool_free(zswap_find_zpool(entry), entry->handle); + zpool_free(entry->pool->zpool, entry->handle); put_pool: zswap_pool_put(entry->pool); } @@ -1753,7 +1732,7 @@ static int zswap_setup(void) pool = __zswap_pool_create_fallback(); if (pool) { pr_info("loaded using pool %s/%s\n", pool->tfm_name, - zpool_get_type(pool->zpools[0])); + zpool_get_type(pool->zpool)); list_add(&pool->list, &zswap_pools); zswap_has_pool = true; static_branch_enable(&zswap_ever_enabled); -- Gitee From a8f560cecbf17763a734b2bc58e2c88d2a297a6e Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Mon, 15 Jul 2024 15:23:05 -0500 Subject: [PATCH 252/484] mm/zswap: fix a white space issue commit b749cb0d61ca1ed120a555badf3a7b025b8a7fc2 upstream Conflicts: none Backport-reason: ZSWAP mass update We accidentally deleted a tab in commit f84152e9efc5 ("mm/zswap: use only one pool in zswap"). Add it back. Link: https://lkml.kernel.org/r/c15066a0-f061-42c9-b0f5-d60281d3d5d8@stanley.mountain Signed-off-by: Dan Carpenter Reviewed-by: Chengming Zhou Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/zswap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/zswap.c b/mm/zswap.c index 921b238a70c9..89f647c92e34 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -802,7 +802,7 @@ static void zswap_entry_free(struct zswap_entry *entry) atomic_dec(&zswap_same_filled_pages); else { zswap_lru_del(&zswap_list_lru, entry); - zpool_free(entry->pool->zpool, entry->handle); + zpool_free(entry->pool->zpool, entry->handle); zswap_pool_put(entry->pool); } if (entry->objcg) { -- Gitee From 302db3837c140e5b5a83c80373fe33f987dc3e49 Mon Sep 17 00:00:00 2001 From: Takero Funaki Date: Wed, 31 Jul 2024 00:49:09 +0000 Subject: [PATCH 253/484] mm: zswap: fix global shrinker memcg iteration commit c5519e0a9bfb6365445de0521a6fe7000b375ecf upstream Conflicts: none Backport-reason: ZSWAP mass update Patch series "mm: zswap: fixes for global shrinker", v5. This series addresses issues in the zswap global shrinker that could not shrink stored pages. With this series, the shrinker continues to shrink pages until it reaches the accept threshold more reliably, gives much higher writeback when the zswap pool limit is hit. This patch (of 2): This patch fixes an issue where the zswap global shrinker stopped iterating through the memcg tree. The problem was that shrink_worker() would restart iterating memcg tree from the tree root, considering an offline memcg as a failure, and abort shrinking after encountering the same offline memcg 16 times even if there is only one offline memcg. After this change, an offline memcg in the tree is no longer considered a failure. This allows the shrinker to continue shrinking the other online memcgs regardless of whether an offline memcg exists, gives higher zswap writeback activity. To avoid holding refcount of offline memcg encountered during the memcg tree walking, shrink_worker() must continue iterating to release the offline memcg to ensure the next memcg stored in the cursor is online. The offline memcg cleaner has also been changed to avoid the same issue. When the next memcg of the offlined memcg is also offline, the refcount stored in the iteration cursor was held until the next shrink_worker() run. The cleaner must release the offline memcg recursively. [yosryahmed@google.com: make critical section more obvious, unify comments] Link: https://lkml.kernel.org/r/CAJD7tkaScz+SbB90Q1d5mMD70UfM2a-J2zhXDT9sePR7Qap45Q@mail.gmail.com Link: https://lkml.kernel.org/r/20240731004918.33182-1-flintglass@gmail.com Link: https://lkml.kernel.org/r/20240731004918.33182-2-flintglass@gmail.com Fixes: a65b0e7607cc ("zswap: make shrinking memcg-aware") Signed-off-by: Takero Funaki Signed-off-by: Yosry Ahmed Acked-by: Yosry Ahmed Reviewed-by: Chengming Zhou Reviewed-by: Nhat Pham Cc: Chengming Zhou Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/zswap.c | 76 +++++++++++++++++++++++++++++++++--------------------- 1 file changed, 47 insertions(+), 29 deletions(-) diff --git a/mm/zswap.c b/mm/zswap.c index 89f647c92e34..b92d1457bd2f 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -764,12 +764,25 @@ void zswap_folio_swapin(struct folio *folio) } } +/* + * This function should be called when a memcg is being offlined. + * + * Since the global shrinker shrink_worker() may hold a reference + * of the memcg, we must check and release the reference in + * zswap_next_shrink. + * + * shrink_worker() must handle the case where this function releases + * the reference of memcg being shrunk. + */ void zswap_memcg_offline_cleanup(struct mem_cgroup *memcg) { /* lock out zswap shrinker walking memcg tree */ spin_lock(&zswap_shrink_lock); - if (zswap_next_shrink == memcg) - zswap_next_shrink = mem_cgroup_iter(NULL, zswap_next_shrink, NULL); + if (zswap_next_shrink == memcg) { + do { + zswap_next_shrink = mem_cgroup_iter(NULL, zswap_next_shrink, NULL); + } while (zswap_next_shrink && !mem_cgroup_online(zswap_next_shrink)); + } spin_unlock(&zswap_shrink_lock); } @@ -1299,43 +1312,48 @@ static void shrink_worker(struct work_struct *w) /* Reclaim down to the accept threshold */ thr = zswap_accept_thr_pages(); - /* global reclaim will select cgroup in a round-robin fashion. */ + /* + * Global reclaim will select cgroup in a round-robin fashion. + * + * We save iteration cursor memcg into zswap_next_shrink, + * which can be modified by the offline memcg cleaner + * zswap_memcg_offline_cleanup(). + * + * Since the offline cleaner is called only once, we cannot leave an + * offline memcg reference in zswap_next_shrink. + * We can rely on the cleaner only if we get online memcg under lock. + * + * If we get an offline memcg, we cannot determine if the cleaner has + * already been called or will be called later. We must put back the + * reference before returning from this function. Otherwise, the + * offline memcg left in zswap_next_shrink will hold the reference + * until the next run of shrink_worker(). + */ do { - spin_lock(&zswap_shrink_lock); - zswap_next_shrink = mem_cgroup_iter(NULL, zswap_next_shrink, NULL); - memcg = zswap_next_shrink; - /* - * We need to retry if we have gone through a full round trip, or if we - * got an offline memcg (or else we risk undoing the effect of the - * zswap memcg offlining cleanup callback). This is not catastrophic - * per se, but it will keep the now offlined memcg hostage for a while. + * Start shrinking from the next memcg after zswap_next_shrink. + * When the offline cleaner has already advanced the cursor, + * advancing the cursor here overlooks one memcg, but this + * should be negligibly rare. * - * Note that if we got an online memcg, we will keep the extra - * reference in case the original reference obtained by mem_cgroup_iter - * is dropped by the zswap memcg offlining callback, ensuring that the - * memcg is not killed when we are reclaiming. + * If we get an online memcg, keep the extra reference in case + * the original one obtained by mem_cgroup_iter() is dropped by + * zswap_memcg_offline_cleanup() while we are shrinking the + * memcg. */ - if (!memcg) { - spin_unlock(&zswap_shrink_lock); - if (++failures == MAX_RECLAIM_RETRIES) - break; - - goto resched; - } - - if (!mem_cgroup_tryget_online(memcg)) { - /* drop the reference from mem_cgroup_iter() */ - mem_cgroup_iter_break(NULL, memcg); - zswap_next_shrink = NULL; - spin_unlock(&zswap_shrink_lock); + spin_lock(&zswap_shrink_lock); + do { + memcg = mem_cgroup_iter(NULL, zswap_next_shrink, NULL); + zswap_next_shrink = memcg; + } while (memcg && !mem_cgroup_tryget_online(memcg)); + spin_unlock(&zswap_shrink_lock); + if (!memcg) { if (++failures == MAX_RECLAIM_RETRIES) break; goto resched; } - spin_unlock(&zswap_shrink_lock); ret = shrink_memcg(memcg); /* drop the extra reference */ -- Gitee From 9f290e43f4344eb7a26c94eb8f452227a9342301 Mon Sep 17 00:00:00 2001 From: Takero Funaki Date: Wed, 31 Jul 2024 00:49:10 +0000 Subject: [PATCH 254/484] mm: zswap: fix global shrinker error handling logic commit 81920438a6dce78e6eac17ee34f29bf6c32074b1 upstream Conflicts: none Backport-reason: ZSWAP mass update This patch fixes the zswap global shrinker, which did not shrink the zpool as expected. The issue addressed is that shrink_worker() did not distinguish between unexpected errors and expected errors, such as failed writeback from an empty memcg. The shrinker would stop shrinking after iterating through the memcg tree 16 times, even if there was only one empty memcg. With this patch, the shrinker no longer considers encountering an empty memcg, encountering a memcg with writeback disabled, or reaching the end of a memcg tree walk as a failure, as long as there are memcgs that are candidates for writeback. Systems with one or more empty memcgs will now observe significantly higher zswap writeback activity after the zswap pool limit is hit. To avoid an infinite loop when there are no writeback candidates, this patch tracks writeback attempts during memcg tree walks and limits reties if no writeback candidates are found. To handle the empty memcg case, the helper function shrink_memcg() is modified to check if the memcg is empty and then return -ENOENT. Link: https://lkml.kernel.org/r/20240731004918.33182-3-flintglass@gmail.com Fixes: a65b0e7607cc ("zswap: make shrinking memcg-aware") Signed-off-by: Takero Funaki Reviewed-by: Chengming Zhou Reviewed-by: Nhat Pham Cc: Johannes Weiner Cc: Yosry Ahmed Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/zswap.c | 40 +++++++++++++++++++++++++++++++++------- 1 file changed, 33 insertions(+), 7 deletions(-) diff --git a/mm/zswap.c b/mm/zswap.c index b92d1457bd2f..164acdf50831 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -1282,10 +1282,10 @@ static struct shrinker *zswap_alloc_shrinker(void) static int shrink_memcg(struct mem_cgroup *memcg) { - int nid, shrunk = 0; + int nid, shrunk = 0, scanned = 0; if (!mem_cgroup_zswap_writeback_enabled(memcg)) - return -EINVAL; + return -ENOENT; /* * Skip zombies because their LRUs are reparented and we would be @@ -1299,21 +1299,34 @@ static int shrink_memcg(struct mem_cgroup *memcg) shrunk += list_lru_walk_one(&zswap_list_lru, nid, memcg, &shrink_memcg_cb, NULL, &nr_to_walk); + scanned += 1 - nr_to_walk; } + + if (!scanned) + return -ENOENT; + return shrunk ? 0 : -EAGAIN; } static void shrink_worker(struct work_struct *w) { struct mem_cgroup *memcg; - int ret, failures = 0; + int ret, failures = 0, attempts = 0; unsigned long thr; /* Reclaim down to the accept threshold */ thr = zswap_accept_thr_pages(); /* - * Global reclaim will select cgroup in a round-robin fashion. + * Global reclaim will select cgroup in a round-robin fashion from all + * online memcgs, but memcgs that have no pages in zswap and + * writeback-disabled memcgs (memory.zswap.writeback=0) are not + * candidates for shrinking. + * + * Shrinking will be aborted if we encounter the following + * MAX_RECLAIM_RETRIES times: + * - No writeback-candidate memcgs found in a memcg tree walk. + * - Shrinking a writeback-candidate memcg failed. * * We save iteration cursor memcg into zswap_next_shrink, * which can be modified by the offline memcg cleaner @@ -1349,9 +1362,14 @@ static void shrink_worker(struct work_struct *w) spin_unlock(&zswap_shrink_lock); if (!memcg) { - if (++failures == MAX_RECLAIM_RETRIES) + /* + * Continue shrinking without incrementing failures if + * we found candidate memcgs in the last tree walk. + */ + if (!attempts && ++failures == MAX_RECLAIM_RETRIES) break; + attempts = 0; goto resched; } @@ -1359,8 +1377,16 @@ static void shrink_worker(struct work_struct *w) /* drop the extra reference */ mem_cgroup_put(memcg); - if (ret == -EINVAL) - break; + /* + * There are no writeback-candidate pages in the memcg. + * This is not an issue as long as we can find another memcg + * with pages in zswap. Skip this without incrementing attempts + * and failures. + */ + if (ret == -ENOENT) + continue; + ++attempts; + if (ret && ++failures == MAX_RECLAIM_RETRIES) break; resched: -- Gitee From 4e3af4227eba1ae4407a067dd1e8c3ca09fb86f0 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Fri, 31 May 2024 16:12:28 +0800 Subject: [PATCH 255/484] mm,swap: fix a theoretical underflow in readahead window calculation commit 653ea80e666b5c4b157740f600ca13ad7fe22033 upstream Conflicts: none Backport-reason: SWAP cleanup Patch series "mm,swap: cleanup VMA based swap readahead window calculation". When VMA based swap readahead is introduced in commit ec560175c0b6 ("mm, swap: VMA based swap readahead"), "struct vma_swap_readahead" is defined to describe the readahead window. Because we wanted to save the PTE entries in the struct at that time. But after commit 4f8fcf4ced0b ("mm/swap: swap_vma_readahead() do the pte_offset_map()"), we no longer save PTE entries in the struct. The size of the struct becomes so small, that it's better to use the fields of the struct directly. This can simplify the code to improve the code readability. The line number of source code reduces too. A theoretical underflow issue and some related code cleanup is done in the series too. This patch (of 3): In swap readahead window calculation, if the fault PFN is smaller than the readahead window size, underflow may occurs. This is only possible in theory, because the start of the virtual address space will not be used for anonymous pages in practice. Even if underflow occurs, there will be no functional bugs. In the worst cases, some swap entries may be swapped in incorrectly and some pages may be allocate on the wrong nodes. Anyway, we still needs to fix the issue via some underflow checking. Link: https://lkml.kernel.org/r/20240531081230.310128-1-ying.huang@intel.com Link: https://lkml.kernel.org/r/20240531081230.310128-2-ying.huang@intel.com Fixes: ec560175c0b6 ("mm, swap: VMA based swap readahead") Signed-off-by: "Huang, Ying" Cc: Hugh Dickins Cc: Alistair Popple Cc: Anshuman Khandual Cc: David Hildenbrand Cc: Mel Gorman Cc: Miaohe Lin Cc: Minchan Kim Cc: Ryan Roberts Cc: Yang Shi Cc: Yu Zhao Cc: Kairui Song Cc: Barry Song Cc: Chris Li Cc: Yosry Ahmed Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/swap_state.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mm/swap_state.c b/mm/swap_state.c index 776eea26d92c..e315bb065716 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -735,6 +735,8 @@ static void swap_ra_info(struct vm_fault *vmf, lpfn = fpfn - left; rpfn = fpfn + win - left; } + if ((long)lpfn < 0) + lpfn = 0; start = max3(lpfn, PFN_DOWN(vma->vm_start), PFN_DOWN(faddr & PMD_MASK)); end = min3(rpfn, PFN_DOWN(vma->vm_end), -- Gitee From 0443442725d16c6c563d61dd420a711645540565 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Fri, 31 May 2024 16:12:29 +0800 Subject: [PATCH 256/484] mm,swap: remove struct vma_swap_readahead commit dce08dd2e86b600d2c53b0d6cdb851f94fb455b2 upstream Conflicts: none Backport-reason: SWAP cleanup When VMA based swap readahead is introduced in commit ec560175c0b6 ("mm, swap: VMA based swap readahead"), "struct vma_swap_readahead" is defined to describe the readahead window. Because we wanted to save the PTE entries in the struct at that time. But after commit 4f8fcf4ced0b ("mm/swap: swap_vma_readahead() do the pte_offset_map()"), we no longer save PTE entries in the struct. The size of the struct becomes so small, that it's better to use the fields of the struct directly. This can simplify the code to improve the code readability. The line number of source code reduces too. No functionality change is expected in this patch. Link: https://lkml.kernel.org/r/20240531081230.310128-3-ying.huang@intel.com Signed-off-by: "Huang, Ying" Cc: Hugh Dickins Cc: Alistair Popple Cc: Anshuman Khandual Cc: David Hildenbrand Cc: Mel Gorman Cc: Miaohe Lin Cc: Minchan Kim Cc: Ryan Roberts Cc: Yang Shi Cc: Yu Zhao Cc: Kairui Song Cc: Barry Song Cc: Chris Li Cc: Yosry Ahmed Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/swap_state.c | 48 ++++++++++++++++++++---------------------------- 1 file changed, 20 insertions(+), 28 deletions(-) diff --git a/mm/swap_state.c b/mm/swap_state.c index e315bb065716..88dc61454a29 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -41,6 +41,8 @@ struct address_space *swapper_spaces[MAX_SWAPFILES] __read_mostly; static unsigned int nr_swapper_spaces[MAX_SWAPFILES] __read_mostly; static bool enable_vma_readahead __read_mostly = true; +#define SWAP_RA_ORDER_CEILING 5 + #define SWAP_RA_WIN_SHIFT (PAGE_SHIFT / 2) #define SWAP_RA_HITS_MASK ((1UL << SWAP_RA_WIN_SHIFT) - 1) #define SWAP_RA_HITS_MAX SWAP_RA_HITS_MASK @@ -686,16 +688,9 @@ void exit_swap_address_space(unsigned int type) swapper_spaces[type] = NULL; } -#define SWAP_RA_ORDER_CEILING 5 - -struct vma_swap_readahead { - unsigned short win; - unsigned short offset; - unsigned short nr_pte; -}; - -static void swap_ra_info(struct vm_fault *vmf, - struct vma_swap_readahead *ra_info) +static unsigned short swap_vma_ra_win(struct vm_fault *vmf, + unsigned short *offset, + unsigned short *nr_pte) { struct vm_area_struct *vma = vmf->vma; unsigned long ra_val; @@ -705,10 +700,8 @@ static void swap_ra_info(struct vm_fault *vmf, max_win = 1 << min_t(unsigned int, READ_ONCE(page_cluster), SWAP_RA_ORDER_CEILING); - if (max_win == 1) { - ra_info->win = 1; - return; - } + if (max_win == 1) + return 1; faddr = vmf->address; fpfn = PFN_DOWN(faddr); @@ -716,12 +709,11 @@ static void swap_ra_info(struct vm_fault *vmf, pfn = PFN_DOWN(SWAP_RA_ADDR(ra_val)); prev_win = SWAP_RA_WIN(ra_val); hits = SWAP_RA_HITS(ra_val); - ra_info->win = win = __swapin_nr_pages(pfn, fpfn, hits, - max_win, prev_win); + win = __swapin_nr_pages(pfn, fpfn, hits, max_win, prev_win); atomic_long_set(&vma->swap_readahead_info, SWAP_RA_VAL(faddr, win, 0)); if (win == 1) - return; + return 1; if (fpfn == pfn + 1) { lpfn = fpfn; @@ -742,8 +734,10 @@ static void swap_ra_info(struct vm_fault *vmf, end = min3(rpfn, PFN_DOWN(vma->vm_end), PFN_DOWN((faddr & PMD_MASK) + PMD_SIZE)); - ra_info->nr_pte = end - start; - ra_info->offset = fpfn - start; + *nr_pte = end - start; + *offset = fpfn - start; + + return win; } /** @@ -774,19 +768,17 @@ static struct folio *swap_vma_readahead(swp_entry_t targ_entry, gfp_t gfp_mask, pgoff_t ilx; unsigned int i; bool page_allocated; - struct vma_swap_readahead ra_info = { - .win = 1, - }; + unsigned short win, nr_pte, offset; - swap_ra_info(vmf, &ra_info); - if (ra_info.win == 1) + win = swap_vma_ra_win(vmf, &offset, &nr_pte); + if (win == 1) goto skip; - addr = vmf->address - (ra_info.offset * PAGE_SIZE); - ilx = targ_ilx - ra_info.offset; + addr = vmf->address - offset * PAGE_SIZE; + ilx = targ_ilx - offset; blk_start_plug(&plug); - for (i = 0; i < ra_info.nr_pte; i++, ilx++, addr += PAGE_SIZE) { + for (i = 0; i < nr_pte; i++, ilx++, addr += PAGE_SIZE) { if (!pte++) { pte = pte_offset_map(vmf->pmd, addr); if (!pte) @@ -806,7 +798,7 @@ static struct folio *swap_vma_readahead(swp_entry_t targ_entry, gfp_t gfp_mask, continue; if (page_allocated) { swap_read_folio(folio, false, &splug); - if (i != ra_info.offset) { + if (i != offset) { folio_set_readahead(folio); count_vm_event(SWAP_RA); } -- Gitee From 5773797af4dbc7c7ca25362c4fbeb49e69ef25f9 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Fri, 31 May 2024 16:12:30 +0800 Subject: [PATCH 257/484] mm,swap: simplify VMA based swap readahead window calculation commit ba518f4d4b84c2c99fd03e77479e5c7f6767faf6 upstream Conflicts: none Backport-reason: SWAP cleanup Replace PFNs with addresses in readahead window calculation. This simplified the logic and reduce the code line number. No functionality change is expected. Link: https://lkml.kernel.org/r/20240531081230.310128-4-ying.huang@intel.com Signed-off-by: "Huang, Ying" Cc: Hugh Dickins Cc: Alistair Popple Cc: Anshuman Khandual Cc: David Hildenbrand Cc: Mel Gorman Cc: Miaohe Lin Cc: Minchan Kim Cc: Ryan Roberts Cc: Yang Shi Cc: Yu Zhao Cc: Kairui Song Cc: Barry Song Cc: Chris Li Cc: Yosry Ahmed Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/swap_state.c | 66 +++++++++++++++++++------------------------------ 1 file changed, 25 insertions(+), 41 deletions(-) diff --git a/mm/swap_state.c b/mm/swap_state.c index 88dc61454a29..233e08e33ecc 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -688,54 +688,40 @@ void exit_swap_address_space(unsigned int type) swapper_spaces[type] = NULL; } -static unsigned short swap_vma_ra_win(struct vm_fault *vmf, - unsigned short *offset, - unsigned short *nr_pte) +static int swap_vma_ra_win(struct vm_fault *vmf, unsigned long *start, + unsigned long *end) { struct vm_area_struct *vma = vmf->vma; unsigned long ra_val; - unsigned long faddr, pfn, fpfn, lpfn, rpfn; - unsigned long start, end; + unsigned long faddr, prev_faddr, left, right; unsigned int max_win, hits, prev_win, win; - max_win = 1 << min_t(unsigned int, READ_ONCE(page_cluster), - SWAP_RA_ORDER_CEILING); + max_win = 1 << min(READ_ONCE(page_cluster), SWAP_RA_ORDER_CEILING); if (max_win == 1) return 1; faddr = vmf->address; - fpfn = PFN_DOWN(faddr); ra_val = GET_SWAP_RA_VAL(vma); - pfn = PFN_DOWN(SWAP_RA_ADDR(ra_val)); + prev_faddr = SWAP_RA_ADDR(ra_val); prev_win = SWAP_RA_WIN(ra_val); hits = SWAP_RA_HITS(ra_val); - win = __swapin_nr_pages(pfn, fpfn, hits, max_win, prev_win); - atomic_long_set(&vma->swap_readahead_info, - SWAP_RA_VAL(faddr, win, 0)); + win = __swapin_nr_pages(PFN_DOWN(prev_faddr), PFN_DOWN(faddr), hits, + max_win, prev_win); + atomic_long_set(&vma->swap_readahead_info, SWAP_RA_VAL(faddr, win, 0)); if (win == 1) return 1; - if (fpfn == pfn + 1) { - lpfn = fpfn; - rpfn = fpfn + win; - } else if (pfn == fpfn + 1) { - lpfn = fpfn - win + 1; - rpfn = fpfn + 1; - } else { - unsigned int left = (win - 1) / 2; - - lpfn = fpfn - left; - rpfn = fpfn + win - left; - } - if ((long)lpfn < 0) - lpfn = 0; - start = max3(lpfn, PFN_DOWN(vma->vm_start), - PFN_DOWN(faddr & PMD_MASK)); - end = min3(rpfn, PFN_DOWN(vma->vm_end), - PFN_DOWN((faddr & PMD_MASK) + PMD_SIZE)); - - *nr_pte = end - start; - *offset = fpfn - start; + if (faddr == prev_faddr + PAGE_SIZE) + left = faddr; + else if (prev_faddr == faddr + PAGE_SIZE) + left = faddr - (win << PAGE_SHIFT) + PAGE_SIZE; + else + left = faddr - (((win - 1) / 2) << PAGE_SHIFT); + right = left + (win << PAGE_SHIFT); + if ((long)left < 0) + left = 0; + *start = max3(left, vma->vm_start, faddr & PMD_MASK); + *end = min3(right, vma->vm_end, (faddr & PMD_MASK) + PMD_SIZE); return win; } @@ -763,22 +749,20 @@ static struct folio *swap_vma_readahead(swp_entry_t targ_entry, gfp_t gfp_mask, struct swap_iocb *splug = NULL; struct folio *folio; pte_t *pte = NULL, pentry; - unsigned long addr; + int win; + unsigned long start, end, addr; swp_entry_t entry; pgoff_t ilx; - unsigned int i; bool page_allocated; - unsigned short win, nr_pte, offset; - win = swap_vma_ra_win(vmf, &offset, &nr_pte); + win = swap_vma_ra_win(vmf, &start, &end); if (win == 1) goto skip; - addr = vmf->address - offset * PAGE_SIZE; - ilx = targ_ilx - offset; + ilx = targ_ilx - PFN_DOWN(vmf->address - start); blk_start_plug(&plug); - for (i = 0; i < nr_pte; i++, ilx++, addr += PAGE_SIZE) { + for (addr = start; addr < end; ilx++, addr += PAGE_SIZE) { if (!pte++) { pte = pte_offset_map(vmf->pmd, addr); if (!pte) @@ -798,7 +782,7 @@ static struct folio *swap_vma_readahead(swp_entry_t targ_entry, gfp_t gfp_mask, continue; if (page_allocated) { swap_read_folio(folio, false, &splug); - if (i != offset) { + if (addr != vmf->address) { folio_set_readahead(folio); count_vm_event(SWAP_RA); } -- Gitee From 39fab9edc7a7a187e0dd39951e4e687ce09f0ad0 Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Fri, 7 Jun 2024 04:55:15 +0000 Subject: [PATCH 258/484] mm: swap: remove 'synchronous' argument to swap_read_folio() commit b2d1f38b524121130befa3a9b37dca790cfa9ab9 upstream Conflicts: minor, we still have bdev_swapin_folio and shrinked si refcount scope. Backport-reason: SWAP cleanup Commit [1] introduced IO polling support duding swapin to reduce swap read latency for block devices that can be polled. However later commit [2] removed polling support. Commit [3] removed the remnants of polling support from read_swap_cache_async() and __read_swap_cache_async(). However, it left behind some remnants in swap_read_folio(), the 'synchronous' argument. swap_read_folio() reads the folio synchronously if synchronous=true or if SWP_SYNCHRONOUS_IO is set in swap_info_struct. The only caller that passes synchronous=true is in do_swap_page() in the SWP_SYNCHRONOUS_IO case. Hence, the argument is redundant, it is only set to true when the swap read would have been synchronous anyway. Remove it. [1] Commit 23955622ff8d ("swap: add block io poll in swapin path") [2] Commit 9650b453a3d4 ("block: ignore RWF_HIPRI hint for sync dio") [3] Commit b243dcbf2f13 ("swap: remove remnants of polling from read_swap_cache_async") Link: https://lkml.kernel.org/r/20240607045515.1836558-1-yosryahmed@google.com Signed-off-by: Yosry Ahmed Reviewed-by: "Huang, Ying" Reviewed-by: Christoph Hellwig Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/memory.c | 2 +- mm/page_io.c | 6 +++--- mm/swap.h | 6 ++---- mm/swap_state.c | 10 +++++----- 4 files changed, 11 insertions(+), 13 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index 1cdeaa1a2957..38cd39f118bf 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4033,7 +4033,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) /* To provide entry to swap_read_folio() */ folio->swap = entry; - swap_read_folio(folio, true, NULL); + swap_read_folio(folio, NULL); folio->private = NULL; } } else { diff --git a/mm/page_io.c b/mm/page_io.c index 7cb539cf2c5c..e78cad4198f1 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -504,10 +504,10 @@ static void swap_read_folio_bdev_async(struct folio *folio, submit_bio(bio); } -void swap_read_folio(struct folio *folio, bool synchronous, - struct swap_iocb **plug) +void swap_read_folio(struct folio *folio, struct swap_iocb **plug) { struct swap_info_struct *sis = swp_swap_info(folio->swap); + bool synchronous = sis->flags & SWP_SYNCHRONOUS_IO; bool workingset = folio_test_workingset(folio); unsigned long pflags; bool in_thrashing; @@ -531,7 +531,7 @@ void swap_read_folio(struct folio *folio, bool synchronous, folio_unlock(folio); } else if (data_race(sis->flags & SWP_FS_OPS)) { swap_read_folio_fs(folio, plug); - } else if (synchronous || (sis->flags & SWP_SYNCHRONOUS_IO)) { + } else if (synchronous) { int ret = bdev_swapin_folio(sis->bdev, swap_folio_sector(folio), folio); if (ret != -EOPNOTSUPP) { if (!ret) diff --git a/mm/swap.h b/mm/swap.h index c668591d1e10..a586f480757e 100644 --- a/mm/swap.h +++ b/mm/swap.h @@ -11,8 +11,7 @@ struct mempolicy; /* linux/mm/page_io.c */ int sio_pool_init(void); struct swap_iocb; -void swap_read_folio(struct folio *folio, bool do_poll, - struct swap_iocb **plug); +void swap_read_folio(struct folio *folio, struct swap_iocb **plug); void __swap_read_unplug(struct swap_iocb *plug); static inline void swap_read_unplug(struct swap_iocb *plug) { @@ -82,8 +81,7 @@ static inline unsigned int folio_swap_flags(struct folio *folio) } #else /* CONFIG_SWAP */ struct swap_iocb; -static inline void swap_read_folio(struct folio *folio, bool do_poll, - struct swap_iocb **plug) +static inline void swap_read_folio(struct folio *folio, struct swap_iocb **plug) { } static inline void swap_write_unplug(struct swap_iocb *sio) diff --git a/mm/swap_state.c b/mm/swap_state.c index 233e08e33ecc..12d4fc4c5618 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -506,7 +506,7 @@ struct folio *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, mpol_cond_put(mpol); if (page_allocated) - swap_read_folio(folio, false, plug); + swap_read_folio(folio, plug); put_swap_device(si); return folio; @@ -626,7 +626,7 @@ struct folio *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask, if (!folio) continue; if (page_allocated) { - swap_read_folio(folio, false, &splug); + swap_read_folio(folio, &splug); if (offset != entry_offset) { folio_set_readahead(folio); count_vm_event(SWAP_RA); @@ -643,7 +643,7 @@ struct folio *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask, &page_allocated, false); if (unlikely(page_allocated)) { zswap_folio_swapin(folio); - swap_read_folio(folio, false, NULL); + swap_read_folio(folio, NULL); } return folio; } @@ -781,7 +781,7 @@ static struct folio *swap_vma_readahead(swp_entry_t targ_entry, gfp_t gfp_mask, if (!folio) continue; if (page_allocated) { - swap_read_folio(folio, false, &splug); + swap_read_folio(folio, &splug); if (addr != vmf->address) { folio_set_readahead(folio); count_vm_event(SWAP_RA); @@ -800,7 +800,7 @@ static struct folio *swap_vma_readahead(swp_entry_t targ_entry, gfp_t gfp_mask, &page_allocated, false); if (unlikely(page_allocated)) { zswap_folio_swapin(folio); - swap_read_folio(folio, false, NULL); + swap_read_folio(folio, NULL); } return folio; } -- Gitee From 6c9698a2c695978c1c0ac40abb6cfbc7f8248a3f Mon Sep 17 00:00:00 2001 From: Nhat Pham Date: Mon, 5 Aug 2024 16:22:42 -0700 Subject: [PATCH 259/484] zswap: implement a second chance algorithm for dynamic zswap shrinker commit e31c38e037621c445bb4393fd77e0a76e6e0899a upstream Conflicts: none Backport-reason: ZSWAP mass update Patch series "improving dynamic zswap shrinker protection scheme", v3. When experimenting with the memory-pressure based (i.e "dynamic") zswap shrinker in production, we observed a sharp increase in the number of swapins, which led to performance regression. We were able to trace this regression to the following problems with the shrinker's warm pages protection scheme: 1. The protection decays way too rapidly, and the decaying is coupled with zswap stores, leading to anomalous patterns, in which a small batch of zswap stores effectively erase all the protection in place for the warmer pages in the zswap LRU. This observation has also been corroborated upstream by Takero Funaki (in [1]). 2. We inaccurately track the number of swapped in pages, missing the non-pivot pages that are part of the readahead window, while counting the pages that are found in the zswap pool. To alleviate these two issues, this patch series improve the dynamic zswap shrinker in the following manner: 1. Replace the protection size tracking scheme with a second chance algorithm. This new scheme removes the need for haphazard stats decaying, and automatically adjusts the pace of pages aging with memory pressure, and writeback rate with pool activities: slowing down when the pool is dominated with zswpouts, and speeding up when the pool is dominated with stale entries. 2. Fix the tracking of the number of swapins to take into account non-pivot pages in the readahead window. With these two changes in place, in a kernel-building benchmark without any cold data added, the number of swapins is reduced by 64.12%. This translate to a 10.32% reduction in build time. We also observe a 3% reduction in kernel CPU time. In another benchmark, with cold data added (to gauge the new algorithm's ability to offload cold data), the new second chance scheme outperforms the old protection scheme by around 0.7%, and actually written back around 21% more pages to backing swap device. So the new scheme is just as good, if not even better than the old scheme on this front as well. [1]: https://lore.kernel.org/linux-mm/CAPpodddcGsK=0Xczfuk8usgZ47xeyf4ZjiofdT+ujiyz6V2pFQ@mail.gmail.com/ This patch (of 2): Current zswap shrinker's heuristics to prevent overshrinking is brittle and inaccurate, specifically in the way we decay the protection size (i.e making pages in the zswap LRU eligible for reclaim). We currently decay protection aggressively in zswap_lru_add() calls. This leads to the following unfortunate effect: when a new batch of pages enter zswap, the protection size rapidly decays to below 25% of the zswap LRU size, which is way too low. We have observed this effect in production, when experimenting with the zswap shrinker: the rate of shrinking shoots up massively right after a new batch of zswap stores. This is somewhat the opposite of what we want originally - when new pages enter zswap, we want to protect both these new pages AND the pages that are already protected in the zswap LRU. Replace existing heuristics with a second chance algorithm 1. When a new zswap entry is stored in the zswap pool, its referenced bit is set. 2. When the zswap shrinker encounters a zswap entry with the referenced bit set, give it a second chance - only flips the referenced bit and rotate it in the LRU. 3. If the shrinker encounters the entry again, this time with its referenced bit unset, then it can reclaim the entry. In this manner, the aging of the pages in the zswap LRUs are decoupled from zswap stores, and picks up the pace with increasing memory pressure (which is what we want). The second chance scheme allows us to modulate the writeback rate based on recent pool activities. Entries that recently entered the pool will be protected, so if the pool is dominated by such entries the writeback rate will reduce proportionally, protecting the workload's workingset.On the other hand, stale entries will be written back quickly, which increases the effective writeback rate. The referenced bit is added at the hole after the `length` field of struct zswap_entry, so there is no extra space overhead for this algorithm. We will still maintain the count of swapins, which is consumed and subtracted from the lru size in zswap_shrinker_count(), to further penalize past overshrinking that led to disk swapins. The idea is that had we considered this many more pages in the LRU active/protected, they would not have been written back and we would not have had to swapped them in. To test this new heuristics, I built the kernel under a cgroup with memory.max set to 2G, on a host with 36 cores: With the old shrinker: real: 263.89s user: 4318.11s sys: 673.29s swapins: 227300.5 With the second chance algorithm: real: 244.85s user: 4327.22s sys: 664.39s swapins: 94663 (average over 5 runs) We observe an 1.3% reduction in kernel CPU usage, and around 7.2% reduction in real time. Note that the number of swapped in pages dropped by 58%. [nphamcs@gmail.com: fix a small mistake in the referenced bit documentation] Link: https://lkml.kernel.org/r/20240806003403.3142387-1-nphamcs@gmail.com Link: https://lkml.kernel.org/r/20240805232243.2896283-1-nphamcs@gmail.com Link: https://lkml.kernel.org/r/20240805232243.2896283-2-nphamcs@gmail.com Signed-off-by: Nhat Pham Suggested-by: Johannes Weiner Acked-by: Yosry Ahmed Cc: Chengming Zhou Cc: Shakeel Butt Cc: Takero Funaki Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- include/linux/zswap.h | 16 +++---- mm/zswap.c | 108 ++++++++++++++++++++++++------------------ 2 files changed, 70 insertions(+), 54 deletions(-) diff --git a/include/linux/zswap.h b/include/linux/zswap.h index bf83ae5e285d..3c308cc845d6 100644 --- a/include/linux/zswap.h +++ b/include/linux/zswap.h @@ -13,17 +13,15 @@ extern atomic_t zswap_stored_pages; struct zswap_lruvec_state { /* - * Number of pages in zswap that should be protected from the shrinker. - * This number is an estimate of the following counts: + * Number of swapped in pages from disk, i.e not found in the zswap pool. * - * a) Recent page faults. - * b) Recent insertion to the zswap LRU. This includes new zswap stores, - * as well as recent zswap LRU rotations. - * - * These pages are likely to be warm, and might incur IO if the are written - * to swap. + * This is consumed and subtracted from the lru size in + * zswap_shrinker_count() to penalize past overshrinking that led to disk + * swapins. The idea is that had we considered this many more pages in the + * LRU active/protected and not written them back, we would not have had to + * swapped them in. */ - atomic_long_t nr_zswap_protected; + atomic_long_t nr_disk_swapins; }; unsigned long zswap_total_pages(void); diff --git a/mm/zswap.c b/mm/zswap.c index 164acdf50831..7775ca968348 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -186,6 +186,10 @@ static struct shrinker *zswap_shrinker; * length - the length in bytes of the compressed page data. Needed during * decompression. For a same value filled page length is 0, and both * pool and lru are invalid and must be ignored. + * referenced - true if the entry recently entered the zswap pool. Unset by the + * writeback logic. The entry is only reclaimed by the writeback + * logic if referenced is unset. See comments in the shrinker + * section for context. * pool - the zswap_pool the entry's data is in * handle - zpool allocation handle that stores the compressed page data * value - value of the same-value filled pages which have same content @@ -195,6 +199,7 @@ static struct shrinker *zswap_shrinker; struct zswap_entry { swp_entry_t swpentry; unsigned int length; + bool referenced; struct zswap_pool *pool; union { unsigned long handle; @@ -699,11 +704,8 @@ static inline int entry_to_nid(struct zswap_entry *entry) static void zswap_lru_add(struct list_lru *list_lru, struct zswap_entry *entry) { - atomic_long_t *nr_zswap_protected; - unsigned long lru_size, old, new; int nid = entry_to_nid(entry); struct mem_cgroup *memcg; - struct lruvec *lruvec; /* * Note that it is safe to use rcu_read_lock() here, even in the face of @@ -721,19 +723,6 @@ static void zswap_lru_add(struct list_lru *list_lru, struct zswap_entry *entry) memcg = mem_cgroup_from_entry(entry); /* will always succeed */ list_lru_add(list_lru, &entry->lru, nid, memcg); - - /* Update the protection area */ - lru_size = list_lru_count_one(list_lru, nid, memcg); - lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid)); - nr_zswap_protected = &lruvec->zswap_lruvec_state.nr_zswap_protected; - old = atomic_long_inc_return(nr_zswap_protected); - /* - * Decay to avoid overflow and adapt to changing workloads. - * This is based on LRU reclaim cost decaying heuristics. - */ - do { - new = old > lru_size / 4 ? old / 2 : old; - } while (!atomic_long_try_cmpxchg(nr_zswap_protected, &old, new)); rcu_read_unlock(); } @@ -751,7 +740,7 @@ static void zswap_lru_del(struct list_lru *list_lru, struct zswap_entry *entry) void zswap_lruvec_state_init(struct lruvec *lruvec) { - atomic_long_set(&lruvec->zswap_lruvec_state.nr_zswap_protected, 0); + atomic_long_set(&lruvec->zswap_lruvec_state.nr_disk_swapins, 0); } void zswap_folio_swapin(struct folio *folio) @@ -760,7 +749,7 @@ void zswap_folio_swapin(struct folio *folio) if (folio) { lruvec = folio_lruvec(folio); - atomic_long_inc(&lruvec->zswap_lruvec_state.nr_zswap_protected); + atomic_long_inc(&lruvec->zswap_lruvec_state.nr_disk_swapins); } } @@ -1090,6 +1079,28 @@ static int zswap_writeback_entry(struct zswap_entry *entry, /********************************* * shrinker functions **********************************/ +/* + * The dynamic shrinker is modulated by the following factors: + * + * 1. Each zswap entry has a referenced bit, which the shrinker unsets (giving + * the entry a second chance) before rotating it in the LRU list. If the + * entry is considered again by the shrinker, with its referenced bit unset, + * it is written back. The writeback rate as a result is dynamically + * adjusted by the pool activities - if the pool is dominated by new entries + * (i.e lots of recent zswapouts), these entries will be protected and + * the writeback rate will slow down. On the other hand, if the pool has a + * lot of stagnant entries, these entries will be reclaimed immediately, + * effectively increasing the writeback rate. + * + * 2. Swapins counter: If we observe swapins, it is a sign that we are + * overshrinking and should slow down. We maintain a swapins counter, which + * is consumed and subtract from the number of eligible objects on the LRU + * in zswap_shrinker_count(). + * + * 3. Compression ratio. The better the workload compresses, the less gains we + * can expect from writeback. We scale down the number of objects available + * for reclaim by this ratio. + */ static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_one *l, spinlock_t *lock, void *arg) { @@ -1099,6 +1110,16 @@ static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_o enum lru_status ret = LRU_REMOVED_RETRY; int writeback_result; + /* + * Second chance algorithm: if the entry has its referenced bit set, give it + * a second chance. Only clear the referenced bit and rotate it in the + * zswap's LRU list. + */ + if (entry->referenced) { + entry->referenced = false; + return LRU_ROTATE; + } + /* * As soon as we drop the LRU lock, the entry can be freed by * a concurrent invalidation. This means the following: @@ -1165,8 +1186,7 @@ static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_o static unsigned long zswap_shrinker_scan(struct shrinker *shrinker, struct shrink_control *sc) { - struct lruvec *lruvec = mem_cgroup_lruvec(sc->memcg, NODE_DATA(sc->nid)); - unsigned long shrink_ret, nr_protected, lru_size; + unsigned long shrink_ret; bool encountered_page_in_swapcache = false; if (!zswap_shrinker_enabled || @@ -1175,25 +1195,6 @@ static unsigned long zswap_shrinker_scan(struct shrinker *shrinker, return SHRINK_STOP; } - nr_protected = - atomic_long_read(&lruvec->zswap_lruvec_state.nr_zswap_protected); - lru_size = list_lru_shrink_count(&zswap_list_lru, sc); - - /* - * Abort if we are shrinking into the protected region. - * - * This short-circuiting is necessary because if we have too many multiple - * concurrent reclaimers getting the freeable zswap object counts at the - * same time (before any of them made reasonable progress), the total - * number of reclaimed objects might be more than the number of unprotected - * objects (i.e the reclaimers will reclaim into the protected area of the - * zswap LRU). - */ - if (nr_protected >= lru_size - sc->nr_to_scan) { - sc->nr_scanned = 0; - return SHRINK_STOP; - } - shrink_ret = list_lru_shrink_walk(&zswap_list_lru, sc, &shrink_memcg_cb, &encountered_page_in_swapcache); @@ -1208,7 +1209,10 @@ static unsigned long zswap_shrinker_count(struct shrinker *shrinker, { struct mem_cgroup *memcg = sc->memcg; struct lruvec *lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(sc->nid)); - unsigned long nr_backing, nr_stored, nr_freeable, nr_protected; + atomic_long_t *nr_disk_swapins = + &lruvec->zswap_lruvec_state.nr_disk_swapins; + unsigned long nr_backing, nr_stored, nr_freeable, nr_disk_swapins_cur, + nr_remain; if (!zswap_shrinker_enabled || !mem_cgroup_zswap_writeback_enabled(memcg)) return 0; @@ -1241,14 +1245,27 @@ static unsigned long zswap_shrinker_count(struct shrinker *shrinker, if (!nr_stored) return 0; - nr_protected = - atomic_long_read(&lruvec->zswap_lruvec_state.nr_zswap_protected); nr_freeable = list_lru_shrink_count(&zswap_list_lru, sc); + if (!nr_freeable) + return 0; + /* - * Subtract the lru size by an estimate of the number of pages - * that should be protected. + * Subtract from the lru size the number of pages that are recently swapped + * in from disk. The idea is that had we protect the zswap's LRU by this + * amount of pages, these disk swapins would not have happened. */ - nr_freeable = nr_freeable > nr_protected ? nr_freeable - nr_protected : 0; + nr_disk_swapins_cur = atomic_long_read(nr_disk_swapins); + do { + if (nr_freeable >= nr_disk_swapins_cur) + nr_remain = 0; + else + nr_remain = nr_disk_swapins_cur - nr_freeable; + } while (!atomic_long_try_cmpxchg( + nr_disk_swapins, &nr_disk_swapins_cur, nr_remain)); + + nr_freeable -= nr_disk_swapins_cur - nr_remain; + if (!nr_freeable) + return 0; /* * Scale the number of freeable pages by the memory saving factor. @@ -1501,6 +1518,7 @@ bool zswap_store(struct folio *folio) store_entry: entry->swpentry = swp; entry->objcg = objcg; + entry->referenced = true; old = xa_store(tree, offset, entry, GFP_KERNEL); if (xa_is_err(old)) { -- Gitee From 36ee82e9f6fce6a6c9a3f69d3099ce1ced546c6f Mon Sep 17 00:00:00 2001 From: Nhat Pham Date: Mon, 5 Aug 2024 16:22:43 -0700 Subject: [PATCH 260/484] zswap: track swapins from disk more accurately commit 0e400844724222a67b962b0f12c7964b5e9a236c upstream Conflicts: minor Backport-reason: ZSWAP mass update Currently, there are a couple of issues with our disk swapin tracking for dynamic zswap shrinker heuristics: 1. We only increment the swapin counter on pivot pages. This means we are not taking into account pages that also need to be swapped in, but are already taken care of as part of the readahead window. 2. We are also incrementing when the pages are read from the zswap pool, which is inaccurate. This patch rectifies these issues by incrementing the counter whenever we need to perform a non-zswap read. Note that we are slightly overcounting, as a page might be read into memory by the readahead algorithm even though it will not be neeeded by users - however, this is an acceptable inaccuracy, as the readahead logic itself will adapt to these kind of scenarios. To test this change, I built the kernel under a cgroup with its memory.max set to 2 GB: real: 236.66s user: 4286.06s sys: 652.86s swapins: 81552 For comparison, with just the new second chance algorithm, the build time is as follows: real: 244.85s user: 4327.22s sys: 664.39s swapins: 94663 Without neither: real: 263.89s user: 4318.11s sys: 673.29s swapins: 227300.5 (average over 5 runs) With this change, the kernel CPU time reduces by a further 1.7%, and the real time is reduced by another 3.3%, compared to just the second chance algorithm by itself. The swapins count also reduces by another 13.85%. Combinng the two changes, we reduce the real time by 10.32%, kernel CPU time by 3%, and number of swapins by 64.12%. To gauge the new scheme's ability to offload cold data, I ran another benchmark, in which the kernel was built under a cgroup with memory.max set to 3 GB, but with 0.5 GB worth of cold data allocated before each build (in a shmem file). Under the old scheme: real: 197.18s user: 4365.08s sys: 289.02s zswpwb: 72115.2 Under the new scheme: real: 195.8s user: 4362.25s sys: 290.14s zswpwb: 87277.8 (average over 5 runs) Notice that we actually observe a 21% increase in the number of written back pages - so the new scheme is just as good, if not better at offloading pages from the zswap pool when they are cold. Build time reduces by around 0.7% as a result. [nphamcs@gmail.com: squeeze a comment into a single line] Link: https://lkml.kernel.org/r/20240806004518.3183562-1-nphamcs@gmail.com Link: https://lkml.kernel.org/r/20240805232243.2896283-3-nphamcs@gmail.com Fixes: b5ba474f3f51 ("zswap: shrink zswap pool based on memory pressure") Signed-off-by: Nhat Pham Suggested-by: Johannes Weiner Acked-by: Yosry Ahmed Acked-by: Johannes Weiner Cc: Chengming Zhou Cc: Shakeel Butt Cc: Takero Funaki Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/page_io.c | 12 +++++++++--- mm/swap_state.c | 8 ++------ 2 files changed, 11 insertions(+), 9 deletions(-) diff --git a/mm/page_io.c b/mm/page_io.c index e78cad4198f1..744fc6201f2b 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -529,21 +529,27 @@ void swap_read_folio(struct folio *folio, struct swap_iocb **plug) if (zswap_load(folio)) { folio_unlock(folio); - } else if (data_race(sis->flags & SWP_FS_OPS)) { + goto finish; + } + + /* We have to read from slower devices. Increase zswap protection. */ + zswap_folio_swapin(folio); + + if (data_race(sis->flags & SWP_FS_OPS)) { swap_read_folio_fs(folio, plug); } else if (synchronous) { int ret = bdev_swapin_folio(sis->bdev, swap_folio_sector(folio), folio); if (ret != -EOPNOTSUPP) { if (!ret) count_vm_event(PSWPIN); - goto out; + goto finish; } swap_read_folio_bdev_sync(folio, sis); } else { swap_read_folio_bdev_async(folio, sis); } -out: +finish: if (workingset) { delayacct_thrashing_end(&in_thrashing); psi_memstall_leave(&pflags); diff --git a/mm/swap_state.c b/mm/swap_state.c index 12d4fc4c5618..8e17b8cab4d0 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -641,10 +641,8 @@ struct folio *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask, /* The page was likely read above, so no need for plugging here */ folio = __read_swap_cache_async(entry, gfp_mask, mpol, ilx, &page_allocated, false); - if (unlikely(page_allocated)) { - zswap_folio_swapin(folio); + if (unlikely(page_allocated)) swap_read_folio(folio, NULL); - } return folio; } @@ -798,10 +796,8 @@ static struct folio *swap_vma_readahead(swp_entry_t targ_entry, gfp_t gfp_mask, /* The folio was likely read above, so no need for plugging here */ folio = __read_swap_cache_async(targ_entry, gfp_mask, mpol, targ_ilx, &page_allocated, false); - if (unlikely(page_allocated)) { - zswap_folio_swapin(folio); + if (unlikely(page_allocated)) swap_read_folio(folio, NULL); - } return folio; } -- Gitee From e65958cd02f72f31fd705fa9dbfd0b92fa7afa22 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 7 Aug 2024 20:37:32 +0100 Subject: [PATCH 261/484] mm: return the folio from swapin_readahead commit 94dc8bffd8b7fe83ba8382a3410a2f218dc20cb0 upstream Conflicts: none Backport-reason: Memory: more folios The unuse_pte_range() caller only wants the folio while do_swap_page() wants both the page and the folio. Since do_swap_page() already has logic for handling both the folio and the page, move the folio-to-page logic there. This also lets us allocate larger folios in the SWP_SYNCHRONOUS_IO path in future. Link: https://lkml.kernel.org/r/20240807193734.1865400-1-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/memory.c | 6 ++---- mm/swap.h | 6 +++--- mm/swap_state.c | 8 +++----- mm/swapfile.c | 5 +---- 4 files changed, 9 insertions(+), 16 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index 38cd39f118bf..c4636615d315 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4012,7 +4012,6 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) /* skip swapcache */ folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma, vmf->address, false); - page = &folio->page; if (folio) { __folio_set_locked(folio); __folio_set_swapbacked(folio); @@ -4037,10 +4036,8 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) folio->private = NULL; } } else { - page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE, + folio = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE, vmf); - if (page) - folio = page_folio(page); swapcache = folio; } @@ -4061,6 +4058,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) ret = VM_FAULT_MAJOR; count_vm_event(PGMAJFAULT); count_memcg_event_mm(vma->vm_mm, PGMAJFAULT); + page = folio_file_page(folio, swp_offset(entry)); } else if (PageHWPoison(page)) { /* * hwpoisoned dirty swapcache pages are kept for killing diff --git a/mm/swap.h b/mm/swap.h index a586f480757e..8f364a00d120 100644 --- a/mm/swap.h +++ b/mm/swap.h @@ -72,8 +72,8 @@ struct folio *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_flags, bool skip_if_exists); struct folio *swap_cluster_readahead(swp_entry_t entry, gfp_t flag, struct mempolicy *mpol, pgoff_t ilx); -struct page *swapin_readahead(swp_entry_t entry, gfp_t flag, - struct vm_fault *vmf); +struct folio *swapin_readahead(swp_entry_t entry, gfp_t flag, + struct vm_fault *vmf); static inline unsigned int folio_swap_flags(struct folio *folio) { @@ -108,7 +108,7 @@ static inline struct folio *swap_cluster_readahead(swp_entry_t entry, return NULL; } -static inline struct page *swapin_readahead(swp_entry_t swp, gfp_t gfp_mask, +static inline struct folio *swapin_readahead(swp_entry_t swp, gfp_t gfp_mask, struct vm_fault *vmf) { return NULL; diff --git a/mm/swap_state.c b/mm/swap_state.c index 8e17b8cab4d0..6b43d8867d42 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -807,13 +807,13 @@ static struct folio *swap_vma_readahead(swp_entry_t targ_entry, gfp_t gfp_mask, * @gfp_mask: memory allocation flags * @vmf: fault information * - * Returns the struct page for entry and addr, after queueing swapin. + * Returns the struct folio for entry and addr, after queueing swapin. * * It's a main entry function for swap readahead. By the configuration, * it will read ahead blocks by cluster-based(ie, physical disk based) * or vma-based(ie, virtual address based on faulty address) readahead. */ -struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask, +struct folio *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask, struct vm_fault *vmf) { struct mempolicy *mpol; @@ -826,9 +826,7 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask, swap_cluster_readahead(entry, gfp_mask, mpol, ilx); mpol_cond_put(mpol); - if (!folio) - return NULL; - return folio_file_page(folio, swp_offset(entry)); + return folio; } #ifdef CONFIG_SYSFS diff --git a/mm/swapfile.c b/mm/swapfile.c index 879469ab3b5e..91979634e4f5 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -2064,7 +2064,6 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, folio = swap_cache_get_folio(entry, vma, addr); if (!folio) { - struct page *page; struct vm_fault vmf = { .vma = vma, .address = addr, @@ -2072,10 +2071,8 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, .pmd = pmd, }; - page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE, + folio = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE, &vmf); - if (page) - folio = page_folio(page); } if (!folio) { swp_count = READ_ONCE(si->swap_map[offset]); -- Gitee From ad75d65229630c489e9afed570fc261d4c933087 Mon Sep 17 00:00:00 2001 From: Barry Song Date: Thu, 8 Aug 2024 09:58:59 +1200 Subject: [PATCH 262/484] mm: attempt to batch free swap entries for zap_pte_range() commit bea67dcc5eea0f6a213897a59b9e644977cd07e6 upstream Conflicts: major, rewrite batch freeing. Backport-reason: Swap cleanup during allocator development Zhiguo reported that swap release could be a serious bottleneck during process exits[1]. With mTHP, we have the opportunity to batch free swaps. Thanks to the work of Chris and Kairui[2], I was able to achieve this optimization with minimal code changes by building on their efforts. If swap_count is 1, which is likely true as most anon memory are private, we can free all contiguous swap slots all together. Ran the below test program for measuring the bandwidth of munmap using zRAM and 64KiB mTHP: #include #include #include unsigned long long tv_to_ms(struct timeval tv) { return tv.tv_sec * 1000 + tv.tv_usec / 1000; } main() { struct timeval tv_b, tv_e; int i; #define SIZE 1024*1024*1024 void *p = mmap(NULL, SIZE, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); if (!p) { perror("fail to get memory"); exit(-1); } madvise(p, SIZE, MADV_HUGEPAGE); memset(p, 0x11, SIZE); /* write to get mem */ madvise(p, SIZE, MADV_PAGEOUT); gettimeofday(&tv_b, NULL); munmap(p, SIZE); gettimeofday(&tv_e, NULL); printf("munmap in bandwidth: %ld bytes/ms\n", SIZE/(tv_to_ms(tv_e) - tv_to_ms(tv_b))); } The result is as below (munmap bandwidth): mm-unstable mm-unstable-with-patch round1 21053761 63161283 round2 21053761 63161283 round3 21053761 63161283 round4 20648881 67108864 round5 20648881 67108864 munmap bandwidth becomes 3X faster. [1] https://lore.kernel.org/linux-mm/20240731133318.527-1-justinjiang@vivo.com/ [2] https://lore.kernel.org/linux-mm/20240730-swap-allocator-v5-0-cb9c148b9297@kernel.org/ [v-songbaohua@oppo.com: check all swaps belong to same swap_cgroup in swap_pte_batch()] Link: https://lkml.kernel.org/r/20240815215308.55233-1-21cnbao@gmail.com [hughd@google.com: add mem_cgroup_disabled() check] Link: https://lkml.kernel.org/r/33f34a88-0130-5444-9b84-93198eeb50e7@google.com [21cnbao@gmail.com: add missing zswap_invalidate()] Link: https://lkml.kernel.org/r/20240821054921.43468-1-21cnbao@gmail.com Link: https://lkml.kernel.org/r/20240807215859.57491-3-21cnbao@gmail.com Signed-off-by: Barry Song Signed-off-by: Hugh Dickins Acked-by: David Hildenbrand Cc: Kairui Song Cc: Chris Li Cc: "Huang, Ying" Cc: Kalesh Singh Cc: Ryan Roberts Cc: Barry Song Cc: Yosry Ahmed Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/internal.h | 9 ++++-- mm/swap_cgroup.c | 2 ++ mm/swapfile.c | 71 ++++++++++++++++++++++++++++++++++++++++-------- 3 files changed, 69 insertions(+), 13 deletions(-) diff --git a/mm/internal.h b/mm/internal.h index 4cb81b87c639..93c78e0e66bf 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -13,6 +13,7 @@ #include #include #include +#include #include struct folio_batch; @@ -304,18 +305,22 @@ static inline int swap_pte_batch(pte_t *start_ptep, int max_nr, pte_t pte) { pte_t expected_pte = pte_next_swp_offset(pte); const pte_t *end_ptep = start_ptep + max_nr; + swp_entry_t entry = pte_to_swp_entry(pte); pte_t *ptep = start_ptep + 1; + unsigned short cgroup_id; VM_WARN_ON(max_nr < 1); VM_WARN_ON(!is_swap_pte(pte)); - VM_WARN_ON(non_swap_entry(pte_to_swp_entry(pte))); + VM_WARN_ON(non_swap_entry(entry)); + cgroup_id = lookup_swap_cgroup_id(entry); while (ptep < end_ptep) { pte = ptep_get(ptep); if (!pte_same(pte, expected_pte)) break; - + if (lookup_swap_cgroup_id(pte_to_swp_entry(pte)) != cgroup_id) + break; expected_pte = pte_next_swp_offset(expected_pte); ptep++; } diff --git a/mm/swap_cgroup.c b/mm/swap_cgroup.c index db6c4a26cf59..da1278f0563b 100644 --- a/mm/swap_cgroup.c +++ b/mm/swap_cgroup.c @@ -161,6 +161,8 @@ unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id, */ unsigned short lookup_swap_cgroup_id(swp_entry_t ent) { + if (mem_cgroup_disabled()) + return 0; return lookup_swap_cgroup(ent, NULL)->id; } diff --git a/mm/swapfile.c b/mm/swapfile.c index 91979634e4f5..7273d581594c 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -186,6 +186,25 @@ static bool swap_is_has_cache(struct swap_info_struct *si, return true; } +static bool swap_is_last_map(struct swap_info_struct *si, + unsigned long offset, int nr_pages, bool *has_cache) +{ + unsigned char *map = si->swap_map + offset; + unsigned char *map_end = map + nr_pages; + unsigned char count = *map; + + if (swap_count(count) != 1) + return false; + + while (++map < map_end) { + if (*map != count) + return false; + } + + *has_cache = !!(count & SWAP_HAS_CACHE); + return true; +} + /* * returns number of pages in the folio that backs the swap entry. If positive, * the folio was reclaimed. If negative, the folio was not reclaimed. If 0, no @@ -1451,6 +1470,46 @@ static unsigned char __swap_entry_free(struct swap_info_struct *si, return usage; } +static bool __swap_entries_free(struct swap_info_struct *si, + swp_entry_t entry, int nr) +{ + unsigned long offset = swp_offset(entry); + unsigned int type = swp_type(entry); + struct swap_cluster_info *ci; + bool has_cache = false; + unsigned char count; + int i; + + if (nr <= 1 || swap_count(data_race(si->swap_map[offset])) != 1) + goto fallback; + /* cross into another cluster */ + if (nr > SWAPFILE_CLUSTER - offset % SWAPFILE_CLUSTER) + goto fallback; + + ci = lock_cluster(si, offset); + if (!swap_is_last_map(si, offset, nr, &has_cache)) { + unlock_cluster(ci); + goto fallback; + } + for (i = 0; i < nr; i++) + WRITE_ONCE(si->swap_map[offset + i], SWAP_HAS_CACHE); + if (!has_cache) + swap_entry_range_free(si, ci, entry, nr); + unlock_cluster(ci); + return has_cache; +fallback: + for (i = 0; i < nr; i++) { + if (data_race(si->swap_map[offset + i])) { + count = __swap_entry_free(si, swp_entry(type, offset + i)); + if (count == SWAP_HAS_CACHE) + has_cache = true; + } else { + WARN_ON_ONCE(1); + } + } + return has_cache; +} + /* * Drop the last HAS_CACHE flag of swap entries, caller have to * ensure all entries belong to the same cgroup. @@ -1725,11 +1784,9 @@ void free_swap_and_cache_nr(swp_entry_t entry, int nr) { const unsigned long start_offset = swp_offset(entry); const unsigned long end_offset = start_offset + nr; - unsigned int type = swp_type(entry); struct swap_info_struct *si; bool any_only_cache = false; unsigned long offset; - unsigned char count; if (non_swap_entry(entry)) return; @@ -1744,15 +1801,7 @@ void free_swap_and_cache_nr(swp_entry_t entry, int nr) /* * First free all entries in the range. */ - for (offset = start_offset; offset < end_offset; offset++) { - if (data_race(si->swap_map[offset])) { - count = __swap_entry_free(si, swp_entry(type, offset)); - if (count == SWAP_HAS_CACHE) - any_only_cache = true; - } else { - WARN_ON_ONCE(1); - } - } + any_only_cache = __swap_entries_free(si, entry, nr); /* * Short-circuit the below loop if none of the entries had their -- Gitee From 7f8a14713869479ecb4460a2be48afade1893878 Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Mon, 12 Aug 2024 15:42:02 +0800 Subject: [PATCH 263/484] mm: swap: extend swap_shmem_alloc() to support batch SWAP_MAP_SHMEM flag setting commit 650180760be6bb448609d4d155eef3b728ace641 upstream Conflicts: minor, we have removed slot cache. Backport-reason: Swap cleanup and optimizations during allocator development Patch series "support large folio swap-out and swap-in for shmem", v5. Shmem will support large folio allocation [1] [2] to get a better performance, however, the memory reclaim still splits the precious large folios when trying to swap-out shmem, which may lead to the memory fragmentation issue and can not take advantage of the large folio for shmeme. Moreover, the swap code already supports for swapping out large folio without split, and large folio swap-in[3] series is queued into mm-unstable branch. Hence this patch set also supports the large folio swap-out and swap-in for shmem. This patch (of 9): To support shmem large folio swap operations, add a new parameter to swap_shmem_alloc() that allows batch SWAP_MAP_SHMEM flag setting for shmem swap entries. While we are at it, using folio_nr_pages() to get the number of pages of the folio as a preparation. Link: https://lkml.kernel.org/r/cover.1723434324.git.baolin.wang@linux.alibaba.com Link: https://lkml.kernel.org/r/99f64115d04b285e009580eb177352c57119ffd0.1723434324.git.baolin.wang@linux.alibaba.com Signed-off-by: Baolin Wang Reviewed-by: Barry Song Cc: Chris Li Cc: Daniel Gomez Cc: David Hildenbrand Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Kefeng Wang Cc: Lance Yang Cc: Matthew Wilcox Cc: Pankaj Raghav Cc: Ryan Roberts Cc: Yang Shi Cc: Zi Yan Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- include/linux/swap.h | 4 ++-- mm/shmem.c | 6 ++++-- mm/swapfile.c | 4 ++-- 3 files changed, 8 insertions(+), 6 deletions(-) diff --git a/include/linux/swap.h b/include/linux/swap.h index 82d2d684e70a..a085b4b6be22 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -514,7 +514,7 @@ bool folio_free_swap(struct folio *folio); void put_swap_folio(struct folio *folio, swp_entry_t entry); extern swp_entry_t get_swap_page_of_type(int); extern int add_swap_count_continuation(swp_entry_t, gfp_t); -extern void swap_shmem_alloc(swp_entry_t); +extern void swap_shmem_alloc(swp_entry_t, int); extern int swap_duplicate(swp_entry_t); extern int swapcache_prepare(swp_entry_t entry, int nr); extern void swap_free_nr(swp_entry_t entry, int nr_pages); @@ -580,7 +580,7 @@ static inline int add_swap_count_continuation(swp_entry_t swp, gfp_t gfp_mask) return 0; } -static inline void swap_shmem_alloc(swp_entry_t swp) +static inline void swap_shmem_alloc(swp_entry_t swp, int nr) { } diff --git a/mm/shmem.c b/mm/shmem.c index c8369e760bce..8879b12aaa1b 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1445,6 +1445,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) struct shmem_inode_info *info = SHMEM_I(inode); struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); pgoff_t index; + int nr_pages; /* * Our capabilities prevent regular writeback or sync from ever calling @@ -1477,6 +1478,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) } index = folio->index; + nr_pages = folio_nr_pages(folio); /* * This is somewhat ridiculous, but without plumbing a SWAP_MAP_FALLOC @@ -1523,8 +1525,8 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) list_add(&info->swaplist, &shmem_swaplist); if (!folio_alloc_swap(folio, __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN)) { - shmem_recalc_inode(inode, 0, 1); - swap_shmem_alloc(folio->swap); + shmem_recalc_inode(inode, 0, nr_pages); + swap_shmem_alloc(folio->swap, nr_pages); shmem_delete_from_page_cache(folio, swp_to_radix_entry(folio->swap)); mutex_unlock(&shmem_swaplist_mutex); diff --git a/mm/swapfile.c b/mm/swapfile.c index 7273d581594c..c304fe1229fb 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -3590,9 +3590,9 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage, int nr) * Help swapoff by noting that swap entry belongs to shmem/tmpfs * (in which case its reference count is never incremented). */ -void swap_shmem_alloc(swp_entry_t entry) +void swap_shmem_alloc(swp_entry_t entry, int nr) { - __swap_duplicate(entry, SWAP_MAP_SHMEM, 1); + __swap_duplicate(entry, SWAP_MAP_SHMEM, nr); } /* -- Gitee From bab4b763f661a34eb400472c39a2f3b92367a7a8 Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Mon, 12 Aug 2024 15:42:03 +0800 Subject: [PATCH 264/484] mm: shmem: extend shmem_partial_swap_usage() to support large folio swap commit 50f381eccefda0cdaf7aa617587dc04cb6652085 upstream Conflicts: none Backport-reason: Swap cleanup and optimizations during allocator development To support shmem large folio swapout in the following patches, using xa_get_order() to get the order of the swap entry to calculate the swap usage of shmem. Link: https://lkml.kernel.org/r/60b130b9fc3e422bb91293a172c2113c85e9233a.1723434324.git.baolin.wang@linux.alibaba.com Signed-off-by: Baolin Wang Cc: Barry Song Cc: Chris Li Cc: Daniel Gomez Cc: David Hildenbrand Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Kefeng Wang Cc: Lance Yang Cc: Matthew Wilcox Cc: Pankaj Raghav Cc: Ryan Roberts Cc: Yang Shi Cc: Zi Yan Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/shmem.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/shmem.c b/mm/shmem.c index 8879b12aaa1b..8818d9aac1d5 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -870,7 +870,7 @@ unsigned long shmem_partial_swap_usage(struct address_space *mapping, if (xas_retry(&xas, page)) continue; if (xa_is_value(page)) - swapped++; + swapped += 1 << xa_get_order(xas.xa, xas.xa_index); if (xas.xa_index == max) break; if (need_resched()) { -- Gitee From 00634355ca13e1982cdcb834638bb5c5573e968a Mon Sep 17 00:00:00 2001 From: Daniel Gomez Date: Mon, 12 Aug 2024 15:42:04 +0800 Subject: [PATCH 265/484] mm: shmem: return number of pages beeing freed in shmem_free_swap commit 6ea0d1ccb110387244e04637f28a1d2eda54e3fb upstream Conflicts: none Backport-reason: Swap cleanup and optimizations during allocator development Both shmem_free_swap callers expect the number of pages being freed. In the large folios context, this needs to support larger values other than 0 (used as 1 page being freed) and -ENOENT (used as 0 pages being freed). In preparation for large folios adoption, make shmem_free_swap routine return the number of pages being freed. So, returning 0 in this context, means 0 pages being freed. While we are at it, changing to use free_swap_and_cache_nr() to free large order swap entry by Baolin Wang. Link: https://lkml.kernel.org/r/9623e863c83d749d5ab407f6fdf0a8e5a3bdf052.1723434324.git.baolin.wang@linux.alibaba.com Signed-off-by: Daniel Gomez Signed-off-by: Baolin Wang Suggested-by: Matthew Wilcox Cc: Barry Song Cc: Chris Li Cc: David Hildenbrand Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Kefeng Wang Cc: Lance Yang Cc: Pankaj Raghav Cc: Ryan Roberts Cc: Yang Shi Cc: Zi Yan Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/shmem.c | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index 8818d9aac1d5..90a51eae48b6 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -836,18 +836,22 @@ static void shmem_delete_from_page_cache(struct folio *folio, void *radswap) } /* - * Remove swap entry from page cache, free the swap and its page cache. + * Remove swap entry from page cache, free the swap and its page cache. Returns + * the number of pages being freed. 0 means entry not found in XArray (0 pages + * being freed). */ -static int shmem_free_swap(struct address_space *mapping, - pgoff_t index, void *radswap) +static long shmem_free_swap(struct address_space *mapping, + pgoff_t index, void *radswap) { + int order = xa_get_order(&mapping->i_pages, index); void *old; old = xa_cmpxchg_irq(&mapping->i_pages, index, radswap, NULL, 0); if (old != radswap) - return -ENOENT; - free_swap_and_cache(radix_to_swp_entry(radswap)); - return 0; + return 0; + free_swap_and_cache_nr(radix_to_swp_entry(radswap), 1 << order); + + return 1 << order; } /* @@ -999,7 +1003,7 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, if (xa_is_value(folio)) { if (unfalloc) continue; - nr_swaps_freed += !shmem_free_swap(mapping, + nr_swaps_freed += shmem_free_swap(mapping, indices[i], folio); continue; } @@ -1066,14 +1070,17 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, folio = fbatch.folios[i]; if (xa_is_value(folio)) { + long swaps_freed; + if (unfalloc) continue; - if (shmem_free_swap(mapping, indices[i], folio)) { + swaps_freed = shmem_free_swap(mapping, indices[i], folio); + if (!swaps_freed) { /* Swap was replaced by page: retry */ index = indices[i]; break; } - nr_swaps_freed++; + nr_swaps_freed += swaps_freed; continue; } -- Gitee From c07d9349c12652e5b94549cb9694752180b6980b Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Mon, 12 Aug 2024 15:42:05 +0800 Subject: [PATCH 266/484] mm: filemap: use xa_get_order() to get the swap entry order commit fb72415938d109fe8cca339a4f1423f76ba213c5 upstream Conflicts: none Backport-reason: Swap cleanup and optimizations during allocator development In the following patches, shmem will support the swap out of large folios, which means the shmem mappings may contain large order swap entries, so using xa_get_order() to get the folio order of the shmem swap entry to update the '*start' correctly. [hughd@google.com: use xa_get_order() to get the swap entry order] Link: https://lkml.kernel.org/r/c336e6e4-da7f-b714-c0f1-12df715f2611@google.com Link: https://lkml.kernel.org/r/6876d55145c1cc80e79df7884aa3a62e397b101d.1723434324.git.baolin.wang@linux.alibaba.com Signed-off-by: Baolin Wang Signed-off-by: Hugh Dickins Cc: Barry Song Cc: Chris Li Cc: Daniel Gomez Cc: David Hildenbrand Cc: "Huang, Ying" Cc: Kefeng Wang Cc: Lance Yang Cc: Matthew Wilcox Cc: Pankaj Raghav Cc: Ryan Roberts Cc: Yang Shi Cc: Zi Yan Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/filemap.c | 41 +++++++++++++++++++++++++++-------------- 1 file changed, 27 insertions(+), 14 deletions(-) diff --git a/mm/filemap.c b/mm/filemap.c index 0860266001b4..1aaac1c2ad2c 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2150,17 +2150,20 @@ unsigned find_get_entries(struct address_space *mapping, pgoff_t *start, if (!folio_batch_add(fbatch, folio)) break; } - rcu_read_unlock(); if (folio_batch_count(fbatch)) { - unsigned long nr = 1; + unsigned long nr; int idx = folio_batch_count(fbatch) - 1; folio = fbatch->folios[idx]; if (!xa_is_value(folio)) nr = folio_nr_pages(folio); - *start = indices[idx] + nr; + else + nr = 1 << xa_get_order(&mapping->i_pages, indices[idx]); + *start = round_down(indices[idx] + nr, nr); } + rcu_read_unlock(); + return folio_batch_count(fbatch); } @@ -2192,10 +2195,17 @@ unsigned find_lock_entries(struct address_space *mapping, pgoff_t *start, rcu_read_lock(); while ((folio = find_get_entry(&xas, end, XA_PRESENT))) { + unsigned long base; + unsigned long nr; + if (!xa_is_value(folio)) { - if (folio->index < *start) + nr = folio_nr_pages(folio); + base = folio->index; + /* Omit large folio which begins before the start */ + if (base < *start) goto put; - if (folio_next_index(folio) - 1 > end) + /* Omit large folio which extends beyond the end */ + if (base + nr - 1 > end) goto put; if (!folio_trylock(folio)) goto put; @@ -2204,7 +2214,19 @@ unsigned find_lock_entries(struct address_space *mapping, pgoff_t *start, goto unlock; VM_BUG_ON_FOLIO(!folio_contains(folio, xas.xa_index), folio); + } else { + nr = 1 << xa_get_order(&mapping->i_pages, xas.xa_index); + base = xas.xa_index & ~(nr - 1); + /* Omit order>0 value which begins before the start */ + if (base < *start) + continue; + /* Omit order>0 value which extends beyond the end */ + if (base + nr - 1 > end) + break; } + + /* Update start now so that last update is correct on return */ + *start = base + nr; indices[fbatch->nr] = xas.xa_index; if (!folio_batch_add(fbatch, folio)) break; @@ -2216,15 +2238,6 @@ unsigned find_lock_entries(struct address_space *mapping, pgoff_t *start, } rcu_read_unlock(); - if (folio_batch_count(fbatch)) { - unsigned long nr = 1; - int idx = folio_batch_count(fbatch) - 1; - - folio = fbatch->folios[idx]; - if (!xa_is_value(folio)) - nr = folio_nr_pages(folio); - *start = indices[idx] + nr; - } return folio_batch_count(fbatch); } -- Gitee From d9d7ed2f4f2d00da9ee7803f51ffca0c27d6294a Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Mon, 12 Aug 2024 15:42:06 +0800 Subject: [PATCH 267/484] mm: shmem: use swap_free_nr() to free shmem swap entries commit 40ff2d11bd58a3908897ccc689ed5d8ac498f173 upstream Conflicts: trivial due to SLI Backport-reason: Swap cleanup and optimizations during allocator development As a preparation for supporting shmem large folio swapout, use swap_free_nr() to free some continuous swap entries of the shmem large folio when the large folio was swapped in from the swap cache. In addition, the index should also be round down to the number of pages when adding the swapin folio into the pagecache. Link: https://lkml.kernel.org/r/342207fa679fc88a447dac2e101ad79e6050fe79.1723434324.git.baolin.wang@linux.alibaba.com Signed-off-by: Baolin Wang Cc: Barry Song Cc: Chris Li Cc: Daniel Gomez Cc: David Hildenbrand Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Kefeng Wang Cc: Lance Yang Cc: Matthew Wilcox Cc: Pankaj Raghav Cc: Ryan Roberts Cc: Yang Shi Cc: Zi Yan Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/shmem.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index 90a51eae48b6..120f23a214f3 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1829,6 +1829,7 @@ static void shmem_set_folio_swapin_error(struct inode *inode, pgoff_t index, struct address_space *mapping = inode->i_mapping; swp_entry_t swapin_error; void *old; + int nr_pages; swapin_error = make_poisoned_swp_entry(); old = xa_cmpxchg_irq(&mapping->i_pages, index, @@ -1837,6 +1838,7 @@ static void shmem_set_folio_swapin_error(struct inode *inode, pgoff_t index, if (old != swp_to_radix_entry(swap)) return; + nr_pages = folio_nr_pages(folio); folio_wait_writeback(folio); delete_from_swap_cache(folio); /* @@ -1844,8 +1846,8 @@ static void shmem_set_folio_swapin_error(struct inode *inode, pgoff_t index, * won't be 0 when inode is released and thus trigger WARN_ON(i_blocks) * in shmem_evict_inode(). */ - shmem_recalc_inode(inode, -1, -1); - swap_free(swap); + shmem_recalc_inode(inode, -nr_pages, -nr_pages); + swap_free_nr(swap, nr_pages); } /* @@ -1864,7 +1866,7 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, struct swap_info_struct *si; struct folio *folio = NULL; swp_entry_t swap; - int error; + int error, nr_pages; #ifdef CONFIG_CGROUP_SLI u64 start; #endif @@ -1920,6 +1922,7 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, goto failed; } folio_wait_writeback(folio); + nr_pages = folio_nr_pages(folio); /* * Some architectures may have to restore extra metadata to the @@ -1933,19 +1936,20 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, goto failed; } - error = shmem_add_to_page_cache(folio, mapping, index, + error = shmem_add_to_page_cache(folio, mapping, + round_down(index, nr_pages), swp_to_radix_entry(swap), gfp); if (error) goto failed; - shmem_recalc_inode(inode, 0, -1); + shmem_recalc_inode(inode, 0, -nr_pages); if (sgp == SGP_WRITE) folio_mark_accessed(folio); delete_from_swap_cache(folio); folio_mark_dirty(folio); - swap_free(swap); + swap_free_nr(swap, nr_pages); put_swap_device(si); *foliop = folio; -- Gitee From eb0d366e363f12a74cce52727b26bd249045a59f Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Thu, 13 Jun 2024 16:21:19 +0800 Subject: [PATCH 268/484] mm: shmem: fix getting incorrect lruvec when replacing a shmem folio commit 9094b4a1c76cfe84b906cc152bab34d4ba26fa5c upstream Conflicts: none Backport-reason: Dependency of swap cleanup and optimizations during allocator development When testing shmem swapin, I encountered the warning below on my machine. The reason is that replacing an old shmem folio with a new one causes mem_cgroup_migrate() to clear the old folio's memcg data. As a result, the old folio cannot get the correct memcg's lruvec needed to remove itself from the LRU list when it is being freed. This could lead to possible serious problems, such as LRU list crashes due to holding the wrong LRU lock, and incorrect LRU statistics. To fix this issue, we can fallback to use the mem_cgroup_replace_folio() to replace the old shmem folio. [ 5241.100311] page: refcount:0 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x5d9960 [ 5241.100317] head: order:4 mapcount:0 entire_mapcount:0 nr_pages_mapped:0 pincount:0 [ 5241.100319] flags: 0x17fffe0000040068(uptodate|lru|head|swapbacked|node=0|zone=2|lastcpupid=0x3ffff) [ 5241.100323] raw: 17fffe0000040068 fffffdffd6687948 fffffdffd69ae008 0000000000000000 [ 5241.100325] raw: 0000000000000000 0000000000000000 00000000ffffffff 0000000000000000 [ 5241.100326] head: 17fffe0000040068 fffffdffd6687948 fffffdffd69ae008 0000000000000000 [ 5241.100327] head: 0000000000000000 0000000000000000 00000000ffffffff 0000000000000000 [ 5241.100328] head: 17fffe0000000204 fffffdffd6665801 ffffffffffffffff 0000000000000000 [ 5241.100329] head: 0000000a00000010 0000000000000000 00000000ffffffff 0000000000000000 [ 5241.100330] page dumped because: VM_WARN_ON_ONCE_FOLIO(!memcg && !mem_cgroup_disabled()) [ 5241.100338] ------------[ cut here ]------------ [ 5241.100339] WARNING: CPU: 19 PID: 78402 at include/linux/memcontrol.h:775 folio_lruvec_lock_irqsave+0x140/0x150 [...] [ 5241.100374] pc : folio_lruvec_lock_irqsave+0x140/0x150 [ 5241.100375] lr : folio_lruvec_lock_irqsave+0x138/0x150 [ 5241.100376] sp : ffff80008b38b930 [...] [ 5241.100398] Call trace: [ 5241.100399] folio_lruvec_lock_irqsave+0x140/0x150 [ 5241.100401] __page_cache_release+0x90/0x300 [ 5241.100404] __folio_put+0x50/0x108 [ 5241.100406] shmem_replace_folio+0x1b4/0x240 [ 5241.100409] shmem_swapin_folio+0x314/0x528 [ 5241.100411] shmem_get_folio_gfp+0x3b4/0x930 [ 5241.100412] shmem_fault+0x74/0x160 [ 5241.100414] __do_fault+0x40/0x218 [ 5241.100417] do_shared_fault+0x34/0x1b0 [ 5241.100419] do_fault+0x40/0x168 [ 5241.100420] handle_pte_fault+0x80/0x228 [ 5241.100422] __handle_mm_fault+0x1c4/0x440 [ 5241.100424] handle_mm_fault+0x60/0x1f0 [ 5241.100426] do_page_fault+0x120/0x488 [ 5241.100429] do_translation_fault+0x4c/0x68 [ 5241.100431] do_mem_abort+0x48/0xa0 [ 5241.100434] el0_da+0x38/0xc0 [ 5241.100436] el0t_64_sync_handler+0x68/0xc0 [ 5241.100437] el0t_64_sync+0x14c/0x150 [ 5241.100439] ---[ end trace 0000000000000000 ]--- [baolin.wang@linux.alibaba.com: remove less helpful comments, per Matthew] Link: https://lkml.kernel.org/r/ccad3fe1375b468ebca3227b6b729f3eaf9d8046.1718423197.git.baolin.wang@linux.alibaba.com Link: https://lkml.kernel.org/r/3c11000dd6c1df83015a8321a859e9775ebbc23e.1718266112.git.baolin.wang@linux.alibaba.com Fixes: 85ce2c517ade ("memcontrol: only transfer the memcg data for migration") Signed-off-by: Baolin Wang Reviewed-by: Shakeel Butt Cc: Matthew Wilcox (Oracle) Cc: Hugh Dickins Cc: Johannes Weiner Cc: Nhat Pham Cc: Michal Hocko Cc: Roman Gushchin Cc: Muchun Song Cc: Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/memcontrol.c | 3 +-- mm/shmem.c | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index f48ab74f3b05..2d0e731a4fe9 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -9539,8 +9539,7 @@ void __mem_cgroup_uncharge_folios(struct folio_batch *folios) * @new: Replacement folio. * * Charge @new as a replacement folio for @old. @old will - * be uncharged upon free. This is only used by the page cache - * (in replace_page_cache_folio()). + * be uncharged upon free. * * Both folios must be locked, @new->mapping must be set up. */ diff --git a/mm/shmem.c b/mm/shmem.c index 120f23a214f3..2d2a97e940ba 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1795,7 +1795,7 @@ static int shmem_replace_folio(struct folio **foliop, gfp_t gfp, xa_lock_irq(&swap_mapping->i_pages); error = shmem_replace_entry(swap_mapping, swap_index, old, new); if (!error) { - mem_cgroup_migrate(old, new); + mem_cgroup_replace_folio(old, new); __lruvec_stat_mod_folio(new, NR_FILE_PAGES, 1); __lruvec_stat_mod_folio(new, NR_SHMEM, 1); __lruvec_stat_mod_folio(old, NR_FILE_PAGES, -1); -- Gitee From 73cb34a081b99518eee7c4a77064e77f7202d4bd Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Mon, 12 Aug 2024 15:42:07 +0800 Subject: [PATCH 269/484] mm: shmem: support large folio allocation for shmem_replace_folio() commit 736f0e03564729a5ba609c2c4fcb37ff1e92ede4 upstream Conflicts: none Backport-reason: Swap cleanup and optimizations during allocator development To support large folio swapin for shmem in the following patches, add large folio allocation for the new replacement folio in shmem_replace_folio(). Moreover large folios occupy N consecutive entries in the swap cache instead of using multi-index entries like the page cache, therefore we should replace each consecutive entries in the swap cache instead of using the shmem_replace_entry(). As well as updating statistics and folio reference count using the number of pages in the folio. [baolin.wang@linux.alibaba.com: fix the gfp flag for large folio allocation] Link: https://lkml.kernel.org/r/5b1e9c5a-7f61-4d97-a8d7-41767ca04c77@linux.alibaba.com [baolin.wang@linux.alibaba.com: fix build without CONFIG_TRANSPARENT_HUGEPAGE] Link: https://lkml.kernel.org/r/8c03467c-63b2-43b4-9851-222d4188725c@linux.alibaba.com Link: https://lkml.kernel.org/r/a41138ecc857ef13e7c5ffa0174321e9e2c9970a.1723434324.git.baolin.wang@linux.alibaba.com Signed-off-by: Baolin Wang Cc: Barry Song Cc: Chris Li Cc: Daniel Gomez Cc: David Hildenbrand Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Kefeng Wang Cc: Lance Yang Cc: Matthew Wilcox Cc: Pankaj Raghav Cc: Ryan Roberts Cc: Yang Shi Cc: Zi Yan Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/shmem.c | 74 +++++++++++++++++++++++++++++++++--------------------- 1 file changed, 46 insertions(+), 28 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index 2d2a97e940ba..977be86fce9c 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -150,7 +150,7 @@ static unsigned long shmem_default_max_inodes(void) static int shmem_swapin_folio(struct inode *inode, pgoff_t index, struct folio **foliop, enum sgp_type sgp, gfp_t gfp, - struct mm_struct *fault_mm, vm_fault_t *fault_type); + struct vm_area_struct *vma, vm_fault_t *fault_type); static inline struct shmem_sb_info *SHMEM_SB(struct super_block *sb) { @@ -1755,30 +1755,35 @@ static bool shmem_should_replace_folio(struct folio *folio, gfp_t gfp) } static int shmem_replace_folio(struct folio **foliop, gfp_t gfp, - struct shmem_inode_info *info, pgoff_t index) + struct shmem_inode_info *info, pgoff_t index, + struct vm_area_struct *vma) { - struct folio *old, *new; - struct address_space *swap_mapping; - swp_entry_t entry; - pgoff_t swap_index; - int error; - - old = *foliop; - entry = old->swap; - swap_index = swap_cache_index(entry); - swap_mapping = swap_address_space(entry); + struct folio *new, *old = *foliop; + swp_entry_t entry = old->swap; + struct address_space *swap_mapping = swap_address_space(entry); + pgoff_t swap_index = swap_cache_index(entry); + XA_STATE(xas, &swap_mapping->i_pages, swap_index); + int nr_pages = folio_nr_pages(old); + int error = 0, i; /* * We have arrived here because our zones are constrained, so don't * limit chance of success by further cpuset and node constraints. */ gfp &= ~GFP_CONSTRAINT_MASK; - VM_BUG_ON_FOLIO(folio_test_large(old), old); - new = shmem_alloc_folio(gfp, 0, info, index); +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + if (nr_pages > 1) { + gfp_t huge_gfp = vma_thp_gfp_mask(vma); + + gfp = limit_gfp_mask(huge_gfp, gfp); + } +#endif + + new = shmem_alloc_folio(gfp, folio_order(old), info, index); if (!new) return -ENOMEM; - folio_get(new); + folio_ref_add(new, nr_pages); folio_copy(new, old); flush_dcache_folio(new); @@ -1788,18 +1793,25 @@ static int shmem_replace_folio(struct folio **foliop, gfp_t gfp, new->swap = entry; folio_set_swapcache(new); - /* - * Our caller will very soon move newpage out of swapcache, but it's - * a nice clean interface for us to replace oldpage by newpage there. - */ + /* Swap cache still stores N entries instead of a high-order entry */ xa_lock_irq(&swap_mapping->i_pages); - error = shmem_replace_entry(swap_mapping, swap_index, old, new); + for (i = 0; i < nr_pages; i++) { + void *item = xas_load(&xas); + + if (item != old) { + error = -ENOENT; + break; + } + + xas_store(&xas, new); + xas_next(&xas); + } if (!error) { mem_cgroup_replace_folio(old, new); - __lruvec_stat_mod_folio(new, NR_FILE_PAGES, 1); - __lruvec_stat_mod_folio(new, NR_SHMEM, 1); - __lruvec_stat_mod_folio(old, NR_FILE_PAGES, -1); - __lruvec_stat_mod_folio(old, NR_SHMEM, -1); + __lruvec_stat_mod_folio(new, NR_FILE_PAGES, nr_pages); + __lruvec_stat_mod_folio(new, NR_SHMEM, nr_pages); + __lruvec_stat_mod_folio(old, NR_FILE_PAGES, -nr_pages); + __lruvec_stat_mod_folio(old, NR_SHMEM, -nr_pages); } xa_unlock_irq(&swap_mapping->i_pages); @@ -1819,7 +1831,12 @@ static int shmem_replace_folio(struct folio **foliop, gfp_t gfp, old->private = NULL; folio_unlock(old); - folio_put_refs(old, 2); + /* + * The old folio are removed from swap cache, drop the 'nr_pages' + * reference, as well as one temporary reference getting from swap + * cache. + */ + folio_put_refs(old, nr_pages + 1); return error; } @@ -1858,10 +1875,11 @@ static void shmem_set_folio_swapin_error(struct inode *inode, pgoff_t index, */ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, struct folio **foliop, enum sgp_type sgp, - gfp_t gfp, struct mm_struct *fault_mm, + gfp_t gfp, struct vm_area_struct *vma, vm_fault_t *fault_type) { struct address_space *mapping = inode->i_mapping; + struct mm_struct *fault_mm = vma ? vma->vm_mm : NULL; struct shmem_inode_info *info = SHMEM_I(inode); struct swap_info_struct *si; struct folio *folio = NULL; @@ -1931,7 +1949,7 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, arch_swap_restore(folio_swap(swap, folio), folio); if (shmem_should_replace_folio(folio, gfp)) { - error = shmem_replace_folio(&folio, gfp, info, index); + error = shmem_replace_folio(&folio, gfp, info, index, vma); if (error) goto failed; } @@ -2008,7 +2026,7 @@ static int shmem_get_folio_gfp(struct inode *inode, pgoff_t index, if (xa_is_value(folio)) { error = shmem_swapin_folio(inode, index, &folio, - sgp, gfp, fault_mm, fault_type); + sgp, gfp, vma, fault_type); if (error == -EEXIST) goto repeat; -- Gitee From 6cb4deb4189e51f87837b6b2743f8b097d884b5c Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Mon, 12 Aug 2024 15:42:08 +0800 Subject: [PATCH 270/484] mm: shmem: drop folio reference count using 'nr_pages' in shmem_delete_from_page_cache() commit 872339c31f3b2dd466320a5bed54abeccf0db47b upstream Conflicts: none Backport-reason: Swap cleanup and optimizations during allocator development To support large folio swapin/swapout for shmem in the following patches, drop the folio's reference count by the number of pages contained in the folio when a shmem folio is deleted from shmem pagecache after adding into swap cache. Link: https://lkml.kernel.org/r/b371eadb27f42fc51261c51008fbb9a334985b4c.1723434324.git.baolin.wang@linux.alibaba.com Signed-off-by: Baolin Wang Cc: Barry Song Cc: Chris Li Cc: Daniel Gomez Cc: David Hildenbrand Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Kefeng Wang Cc: Lance Yang Cc: Matthew Wilcox Cc: Pankaj Raghav Cc: Ryan Roberts Cc: Yang Shi Cc: Zi Yan Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/shmem.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/shmem.c b/mm/shmem.c index 977be86fce9c..ceab3915fac5 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -831,7 +831,7 @@ static void shmem_delete_from_page_cache(struct folio *folio, void *radswap) __lruvec_stat_mod_folio(folio, NR_FILE_PAGES, -nr); __lruvec_stat_mod_folio(folio, NR_SHMEM, -nr); xa_unlock_irq(&mapping->i_pages); - folio_put(folio); + folio_put_refs(folio, nr); BUG_ON(error); } -- Gitee From 6d76a4c27aa9cf35368e6c3100d32e0d5d1a01c3 Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Mon, 12 Aug 2024 15:42:09 +0800 Subject: [PATCH 271/484] mm: shmem: split large entry if the swapin folio is not large commit 12885cbe88ddf6c5fc3306193a6449ac310f2331 upstream Conflicts: none Backport-reason: Swap cleanup and optimizations during allocator development Now the swap device can only swap-in order 0 folio, even though a large folio is swapped out. This requires us to split the large entry previously saved in the shmem pagecache to support the swap in of small folios. [hughd@google.com: fix warnings from kmalloc_fix_flags()] Link: https://lkml.kernel.org/r/e2a2ba5d-864c-50aa-7579-97cba1c7dd0c@google.com [baolin.wang@linux.alibaba.com: drop the 'new_order' parameter] Link: https://lkml.kernel.org/r/39c71ccf-669b-4d9f-923c-f6b9c4ceb8df@linux.alibaba.com Link: https://lkml.kernel.org/r/4a0f12f27c54a62eb4d9ca1265fed3a62531a63e.1723434324.git.baolin.wang@linux.alibaba.com Signed-off-by: Baolin Wang Signed-off-by: Hugh Dickins Cc: Barry Song Cc: Chris Li Cc: Daniel Gomez Cc: David Hildenbrand Cc: "Huang, Ying" Cc: Kefeng Wang Cc: Lance Yang Cc: Matthew Wilcox Cc: Pankaj Raghav Cc: Ryan Roberts Cc: Yang Shi Cc: Zi Yan Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/shmem.c | 103 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 103 insertions(+) diff --git a/mm/shmem.c b/mm/shmem.c index ceab3915fac5..55968b14562a 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1867,6 +1867,84 @@ static void shmem_set_folio_swapin_error(struct inode *inode, pgoff_t index, swap_free_nr(swap, nr_pages); } +static int shmem_split_large_entry(struct inode *inode, pgoff_t index, + swp_entry_t swap, gfp_t gfp) +{ + struct address_space *mapping = inode->i_mapping; + XA_STATE_ORDER(xas, &mapping->i_pages, index, 0); + void *alloced_shadow = NULL; + int alloced_order = 0, i; + + /* Convert user data gfp flags to xarray node gfp flags */ + gfp &= GFP_RECLAIM_MASK; + + for (;;) { + int order = -1, split_order = 0; + void *old = NULL; + + xas_lock_irq(&xas); + old = xas_load(&xas); + if (!xa_is_value(old) || swp_to_radix_entry(swap) != old) { + xas_set_err(&xas, -EEXIST); + goto unlock; + } + + order = xas_get_order(&xas); + + /* Swap entry may have changed before we re-acquire the lock */ + if (alloced_order && + (old != alloced_shadow || order != alloced_order)) { + xas_destroy(&xas); + alloced_order = 0; + } + + /* Try to split large swap entry in pagecache */ + if (order > 0) { + if (!alloced_order) { + split_order = order; + goto unlock; + } + xas_split(&xas, old, order); + + /* + * Re-set the swap entry after splitting, and the swap + * offset of the original large entry must be continuous. + */ + for (i = 0; i < 1 << order; i++) { + pgoff_t aligned_index = round_down(index, 1 << order); + swp_entry_t tmp; + + tmp = swp_entry(swp_type(swap), swp_offset(swap) + i); + __xa_store(&mapping->i_pages, aligned_index + i, + swp_to_radix_entry(tmp), 0); + } + } + +unlock: + xas_unlock_irq(&xas); + + /* split needed, alloc here and retry. */ + if (split_order) { + xas_split_alloc(&xas, old, split_order, gfp); + if (xas_error(&xas)) + goto error; + alloced_shadow = old; + alloced_order = split_order; + xas_reset(&xas); + continue; + } + + if (!xas_nomem(&xas, gfp)) + break; + } + +error: + if (xas_error(&xas)) + return xas_error(&xas); + + return alloced_order; +} + /* * Swap in the folio pointed to by *foliop. * Caller has to make sure that *foliop contains a valid swapped folio. @@ -1907,12 +1985,37 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, /* Look it up and read it in.. */ folio = swap_cache_get_folio(swap, NULL, 0); if (!folio) { + int split_order; + /* Or update major stats only when swapin succeeds?? */ if (fault_type) { *fault_type |= VM_FAULT_MAJOR; count_vm_event(PGMAJFAULT); count_memcg_event_mm(fault_mm, PGMAJFAULT); } + + /* + * Now swap device can only swap in order 0 folio, then we + * should split the large swap entry stored in the pagecache + * if necessary. + */ + split_order = shmem_split_large_entry(inode, index, swap, gfp); + if (split_order < 0) { + error = split_order; + goto failed; + } + + /* + * If the large swap entry has already been split, it is + * necessary to recalculate the new swap entry based on + * the old order alignment. + */ + if (split_order > 0) { + pgoff_t offset = index - round_down(index, 1 << split_order); + + swap = swp_entry(swp_type(swap), swp_offset(swap) + offset); + } + /* Here we actually start the io */ #ifdef CONFIG_CGROUP_SLI sli_memlat_stat_start(&start); -- Gitee From 06600cbca042472113d3f7988849a6201d5d9139 Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Mon, 12 Aug 2024 15:42:10 +0800 Subject: [PATCH 272/484] mm: shmem: support large folio swap out commit 809bc86517cc408b5b8cb8e08e69096639432bc8 upstream Conflicts: minor, due to SLI and already backported folio_alloc_swap Backport-reason: mTHP swapout for shmem Shmem will support large folio allocation [1] [2] to get a better performance, however, the memory reclaim still splits the precious large folios when trying to swap out shmem, which may lead to the memory fragmentation issue and can not take advantage of the large folio for shmeme. Moreover, the swap code already supports for swapping out large folio without split, hence this patch set supports the large folio swap out for shmem. Note the i915_gem_shmem driver still need to be split when swapping, thus add a new flag 'split_large_folio' for writeback_control to indicate spliting the large folio. [1] https://lore.kernel.org/all/cover.1717495894.git.baolin.wang@linux.alibaba.com/ [2] https://lore.kernel.org/all/20240515055719.32577-1-da.gomez@samsung.com/ [hughd@google.com: shmem_writepage() split folio at EOF before swapout] Link: https://lkml.kernel.org/r/aef55f8d-6040-692d-65e3-16150cce4440@google.com [baolin.wang@linux.alibaba.com: remove the wbc->split_large_folio per Hugh] Link: https://lkml.kernel.org/r/1236a002daa301b3b9ba73d6c0fab348427cf295.1724833399.git.baolin.wang@linux.alibaba.com Link: https://lkml.kernel.org/r/d80c21abd20e1b0f5ca66b330f074060fb2f082d.1723434324.git.baolin.wang@linux.alibaba.com Signed-off-by: Baolin Wang Signed-off-by: Hugh Dickins Cc: Barry Song Cc: Chris Li Cc: David Hildenbrand Cc: "Huang, Ying" Cc: Kefeng Wang Cc: Lance Yang Cc: Matthew Wilcox Cc: Pankaj Raghav Cc: Ryan Roberts Cc: Yang Shi Cc: Zi Yan Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- include/linux/writeback.h | 3 +++ mm/shmem.c | 24 +++++++++++++++++++----- mm/vmscan.c | 31 ++++++++++++++++++++++++------- 3 files changed, 46 insertions(+), 12 deletions(-) diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 598cf6b2abd7..018b6c6f6172 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -96,6 +96,9 @@ struct writeback_control { */ struct swap_iocb **swap_plug; + /* Target list for splitting a large folio */ + struct list_head *list; + #ifdef CONFIG_CGROUP_WRITEBACK struct bdi_writeback *wb; /* wb this writeback is issued under */ struct inode *inode; /* inode being written out */ diff --git a/mm/shmem.c b/mm/shmem.c index 55968b14562a..bdd16adec1cc 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -775,7 +775,6 @@ static int shmem_add_to_page_cache(struct folio *folio, VM_BUG_ON_FOLIO(index != round_down(index, nr), folio); VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); VM_BUG_ON_FOLIO(!folio_test_swapbacked(folio), folio); - VM_BUG_ON(expected && folio_test_large(folio)); folio_ref_add(folio, nr); folio->mapping = mapping; @@ -1453,6 +1452,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); pgoff_t index; int nr_pages; + bool split = false; /* * Our capabilities prevent regular writeback or sync from ever calling @@ -1471,14 +1471,26 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) goto redirty; /* - * If /sys/kernel/mm/transparent_hugepage/shmem_enabled is "always" or - * "force", drivers/gpu/drm/i915/gem/i915_gem_shmem.c gets huge pages, - * and its shmem_writeback() needs them to be split when swapping. + * If CONFIG_THP_SWAP is not enabled, the large folio should be + * split when swapping. + * + * And shrinkage of pages beyond i_size does not split swap, so + * swapout of a large folio crossing i_size needs to split too + * (unless fallocate has been used to preallocate beyond EOF). */ if (folio_test_large(folio)) { + index = shmem_fallocend(inode, + DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE)); + if ((index > folio->index && index < folio_next_index(folio)) || + !IS_ENABLED(CONFIG_THP_SWAP)) + split = true; + } + + if (split) { +try_split: /* Ensure the subpages are still dirty */ folio_test_set_dirty(folio); - if (split_huge_page(page) < 0) + if (split_huge_page_to_list_to_order(page, wbc->list, 0)) goto redirty; folio = page_folio(page); folio_clear_dirty(folio); @@ -1543,6 +1555,8 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) if (!info->swapped) list_del_init(&info->swaplist); mutex_unlock(&shmem_swaplist_mutex); + if (nr_pages > 1) + goto try_split; redirty: folio_mark_dirty(folio); if (wbc->for_reclaim) diff --git a/mm/vmscan.c b/mm/vmscan.c index f7e8d59ffcea..9f9665c2f052 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -686,7 +686,8 @@ typedef enum { * Calls ->writepage(). */ static pageout_t pageout(struct folio *folio, struct address_space *mapping, - struct scan_control *sc, struct swap_iocb **plug) + struct scan_control *sc, struct swap_iocb **plug, + struct list_head *folio_list) { #ifdef CONFIG_CGROUP_SLI u64 start; @@ -737,6 +738,14 @@ static pageout_t pageout(struct folio *folio, struct address_space *mapping, .swap_plug = plug, }; + /* + * The large shmem folio can be split if CONFIG_THP_SWAP is + * not enabled or contiguous swap entries are failed to + * allocate. + */ + if (shmem_mapping(mapping) && folio_test_large(folio)) + wbc.list = folio_list; + folio_set_reclaim(folio); #ifdef CONFIG_CGROUP_SLI if (!current_is_kswapd()) @@ -1362,11 +1371,6 @@ static unsigned int shrink_folio_list(struct list_head *folio_list, */ folio_mark_dirty(folio); } - } else if (folio_test_swapbacked(folio) && - folio_test_large(folio)) { - /* Split shmem folio */ - if (split_folio_to_list(folio, folio_list)) - goto keep_locked; } /* @@ -1453,12 +1457,25 @@ static unsigned int shrink_folio_list(struct list_head *folio_list, * starts and then write it out here. */ try_to_unmap_flush_dirty(); - switch (pageout(folio, mapping, sc, &plug)) { + switch (pageout(folio, mapping, sc, &plug, folio_list)) { case PAGE_KEEP: goto keep_locked; case PAGE_ACTIVATE: + /* + * If shmem folio is split when writeback to swap, + * the tail pages will make their own pass through + * this function and be accounted then. + */ + if (nr_pages > 1 && !folio_test_large(folio)) { + sc->nr_scanned -= (nr_pages - 1); + nr_pages = 1; + } goto activate_locked; case PAGE_SUCCESS: + if (nr_pages > 1 && !folio_test_large(folio)) { + sc->nr_scanned -= (nr_pages - 1); + nr_pages = 1; + } stat->nr_pageout += nr_pages; if (folio_test_writeback(folio)) -- Gitee From f5b76375320888a275c9312480e9fa84c7c1392f Mon Sep 17 00:00:00 2001 From: Usama Arif Date: Fri, 23 Aug 2024 20:04:39 +0100 Subject: [PATCH 273/484] mm: store zero pages to be swapped out in a bitmap commit 0ca0c24e3211586d44a31b3c4767fe3f43a008a7 upstream Conflicts: minor, we have zswap_invalidate in the loop already. Backport-reason: Swap Table merges zeromap so backport zeromap first Patch series "mm: store zero pages to be swapped out in a bitmap", v8. As shown in the patch series that introduced the zswap same-filled optimization [1], 10-20% of the pages stored in zswap are same-filled. This is also observed across Meta's server fleet. By using VM counters in swap_writepage (not included in this patchseries) it was found that less than 1% of the same-filled pages to be swapped out are non-zero pages. For conventional swap setup (without zswap), rather than reading/writing these pages to flash resulting in increased I/O and flash wear, a bitmap can be used to mark these pages as zero at write time, and the pages can be filled at read time if the bit corresponding to the page is set. When using zswap with swap, this also means that a zswap_entry does not need to be allocated for zero filled pages resulting in memory savings which would offset the memory used for the bitmap. A similar attempt was made earlier in [2] where zswap would only track zero-filled pages instead of same-filled. This patchseries adds zero-filled pages optimization to swap (hence it can be used even if zswap is disabled) and removes the same-filled code from zswap (as only 1% of the same-filled pages are non-zero), simplifying code. [1] https://lore.kernel.org/all/20171018104832epcms5p1b2232e2236258de3d03d1344dde9fce0@epcms5p1/ [2] https://lore.kernel.org/lkml/20240325235018.2028408-1-yosryahmed@google.com/ This patch (of 2): Approximately 10-20% of pages to be swapped out are zero pages [1]. Rather than reading/writing these pages to flash resulting in increased I/O and flash wear, a bitmap can be used to mark these pages as zero at write time, and the pages can be filled at read time if the bit corresponding to the page is set. With this patch, NVMe writes in Meta server fleet decreased by almost 10% with conventional swap setup (zswap disabled). [1] https://lore.kernel.org/all/20171018104832epcms5p1b2232e2236258de3d03d1344dde9fce0@epcms5p1/ Link: https://lkml.kernel.org/r/20240823190545.979059-1-usamaarif642@gmail.com Link: https://lkml.kernel.org/r/20240823190545.979059-2-usamaarif642@gmail.com Signed-off-by: Usama Arif Reviewed-by: Chengming Zhou Reviewed-by: Yosry Ahmed Reviewed-by: Nhat Pham Acked-by: Johannes Weiner Cc: David Hildenbrand Cc: Hugh Dickins Cc: Matthew Wilcox (Oracle) Cc: Shakeel Butt Cc: Usama Arif Cc: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- include/linux/swap.h | 1 + mm/page_io.c | 118 ++++++++++++++++++++++++++++++++++++++++++- mm/swapfile.c | 38 +++++++++++--- 3 files changed, 150 insertions(+), 7 deletions(-) diff --git a/include/linux/swap.h b/include/linux/swap.h index a085b4b6be22..e9433c2aa375 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -303,6 +303,7 @@ struct swap_info_struct { signed char type; /* strange name for an index */ unsigned int max; /* extent of the swap_map */ unsigned char *swap_map; /* vmalloc'ed array of usage counts */ + unsigned long *zeromap; /* kvmalloc'ed bitmap to track zero pages */ struct swap_cluster_info *cluster_info; /* cluster info. Only for SSD */ struct list_head free_clusters; /* free clusters list */ struct list_head full_clusters; /* full clusters list */ diff --git a/mm/page_io.c b/mm/page_io.c index 744fc6201f2b..4edb9f0d0cfb 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -171,6 +171,80 @@ int generic_swapfile_activate(struct swap_info_struct *sis, goto out; } +static bool is_folio_zero_filled(struct folio *folio) +{ + unsigned int pos, last_pos; + unsigned long *data; + unsigned int i; + + last_pos = PAGE_SIZE / sizeof(*data) - 1; + for (i = 0; i < folio_nr_pages(folio); i++) { + data = kmap_local_folio(folio, i * PAGE_SIZE); + /* + * Check last word first, incase the page is zero-filled at + * the start and has non-zero data at the end, which is common + * in real-world workloads. + */ + if (data[last_pos]) { + kunmap_local(data); + return false; + } + for (pos = 0; pos < last_pos; pos++) { + if (data[pos]) { + kunmap_local(data); + return false; + } + } + kunmap_local(data); + } + + return true; +} + +static void swap_zeromap_folio_set(struct folio *folio) +{ + struct swap_info_struct *sis = swp_swap_info(folio->swap); + swp_entry_t entry; + unsigned int i; + + for (i = 0; i < folio_nr_pages(folio); i++) { + entry = page_swap_entry(folio_page(folio, i)); + set_bit(swp_offset(entry), sis->zeromap); + } +} + +static void swap_zeromap_folio_clear(struct folio *folio) +{ + struct swap_info_struct *sis = swp_swap_info(folio->swap); + swp_entry_t entry; + unsigned int i; + + for (i = 0; i < folio_nr_pages(folio); i++) { + entry = page_swap_entry(folio_page(folio, i)); + clear_bit(swp_offset(entry), sis->zeromap); + } +} + +/* + * Return the index of the first subpage which is not zero-filled + * according to swap_info_struct->zeromap. + * If all pages are zero-filled according to zeromap, it will return + * folio_nr_pages(folio). + */ +static unsigned int swap_zeromap_folio_test(struct folio *folio) +{ + struct swap_info_struct *sis = swp_swap_info(folio->swap); + swp_entry_t entry; + unsigned int i; + + for (i = 0; i < folio_nr_pages(folio); i++) { + entry = page_swap_entry(folio_page(folio, i)); + if (!test_bit(swp_offset(entry), sis->zeromap)) + return i; + } + return i; +} + /* * We may have stale swap cache pages in memory: notice * them here and get rid of the unnecessary final write. @@ -194,6 +268,25 @@ int swap_writepage(struct page *page, struct writeback_control *wbc) folio_unlock(folio); return ret; } + + /* + * Use a bitmap (zeromap) to avoid doing IO for zero-filled pages. + * The bits in zeromap are protected by the locked swapcache folio + * and atomic updates are used to protect against read-modify-write + * corruption due to other zero swap entries seeing concurrent updates. + */ + if (is_folio_zero_filled(folio)) { + swap_zeromap_folio_set(folio); + folio_unlock(folio); + return 0; + } else { + /* + * Clear bits this folio occupies in the zeromap to prevent + * zero data being read in from any previous zero writes that + * occupied the same swap entries. + */ + swap_zeromap_folio_clear(folio); + } if (zswap_store(folio)) { folio_unlock(folio); return 0; @@ -437,6 +530,26 @@ static void sio_read_complete(struct kiocb *iocb, long ret) mempool_free(sio, sio_pool); } +static bool swap_read_folio_zeromap(struct folio *folio) +{ + unsigned int idx = swap_zeromap_folio_test(folio); + + if (idx == 0) + return false; + + /* + * Swapping in a large folio that is partially in the zeromap is not + * currently handled. Return true without marking the folio uptodate so + * that an IO error is emitted (e.g. do_swap_page() will sigbus). + */ + if (WARN_ON_ONCE(idx < folio_nr_pages(folio))) + return true; + + folio_zero_range(folio, 0, folio_size(folio)); + folio_mark_uptodate(folio); + return true; +} + static void swap_read_folio_fs(struct folio *folio, struct swap_iocb **plug) { struct swap_info_struct *sis = swp_swap_info(folio->swap); @@ -527,7 +640,10 @@ void swap_read_folio(struct folio *folio, struct swap_iocb **plug) } delayacct_swapin_start(); - if (zswap_load(folio)) { + if (swap_read_folio_zeromap(folio)) { + folio_unlock(folio); + goto finish; + } else if (zswap_load(folio)) { folio_unlock(folio); goto finish; } diff --git a/mm/swapfile.c b/mm/swapfile.c index c304fe1229fb..21e43e7e2a6a 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1124,8 +1124,14 @@ static void swap_range_free(struct swap_info_struct *si, unsigned long offset, void (*swap_slot_free_notify)(struct block_device *, unsigned long); unsigned int i; - for (i = 0; i < nr_entries; i++) + /* + * Use atomic clear_bit operations only on zeromap instead of non-atomic + * bitmap_clear to prevent adjacent bits corruption due to simultaneous writes. + */ + for (i = 0; i < nr_entries; i++) { + clear_bit(offset + i, si->zeromap); zswap_invalidate(swp_entry(si->type, offset + i)); + } if (si->flags & SWP_BLKDEV) swap_slot_free_notify = @@ -2542,7 +2548,8 @@ static int swap_node(struct swap_info_struct *si) static void setup_swap_info(struct swap_info_struct *si, int prio, unsigned char *swap_map, - struct swap_cluster_info *cluster_info) + struct swap_cluster_info *cluster_info, + unsigned long *zeromap) { int i; @@ -2567,6 +2574,7 @@ static void setup_swap_info(struct swap_info_struct *si, int prio, } si->swap_map = swap_map; si->cluster_info = cluster_info; + si->zeromap = zeromap; } static void _enable_swap_info(struct swap_info_struct *si) @@ -2593,11 +2601,12 @@ static void _enable_swap_info(struct swap_info_struct *si) static void enable_swap_info(struct swap_info_struct *si, int prio, unsigned char *swap_map, - struct swap_cluster_info *cluster_info) + struct swap_cluster_info *cluster_info, + unsigned long *zeromap) { spin_lock(&swap_lock); spin_lock(&si->lock); - setup_swap_info(si, prio, swap_map, cluster_info); + setup_swap_info(si, prio, swap_map, cluster_info, zeromap); spin_unlock(&si->lock); spin_unlock(&swap_lock); /* @@ -2615,7 +2624,7 @@ static void reinsert_swap_info(struct swap_info_struct *si) { spin_lock(&swap_lock); spin_lock(&si->lock); - setup_swap_info(si, si->prio, si->swap_map, si->cluster_info); + setup_swap_info(si, si->prio, si->swap_map, si->cluster_info, si->zeromap); _enable_swap_info(si); spin_unlock(&si->lock); spin_unlock(&swap_lock); @@ -2665,6 +2674,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) { struct swap_info_struct *p = NULL; unsigned char *swap_map; + unsigned long *zeromap; struct swap_cluster_info *cluster_info; struct file *swap_file, *victim; struct address_space *mapping; @@ -2775,6 +2785,8 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) p->max = 0; swap_map = p->swap_map; p->swap_map = NULL; + zeromap = p->zeromap; + p->zeromap = NULL; cluster_info = p->cluster_info; p->cluster_info = NULL; spin_unlock(&p->lock); @@ -2785,6 +2797,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) kfree(p->global_cluster); p->global_cluster = NULL; vfree(swap_map); + kvfree(zeromap); kvfree(cluster_info); /* Destroy swap account information */ swap_cgroup_swapoff(p->type); @@ -3270,6 +3283,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) sector_t span; unsigned long maxpages; unsigned char *swap_map = NULL; + unsigned long *zeromap = NULL; struct swap_cluster_info *cluster_info = NULL; struct page *page = NULL; struct inode *inode = NULL; @@ -3361,6 +3375,17 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) goto bad_swap_unlock_inode; } + /* + * Use kvmalloc_array instead of bitmap_zalloc as the allocation order might + * be above MAX_PAGE_ORDER incase of a large swap file. + */ + zeromap = kvmalloc_array(BITS_TO_LONGS(maxpages), sizeof(long), + GFP_KERNEL | __GFP_ZERO); + if (!zeromap) { + error = -ENOMEM; + goto bad_swap_unlock_inode; + } + if (si->bdev && bdev_stable_writes(si->bdev)) si->flags |= SWP_STABLE_WRITES; @@ -3436,7 +3461,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) if (swap_flags & SWAP_FLAG_PREFER) prio = (swap_flags & SWAP_FLAG_PRIO_MASK) >> SWAP_FLAG_PRIO_SHIFT; - enable_swap_info(si, prio, swap_map, cluster_info); + enable_swap_info(si, prio, swap_map, cluster_info, zeromap); pr_info("Adding %uk swap on %s. Priority:%d extents:%d across:%lluk %s%s%s%s\n", K(si->pages), name->name, si->prio, nr_extents, @@ -3469,6 +3494,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) si->flags = 0; spin_unlock(&swap_lock); vfree(swap_map); + kvfree(zeromap); kvfree(cluster_info); if (inced_nr_rotate_swap) atomic_dec(&nr_rotate_swap); -- Gitee From c963e3b87c984b2b3f81e1813aa5b03ec9b22576 Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Fri, 24 May 2024 03:38:16 +0000 Subject: [PATCH 274/484] mm: zswap: use sg_set_folio() in zswap_{compress/decompress}() commit 5d19f5de673bdee5e711c85737dd5ce5d438d8a3 upstream Conflicts: none Backport-reason: ZSWAP mass update Patch series "mm: zswap: trivial folio conversions". Some trivial folio conversions in zswap code. This patch (of 3): sg_set_folio() is equivalent to sg_set_page() for order-0 folios, which are the only ones supported by zswap. Now zswap_decompress() can take in a folio directly. Link: https://lkml.kernel.org/r/20240524033819.1953587-1-yosryahmed@google.com Link: https://lkml.kernel.org/r/20240524033819.1953587-2-yosryahmed@google.com Signed-off-by: Yosry Ahmed Reviewed-by: Chengming Zhou Cc: Johannes Weiner Cc: Matthew Wilcox Cc: Nhat Pham Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/zswap.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/mm/zswap.c b/mm/zswap.c index 7775ca968348..1796e4dcd2b1 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -902,7 +902,7 @@ static bool zswap_compress(struct folio *folio, struct zswap_entry *entry) dst = acomp_ctx->buffer; sg_init_table(&input, 1); - sg_set_page(&input, &folio->page, PAGE_SIZE, 0); + sg_set_folio(&input, folio, PAGE_SIZE, 0); /* * We need PAGE_SIZE * 2 here since there maybe over-compression case, @@ -956,7 +956,7 @@ static bool zswap_compress(struct folio *folio, struct zswap_entry *entry) return comp_ret == 0 && alloc_ret == 0; } -static void zswap_decompress(struct zswap_entry *entry, struct page *page) +static void zswap_decompress(struct zswap_entry *entry, struct folio *folio) { struct zpool *zpool = entry->pool->zpool; struct scatterlist input, output; @@ -975,7 +975,7 @@ static void zswap_decompress(struct zswap_entry *entry, struct page *page) sg_init_one(&input, src, entry->length); sg_init_table(&output, 1); - sg_set_page(&output, page, PAGE_SIZE, 0); + sg_set_folio(&output, folio, PAGE_SIZE, 0); acomp_request_set_params(acomp_ctx->req, &input, &output, entry->length, PAGE_SIZE); BUG_ON(crypto_wait_req(crypto_acomp_decompress(acomp_ctx->req), &acomp_ctx->wait)); BUG_ON(acomp_ctx->req->dlen != PAGE_SIZE); @@ -1055,7 +1055,7 @@ static int zswap_writeback_entry(struct zswap_entry *entry, return -ENOMEM; } - zswap_decompress(entry, &folio->page); + zswap_decompress(entry, folio); count_vm_event(ZSWPWB); if (entry->objcg) @@ -1636,7 +1636,7 @@ bool zswap_load(struct folio *folio) return false; if (entry->length) - zswap_decompress(entry, page); + zswap_decompress(entry, folio); else { dst = kmap_local_page(page); zswap_fill_page(dst, entry->value); -- Gitee From 8d29c9b90928764398bdaa22f87d3a19c2be06c7 Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Fri, 24 May 2024 03:38:17 +0000 Subject: [PATCH 275/484] mm :zswap: use kmap_local_folio() in zswap_load() commit 30a28baafc882fc2cf0f512256081dba03218c8f upstream Conflicts: none Backport-reason: ZSWAP mass update Eliminate the last explicit 'struct page' reference in mm/zswap.c. Link: https://lkml.kernel.org/r/20240524033819.1953587-3-yosryahmed@google.com Signed-off-by: Yosry Ahmed Reviewed-by: Chengming Zhou Cc: Johannes Weiner Cc: Matthew Wilcox Cc: Nhat Pham Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/zswap.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/mm/zswap.c b/mm/zswap.c index 1796e4dcd2b1..912b3a1349fd 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -1593,7 +1593,6 @@ bool zswap_load(struct folio *folio) { swp_entry_t swp = folio->swap; pgoff_t offset = swp_offset(swp); - struct page *page = &folio->page; bool swapcache = folio_test_swapcache(folio); struct xarray *tree = swap_zswap_tree(swp); struct zswap_entry *entry; @@ -1638,7 +1637,7 @@ bool zswap_load(struct folio *folio) if (entry->length) zswap_decompress(entry, folio); else { - dst = kmap_local_page(page); + dst = kmap_local_folio(folio, 0); zswap_fill_page(dst, entry->value); kunmap_local(dst); } -- Gitee From 05f09d17f91c88ab9b462d1a42e518d313b0a671 Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Fri, 24 May 2024 03:38:18 +0000 Subject: [PATCH 276/484] mm: zswap: make same_filled functions folio-friendly commit 5a3f572a592850e8b0a4a8668781820391a2e2e7 upstream Conflicts: none Backport-reason: ZSWAP mass update A variable name 'page' is used in zswap_is_folio_same_filled() and zswap_fill_page() to point at the kmapped data in a folio. Use 'data' instead to avoid confusion and stop it from showing up when searching for 'page' references in mm/zswap.c. While we are at it, move the kmap/kunmap calls into zswap_fill_page(), make it take in a folio, and rename it to zswap_fill_folio(). Link: https://lkml.kernel.org/r/20240524033819.1953587-4-yosryahmed@google.com Signed-off-by: Yosry Ahmed Reviewed-by: Chengming Zhou Cc: Johannes Weiner Cc: Matthew Wilcox Cc: Nhat Pham Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/zswap.c | 30 +++++++++++++----------------- 1 file changed, 13 insertions(+), 17 deletions(-) diff --git a/mm/zswap.c b/mm/zswap.c index 912b3a1349fd..194dd4add1c6 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -1416,35 +1416,35 @@ static void shrink_worker(struct work_struct *w) **********************************/ static bool zswap_is_folio_same_filled(struct folio *folio, unsigned long *value) { - unsigned long *page; + unsigned long *data; unsigned long val; - unsigned int pos, last_pos = PAGE_SIZE / sizeof(*page) - 1; + unsigned int pos, last_pos = PAGE_SIZE / sizeof(*data) - 1; bool ret = false; - page = kmap_local_folio(folio, 0); - val = page[0]; + data = kmap_local_folio(folio, 0); + val = data[0]; - if (val != page[last_pos]) + if (val != data[last_pos]) goto out; for (pos = 1; pos < last_pos; pos++) { - if (val != page[pos]) + if (val != data[pos]) goto out; } *value = val; ret = true; out: - kunmap_local(page); + kunmap_local(data); return ret; } -static void zswap_fill_page(void *ptr, unsigned long value) +static void zswap_fill_folio(struct folio *folio, unsigned long value) { - unsigned long *page; + unsigned long *data = kmap_local_folio(folio, 0); - page = (unsigned long *)ptr; - memset_l(page, value, PAGE_SIZE / sizeof(unsigned long)); + memset_l(data, value, PAGE_SIZE / sizeof(unsigned long)); + kunmap_local(data); } /********************************* @@ -1596,7 +1596,6 @@ bool zswap_load(struct folio *folio) bool swapcache = folio_test_swapcache(folio); struct xarray *tree = swap_zswap_tree(swp); struct zswap_entry *entry; - u8 *dst; VM_WARN_ON_ONCE(!folio_test_locked(folio)); @@ -1636,11 +1635,8 @@ bool zswap_load(struct folio *folio) if (entry->length) zswap_decompress(entry, folio); - else { - dst = kmap_local_folio(folio, 0); - zswap_fill_page(dst, entry->value); - kunmap_local(dst); - } + else + zswap_fill_folio(folio, entry->value); count_vm_event(ZSWPIN); if (entry->objcg) -- Gitee From 6a3b84ef6aee1e9f67d61ea91f189c27bf45df2e Mon Sep 17 00:00:00 2001 From: Usama Arif Date: Fri, 23 Aug 2024 20:04:40 +0100 Subject: [PATCH 277/484] mm: remove code to handle same filled pages commit 20a5532ffa53d6ecf41ded920a7b0ff9c65a7dcf upstream Conflicts: none Backport-reason: Swap Table merges zeromap so backport zeromap first With an earlier commit to handle zero-filled pages in swap directly, and with only 1% of the same-filled pages being non-zero, zswap no longer needs to handle same-filled pages and can just work on compressed pages. Link: https://lkml.kernel.org/r/20240823190545.979059-3-usamaarif642@gmail.com Signed-off-by: Usama Arif Reviewed-by: Chengming Zhou Acked-by: Yosry Ahmed Reviewed-by: Nhat Pham Acked-by: Johannes Weiner Cc: Andi Kleen Cc: David Hildenbrand Cc: Hugh Dickins Cc: Johannes Weiner Cc: Matthew Wilcox (Oracle) Cc: Shakeel Butt Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/zswap.c | 85 +++++------------------------------------------------- 1 file changed, 8 insertions(+), 77 deletions(-) diff --git a/mm/zswap.c b/mm/zswap.c index 194dd4add1c6..d5a136405834 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -44,8 +44,6 @@ **********************************/ /* The number of compressed pages currently stored in zswap */ atomic_t zswap_stored_pages = ATOMIC_INIT(0); -/* The number of same-value filled pages currently stored in zswap */ -static atomic_t zswap_same_filled_pages = ATOMIC_INIT(0); /* * The statistics below are not protected from concurrent access for @@ -184,8 +182,7 @@ static struct shrinker *zswap_shrinker; * * swpentry - associated swap entry, the offset indexes into the red-black tree * length - the length in bytes of the compressed page data. Needed during - * decompression. For a same value filled page length is 0, and both - * pool and lru are invalid and must be ignored. + * decompression. * referenced - true if the entry recently entered the zswap pool. Unset by the * writeback logic. The entry is only reclaimed by the writeback * logic if referenced is unset. See comments in the shrinker @@ -201,10 +198,7 @@ struct zswap_entry { unsigned int length; bool referenced; struct zswap_pool *pool; - union { - unsigned long handle; - unsigned long value; - }; + unsigned long handle; struct obj_cgroup *objcg; struct list_head lru; }; @@ -800,13 +794,9 @@ static void zswap_entry_cache_free(struct zswap_entry *entry) */ static void zswap_entry_free(struct zswap_entry *entry) { - if (!entry->length) - atomic_dec(&zswap_same_filled_pages); - else { - zswap_lru_del(&zswap_list_lru, entry); - zpool_free(entry->pool->zpool, entry->handle); - zswap_pool_put(entry->pool); - } + zswap_lru_del(&zswap_list_lru, entry); + zpool_free(entry->pool->zpool, entry->handle); + zswap_pool_put(entry->pool); if (entry->objcg) { obj_cgroup_uncharge_zswap(entry->objcg, entry->length); obj_cgroup_put(entry->objcg); @@ -1272,11 +1262,6 @@ static unsigned long zswap_shrinker_count(struct shrinker *shrinker, * This ensures that the better zswap compresses memory, the fewer * pages we will evict to swap (as it will otherwise incur IO for * relatively small memory saving). - * - * The memory saving factor calculated here takes same-filled pages into - * account, but those are not freeable since they almost occupy no - * space. Hence, we may scale nr_freeable down a little bit more than we - * should if we have a lot of same-filled pages. */ return mult_frac(nr_freeable, nr_backing, nr_stored); } @@ -1411,42 +1396,6 @@ static void shrink_worker(struct work_struct *w) } while (zswap_total_pages() > thr); } -/********************************* -* same-filled functions -**********************************/ -static bool zswap_is_folio_same_filled(struct folio *folio, unsigned long *value) -{ - unsigned long *data; - unsigned long val; - unsigned int pos, last_pos = PAGE_SIZE / sizeof(*data) - 1; - bool ret = false; - - data = kmap_local_folio(folio, 0); - val = data[0]; - - if (val != data[last_pos]) - goto out; - - for (pos = 1; pos < last_pos; pos++) { - if (val != data[pos]) - goto out; - } - - *value = val; - ret = true; -out: - kunmap_local(data); - return ret; -} - -static void zswap_fill_folio(struct folio *folio, unsigned long value) -{ - unsigned long *data = kmap_local_folio(folio, 0); - - memset_l(data, value, PAGE_SIZE / sizeof(unsigned long)); - kunmap_local(data); -} - /********************************* * main API **********************************/ @@ -1458,7 +1407,6 @@ bool zswap_store(struct folio *folio) struct zswap_entry *entry, *old; struct obj_cgroup *objcg = NULL; struct mem_cgroup *memcg = NULL; - unsigned long value; VM_WARN_ON_ONCE(!folio_test_locked(folio)); VM_WARN_ON_ONCE(!folio_test_swapcache(folio)); @@ -1491,13 +1439,6 @@ bool zswap_store(struct folio *folio) goto reject; } - if (zswap_is_folio_same_filled(folio, &value)) { - entry->length = 0; - entry->value = value; - atomic_inc(&zswap_same_filled_pages); - goto store_entry; - } - /* if entry is successfully added, it keeps the reference */ entry->pool = zswap_pool_current_get(); if (!entry->pool) @@ -1515,7 +1456,6 @@ bool zswap_store(struct folio *folio) if (!zswap_compress(folio, entry)) goto put_pool; -store_entry: entry->swpentry = swp; entry->objcg = objcg; entry->referenced = true; @@ -1564,13 +1504,9 @@ bool zswap_store(struct folio *folio) return true; store_failed: - if (!entry->length) - atomic_dec(&zswap_same_filled_pages); - else { - zpool_free(entry->pool->zpool, entry->handle); + zpool_free(entry->pool->zpool, entry->handle); put_pool: - zswap_pool_put(entry->pool); - } + zswap_pool_put(entry->pool); freepage: zswap_entry_cache_free(entry); reject: @@ -1633,10 +1569,7 @@ bool zswap_load(struct folio *folio) if (!entry) return false; - if (entry->length) - zswap_decompress(entry, folio); - else - zswap_fill_folio(folio, entry->value); + zswap_decompress(entry, folio); count_vm_event(ZSWPIN); if (entry->objcg) @@ -1739,8 +1672,6 @@ static int zswap_debugfs_init(void) zswap_debugfs_root, NULL, &total_size_fops); debugfs_create_atomic_t("stored_pages", 0444, zswap_debugfs_root, &zswap_stored_pages); - debugfs_create_atomic_t("same_filled_pages", 0444, - zswap_debugfs_root, &zswap_same_filled_pages); return 0; } -- Gitee From 5b43aa216895a747e23af9fdf7dff7b2833d000d Mon Sep 17 00:00:00 2001 From: Barry Song Date: Mon, 9 Sep 2024 11:21:17 +1200 Subject: [PATCH 278/484] mm: fix swap_read_folio_zeromap() for large folios with partial zeromap commit 9d57090e73d5e00e946d7fdd6398c2c0bc3b5525 upstream Conflicts: none Backport-reason: Swap Table merges zeromap so backport zeromap first Patch series "mm: enable large folios swap-in support", v9. Currently, we support mTHP swapout but not swapin. This means that once mTHP is swapped out, it will come back as small folios when swapped in. This is particularly detrimental for devices like Android, where more than half of the memory is in swap. The lack of mTHP swapin functionality makes mTHP a showstopper in scenarios that heavily rely on swap. This patchset introduces mTHP swap-in support. It starts with synchronous devices similar to zRAM, aiming to benefit as many users as possible with minimal changes. This patch (of 3): There could be a corner case where the first entry is non-zeromap, but a subsequent entry is zeromap. In this case, we should not let swap_read_folio_zeromap() return false since we will still read corrupted data. Additionally, the iteration of test_bit() is unnecessary and can be replaced with bitmap operations, which are more efficient. We can adopt the style of swap_pte_batch() and folio_pte_batch() to introduce swap_zeromap_batch() which seems to provide the greatest flexibility for the caller. This approach allows the caller to either check if the zeromap status of all entries is consistent or determine the number of contiguous entries with the same status. Since swap_read_folio() can't handle reading a large folio that's partially zeromap and partially non-zeromap, we've moved the code to mm/swap.h so that others, like those working on swap-in, can access it. Link: https://lkml.kernel.org/r/20240908232119.2157-1-21cnbao@gmail.com Link: https://lkml.kernel.org/r/20240908232119.2157-2-21cnbao@gmail.com Fixes: 0ca0c24e3211 ("mm: store zero pages to be swapped out in a bitmap") Signed-off-by: Barry Song Reviewed-by: Yosry Ahmed Reviewed-by: Usama Arif Cc: Baolin Wang Cc: Chris Li Cc: Christoph Hellwig Cc: Chuanhua Han Cc: David Hildenbrand Cc: Gao Xiang Cc: Huang Ying Cc: Hugh Dickins Cc: Johannes Weiner Cc: Kairui Song Cc: Kairui Song Cc: Kalesh Singh Cc: Matthew Wilcox Cc: Michal Hocko Cc: Minchan Kim Cc: Nhat Pham Cc: Ryan Roberts Cc: Sergey Senozhatsky Cc: Shakeel Butt Cc: Suren Baghdasaryan Cc: Yang Shi Cc: Kanchana P Sridhar Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/page_io.c | 32 +++++++------------------------- mm/swap.h | 33 +++++++++++++++++++++++++++++++++ 2 files changed, 40 insertions(+), 25 deletions(-) diff --git a/mm/page_io.c b/mm/page_io.c index 4edb9f0d0cfb..859726b85b03 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -225,26 +225,6 @@ static void swap_zeromap_folio_clear(struct folio *folio) } } -/* - * Return the index of the first subpage which is not zero-filled - * according to swap_info_struct->zeromap. - * If all pages are zero-filled according to zeromap, it will return - * folio_nr_pages(folio). - */ -static unsigned int swap_zeromap_folio_test(struct folio *folio) -{ - struct swap_info_struct *sis = swp_swap_info(folio->swap); - swp_entry_t entry; - unsigned int i; - - for (i = 0; i < folio_nr_pages(folio); i++) { - entry = page_swap_entry(folio_page(folio, i)); - if (!test_bit(swp_offset(entry), sis->zeromap)) - return i; - } - return i; -} - /* * We may have stale swap cache pages in memory: notice * them here and get rid of the unnecessary final write. @@ -532,19 +512,21 @@ static void sio_read_complete(struct kiocb *iocb, long ret) static bool swap_read_folio_zeromap(struct folio *folio) { - unsigned int idx = swap_zeromap_folio_test(folio); - - if (idx == 0) - return false; + int nr_pages = folio_nr_pages(folio); + bool is_zeromap; /* * Swapping in a large folio that is partially in the zeromap is not * currently handled. Return true without marking the folio uptodate so * that an IO error is emitted (e.g. do_swap_page() will sigbus). */ - if (WARN_ON_ONCE(idx < folio_nr_pages(folio))) + if (WARN_ON_ONCE(swap_zeromap_batch(folio->swap, nr_pages, + &is_zeromap) != nr_pages)) return true; + if (!is_zeromap) + return false; + folio_zero_range(folio, 0, folio_size(folio)); folio_mark_uptodate(folio); return true; diff --git a/mm/swap.h b/mm/swap.h index 8f364a00d120..faa41d242cd7 100644 --- a/mm/swap.h +++ b/mm/swap.h @@ -79,6 +79,32 @@ static inline unsigned int folio_swap_flags(struct folio *folio) { return swp_swap_info(folio->swap)->flags; } + +/* + * Return the count of contiguous swap entries that share the same + * zeromap status as the starting entry. If is_zeromap is not NULL, + * it will return the zeromap status of the starting entry. + */ +static inline int swap_zeromap_batch(swp_entry_t entry, int max_nr, + bool *is_zeromap) +{ + struct swap_info_struct *sis = swp_swap_info(entry); + unsigned long start = swp_offset(entry); + unsigned long end = start + max_nr; + bool first_bit; + + first_bit = test_bit(start, sis->zeromap); + if (is_zeromap) + *is_zeromap = first_bit; + + if (max_nr <= 1) + return max_nr; + if (first_bit) + return find_next_zero_bit(sis->zeromap, end, start) - start; + else + return find_next_bit(sis->zeromap, end, start) - start; +} + #else /* CONFIG_SWAP */ struct swap_iocb; static inline void swap_read_folio(struct folio *folio, struct swap_iocb **plug) @@ -165,6 +191,13 @@ static inline unsigned int folio_swap_flags(struct folio *folio) { return 0; } + +static inline int swap_zeromap_batch(swp_entry_t entry, int max_nr, + bool *has_zeromap) +{ + return 0; +} + #endif /* CONFIG_SWAP */ #endif /* _MM_SWAP_H */ -- Gitee From 24162b190d29b9972bca1f8833fca9b54826983f Mon Sep 17 00:00:00 2001 From: Barry Song Date: Mon, 9 Sep 2024 11:21:18 +1200 Subject: [PATCH 279/484] mm: add nr argument in mem_cgroup_swapin_uncharge_swap() helper to support large folios commit 325efb16da2c840e165d9b620fec8049d4d664cc upstream Conflicts: none Backport-reason: Swap Allocator Development With large folios swap-in, we might need to uncharge multiple entries all together, add nr argument in mem_cgroup_swapin_uncharge_swap(). For the existing two users, just pass nr=1. Link: https://lkml.kernel.org/r/20240908232119.2157-3-21cnbao@gmail.com Signed-off-by: Barry Song Acked-by: Chris Li Reviewed-by: Yosry Ahmed Cc: Shakeel Butt Cc: Baolin Wang Cc: Christoph Hellwig Cc: David Hildenbrand Cc: Gao Xiang Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Johannes Weiner Cc: Kairui Song Cc: Kairui Song Cc: Kalesh Singh Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Minchan Kim Cc: Nhat Pham Cc: Ryan Roberts Cc: Sergey Senozhatsky Cc: Suren Baghdasaryan Cc: Yang Shi Cc: Chuanhua Han Cc: Kanchana P Sridhar Cc: Usama Arif Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- include/linux/memcontrol.h | 5 +++-- mm/memcontrol.c | 7 ++++--- mm/memory.c | 2 +- mm/swap_state.c | 2 +- 4 files changed, 9 insertions(+), 7 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 21871192b84b..9e0316a327ba 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -821,7 +821,8 @@ int mem_cgroup_hugetlb_try_charge(struct mem_cgroup *memcg, gfp_t gfp, int mem_cgroup_swapin_charge_folio(struct folio *folio, struct mm_struct *mm, gfp_t gfp, swp_entry_t entry); -void mem_cgroup_swapin_uncharge_swap(swp_entry_t entry); + +void mem_cgroup_swapin_uncharge_swap(swp_entry_t entry, unsigned int nr_pages); void __mem_cgroup_uncharge(struct folio *folio); @@ -1479,7 +1480,7 @@ static inline int mem_cgroup_swapin_charge_folio(struct folio *folio, return 0; } -static inline void mem_cgroup_swapin_uncharge_swap(swp_entry_t entry) +static inline void mem_cgroup_swapin_uncharge_swap(swp_entry_t entry, unsigned int nr) { } diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 2d0e731a4fe9..d5c3100c902f 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -9379,14 +9379,15 @@ int mem_cgroup_swapin_charge_folio(struct folio *folio, struct mm_struct *mm, /* * mem_cgroup_swapin_uncharge_swap - uncharge swap slot - * @entry: swap entry for which the page is charged + * @entry: the first swap entry for which the pages are charged + * @nr_pages: number of pages which will be uncharged * * Call this function after successfully adding the charged page to swapcache. * * Note: This function assumes the page for which swap slot is being uncharged * is order 0 page. */ -void mem_cgroup_swapin_uncharge_swap(swp_entry_t entry) +void mem_cgroup_swapin_uncharge_swap(swp_entry_t entry, unsigned int nr_pages) { /* * Cgroup1's unified memory+swap counter has been charged with the @@ -9406,7 +9407,7 @@ void mem_cgroup_swapin_uncharge_swap(swp_entry_t entry) * let's not wait for it. The page already received a * memory+swap charge, drop the swap entry duplicate. */ - mem_cgroup_uncharge_swap(entry, 1); + mem_cgroup_uncharge_swap(entry, nr_pages); } } diff --git a/mm/memory.c b/mm/memory.c index c4636615d315..bad6f596ae19 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4022,7 +4022,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) ret = VM_FAULT_OOM; goto out_page; } - mem_cgroup_swapin_uncharge_swap(entry); + mem_cgroup_swapin_uncharge_swap(entry, 1); shadow = get_shadow_from_swap_cache(entry); if (shadow) diff --git a/mm/swap_state.c b/mm/swap_state.c index 6b43d8867d42..623147db9e1f 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -454,7 +454,7 @@ struct folio *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, if (add_to_swap_cache(new_folio, entry, gfp_mask & GFP_RECLAIM_MASK, &shadow)) goto fail_unlock; - mem_cgroup_swapin_uncharge_swap(entry); + mem_cgroup_swapin_uncharge_swap(entry, 1); if (shadow) workingset_refault(new_folio, shadow); -- Gitee From 44744c6dda66235a9ca1216b4438fd2521e36369 Mon Sep 17 00:00:00 2001 From: Chuanhua Han Date: Mon, 9 Sep 2024 11:21:19 +1200 Subject: [PATCH 280/484] mm: support large folios swap-in for sync io devices commit 242d12c98174584a18965cfab95778893872d650 upstream Conflicts: none Backport-reason: mTHP swapin Currently, we have mTHP features, but unfortunately, without support for large folio swap-ins, once these large folios are swapped out, they are lost because mTHP swap is a one-way process. The lack of mTHP swap-in functionality prevents mTHP from being used on devices like Android that heavily rely on swap. This patch introduces mTHP swap-in support. It starts from sync devices such as zRAM. This is probably the simplest and most common use case, benefiting billions of Android phones and similar devices with minimal implementation cost. In this straightforward scenario, large folios are always exclusive, eliminating the need to handle complex rmap and swapcache issues. It offers several benefits: 1. Enables bidirectional mTHP swapping, allowing retrieval of mTHP after swap-out and swap-in. Large folios in the buddy system are also preserved as much as possible, rather than being fragmented due to swap-in. 2. Eliminates fragmentation in swap slots and supports successful THP_SWPOUT. w/o this patch (Refer to the data from Chris's and Kairui's latest swap allocator optimization while running ./thp_swap_allocator_test w/o "-a" option [1]): ./thp_swap_allocator_test Iteration 1: swpout inc: 233, swpout fallback inc: 0, Fallback percentage: 0.00% Iteration 2: swpout inc: 131, swpout fallback inc: 101, Fallback percentage: 43.53% Iteration 3: swpout inc: 71, swpout fallback inc: 155, Fallback percentage: 68.58% Iteration 4: swpout inc: 55, swpout fallback inc: 168, Fallback percentage: 75.34% Iteration 5: swpout inc: 35, swpout fallback inc: 191, Fallback percentage: 84.51% Iteration 6: swpout inc: 25, swpout fallback inc: 199, Fallback percentage: 88.84% Iteration 7: swpout inc: 23, swpout fallback inc: 205, Fallback percentage: 89.91% Iteration 8: swpout inc: 9, swpout fallback inc: 219, Fallback percentage: 96.05% Iteration 9: swpout inc: 13, swpout fallback inc: 213, Fallback percentage: 94.25% Iteration 10: swpout inc: 12, swpout fallback inc: 216, Fallback percentage: 94.74% Iteration 11: swpout inc: 16, swpout fallback inc: 213, Fallback percentage: 93.01% Iteration 12: swpout inc: 10, swpout fallback inc: 210, Fallback percentage: 95.45% Iteration 13: swpout inc: 16, swpout fallback inc: 212, Fallback percentage: 92.98% Iteration 14: swpout inc: 12, swpout fallback inc: 212, Fallback percentage: 94.64% Iteration 15: swpout inc: 15, swpout fallback inc: 211, Fallback percentage: 93.36% Iteration 16: swpout inc: 15, swpout fallback inc: 200, Fallback percentage: 93.02% Iteration 17: swpout inc: 9, swpout fallback inc: 220, Fallback percentage: 96.07% w/ this patch (always 0%): Iteration 1: swpout inc: 948, swpout fallback inc: 0, Fallback percentage: 0.00% Iteration 2: swpout inc: 953, swpout fallback inc: 0, Fallback percentage: 0.00% Iteration 3: swpout inc: 950, swpout fallback inc: 0, Fallback percentage: 0.00% Iteration 4: swpout inc: 952, swpout fallback inc: 0, Fallback percentage: 0.00% Iteration 5: swpout inc: 950, swpout fallback inc: 0, Fallback percentage: 0.00% Iteration 6: swpout inc: 950, swpout fallback inc: 0, Fallback percentage: 0.00% Iteration 7: swpout inc: 947, swpout fallback inc: 0, Fallback percentage: 0.00% Iteration 8: swpout inc: 950, swpout fallback inc: 0, Fallback percentage: 0.00% Iteration 9: swpout inc: 950, swpout fallback inc: 0, Fallback percentage: 0.00% Iteration 10: swpout inc: 945, swpout fallback inc: 0, Fallback percentage: 0.00% Iteration 11: swpout inc: 947, swpout fallback inc: 0, Fallback percentage: 0.00% ... 3. With both mTHP swap-out and swap-in supported, we offer the option to enable zsmalloc compression/decompression with larger granularity[2]. The upcoming optimization in zsmalloc will significantly increase swap speed and improve compression efficiency. Tested by running 100 iterations of swapping 100MiB of anon memory, the swap speed improved dramatically: time consumption of swapin(ms) time consumption of swapout(ms) lz4 4k 45274 90540 lz4 64k 22942 55667 zstdn 4k 85035 186585 zstdn 64k 46558 118533 The compression ratio also improved, as evaluated with 1 GiB of data: granularity orig_data_size compr_data_size 4KiB-zstd 1048576000 246876055 64KiB-zstd 1048576000 199763892 Without mTHP swap-in, the potential optimizations in zsmalloc cannot be realized. 4. Even mTHP swap-in itself can reduce swap-in page faults by a factor of nr_pages. Swapping in content filled with the same data 0x11, w/o and w/ the patch for five rounds (Since the content is the same, decompression will be very fast. This primarily assesses the impact of reduced page faults): swp in bandwidth(bytes/ms) w/o w/ round1 624152 1127501 round2 631672 1127501 round3 620459 1139756 round4 606113 1139756 round5 624152 1152281 avg 621310 1137359 +83% 5. With both mTHP swap-out and swap-in supported, we offer the option to enable hardware accelerators(Intel IAA) to do parallel decompression with which Kanchana reported 7X improvement on zRAM read latency[3]. [1] https://lore.kernel.org/all/20240730-swap-allocator-v5-0-cb9c148b9297@kernel.org/ [2] https://lore.kernel.org/all/20240327214816.31191-1-21cnbao@gmail.com/ [3] https://lore.kernel.org/all/cover.1714581792.git.andre.glover@linux.intel.com/ Link: https://lkml.kernel.org/r/20240908232119.2157-4-21cnbao@gmail.com Signed-off-by: Chuanhua Han Co-developed-by: Barry Song Signed-off-by: Barry Song Cc: Baolin Wang Cc: Chris Li Cc: Christoph Hellwig Cc: David Hildenbrand Cc: Gao Xiang Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Johannes Weiner Cc: Kairui Song Cc: Kalesh Singh Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Minchan Kim Cc: Nhat Pham Cc: Ryan Roberts Cc: Sergey Senozhatsky Cc: Shakeel Butt Cc: Suren Baghdasaryan Cc: Yang Shi Cc: Yosry Ahmed Cc: Usama Arif Cc: Kanchana P Sridhar Cc: Kairui Song Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/memory.c | 261 ++++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 234 insertions(+), 27 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index bad6f596ae19..0d36fe555fda 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3907,6 +3907,194 @@ static vm_fault_t handle_pte_marker(struct vm_fault *vmf) return VM_FAULT_SIGBUS; } +static struct folio *__alloc_swap_folio(struct vm_fault *vmf) +{ + struct vm_area_struct *vma = vmf->vma; + struct folio *folio; + swp_entry_t entry; + + folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma, + vmf->address, false); + if (!folio) + return NULL; + + entry = pte_to_swp_entry(vmf->orig_pte); + if (mem_cgroup_swapin_charge_folio(folio, vma->vm_mm, + GFP_KERNEL, entry)) { + folio_put(folio); + return NULL; + } + + return folio; +} + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +static inline int non_swapcache_batch(swp_entry_t entry, int max_nr) +{ + struct swap_info_struct *si = swp_swap_info(entry); + pgoff_t offset = swp_offset(entry); + int i; + + /* + * While allocating a large folio and doing swap_read_folio, which is + * the case the being faulted pte doesn't have swapcache. We need to + * ensure all PTEs have no cache as well, otherwise, we might go to + * swap devices while the content is in swapcache. + */ + for (i = 0; i < max_nr; i++) { + if ((si->swap_map[offset + i] & SWAP_HAS_CACHE)) + return i; + } + + return i; +} + +/* + * Check if the PTEs within a range are contiguous swap entries + * and have consistent swapcache, zeromap. + */ +static bool can_swapin_thp(struct vm_fault *vmf, pte_t *ptep, int nr_pages) +{ + unsigned long addr; + swp_entry_t entry; + int idx; + pte_t pte; + + addr = ALIGN_DOWN(vmf->address, nr_pages * PAGE_SIZE); + idx = (vmf->address - addr) / PAGE_SIZE; + pte = ptep_get(ptep); + + if (!pte_same(pte, pte_move_swp_offset(vmf->orig_pte, -idx))) + return false; + entry = pte_to_swp_entry(pte); + if (swap_pte_batch(ptep, nr_pages, pte) != nr_pages) + return false; + + /* + * swap_read_folio() can't handle the case a large folio is hybridly + * from different backends. And they are likely corner cases. Similar + * things might be added once zswap support large folios. + */ + if (unlikely(swap_zeromap_batch(entry, nr_pages, NULL) != nr_pages)) + return false; + if (unlikely(non_swapcache_batch(entry, nr_pages) != nr_pages)) + return false; + + return true; +} + +static inline unsigned long thp_swap_suitable_orders(pgoff_t swp_offset, + unsigned long addr, + unsigned long orders) +{ + int order, nr; + + order = highest_order(orders); + + /* + * To swap in a THP with nr pages, we require that its first swap_offset + * is aligned with that number, as it was when the THP was swapped out. + * This helps filter out most invalid entries. + */ + while (orders) { + nr = 1 << order; + if ((addr >> PAGE_SHIFT) % nr == swp_offset % nr) + break; + order = next_order(&orders, order); + } + + return orders; +} + +static struct folio *alloc_swap_folio(struct vm_fault *vmf) +{ + struct vm_area_struct *vma = vmf->vma; + unsigned long orders; + struct folio *folio; + unsigned long addr; + swp_entry_t entry; + spinlock_t *ptl; + pte_t *pte; + gfp_t gfp; + int order; + + /* + * If uffd is active for the vma we need per-page fault fidelity to + * maintain the uffd semantics. + */ + if (unlikely(userfaultfd_armed(vma))) + goto fallback; + + /* + * A large swapped out folio could be partially or fully in zswap. We + * lack handling for such cases, so fallback to swapping in order-0 + * folio. + */ + if (!zswap_never_enabled()) + goto fallback; + + entry = pte_to_swp_entry(vmf->orig_pte); + /* + * Get a list of all the (large) orders below PMD_ORDER that are enabled + * and suitable for swapping THP. + */ + orders = thp_vma_allowable_orders(vma, vma->vm_flags, + TVA_IN_PF | TVA_ENFORCE_SYSFS, BIT(PMD_ORDER) - 1); + orders = thp_vma_suitable_orders(vma, vmf->address, orders); + orders = thp_swap_suitable_orders(swp_offset(entry), + vmf->address, orders); + + if (!orders) + goto fallback; + + pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd, + vmf->address & PMD_MASK, &ptl); + if (unlikely(!pte)) + goto fallback; + + /* + * For do_swap_page, find the highest order where the aligned range is + * completely swap entries with contiguous swap offsets. + */ + order = highest_order(orders); + while (orders) { + addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order); + if (can_swapin_thp(vmf, pte + pte_index(addr), 1 << order)) + break; + order = next_order(&orders, order); + } + + pte_unmap_unlock(pte, ptl); + + /* Try allocating the highest of the remaining orders. */ + gfp = vma_thp_gfp_mask(vma); + while (orders) { + addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order); + folio = vma_alloc_folio(gfp, order, vma, addr, true); + if (folio) { + if (!mem_cgroup_swapin_charge_folio(folio, vma->vm_mm, + gfp, entry)) + return folio; + folio_put(folio); + } + order = next_order(&orders, order); + } + +fallback: + return __alloc_swap_folio(vmf); +} +#else /* !CONFIG_TRANSPARENT_HUGEPAGE */ +static inline bool can_swapin_thp(struct vm_fault *vmf, pte_t *ptep, int nr_pages) +{ + return false; +} + +static struct folio *alloc_swap_folio(struct vm_fault *vmf) +{ + return __alloc_swap_folio(vmf); +} +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ + /* * We enter with non-exclusive mmap_lock (to exclude vma changes, * but allow concurrent faults), and pte mapped but not yet locked. @@ -3995,34 +4183,34 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) if (!folio) { if (data_race(si->flags & SWP_SYNCHRONOUS_IO) && __swap_count(entry) == 1) { - /* - * Prevent parallel swapin from proceeding with - * the cache flag. Otherwise, another thread may - * finish swapin first, free the entry, and swapout - * reusing the same entry. It's undetectable as - * pte_same() returns true due to entry reuse. - */ - if (swapcache_prepare(entry, 1)) { - /* Relax a bit to prevent rapid repeated page faults */ - schedule_timeout_uninterruptible(1); - goto out; - } - need_clear_cache = true; - /* skip swapcache */ - folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, - vma, vmf->address, false); + folio = alloc_swap_folio(vmf); if (folio) { __folio_set_locked(folio); __folio_set_swapbacked(folio); - if (mem_cgroup_swapin_charge_folio(folio, - vma->vm_mm, GFP_KERNEL, - entry)) { - ret = VM_FAULT_OOM; + nr_pages = folio_nr_pages(folio); + if (folio_test_large(folio)) + entry.val = ALIGN_DOWN(entry.val, nr_pages); + /* + * Prevent parallel swapin from proceeding with + * the cache flag. Otherwise, another thread + * may finish swapin first, free the entry, and + * swapout reusing the same entry. It's + * undetectable as pte_same() returns true due + * to entry reuse. + */ + if (swapcache_prepare(entry, nr_pages)) { + /* + * Relax a bit to prevent rapid + * repeated page faults. + */ + schedule_timeout_uninterruptible(1); goto out_page; } - mem_cgroup_swapin_uncharge_swap(entry, 1); + need_clear_cache = true; + + mem_cgroup_swapin_uncharge_swap(entry, nr_pages); shadow = get_shadow_from_swap_cache(entry); if (shadow) @@ -4128,6 +4316,24 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) goto out_nomap; } + /* allocated large folios for SWP_SYNCHRONOUS_IO */ + if (folio_test_large(folio) && !folio_test_swapcache(folio)) { + unsigned long nr = folio_nr_pages(folio); + unsigned long folio_start = ALIGN_DOWN(vmf->address, nr * PAGE_SIZE); + unsigned long idx = (vmf->address - folio_start) / PAGE_SIZE; + pte_t *folio_ptep = vmf->pte - idx; + pte_t folio_pte = ptep_get(folio_ptep); + + if (!pte_same(folio_pte, pte_move_swp_offset(vmf->orig_pte, -idx)) || + swap_pte_batch(folio_ptep, nr, folio_pte) != nr) + goto out_nomap; + + page_idx = idx; + address = folio_start; + ptep = folio_ptep; + goto check_folio; + } + nr_pages = 1; page_idx = 0; address = vmf->address; @@ -4255,11 +4461,12 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) folio_add_lru_vma(folio, vma); } else if (!folio_test_anon(folio)) { /* - * We currently only expect small !anon folios, which are either - * fully exclusive or fully shared. If we ever get large folios - * here, we have to be careful. + * We currently only expect small !anon folios which are either + * fully exclusive or fully shared, or new allocated large + * folios which are fully exclusive. If we ever get large + * folios within swapcache here, we have to be careful. */ - VM_WARN_ON_ONCE(folio_test_large(folio)); + VM_WARN_ON_ONCE(folio_test_large(folio) && folio_test_swapcache(folio)); VM_WARN_ON_FOLIO(!folio_test_locked(folio), folio); folio_add_new_anon_rmap(folio, vma, address, rmap_flags); } else { @@ -4302,7 +4509,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) out: /* Clear the swap cache pin for direct swapin after PTL unlock */ if (need_clear_cache) - swapcache_clear(si, entry, 1); + swapcache_clear(si, entry, nr_pages); if (si) put_swap_device(si); return ret; @@ -4318,7 +4525,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) folio_put(swapcache); } if (need_clear_cache) - swapcache_clear(si, entry, 1); + swapcache_clear(si, entry, nr_pages); if (si) put_swap_device(si); return ret; -- Gitee From 2ba86cc5af2ec7fe545f389c9e549bee03846c2a Mon Sep 17 00:00:00 2001 From: Kanchana P Sridhar Date: Wed, 2 Oct 2024 12:42:13 -0700 Subject: [PATCH 281/484] mm: zswap: delete comments for "value" member of 'struct zswap_entry'. commit aa5f0fa6af38d96bc6f1b7e1534f5b5c025930a6 upstream Conflicts: none Backport-reason: Swap Table merges zeromap so backport zeromap first Made a minor edit in the comments for 'struct zswap_entry' to delete the description of the 'value' member that was deleted in commit 20a5532ffa53 ("mm: remove code to handle same filled pages"). Link: https://lkml.kernel.org/r/20241002194213.30041-1-kanchana.p.sridhar@intel.com Signed-off-by: Kanchana P Sridhar Fixes: 20a5532ffa53 ("mm: remove code to handle same filled pages") Reviewed-by: Nhat Pham Acked-by: Yosry Ahmed Reviewed-by: Usama Arif Cc: Chengming Zhou Cc: Huang Ying Cc: Johannes Weiner Cc: Kanchana P Sridhar Cc: Ryan Roberts Cc: Wajdi Feghali Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/zswap.c | 1 - 1 file changed, 1 deletion(-) diff --git a/mm/zswap.c b/mm/zswap.c index d5a136405834..e1501bcc1378 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -189,7 +189,6 @@ static struct shrinker *zswap_shrinker; * section for context. * pool - the zswap_pool the entry's data is in * handle - zpool allocation handle that stores the compressed page data - * value - value of the same-value filled pages which have same content * objcg - the obj_cgroup that the compressed memory is charged to * lru - handle to the pool's lru used to evict pages. */ -- Gitee From c2c2f437159b2a82801390ec74ab0a5360509442 Mon Sep 17 00:00:00 2001 From: Barry Song Date: Thu, 24 Oct 2024 10:02:01 +1300 Subject: [PATCH 282/484] mm: fix PSWPIN counter for large folios swap-in commit b54e1bfecc4b2775c184d2edb319232b853a686d upstream Conflicts: none Backport-reason: Swap Allocator & mTHP statistic Similar to PSWPOUT, we should count the number of base pages instead of large folios. Link: https://lkml.kernel.org/r/20241023210201.2798-1-21cnbao@gmail.com Fixes: 242d12c98174 ("mm: support large folios swap-in for sync io devices") Signed-off-by: Barry Song Acked-by: David Hildenbrand Reviewed-by: Baolin Wang Cc: Chris Li Cc: Yosry Ahmed Cc: "Huang, Ying" Cc: Kairui Song Cc: Ryan Roberts Cc: Kanchana P Sridhar Cc: Usama Arif Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/page_io.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/page_io.c b/mm/page_io.c index 859726b85b03..d9bbf94d2ae1 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -580,7 +580,7 @@ static void swap_read_folio_bdev_sync(struct folio *folio, * attempt to access it in the page fault retry time check. */ get_task_struct(current); - count_vm_event(PSWPIN); + count_vm_events(PSWPIN, folio_nr_pages(folio)); submit_bio_wait(&bio); __end_swap_bio_read(&bio); put_task_struct(current); @@ -595,7 +595,7 @@ static void swap_read_folio_bdev_async(struct folio *folio, bio->bi_iter.bi_sector = swap_folio_sector(folio); bio->bi_end_io = end_swap_bio_read; bio_add_folio_nofail(bio, folio, folio_size(folio), 0); - count_vm_event(PSWPIN); + count_vm_events(PSWPIN, folio_nr_pages(folio)); submit_bio(bio); } -- Gitee From 9a86ebfbb4f3615a9ee292b982b192eb7665fa60 Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Mon, 1 Sep 2025 19:44:08 +0800 Subject: [PATCH 283/484] mm: fix PSWPIN counter for large folios swap-in for SYNC_IO devices Upstream: no We have different swapin path for SYNC_IO device so fix it differently. Signed-off-by: Kairui Song --- mm/page_io.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/page_io.c b/mm/page_io.c index d9bbf94d2ae1..e9f8245399f0 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -639,7 +639,7 @@ void swap_read_folio(struct folio *folio, struct swap_iocb **plug) int ret = bdev_swapin_folio(sis->bdev, swap_folio_sector(folio), folio); if (ret != -EOPNOTSUPP) { if (!ret) - count_vm_event(PSWPIN); + count_vm_events(PSWPIN, folio_nr_pages(folio)); goto finish; } swap_read_folio_bdev_sync(folio, sis); -- Gitee From de4699977e36deb0e689c874c382911498d17180 Mon Sep 17 00:00:00 2001 From: Jingxiang Zeng Date: Fri, 30 Aug 2024 16:22:44 +0800 Subject: [PATCH 284/484] mm/memcontrol: add per-memcg pgpgin/pswpin counter commit 15ff4d409e1a6f939d94d2005ae275c26b2b0d9d upstream Conflicts: minor, SYNC_IO path is different Backport-reason: mTHP swap support In proactive memory reclamation scenarios, it is necessary to estimate the pswpin and pswpout metrics of the cgroup to determine whether to continue reclaiming anonymous pages in the current batch. This patch will collect these metrics and expose them. [linuszeng@tencent.com: v2] Link: https://lkml.kernel.org/r/20240830082244.156923-1-jingxiangzeng.cas@gmail.com Li nk: https://lkml.kernel.org/r/20240913084453.3605621-1-jingxiangzeng.cas@gmail.com Link: https://lkml.kernel.org/r/20240830082244.156923-1-jingxiangzeng.cas@gmail.com Signed-off-by: Jingxiang Zeng Acked-by: Nhat Pham Cc: Johannes Weiner Cc: Michal Hocko Cc: Muchun Song Cc: Roman Gushchin Cc: Shakeel Butt Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/memcontrol.c | 2 ++ mm/page_io.c | 8 +++++++- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index d5c3100c902f..694d8ceb80d9 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -612,6 +612,8 @@ mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz) static const unsigned int memcg_vm_event_stat[] = { PGPGIN, PGPGOUT, + PSWPIN, + PSWPOUT, PGSCAN_KSWAPD, PGSCAN_DIRECT, PGSCAN_KHUGEPAGED, diff --git a/mm/page_io.c b/mm/page_io.c index e9f8245399f0..112571c2b1ca 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -291,6 +291,7 @@ static inline void count_swpout_vm_event(struct folio *folio) } count_mthp_stat(folio_order(folio), MTHP_STAT_ANON_SWPOUT); #endif + count_memcg_folio_events(folio, PSWPOUT, folio_nr_pages(folio)); count_vm_events(PSWPOUT, folio_nr_pages(folio)); } @@ -495,6 +496,7 @@ static void sio_read_complete(struct kiocb *iocb, long ret) for (p = 0; p < sio->pages; p++) { struct folio *folio = page_folio(sio->bvec[p].bv_page); + count_memcg_folio_events(folio, PSWPIN, folio_nr_pages(folio)); folio_mark_uptodate(folio); folio_unlock(folio); } @@ -580,6 +582,7 @@ static void swap_read_folio_bdev_sync(struct folio *folio, * attempt to access it in the page fault retry time check. */ get_task_struct(current); + count_memcg_folio_events(folio, PSWPIN, folio_nr_pages(folio)); count_vm_events(PSWPIN, folio_nr_pages(folio)); submit_bio_wait(&bio); __end_swap_bio_read(&bio); @@ -595,6 +598,7 @@ static void swap_read_folio_bdev_async(struct folio *folio, bio->bi_iter.bi_sector = swap_folio_sector(folio); bio->bi_end_io = end_swap_bio_read; bio_add_folio_nofail(bio, folio, folio_size(folio), 0); + count_memcg_folio_events(folio, PSWPIN, folio_nr_pages(folio)); count_vm_events(PSWPIN, folio_nr_pages(folio)); submit_bio(bio); } @@ -638,8 +642,10 @@ void swap_read_folio(struct folio *folio, struct swap_iocb **plug) } else if (synchronous) { int ret = bdev_swapin_folio(sis->bdev, swap_folio_sector(folio), folio); if (ret != -EOPNOTSUPP) { - if (!ret) + if (!ret) { + count_memcg_folio_events(folio, PSWPIN, folio_nr_pages(folio)); count_vm_events(PSWPIN, folio_nr_pages(folio)); + } goto finish; } swap_read_folio_bdev_sync(folio, sis); -- Gitee From 3298d37f15c112adf5bcd7d35b192b66cbcbe31b Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Thu, 23 May 2024 10:36:39 +0800 Subject: [PATCH 285/484] mm: drop the 'anon_' prefix for swap-out mTHP counters commit 0d648dd5c899f33154b98a6aef6e3dab0f4de613 upstream Conflicts: none Backport-reason: Swap Allocator: better statistic The mTHP swap related counters: 'anon_swpout' and 'anon_swpout_fallback' are confusing with an 'anon_' prefix, since the shmem can swap out non-anonymous pages. So drop the 'anon_' prefix to keep consistent with the old swap counter names. This is needed in 6.10-rcX to avoid having an inconsistent ABI out in the field. Link: https://lkml.kernel.org/r/7a8989c13299920d7589007a30065c3e2c19f0e0.1716431702.git.baolin.wang@linux.alibaba.com Fixes: d0f048ac39f6 ("mm: add per-order mTHP anon_swpout and anon_swpout_fallback counters") Fixes: 42248b9d34ea ("mm: add docs for per-order mTHP counters and transhuge_page ABI") Signed-off-by: Baolin Wang Suggested-by: "Huang, Ying" Acked-by: Barry Song Cc: David Hildenbrand Cc: Lance Yang Cc: Matthew Wilcox (Oracle) Cc: Ryan Roberts Cc: Zi Yan Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- Documentation/admin-guide/mm/transhuge.rst | 4 ++-- include/linux/huge_mm.h | 4 ++-- mm/huge_memory.c | 8 ++++---- mm/page_io.c | 2 +- mm/vmscan.c | 2 +- 5 files changed, 10 insertions(+), 10 deletions(-) diff --git a/Documentation/admin-guide/mm/transhuge.rst b/Documentation/admin-guide/mm/transhuge.rst index f82300b9193f..9ca1f3a7f5bb 100644 --- a/Documentation/admin-guide/mm/transhuge.rst +++ b/Documentation/admin-guide/mm/transhuge.rst @@ -466,11 +466,11 @@ anon_fault_fallback_charge instead falls back to using huge pages with lower orders or small pages even though the allocation was successful. -anon_swpout +swpout is incremented every time a huge page is swapped out in one piece without splitting. -anon_swpout_fallback +swpout_fallback is incremented if a huge page has to be split before swapout. Usually because failed to allocate some continuous swap space for the huge page. diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index c56d42e28fd9..e54b874524b0 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -276,8 +276,8 @@ enum mthp_stat_item { MTHP_STAT_ANON_FAULT_ALLOC, MTHP_STAT_ANON_FAULT_FALLBACK, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE, - MTHP_STAT_ANON_SWPOUT, - MTHP_STAT_ANON_SWPOUT_FALLBACK, + MTHP_STAT_SWPOUT, + MTHP_STAT_SWPOUT_FALLBACK, __MTHP_STAT_COUNT }; diff --git a/mm/huge_memory.c b/mm/huge_memory.c index e1b8943a3fc8..a62983d7f6a5 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -639,15 +639,15 @@ static struct kobj_attribute _name##_attr = __ATTR_RO(_name) DEFINE_MTHP_STAT_ATTR(anon_fault_alloc, MTHP_STAT_ANON_FAULT_ALLOC); DEFINE_MTHP_STAT_ATTR(anon_fault_fallback, MTHP_STAT_ANON_FAULT_FALLBACK); DEFINE_MTHP_STAT_ATTR(anon_fault_fallback_charge, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE); -DEFINE_MTHP_STAT_ATTR(anon_swpout, MTHP_STAT_ANON_SWPOUT); -DEFINE_MTHP_STAT_ATTR(anon_swpout_fallback, MTHP_STAT_ANON_SWPOUT_FALLBACK); +DEFINE_MTHP_STAT_ATTR(swpout, MTHP_STAT_SWPOUT); +DEFINE_MTHP_STAT_ATTR(swpout_fallback, MTHP_STAT_SWPOUT_FALLBACK); static struct attribute *stats_attrs[] = { &anon_fault_alloc_attr.attr, &anon_fault_fallback_attr.attr, &anon_fault_fallback_charge_attr.attr, - &anon_swpout_attr.attr, - &anon_swpout_fallback_attr.attr, + &swpout_attr.attr, + &swpout_fallback_attr.attr, NULL, }; diff --git a/mm/page_io.c b/mm/page_io.c index 112571c2b1ca..53df7c517354 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -289,7 +289,7 @@ static inline void count_swpout_vm_event(struct folio *folio) count_memcg_folio_events(folio, THP_SWPOUT, 1); count_vm_event(THP_SWPOUT); } - count_mthp_stat(folio_order(folio), MTHP_STAT_ANON_SWPOUT); + count_mthp_stat(folio_order(folio), MTHP_STAT_SWPOUT); #endif count_memcg_folio_events(folio, PSWPOUT, folio_nr_pages(folio)); count_vm_events(PSWPOUT, folio_nr_pages(folio)); diff --git a/mm/vmscan.c b/mm/vmscan.c index 9f9665c2f052..48fe4a6ef843 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1353,7 +1353,7 @@ static unsigned int shrink_folio_list(struct list_head *folio_list, THP_SWPOUT_FALLBACK, 1); count_vm_event(THP_SWPOUT_FALLBACK); } - count_mthp_stat(order, MTHP_STAT_ANON_SWPOUT_FALLBACK); + count_mthp_stat(order, MTHP_STAT_SWPOUT_FALLBACK); #endif if (folio_alloc_swap(folio, __GFP_HIGH | __GFP_NOWARN)) goto activate_locked_split; -- Gitee From cbed22d525629fc83adefdad70602b7abc1acd75 Mon Sep 17 00:00:00 2001 From: Kanchana P Sridhar Date: Wed, 2 Oct 2024 15:58:22 -0700 Subject: [PATCH 286/484] mm: swap: make some count_mthp_stat() call-sites be THP-agnostic. commit e26060d1fbd31a8583e2e79addc772249c1e22b4 upstream Conflicts: minor, we removed swap slot cache Backport-reason: Swap Allocator: better statistic In commit 246d3aa3e531 ("mm: cleanup count_mthp_stat() definition"), Ryan Roberts has pointed out the merits of mm code that does not require THP, to be compile-able without requiring THP ifdefs. As a step in that direction, he has moved count_mthp_stat() to be always defined, resolving to a no-op if THP is not defined. Barry Song referred me to Ryan's commit when I was working on the "mm: zswap swap-out of large folios" patch-series [1]. This patch propagates the benefits of the above change to page_io.c and vmscan.c. As a result, there is one less reason to have the ifdef THP in these code sections. [1]: https://patchwork.kernel.org/project/linux-mm/list/?series=894347 Link: https://lkml.kernel.org/r/20241002225822.9006-1-kanchana.p.sridhar@intel.com Signed-off-by: Kanchana P Sridhar Reviewed-by: Nhat Pham Cc: Chengming Zhou Cc: "Huang, Ying" Cc: Johannes Weiner Cc: Ryan Roberts Cc: Wajdi Feghali Cc: Yosry Ahmed Cc: Barry Song <21cnbao@gmail.com> Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/page_io.c | 2 +- mm/vmscan.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/page_io.c b/mm/page_io.c index 53df7c517354..b94bbb686226 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -289,8 +289,8 @@ static inline void count_swpout_vm_event(struct folio *folio) count_memcg_folio_events(folio, THP_SWPOUT, 1); count_vm_event(THP_SWPOUT); } - count_mthp_stat(folio_order(folio), MTHP_STAT_SWPOUT); #endif + count_mthp_stat(folio_order(folio), MTHP_STAT_SWPOUT); count_memcg_folio_events(folio, PSWPOUT, folio_nr_pages(folio)); count_vm_events(PSWPOUT, folio_nr_pages(folio)); } diff --git a/mm/vmscan.c b/mm/vmscan.c index 48fe4a6ef843..398929912c43 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1353,8 +1353,8 @@ static unsigned int shrink_folio_list(struct list_head *folio_list, THP_SWPOUT_FALLBACK, 1); count_vm_event(THP_SWPOUT_FALLBACK); } - count_mthp_stat(order, MTHP_STAT_SWPOUT_FALLBACK); #endif + count_mthp_stat(order, MTHP_STAT_SWPOUT_FALLBACK); if (folio_alloc_swap(folio, __GFP_HIGH | __GFP_NOWARN)) goto activate_locked_split; } -- Gitee From 8a9d63695d3d9466ac73233602458c41a145452d Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Sat, 12 Oct 2024 01:19:50 +0800 Subject: [PATCH 287/484] mm/zswap: avoid touching XArray for unnecessary invalidation commit 773ee2cda50c46e582a8ee2f8f00a5c8ac2923a7 upstream Conflicts: none Backport-reason: ZSWAP mass update zswap_invalidation simply calls xa_erase, which acquires the Xarray lock first, then does a look up. This has a higher overhead even if zswap is not used or the tree is empty. So instead, do a very lightweight xa_empty check first, if there is nothing to erase, don't touch the lock or the tree. Using xa_empty rather than zswap_never_enabled is more helpful as it cover both case where zswap wes never used or the particular range doesn't have any zswap entry. And it's safe as the swap slot should be currently pinned by caller with HAS_CACHE. Sequential SWAP in/out tests with zswap disabled showed a minor performance gain, SWAP in of zero page with zswap enabled also showed a performance gain. (swapout is basically unchanged so only test one case): Swapout of 2G zero page using brd as SWAP, zswap disabled (total time, 4 testrun, +0.1%): Before: 1705013 us 1703119 us 1704335 us 1705848 us. After: 1703579 us 1710640 us 1703625 us 1708699 us. Swapin of 2G zero page using brd as SWAP, zswap disabled (total time, 4 testrun, -3.5%): Before: 1912312 us 1915692 us 1905837 us 1912706 us. After: 1845354 us 1849691 us 1845868 us 1841828 us. Swapin of 2G zero page using brd as SWAP, zswap enabled (total time, 4 testrun, -3.3%): Before: 1897994 us 1894681 us 1899982 us 1898333 us After: 1835894 us 1834113 us 1832047 us 1833125 us Swapin of 2G random page using brd as SWAP, zswap enabled (total time, 4 testrun, -0.1%): Before: 4519747 us 4431078 us 4430185 us 4439999 us After: 4492176 us 4437796 us 4434612 us 4434289 us And the performance is very slightly better or unchanged for build kernel test with zswap enabled or disabled. Build Linux Kernel with defconfig and -j32 in 1G memory cgroup, using brd SWAP, zswap disabled (sys time in seconds, 6 testrun, -0.1%): Before: 1648.83 1653.52 1666.34 1665.95 1663.06 1656.67 After: 1651.36 1661.89 1645.70 1657.45 1662.07 1652.83 Build Linux Kernel with defconfig and -j32 in 2G memory cgroup, using brd SWAP zswap enabled (sys time in seconds, 6 testrun, -0.3%): Before: 1240.25 1254.06 1246.77 1265.92 1244.23 1227.74 After: 1226.41 1218.21 1249.12 1249.13 1244.39 1233.01 Link: https://lkml.kernel.org/r/20241011171950.62684-1-ryncsn@gmail.com Signed-off-by: Kairui Song Acked-by: Yosry Ahmed Cc: Barry Song Cc: Chengming Zhou Cc: Chris Li Cc: Johannes Weiner Cc: Nhat Pham Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/zswap.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/mm/zswap.c b/mm/zswap.c index e1501bcc1378..624442752cc6 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -1589,6 +1589,9 @@ void zswap_invalidate(swp_entry_t swp) struct xarray *tree = swap_zswap_tree(swp); struct zswap_entry *entry; + if (xa_empty(tree)) + return; + entry = xa_erase(tree, offset); if (entry) zswap_entry_free(entry); -- Gitee From 5efe4c88a8ab3501c38fce391d75a522edd97015 Mon Sep 17 00:00:00 2001 From: Usama Arif Date: Thu, 2 May 2024 19:50:24 +0100 Subject: [PATCH 288/484] cgroup: Add documentation for missing zswap memory.stat commit db5b4f3253ff73bc2e926ec76e1c0f662b38d9a4 upstream Conflicts: none Backport-reason: ZSWAP mass update This includes zswpin, zswpout and zswpwb. Signed-off-by: Usama Arif Acked-by: Nhat Pham Acked-by: Johannes Weiner Signed-off-by: Jonathan Corbet Message-ID: <20240502185307.3942173-2-usamaarif642@gmail.com> Signed-off-by: Kairui Song --- Documentation/admin-guide/cgroup-v2.rst | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index a5beae80d3ca..a0cd05e81124 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -1622,6 +1622,15 @@ PAGE_SIZE multiple when read back. pglazyfreed (npn) Amount of reclaimed lazyfree pages + zswpin + Number of pages moved in to memory from zswap. + + zswpout + Number of pages moved out of memory to zswap. + + zswpwb + Number of pages written from zswap to swap. + thp_fault_alloc (npn) Number of transparent hugepages which were allocated to satisfy a page fault. This counter is not present when CONFIG_TRANSPARENT_HUGEPAGE -- Gitee From 2a81ac87fd527d797d9cfdd19468aebab18773b7 Mon Sep 17 00:00:00 2001 From: Barry Song Date: Thu, 7 Nov 2024 14:12:46 +1300 Subject: [PATCH 289/484] mm: count zeromap read and set for swapout and swapin MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit e7ac4daeed91a25382091e73818ea0cddb1afd5e upstream Conflicts: trivial, we still have CONFIG KMEM Backport-reason: Swap Allocator: better statistic When the proportion of folios from the zeromap is small, missing their accounting may not significantly impact profiling. However, it's easy to construct a scenario where this becomes an issue—for example, allocating 1 GB of memory, writing zeros from userspace, followed by MADV_PAGEOUT, and then swapping it back in. In this case, the swap-out and swap-in counts seem to vanish into a black hole, potentially causing semantic ambiguity. On the other hand, Usama reported that zero-filled pages can exceed 10% in workloads utilizing zswap, while Hailong noted that some app in Android have more than 6% zero-filled pages. Before commit 0ca0c24e3211 ("mm: store zero pages to be swapped out in a bitmap"), both zswap and zRAM implemented similar optimizations, leading to these optimized-out pages being counted in either zswap or zRAM counters (with pswpin/pswpout also increasing for zRAM). With zeromap functioning prior to both zswap and zRAM, userspace will no longer detect these swap-out and swap-in actions. We have three ways to address this: 1. Introduce a dedicated counter specifically for the zeromap. 2. Use pswpin/pswpout accounting, treating the zero map as a standard backend. This approach aligns with zRAM's current handling of same-page fills at the device level. However, it would mean losing the optimized-out page counters previously available in zRAM and would not align with systems using zswap. Additionally, as noted by Nhat Pham, pswpin/pswpout counters apply only to I/O done directly to the backend device. 3. Count zeromap pages under zswap, aligning with system behavior when zswap is enabled. However, this would not be consistent with zRAM, nor would it align with systems lacking both zswap and zRAM. Given the complications with options 2 and 3, this patch selects option 1. We can find these counters from /proc/vmstat (counters for the whole system) and memcg's memory.stat (counters for the interested memcg). For example: $ grep -E 'swpin_zero|swpout_zero' /proc/vmstat swpin_zero 1648 swpout_zero 33536 $ grep -E 'swpin_zero|swpout_zero' /sys/fs/cgroup/system.slice/memory.stat swpin_zero 3905 swpout_zero 3985 This patch does not address any specific zeromap bug, but the missing swpout and swpin counts for zero-filled pages can be highly confusing and may mislead user-space agents that rely on changes in these counters as indicators. Therefore, we add a Fixes tag to encourage the inclusion of this counter in any kernel versions with zeromap. Many thanks to Kanchana for the contribution of changing count_objcg_event() to count_objcg_events() to support large folios[1], which has now been incorporated into this patch. [1] https://lkml.kernel.org/r/20241001053222.6944-5-kanchana.p.sridhar@intel.com Link: https://lkml.kernel.org/r/20241107011246.59137-1-21cnbao@gmail.com Fixes: 0ca0c24e3211 ("mm: store zero pages to be swapped out in a bitmap") Co-developed-by: Kanchana P Sridhar Signed-off-by: Barry Song Reviewed-by: Nhat Pham Reviewed-by: Chengming Zhou Acked-by: Johannes Weiner Cc: Usama Arif Cc: Yosry Ahmed Cc: Hailong Liu Cc: David Hildenbrand Cc: Hugh Dickins Cc: Matthew Wilcox (Oracle) Cc: Shakeel Butt Cc: Andi Kleen Cc: Baolin Wang Cc: Chris Li Cc: "Huang, Ying" Cc: Kairui Song Cc: Ryan Roberts Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- Documentation/admin-guide/cgroup-v2.rst | 9 +++++++++ include/linux/memcontrol.h | 12 +++++++----- include/linux/vm_event_item.h | 2 ++ mm/memcontrol.c | 4 ++++ mm/page_io.c | 16 ++++++++++++++++ mm/vmstat.c | 2 ++ mm/zswap.c | 6 +++--- 7 files changed, 43 insertions(+), 8 deletions(-) diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index a0cd05e81124..c9df14a23707 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -1622,6 +1622,15 @@ PAGE_SIZE multiple when read back. pglazyfreed (npn) Amount of reclaimed lazyfree pages + swpin_zero + Number of pages swapped into memory and filled with zero, where I/O + was optimized out because the page content was detected to be zero + during swapout. + + swpout_zero + Number of zero-filled pages swapped out with I/O skipped due to the + content being detected as zero. + zswpin Number of pages moved in to memory from zswap. diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 9e0316a327ba..7d8a8e2e6def 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -2068,8 +2068,9 @@ static inline int memcg_kmem_id(struct mem_cgroup *memcg) struct mem_cgroup *mem_cgroup_from_obj(void *p); struct mem_cgroup *mem_cgroup_from_slab_obj(void *p); -static inline void count_objcg_event(struct obj_cgroup *objcg, - enum vm_event_item idx) +static inline void count_objcg_events(struct obj_cgroup *objcg, + enum vm_event_item idx, + unsigned long count) { struct mem_cgroup *memcg; @@ -2078,7 +2079,7 @@ static inline void count_objcg_event(struct obj_cgroup *objcg, rcu_read_lock(); memcg = obj_cgroup_memcg(objcg); - count_memcg_events(memcg, idx, 1); + count_memcg_events(memcg, idx, count); rcu_read_unlock(); } @@ -2138,8 +2139,9 @@ static inline struct mem_cgroup *mem_cgroup_from_slab_obj(void *p) return NULL; } -static inline void count_objcg_event(struct obj_cgroup *objcg, - enum vm_event_item idx) +static inline void count_objcg_events(struct obj_cgroup *objcg, + enum vm_event_item idx, + unsigned long count) { } diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h index b61796a35d2b..b33c73e24b41 100644 --- a/include/linux/vm_event_item.h +++ b/include/linux/vm_event_item.h @@ -135,6 +135,8 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, #ifdef CONFIG_SWAP SWAP_RA, SWAP_RA_HIT, + SWPIN_ZERO, + SWPOUT_ZERO, #ifdef CONFIG_KSM KSM_SWPIN_COPY, #endif diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 694d8ceb80d9..73bc3ca42fa1 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -627,6 +627,10 @@ static const unsigned int memcg_vm_event_stat[] = { PGDEACTIVATE, PGLAZYFREE, PGLAZYFREED, +#ifdef CONFIG_SWAP + SWPIN_ZERO, + SWPOUT_ZERO, +#endif #if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP) ZSWPIN, ZSWPOUT, diff --git a/mm/page_io.c b/mm/page_io.c index b94bbb686226..7c1d5f182626 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -203,7 +203,9 @@ static bool is_folio_zero_filled(struct folio *folio) static void swap_zeromap_folio_set(struct folio *folio) { + struct obj_cgroup *objcg = get_obj_cgroup_from_folio(folio); struct swap_info_struct *sis = swp_swap_info(folio->swap); + int nr_pages = folio_nr_pages(folio); swp_entry_t entry; unsigned int i; @@ -211,6 +213,12 @@ static void swap_zeromap_folio_set(struct folio *folio) entry = page_swap_entry(folio_page(folio, i)); set_bit(swp_offset(entry), sis->zeromap); } + + count_vm_events(SWPOUT_ZERO, nr_pages); + if (objcg) { + count_objcg_events(objcg, SWPOUT_ZERO, nr_pages); + obj_cgroup_put(objcg); + } } static void swap_zeromap_folio_clear(struct folio *folio) @@ -515,6 +523,7 @@ static void sio_read_complete(struct kiocb *iocb, long ret) static bool swap_read_folio_zeromap(struct folio *folio) { int nr_pages = folio_nr_pages(folio); + struct obj_cgroup *objcg; bool is_zeromap; /* @@ -529,6 +538,13 @@ static bool swap_read_folio_zeromap(struct folio *folio) if (!is_zeromap) return false; + objcg = get_obj_cgroup_from_folio(folio); + count_vm_events(SWPIN_ZERO, nr_pages); + if (objcg) { + count_objcg_events(objcg, SWPIN_ZERO, nr_pages); + obj_cgroup_put(objcg); + } + folio_zero_range(folio, 0, folio_size(folio)); folio_mark_uptodate(folio); return true; diff --git a/mm/vmstat.c b/mm/vmstat.c index 2c736afc0325..dac920df1955 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1391,6 +1391,8 @@ const char * const vmstat_text[] = { #ifdef CONFIG_SWAP "swap_ra", "swap_ra_hit", + "swpin_zero", + "swpout_zero", #ifdef CONFIG_KSM "ksm_swpin_copy", #endif diff --git a/mm/zswap.c b/mm/zswap.c index 624442752cc6..478d8aa56881 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -1048,7 +1048,7 @@ static int zswap_writeback_entry(struct zswap_entry *entry, count_vm_event(ZSWPWB); if (entry->objcg) - count_objcg_event(entry->objcg, ZSWPWB); + count_objcg_events(entry->objcg, ZSWPWB, 1); zswap_entry_free(entry); @@ -1478,7 +1478,7 @@ bool zswap_store(struct folio *folio) if (objcg) { obj_cgroup_charge_zswap(objcg, entry->length); - count_objcg_event(objcg, ZSWPOUT); + count_objcg_events(objcg, ZSWPOUT, 1); } /* @@ -1572,7 +1572,7 @@ bool zswap_load(struct folio *folio) count_vm_event(ZSWPIN); if (entry->objcg) - count_objcg_event(entry->objcg, ZSWPIN); + count_objcg_events(entry->objcg, ZSWPIN, 1); if (swapcache) { zswap_entry_free(entry); -- Gitee From 01b748d66cb56c2915e3fcff685634250a41fd77 Mon Sep 17 00:00:00 2001 From: Kanchana P Sridhar Date: Mon, 30 Sep 2024 22:32:17 -0700 Subject: [PATCH 290/484] mm: zswap: modify zswap_compress() to accept a page instead of a folio commit 3d0f560a367ee2bc9ec369f5e844d8116d850f1c upstream Conflicts: none Backport-reason: ZSWAP mass update For zswap_store() to be able to store a large folio by compressing it one page at a time, zswap_compress() needs to accept a page as input. This will allow us to iterate through each page in the folio in zswap_store(), compress it and store it in the zpool. Link: https://lkml.kernel.org/r/20241001053222.6944-3-kanchana.p.sridhar@intel.com Signed-off-by: Kanchana P Sridhar Reviewed-by: Nhat Pham Reviewed-by: Chengming Zhou Acked-by: Johannes Weiner Acked-by: Yosry Ahmed Cc: "Huang, Ying" Cc: Matthew Wilcox Cc: Ryan Roberts Cc: Shakeel Butt Cc: Usama Arif Cc: Wajdi Feghali Cc: "Zou, Nanhai" Cc: Barry Song <21cnbao@gmail.com> Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/zswap.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mm/zswap.c b/mm/zswap.c index 478d8aa56881..1890a538eb5e 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -873,7 +873,7 @@ static int zswap_cpu_comp_dead(unsigned int cpu, struct hlist_node *node) return 0; } -static bool zswap_compress(struct folio *folio, struct zswap_entry *entry) +static bool zswap_compress(struct page *page, struct zswap_entry *entry) { struct crypto_acomp_ctx *acomp_ctx; struct scatterlist input, output; @@ -891,7 +891,7 @@ static bool zswap_compress(struct folio *folio, struct zswap_entry *entry) dst = acomp_ctx->buffer; sg_init_table(&input, 1); - sg_set_folio(&input, folio, PAGE_SIZE, 0); + sg_set_page(&input, page, PAGE_SIZE, 0); /* * We need PAGE_SIZE * 2 here since there maybe over-compression case, @@ -1452,7 +1452,7 @@ bool zswap_store(struct folio *folio) mem_cgroup_put(memcg); } - if (!zswap_compress(folio, entry)) + if (!zswap_compress(&folio->page, entry)) goto put_pool; entry->swpentry = swp; -- Gitee From f20ffacce50c295ea489c98d8e5f54b1eabc27c1 Mon Sep 17 00:00:00 2001 From: Kanchana P Sridhar Date: Mon, 30 Sep 2024 22:32:18 -0700 Subject: [PATCH 291/484] mm: zswap: rename zswap_pool_get() to zswap_pool_tryget() commit 0201c054c2a38c53e8949700468ae91623f8cea9 upstream Conflicts: none Backport-reason: ZSWAP mass update Modify the name of the existing zswap_pool_get() to zswap_pool_tryget() to be representative of the call it makes to percpu_ref_tryget(). A subsequent patch will introduce a new zswap_pool_get() that calls percpu_ref_get(). The intent behind this change is for higher level zswap API such as zswap_store() to call zswap_pool_tryget() to check upfront if the pool's refcount is "0" (which means it could be getting destroyed) and to handle this as an error condition. zswap_store() would proceed only if zswap_pool_tryget() returns success, and any additional pool refcounts that need to be obtained for compressing sub-pages in a large folio could simply call zswap_pool_get(). Link: https://lkml.kernel.org/r/20241001053222.6944-4-kanchana.p.sridhar@intel.com Signed-off-by: Kanchana P Sridhar Acked-by: Yosry Ahmed Reviewed-by: Chengming Zhou Acked-by: Johannes Weiner Reviewed-by: Nhat Pham Cc: "Huang, Ying" Cc: Matthew Wilcox Cc: Ryan Roberts Cc: Shakeel Butt Cc: Usama Arif Cc: Wajdi Feghali Cc: "Zou, Nanhai" Cc: Barry Song <21cnbao@gmail.com> Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/zswap.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mm/zswap.c b/mm/zswap.c index 1890a538eb5e..6842b9858a53 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -401,7 +401,7 @@ static void __zswap_pool_empty(struct percpu_ref *ref) spin_unlock_bh(&zswap_pools_lock); } -static int __must_check zswap_pool_get(struct zswap_pool *pool) +static int __must_check zswap_pool_tryget(struct zswap_pool *pool) { if (!pool) return 0; @@ -439,7 +439,7 @@ static struct zswap_pool *zswap_pool_current_get(void) rcu_read_lock(); pool = __zswap_pool_current(); - if (!zswap_pool_get(pool)) + if (!zswap_pool_tryget(pool)) pool = NULL; rcu_read_unlock(); @@ -460,7 +460,7 @@ static struct zswap_pool *zswap_pool_find_get(char *type, char *compressor) if (strcmp(zpool_get_type(pool->zpool), type)) continue; /* if we can't get it, it's about to be destroyed */ - if (!zswap_pool_get(pool)) + if (!zswap_pool_tryget(pool)) continue; return pool; } -- Gitee From 54ebd2cb66a50ba21ee6f29e88491cd03324a8a5 Mon Sep 17 00:00:00 2001 From: Kanchana P Sridhar Date: Mon, 30 Sep 2024 22:32:20 -0700 Subject: [PATCH 292/484] mm: zswap: modify zswap_stored_pages to be atomic_long_t commit 6e1fa555ec772046ec3b903f507ff7fed5323796 upstream Conflicts: none Backport-reason: ZSWAP mass update For zswap_store() to support large folios, we need to be able to do a batch update of zswap_stored_pages upon successful store of all pages in the folio. For this, we need to add folio_nr_pages(), which returns a long, to zswap_stored_pages. Link: https://lkml.kernel.org/r/20241001053222.6944-6-kanchana.p.sridhar@intel.com Signed-off-by: Kanchana P Sridhar Acked-by: Yosry Ahmed Acked-by: Johannes Weiner Reviewed-by: Nhat Pham Cc: Chengming Zhou Cc: "Huang, Ying" Cc: Matthew Wilcox Cc: Ryan Roberts Cc: Shakeel Butt Cc: Usama Arif Cc: Wajdi Feghali Cc: "Zou, Nanhai" Cc: Barry Song <21cnbao@gmail.com> Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- fs/proc/meminfo.c | 2 +- include/linux/zswap.h | 2 +- mm/zswap.c | 19 +++++++++++++------ 3 files changed, 15 insertions(+), 8 deletions(-) diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index 7bad4ce3a1d0..d463e013dbd1 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -93,7 +93,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v) #ifdef CONFIG_ZSWAP show_val_kb(m, "Zswap: ", zswap_total_pages()); seq_printf(m, "Zswapped: %8lu kB\n", - (unsigned long)atomic_read(&zswap_stored_pages) << + (unsigned long)atomic_long_read(&zswap_stored_pages) << (PAGE_SHIFT - 10)); #endif show_val_kb(m, "Dirty: ", diff --git a/include/linux/zswap.h b/include/linux/zswap.h index 3c308cc845d6..e9db943ad25a 100644 --- a/include/linux/zswap.h +++ b/include/linux/zswap.h @@ -7,7 +7,7 @@ struct lruvec; -extern atomic_t zswap_stored_pages; +extern atomic_long_t zswap_stored_pages; #ifdef CONFIG_ZSWAP diff --git a/mm/zswap.c b/mm/zswap.c index 6842b9858a53..d41f3b9e433e 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -43,7 +43,7 @@ * statistics **********************************/ /* The number of compressed pages currently stored in zswap */ -atomic_t zswap_stored_pages = ATOMIC_INIT(0); +atomic_long_t zswap_stored_pages = ATOMIC_INIT(0); /* * The statistics below are not protected from concurrent access for @@ -801,7 +801,7 @@ static void zswap_entry_free(struct zswap_entry *entry) obj_cgroup_put(entry->objcg); } zswap_entry_cache_free(entry); - atomic_dec(&zswap_stored_pages); + atomic_long_dec(&zswap_stored_pages); } /********************************* @@ -1228,7 +1228,7 @@ static unsigned long zswap_shrinker_count(struct shrinker *shrinker, nr_stored = memcg_page_state(memcg, MEMCG_ZSWAPPED); } else { nr_backing = zswap_total_pages(); - nr_stored = atomic_read(&zswap_stored_pages); + nr_stored = atomic_long_read(&zswap_stored_pages); } if (!nr_stored) @@ -1497,7 +1497,7 @@ bool zswap_store(struct folio *folio) } /* update stats */ - atomic_inc(&zswap_stored_pages); + atomic_long_inc(&zswap_stored_pages); count_vm_event(ZSWPOUT); return true; @@ -1649,6 +1649,13 @@ static int debugfs_get_total_size(void *data, u64 *val) } DEFINE_DEBUGFS_ATTRIBUTE(total_size_fops, debugfs_get_total_size, NULL, "%llu\n"); +static int debugfs_get_stored_pages(void *data, u64 *val) +{ + *val = atomic_long_read(&zswap_stored_pages); + return 0; +} +DEFINE_DEBUGFS_ATTRIBUTE(stored_pages_fops, debugfs_get_stored_pages, NULL, "%llu\n"); + static int zswap_debugfs_init(void) { if (!debugfs_initialized()) @@ -1672,8 +1679,8 @@ static int zswap_debugfs_init(void) zswap_debugfs_root, &zswap_written_back_pages); debugfs_create_file("pool_total_size", 0444, zswap_debugfs_root, NULL, &total_size_fops); - debugfs_create_atomic_t("stored_pages", 0444, - zswap_debugfs_root, &zswap_stored_pages); + debugfs_create_file("stored_pages", 0444, + zswap_debugfs_root, NULL, &stored_pages_fops); return 0; } -- Gitee From 9263c36db3e811bac5142bed0484997b8c195be2 Mon Sep 17 00:00:00 2001 From: Kanchana P Sridhar Date: Mon, 30 Sep 2024 22:32:21 -0700 Subject: [PATCH 293/484] mm: zswap: support large folios in zswap_store() commit b7c0ccdfbafdec98699ddb6f164beebf16f0bc45 upstream Conflicts: none Backport-reason: ZSWAP mass update This series enables zswap_store() to accept and store large folios. The most significant contribution in this series is from the earlier RFC submitted by Ryan Roberts [1]. Ryan's original RFC has been migrated to mm-unstable as of 9-30-2024 in patch 6 of this series, and adapted based on code review comments received for the current patch-series. [1]: [RFC PATCH v1] mm: zswap: Store large folios without splitting https://lore.kernel.org/linux-mm/20231019110543.3284654-1-ryan.roberts@arm.com/T/#u The first few patches do the prep work for supporting large folios in zswap_store. Patch 6 provides the main functionality to swap-out large folios in zswap. Patch 7 adds sysfs per-order hugepages "zswpout" counters that get incremented upon successful zswap_store of large folios, and also updates the documentation for this: /sys/kernel/mm/transparent_hugepage/hugepages-*kB/stats/zswpout This series is a pre-requisite for zswap compress batching of large folio swap-out and decompress batching of swap-ins based on swapin_readahead(), using Intel IAA hardware acceleration, which we would like to submit in subsequent patch-series, with performance improvement data. Thanks to Ying Huang for pre-posting review feedback and suggestions! Thanks also to Nhat, Yosry, Johannes, Barry, Chengming, Usama, Ying and Matthew for their helpful feedback, code/data reviews and suggestions! I would like to thank Ryan Roberts for his original RFC [1]. System setup for testing: ========================= Testing of this series was done with mm-unstable as of 9-27-2024, commit de2fbaa6d9c3576ec7133ed02a370ec9376bf000 (without this patch-series) and mm-unstable 9-30-2024 commit c121617e3606be6575cdacfdb63cc8d67b46a568 (with this patch-series). Data was gathered on an Intel Sapphire Rapids server, dual-socket 56 cores per socket, 4 IAA devices per socket, 503 GiB RAM and 525G SSD disk partition swap. Core frequency was fixed at 2500MHz. The vm-scalability "usemem" test was run in a cgroup whose memory.high was fixed at 150G. The is no swap limit set for the cgroup. 30 usemem processes were run, each allocating and writing 10G of memory, and sleeping for 10 sec before exiting: usemem --init-time -w -O -s 10 -n 30 10g Other kernel configuration parameters: zswap compressors : zstd, deflate-iaa zswap allocator : zsmalloc vm.page-cluster : 2 In the experiments where "deflate-iaa" is used as the zswap compressor, IAA "compression verification" is enabled by default (cat /sys/bus/dsa/drivers/crypto/verify_compress). Hence each IAA compression will be decompressed internally by the "iaa_crypto" driver, the crc-s returned by the hardware will be compared and errors reported in case of mismatches. Thus "deflate-iaa" helps ensure better data integrity as compared to the software compressors, and the experimental data listed below is with verify_compress set to "1". Metrics reporting methodology: ============================== Total and average throughput are derived from the individual 30 processes' throughputs reported by usemem. elapsed/sys times are measured with perf. All percentage changes are "new" vs. "old"; hence a positive value denotes an increase in the metric, whether it is throughput or latency, and a negative value denotes a reduction in the metric. Positive throughput change percentages and negative latency change percentages denote improvements. The vm stats and sysfs hugepages stats included with the performance data provide details on the swapout activity to zswap/swap device. Testing labels used in data summaries: ====================================== The data refers to these test configurations and the before/after comparisons that they do: before-case1: ------------- mm-unstable 9-27-2024, CONFIG_THP_SWAP=N (compares zswap 4K vs. zswap 64K) In this scenario, CONFIG_THP_SWAP=N results in 64K/2M folios to be split into 4K folios that get processed by zswap. before-case2: ------------- mm-unstable 9-27-2024, CONFIG_THP_SWAP=Y (compares SSD swap large folios vs. zswap large folios) In this scenario, CONFIG_THP_SWAP=Y results in zswap rejecting large folios, which will then be stored by the SSD swap device. after: ------ v10 of this patch-series, CONFIG_THP_SWAP=Y The "after" is CONFIG_THP_SWAP=Y and v10 of this patch-series, that results in 64K/2M folios to not be split, and to be processed by zswap_store. Regression Testing: =================== I ran vm-scalability usemem without large folios, i.e., only 4K folios with mm-unstable and this patch-series. The main goal was to make sure that there is no functional or performance regression wrt the earlier zswap behavior for 4K folios, now that 4K folios will be processed by the new zswap_store() code. The data indicates there is no significant regression. ------------------------------------------------------------------------------- 4K folios: ========== zswap compressor zstd zstd zstd zstd v10 before-case1 before-case2 after vs. vs. case1 case2 ------------------------------------------------------------------------------- Total throughput (KB/s) 4,793,363 4,880,978 4,853,074 1% -1% Average throughput (KB/s) 159,778 162,699 161,769 1% -1% elapsed time (sec) 130.14 123.17 126.29 -3% 3% sys time (sec) 3,135.53 2,985.64 3,083.18 -2% 3% memcg_high 446,826 444,626 452,930 memcg_swap_fail 0 0 0 zswpout 48,932,107 48,931,971 48,931,820 zswpin 383 386 397 pswpout 0 0 0 pswpin 0 0 0 thp_swpout 0 0 0 thp_swpout_fallback 0 0 0 64kB-mthp_swpout_fallback 0 0 0 pgmajfault 3,063 3,077 3,479 swap_ra 93 94 96 swap_ra_hit 47 47 50 ZSWPOUT-64kB n/a n/a 0 SWPOUT-64kB 0 0 0 ------------------------------------------------------------------------------- Performance Testing: ==================== We list the data for 64K folios with before/after data per-compressor, followed by the same for 2M pmd-mappable folios. ------------------------------------------------------------------------------- 64K folios: zstd: ================= zswap compressor zstd zstd zstd zstd v10 before-case1 before-case2 after vs. vs. case1 case2 ------------------------------------------------------------------------------- Total throughput (KB/s) 5,222,213 1,076,611 6,159,776 18% 472% Average throughput (KB/s) 174,073 35,887 205,325 18% 472% elapsed time (sec) 120.50 347.16 108.33 -10% -69% sys time (sec) 2,930.33 248.16 2,549.65 -13% 927% memcg_high 416,773 552,200 465,874 memcg_swap_fail 3,192,906 1,293 1,012 zswpout 48,931,583 20,903 48,931,218 zswpin 384 363 410 pswpout 0 40,778,448 0 pswpin 0 16 0 thp_swpout 0 0 0 thp_swpout_fallback 0 0 0 64kB-mthp_swpout_fallback 3,192,906 1,293 1,012 pgmajfault 3,452 3,072 3,061 swap_ra 90 87 107 swap_ra_hit 42 43 57 ZSWPOUT-64kB n/a n/a 3,057,173 SWPOUT-64kB 0 2,548,653 0 ------------------------------------------------------------------------------- ------------------------------------------------------------------------------- 64K folios: deflate-iaa: ======================== zswap compressor deflate-iaa deflate-iaa deflate-iaa deflate-iaa v10 before-case1 before-case2 after vs. vs. case1 case2 ------------------------------------------------------------------------------- Total throughput (KB/s) 5,652,608 1,089,180 7,189,778 27% 560% Average throughput (KB/s) 188,420 36,306 239,659 27% 560% elapsed time (sec) 102.90 343.35 87.05 -15% -75% sys time (sec) 2,246.86 213.53 1,864.16 -17% 773% memcg_high 576,104 502,907 642,083 memcg_swap_fail 4,016,117 1,407 1,478 zswpout 61,163,423 22,444 57,798,716 zswpin 401 368 454 pswpout 0 40,862,080 0 pswpin 0 20 0 thp_swpout 0 0 0 thp_swpout_fallback 0 0 0 64kB-mthp_swpout_fallback 4,016,117 1,407 1,478 pgmajfault 3,063 3,153 3,122 swap_ra 96 93 156 swap_ra_hit 46 45 83 ZSWPOUT-64kB n/a n/a 3,611,032 SWPOUT-64kB 0 2,553,880 0 ------------------------------------------------------------------------------- ------------------------------------------------------------------------------- 2M folios: zstd: ================ zswap compressor zstd zstd zstd zstd v10 before-case1 before-case2 after vs. vs. case1 case2 ------------------------------------------------------------------------------- Total throughput (KB/s) 5,895,500 1,109,694 6,484,224 10% 484% Average throughput (KB/s) 196,516 36,989 216,140 10% 484% elapsed time (sec) 108.77 334.28 106.33 -2% -68% sys time (sec) 2,657.14 94.88 2,376.13 -11% 2404% memcg_high 64,200 66,316 56,898 memcg_swap_fail 101,182 70 27 zswpout 48,931,499 36,507 48,890,640 zswpin 380 379 377 pswpout 0 40,166,400 0 pswpin 0 0 0 thp_swpout 0 78,450 0 thp_swpout_fallback 101,182 70 27 2MB-mthp_swpout_fallback 0 0 27 pgmajfault 3,067 3,417 3,311 swap_ra 91 90 854 swap_ra_hit 45 45 810 ZSWPOUT-2MB n/a n/a 95,459 SWPOUT-2MB 0 78,450 0 ------------------------------------------------------------------------------- ------------------------------------------------------------------------------- 2M folios: deflate-iaa: ======================= zswap compressor deflate-iaa deflate-iaa deflate-iaa deflate-iaa v10 before-case1 before-case2 after vs. vs. case1 case2 ------------------------------------------------------------------------------- Total throughput (KB/s) 6,286,587 1,126,785 7,073,464 13% 528% Average throughput (KB/s) 209,552 37,559 235,782 13% 528% elapsed time (sec) 96.19 333.03 85.79 -11% -74% sys time (sec) 2,141.44 99.96 1,826.67 -15% 1727% memcg_high 99,253 64,666 79,718 memcg_swap_fail 129,074 53 165 zswpout 61,312,794 28,321 56,045,120 zswpin 383 406 403 pswpout 0 40,048,128 0 pswpin 0 0 0 thp_swpout 0 78,219 0 thp_swpout_fallback 129,074 53 165 2MB-mthp_swpout_fallback 0 0 165 pgmajfault 3,430 3,077 31,468 swap_ra 91 103 84,373 swap_ra_hit 47 46 84,317 ZSWPOUT-2MB n/a n/a 109,229 SWPOUT-2MB 0 78,219 0 ------------------------------------------------------------------------------- And finally, this is a comparison of deflate-iaa vs. zstd with v10 of this patch-series: --------------------------------------------- zswap_store large folios v10 Impr w/ deflate-iaa vs. zstd 64K folios 2M folios --------------------------------------------- Throughput (KB/s) 17% 9% elapsed time (sec) -20% -19% sys time (sec) -27% -23% --------------------------------------------- Conclusions based on the performance results: ============================================= v10 wrt before-case1: --------------------- We see significant improvements in throughput, elapsed and sys time for zstd and deflate-iaa, when comparing before-case1 (THP_SWAP=N) vs. after (THP_SWAP=Y) with zswap_store large folios. v10 wrt before-case2: --------------------- We see even more significant improvements in throughput and elapsed time for zstd and deflate-iaa, when comparing before-case2 (large-folio-SSD) vs. after (large-folio-zswap). The sys time increases with large-folio-zswap as expected, due to the CPU compression time vs. asynchronous disk write times, as pointed out by Ying and Yosry. In before-case2, when zswap does not store large folios, only allocations and cgroup charging due to 4K folio zswap stores count towards the cgroup memory limit. However, in the after scenario, with the introduction of zswap_store() of large folios, there is an added component of the zswap compressed pool usage from large folio stores from potentially all 30 processes, that gets counted towards the memory limit. As a result, we see higher swapout activity in the "after" data. Summary: ======== The v10 data presented above shows that zswap_store of large folios demonstrates good throughput/performance improvements compared to conventional SSD swap of large folios with a sufficiently large 525G SSD swap device. Hence, it seems reasonable for zswap_store to support large folios, so that further performance improvements can be implemented. In the experimental setup used in this patchset, we have enabled IAA compress verification to ensure additional hardware data integrity CRC checks not currently done by the software compressors. We see good throughput/latency improvements with deflate-iaa vs. zstd with zswap_store of large folios. Some of the ideas for further reducing latency that have shown promise in our experiments, are: 1) IAA compress/decompress batching. 2) Distributing compress jobs across all IAA devices on the socket. The tests run for this patchset are using only 1 IAA device per core, that avails of 2 compress engines on the device. In our experiments with IAA batching, we distribute compress jobs from all cores to the 8 compress engines available per socket. We further compress the pages in each folio in parallel in the accelerator. As a result, we improve compress latency and reclaim throughput. In decompress batching, we use swapin_readahead to generate a prefetch batch of 4K folios that we decompress in parallel in IAA. ------------------------------------------------------------------------------ IAA compress/decompress batching Further improvements wrt v10 zswap_store Sequential subpage store using "deflate-iaa": "deflate-iaa" Batching "deflate-iaa-canned" [2] Batching Additional Impr Additional Impr 64K folios 2M folios 64K folios 2M folios ------------------------------------------------------------------------------ Throughput (KB/s) 19% 43% 26% 55% elapsed time (sec) -5% -14% -10% -21% sys time (sec) 4% -7% -4% -18% ------------------------------------------------------------------------------ With zswap IAA compress/decompress batching, we are able to demonstrate significant performance improvements and memory savings in server scalability experiments in highly contended system scenarios under significant memory pressure; as compared to software compressors. We hope to submit this work in subsequent patch series. The current patch-series is a prequisite for these future submissions. This patch (of 7): zswap_store() will store large folios by compressing them page by page. This patch provides a sequential implementation of storing a large folio in zswap_store() by iterating through each page in the folio to compress and store it in the zswap zpool. zswap_store() calls the newly added zswap_store_page() function for each page in the folio. zswap_store_page() handles compressing and storing each page. We check the global and per-cgroup limits once at the beginning of zswap_store(), and only check that the limit is not reached yet. This is racy and inaccurate, but it should be sufficient for now. We also obtain initial references to the relevant objcg and pool to guarantee that subsequent references can be acquired by zswap_store_page(). A new function zswap_pool_get() is added to facilitate this. If these one-time checks pass, we compress the pages of the folio, while maintaining a running count of compressed bytes for all the folio's pages. If all pages are successfully compressed and stored, we do the cgroup zswap charging with the total compressed bytes, and batch update the zswap_stored_pages atomic/zswpout event stats with folio_nr_pages() once, before returning from zswap_store(). If an error is encountered during the store of any page in the folio, all pages in that folio currently stored in zswap will be invalidated. Thus, a folio is either entirely stored in zswap, or entirely not stored in zswap. The most important value provided by this patch is it enables swapping out large folios to zswap without splitting them. Furthermore, it batches some operations while doing so (cgroup charging, stats updates). This patch also forms the basis for building compress batching of pages in a large folio in zswap_store() by compressing up to say, 8 pages of the folio in parallel in hardware using the Intel In-Memory Analytics Accelerator (Intel IAA). This change reuses and adapts the functionality in Ryan Roberts' RFC patch [1]: "[RFC,v1] mm: zswap: Store large folios without splitting" [1] https://lore.kernel.org/linux-mm/20231019110543.3284654-1-ryan.roberts@arm.com/T/#u Link: https://lkml.kernel.org/r/20241001053222.6944-1-kanchana.p.sridhar@intel.com Link: https://lkml.kernel.org/r/20241001053222.6944-7-kanchana.p.sridhar@intel.com Signed-off-by: Kanchana P Sridhar Originally-by: Ryan Roberts Acked-by: Johannes Weiner Acked-by: Yosry Ahmed Reviewed-by: Nhat Pham Cc: Chengming Zhou Cc: "Huang, Ying" Cc: Matthew Wilcox Cc: Shakeel Butt Cc: Usama Arif Cc: Wajdi Feghali Cc: "Zou, Nanhai" Cc: Barry Song <21cnbao@gmail.com> Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/zswap.c | 189 ++++++++++++++++++++++++++++++++++------------------- 1 file changed, 121 insertions(+), 68 deletions(-) diff --git a/mm/zswap.c b/mm/zswap.c index d41f3b9e433e..ecf4d67d1659 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -409,6 +409,12 @@ static int __must_check zswap_pool_tryget(struct zswap_pool *pool) return percpu_ref_tryget(&pool->ref); } +/* The caller must already have a reference. */ +static void zswap_pool_get(struct zswap_pool *pool) +{ + percpu_ref_get(&pool->ref); +} + static void zswap_pool_put(struct zswap_pool *pool) { percpu_ref_put(&pool->ref); @@ -1398,68 +1404,38 @@ static void shrink_worker(struct work_struct *w) /********************************* * main API **********************************/ -bool zswap_store(struct folio *folio) + +static ssize_t zswap_store_page(struct page *page, + struct obj_cgroup *objcg, + struct zswap_pool *pool) { - swp_entry_t swp = folio->swap; - pgoff_t offset = swp_offset(swp); - struct xarray *tree = swap_zswap_tree(swp); struct zswap_entry *entry, *old; - struct obj_cgroup *objcg = NULL; - struct mem_cgroup *memcg = NULL; - - VM_WARN_ON_ONCE(!folio_test_locked(folio)); - VM_WARN_ON_ONCE(!folio_test_swapcache(folio)); - - /* Large folios aren't supported */ - if (folio_test_large(folio)) - return false; - - if (!zswap_enabled) - goto check_old; - - /* Check cgroup limits */ - objcg = get_obj_cgroup_from_folio(folio); - if (objcg && !obj_cgroup_may_zswap(objcg)) { - memcg = get_mem_cgroup_from_objcg(objcg); - if (shrink_memcg(memcg)) { - mem_cgroup_put(memcg); - goto reject; - } - mem_cgroup_put(memcg); - } - - if (zswap_check_limits()) - goto reject; /* allocate entry */ - entry = zswap_entry_cache_alloc(GFP_KERNEL, folio_nid(folio)); + entry = zswap_entry_cache_alloc(GFP_KERNEL, page_to_nid(page)); if (!entry) { zswap_reject_kmemcache_fail++; goto reject; } - /* if entry is successfully added, it keeps the reference */ - entry->pool = zswap_pool_current_get(); - if (!entry->pool) - goto freepage; + /* zswap_store() already holds a ref on 'objcg' and 'pool' */ + if (objcg) + obj_cgroup_get(objcg); + zswap_pool_get(pool); - if (objcg) { - memcg = get_mem_cgroup_from_objcg(objcg); - if (memcg_list_lru_alloc(memcg, &zswap_list_lru, GFP_KERNEL)) { - mem_cgroup_put(memcg); - goto put_pool; - } - mem_cgroup_put(memcg); - } + /* if entry is successfully added, it keeps the reference */ + entry->pool = pool; - if (!zswap_compress(&folio->page, entry)) - goto put_pool; + if (!zswap_compress(page, entry)) + goto put_pool_objcg; - entry->swpentry = swp; + entry->swpentry = page_swap_entry(page); entry->objcg = objcg; entry->referenced = true; - old = xa_store(tree, offset, entry, GFP_KERNEL); + old = xa_store(swap_zswap_tree(entry->swpentry), + swp_offset(entry->swpentry), + entry, GFP_KERNEL); if (xa_is_err(old)) { int err = xa_err(old); @@ -1476,11 +1452,6 @@ bool zswap_store(struct folio *folio) if (old) zswap_entry_free(old); - if (objcg) { - obj_cgroup_charge_zswap(objcg, entry->length); - count_objcg_events(objcg, ZSWPOUT, 1); - } - /* * We finish initializing the entry while it's already in xarray. * This is safe because: @@ -1496,32 +1467,114 @@ bool zswap_store(struct folio *folio) zswap_lru_add(&zswap_list_lru, entry); } - /* update stats */ - atomic_long_inc(&zswap_stored_pages); - count_vm_event(ZSWPOUT); - - return true; + /* + * We shouldn't have any possibility of failure after the entry is + * added in the xarray. The pool/objcg refs obtained here will only + * be dropped if/when zswap_entry_free() gets called. + */ + return entry->length; store_failed: zpool_free(entry->pool->zpool, entry->handle); -put_pool: - zswap_pool_put(entry->pool); -freepage: +put_pool_objcg: + zswap_pool_put(pool); + obj_cgroup_put(objcg); zswap_entry_cache_free(entry); reject: + return -EINVAL; +} + +bool zswap_store(struct folio *folio) +{ + long nr_pages = folio_nr_pages(folio); + swp_entry_t swp = folio->swap; + struct obj_cgroup *objcg = NULL; + struct mem_cgroup *memcg = NULL; + struct zswap_pool *pool; + size_t compressed_bytes = 0; + bool ret = false; + long index; + + VM_WARN_ON_ONCE(!folio_test_locked(folio)); + VM_WARN_ON_ONCE(!folio_test_swapcache(folio)); + + if (!zswap_enabled) + goto check_old; + + objcg = get_obj_cgroup_from_folio(folio); + if (objcg && !obj_cgroup_may_zswap(objcg)) { + memcg = get_mem_cgroup_from_objcg(objcg); + if (shrink_memcg(memcg)) { + mem_cgroup_put(memcg); + goto put_objcg; + } + mem_cgroup_put(memcg); + } + + if (zswap_check_limits()) + goto put_objcg; + + pool = zswap_pool_current_get(); + if (!pool) + goto put_objcg; + + if (objcg) { + memcg = get_mem_cgroup_from_objcg(objcg); + if (memcg_list_lru_alloc(memcg, &zswap_list_lru, GFP_KERNEL)) { + mem_cgroup_put(memcg); + goto put_pool; + } + mem_cgroup_put(memcg); + } + + for (index = 0; index < nr_pages; ++index) { + struct page *page = folio_page(folio, index); + ssize_t bytes; + + bytes = zswap_store_page(page, objcg, pool); + if (bytes < 0) + goto put_pool; + compressed_bytes += bytes; + } + + if (objcg) { + obj_cgroup_charge_zswap(objcg, compressed_bytes); + count_objcg_events(objcg, ZSWPOUT, nr_pages); + } + + atomic_long_add(nr_pages, &zswap_stored_pages); + count_vm_events(ZSWPOUT, nr_pages); + + ret = true; + +put_pool: + zswap_pool_put(pool); +put_objcg: obj_cgroup_put(objcg); - if (zswap_pool_reached_full) + if (!ret && zswap_pool_reached_full) queue_work(shrink_wq, &zswap_shrink_work); check_old: /* - * If the zswap store fails or zswap is disabled, we must invalidate the - * possibly stale entry which was previously stored at this offset. - * Otherwise, writeback could overwrite the new data in the swapfile. + * If the zswap store fails or zswap is disabled, we must invalidate + * the possibly stale entries which were previously stored at the + * offsets corresponding to each page of the folio. Otherwise, + * writeback could overwrite the new data in the swapfile. */ - entry = xa_erase(tree, offset); - if (entry) - zswap_entry_free(entry); - return false; + if (!ret) { + unsigned type = swp_type(swp); + pgoff_t offset = swp_offset(swp); + struct zswap_entry *entry; + struct xarray *tree; + + for (index = 0; index < nr_pages; ++index) { + tree = swap_zswap_tree(swp_entry(type, offset + index)); + entry = xa_erase(tree, offset + index); + if (entry) + zswap_entry_free(entry); + } + } + + return ret; } bool zswap_load(struct folio *folio) -- Gitee From f31d22971be13dcf8d2c53b0d8da9fc48c8465f0 Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Tue, 11 Jun 2024 18:11:05 +0800 Subject: [PATCH 294/484] mm: memory: extend finish_fault() to support large folio commit 43e027e414232b1ce4fa6c96a582417e2c027f2d upstream Conflicts: none Backport-reason: mTHP for Shmem Patch series "add mTHP support for anonymous shmem", v5. Anonymous pages have already been supported for multi-size (mTHP) allocation through commit 19eaf44954df, that can allow THP to be configured through the sysfs interface located at '/sys/kernel/mm/transparent_hugepage/hugepage-XXkb/enabled'. However, the anonymous shmem will ignore the anonymous mTHP rule configured through the sysfs interface, and can only use the PMD-mapped THP, that is not reasonable. Many implement anonymous page sharing through mmap(MAP_SHARED | MAP_ANONYMOUS), especially in database usage scenarios, therefore, users expect to apply an unified mTHP strategy for anonymous pages, also including the anonymous shared pages, in order to enjoy the benefits of mTHP. For example, lower latency than PMD-mapped THP, smaller memory bloat than PMD-mapped THP, contiguous PTEs on ARM architecture to reduce TLB miss etc. As discussed in the bi-weekly MM meeting[1], the mTHP controls should control all of shmem, not only anonymous shmem, but support will be added iteratively. Therefore, this patch set starts with support for anonymous shmem. The primary strategy is similar to supporting anonymous mTHP. Introduce a new interface '/mm/transparent_hugepage/hugepage-XXkb/shmem_enabled', which can have almost the same values as the top-level '/sys/kernel/mm/transparent_hugepage/shmem_enabled', with adding a new additional "inherit" option and dropping the testing options 'force' and 'deny'. By default all sizes will be set to "never" except PMD size, which is set to "inherit". This ensures backward compatibility with the anonymous shmem enabled of the top level, meanwhile also allows independent control of anonymous shmem enabled for each mTHP. Use the page fault latency tool to measure the performance of 1G anonymous shmem with 32 threads on my machine environment with: ARM64 Architecture, 32 cores, 125G memory: base: mm-unstable user-time sys_time faults_per_sec_per_cpu faults_per_sec 0.04s 3.10s 83516.416 2669684.890 mm-unstable + patchset, anon shmem mTHP disabled user-time sys_time faults_per_sec_per_cpu faults_per_sec 0.02s 3.14s 82936.359 2630746.027 mm-unstable + patchset, anon shmem 64K mTHP enabled user-time sys_time faults_per_sec_per_cpu faults_per_sec 0.08s 0.31s 678630.231 17082522.495 From the data above, it is observed that the patchset has a minimal impact when mTHP is not enabled (some fluctuations observed during testing). When enabling 64K mTHP, there is a significant improvement of the page fault latency. [1] https://lore.kernel.org/all/f1783ff0-65bd-4b2b-8952-52b6822a0835@redhat.com/ This patch (of 6): Add large folio mapping establishment support for finish_fault() as a preparation, to support multi-size THP allocation of anonymous shmem pages in the following patches. Keep the same behavior (per-page fault) for non-anon shmem to avoid inflating the RSS unintentionally, and we can discuss what size of mapping to build when extending mTHP to control non-anon shmem in the future. [baolin.wang@linux.alibaba.com: avoid going beyond the PMD pagetable size] Link: https://lkml.kernel.org/r/b0e6a8b1-a32c-459e-ae67-fde5d28773e6@linux.alibaba.com [baolin.wang@linux.alibaba.com: use 'PTRS_PER_PTE' instead of 'PTRS_PER_PTE - 1'] Link: https://lkml.kernel.org/r/e1f5767a-2c9b-4e37-afe6-1de26fe54e41@linux.alibaba.com Link: https://lkml.kernel.org/r/cover.1718090413.git.baolin.wang@linux.alibaba.com Link: https://lkml.kernel.org/r/3a190892355989d42f59cf9f2f98b94694b0d24d.1718090413.git.baolin.wang@linux.alibaba.com Signed-off-by: Baolin Wang Reviewed-by: Zi Yan Reviewed-by: Kefeng Wang Cc: Daniel Gomez Cc: David Hildenbrand Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Lance Yang Cc: Pankaj Raghav Cc: Ryan Roberts Cc: Yang Shi Cc: Barry Song Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/memory.c | 61 ++++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 51 insertions(+), 10 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index 0d36fe555fda..671112bb402d 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4967,9 +4967,12 @@ vm_fault_t finish_fault(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; struct page *page; + struct folio *folio; vm_fault_t ret; bool is_cow = (vmf->flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED); + int type, nr_pages; + unsigned long addr = vmf->address; /* Did we COW the page? */ if (is_cow) @@ -5000,24 +5003,62 @@ vm_fault_t finish_fault(struct vm_fault *vmf) return VM_FAULT_OOM; } + folio = page_folio(page); + nr_pages = folio_nr_pages(folio); + + /* + * Using per-page fault to maintain the uffd semantics, and same + * approach also applies to non-anonymous-shmem faults to avoid + * inflating the RSS of the process. + */ + if (!vma_is_anon_shmem(vma) || unlikely(userfaultfd_armed(vma))) { + nr_pages = 1; + } else if (nr_pages > 1) { + pgoff_t idx = folio_page_idx(folio, page); + /* The page offset of vmf->address within the VMA. */ + pgoff_t vma_off = vmf->pgoff - vmf->vma->vm_pgoff; + /* The index of the entry in the pagetable for fault page. */ + pgoff_t pte_off = pte_index(vmf->address); + + /* + * Fallback to per-page fault in case the folio size in page + * cache beyond the VMA limits and PMD pagetable limits. + */ + if (unlikely(vma_off < idx || + vma_off + (nr_pages - idx) > vma_pages(vma) || + pte_off < idx || + pte_off + (nr_pages - idx) > PTRS_PER_PTE)) { + nr_pages = 1; + } else { + /* Now we can set mappings for the whole large folio. */ + addr = vmf->address - idx * PAGE_SIZE; + page = &folio->page; + } + } + vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, - vmf->address, &vmf->ptl); + addr, &vmf->ptl); if (!vmf->pte) return VM_FAULT_NOPAGE; /* Re-check under ptl */ - if (likely(!vmf_pte_changed(vmf))) { - struct folio *folio = page_folio(page); - int type = is_cow ? MM_ANONPAGES : mm_counter_file(folio); - - set_pte_range(vmf, folio, page, 1, vmf->address); - add_mm_counter(vma->vm_mm, type, 1); - ret = 0; - } else { - update_mmu_tlb(vma, vmf->address, vmf->pte); + if (nr_pages == 1 && unlikely(vmf_pte_changed(vmf))) { + update_mmu_tlb(vma, addr, vmf->pte); ret = VM_FAULT_NOPAGE; + goto unlock; + } else if (nr_pages > 1 && !pte_range_none(vmf->pte, nr_pages)) { + update_mmu_tlb_range(vma, addr, vmf->pte, nr_pages); + ret = VM_FAULT_NOPAGE; + goto unlock; } + folio_ref_add(folio, nr_pages - 1); + set_pte_range(vmf, folio, page, nr_pages, addr); + type = is_cow ? MM_ANONPAGES : mm_counter_file(folio); + add_mm_counter(vma->vm_mm, type, nr_pages); + ret = 0; + +unlock: pte_unmap_unlock(vmf->pte, vmf->ptl); return ret; } -- Gitee From 30015087432763f9eb25cf759c007e49df6e6e49 Mon Sep 17 00:00:00 2001 From: Bang Li Date: Wed, 22 May 2024 14:12:02 +0800 Subject: [PATCH 295/484] mm: add update_mmu_tlb_range() commit 23b1b44e6c61295084284aa7d87db863a7802b92 upstream Conflicts: none Backport-reason: Prepare for shmem mTHP swap support Patch series "Add update_mmu_tlb_range() to simplify code", v4. This series of commits mainly adds the update_mmu_tlb_range() to batch update tlb in an address range and implement update_mmu_tlb() using update_mmu_tlb_range(). After commit 19eaf44954df ("mm: thp: support allocation of anonymous multi-size THP"), We may need to batch update tlb of a certain address range by calling update_mmu_tlb() in a loop. Using the update_mmu_tlb_range(), we can simplify the code and possibly reduce the execution of some unnecessary code in some architectures. This patch (of 3): Add update_mmu_tlb_range(), we can batch update tlb of an address range. Link: https://lkml.kernel.org/r/20240522061204.117421-1-libang.li@antgroup.com Link: https://lkml.kernel.org/r/20240522061204.117421-2-libang.li@antgroup.com Signed-off-by: Bang Li Acked-by: David Hildenbrand Cc: Chris Zankel Cc: Huacai Chen Cc: Lance Yang Cc: Max Filippov Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Ryan Roberts Cc: Thomas Bogendoerfer Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- arch/loongarch/include/asm/pgtable.h | 2 ++ arch/mips/include/asm/pgtable.h | 2 ++ arch/riscv/include/asm/pgtable.h | 2 ++ arch/xtensa/include/asm/pgtable.h | 3 +++ arch/xtensa/mm/tlb.c | 6 ++++++ include/linux/pgtable.h | 7 +++++++ 6 files changed, 22 insertions(+) diff --git a/arch/loongarch/include/asm/pgtable.h b/arch/loongarch/include/asm/pgtable.h index 1bda6522f13a..b83584b492f1 100644 --- a/arch/loongarch/include/asm/pgtable.h +++ b/arch/loongarch/include/asm/pgtable.h @@ -455,6 +455,8 @@ static inline void update_mmu_cache_range(struct vm_fault *vmf, #define __HAVE_ARCH_UPDATE_MMU_TLB #define update_mmu_tlb update_mmu_cache +#define update_mmu_tlb_range(vma, addr, ptep, nr) \ + update_mmu_cache_range(NULL, vma, addr, ptep, nr) static inline void update_mmu_cache_pmd(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp) diff --git a/arch/mips/include/asm/pgtable.h b/arch/mips/include/asm/pgtable.h index 430b208c0130..58ada9791e5a 100644 --- a/arch/mips/include/asm/pgtable.h +++ b/arch/mips/include/asm/pgtable.h @@ -596,6 +596,8 @@ static inline void update_mmu_cache_range(struct vm_fault *vmf, #define __HAVE_ARCH_UPDATE_MMU_TLB #define update_mmu_tlb update_mmu_cache +#define update_mmu_tlb_range(vma, address, ptep, nr) \ + update_mmu_cache_range(NULL, vma, address, ptep, nr) static inline void update_mmu_cache_pmd(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp) diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h index cd5217a1a3f6..84582d6d30ff 100644 --- a/arch/riscv/include/asm/pgtable.h +++ b/arch/riscv/include/asm/pgtable.h @@ -493,6 +493,8 @@ static inline void update_mmu_cache_range(struct vm_fault *vmf, #define __HAVE_ARCH_UPDATE_MMU_TLB #define update_mmu_tlb update_mmu_cache +#define update_mmu_tlb_range(vma, addr, ptep, nr) \ + update_mmu_cache_range(NULL, vma, addr, ptep, nr) static inline void update_mmu_cache_pmd(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp) diff --git a/arch/xtensa/include/asm/pgtable.h b/arch/xtensa/include/asm/pgtable.h index 9a7e5e57ee9a..436158bd9030 100644 --- a/arch/xtensa/include/asm/pgtable.h +++ b/arch/xtensa/include/asm/pgtable.h @@ -413,6 +413,9 @@ typedef pte_t *pte_addr_t; void update_mmu_tlb(struct vm_area_struct *vma, unsigned long address, pte_t *ptep); #define __HAVE_ARCH_UPDATE_MMU_TLB +void update_mmu_tlb_range(struct vm_area_struct *vma, + unsigned long address, pte_t *ptep, unsigned int nr); +#define update_mmu_tlb_range update_mmu_tlb_range #endif /* !defined (__ASSEMBLY__) */ diff --git a/arch/xtensa/mm/tlb.c b/arch/xtensa/mm/tlb.c index 4f974b74883c..b1e1f63de72b 100644 --- a/arch/xtensa/mm/tlb.c +++ b/arch/xtensa/mm/tlb.c @@ -169,6 +169,12 @@ void update_mmu_tlb(struct vm_area_struct *vma, local_flush_tlb_page(vma, address); } +void update_mmu_tlb_range(struct vm_area_struct *vma, + unsigned long address, pte_t *ptep, unsigned int nr) +{ + local_flush_tlb_range(vma, address, address + PAGE_SIZE * nr); +} + #ifdef CONFIG_DEBUG_TLB_SANITY static unsigned get_pte_for_vaddr(unsigned vaddr) diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index bfe4182e7940..decff72c2ff0 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -710,6 +710,13 @@ static inline void clear_full_ptes(struct mm_struct *mm, unsigned long addr, * fault. This function updates TLB only, do nothing with cache or others. * It is the difference with function update_mmu_cache. */ +#ifndef update_mmu_tlb_range +static inline void update_mmu_tlb_range(struct vm_area_struct *vma, + unsigned long address, pte_t *ptep, unsigned int nr) +{ +} +#endif + #ifndef __HAVE_ARCH_UPDATE_MMU_TLB static inline void update_mmu_tlb(struct vm_area_struct *vma, unsigned long address, pte_t *ptep) -- Gitee From 676eb1d7d9f7b037ed6a47441d4bc55e08459cc9 Mon Sep 17 00:00:00 2001 From: Bang Li Date: Wed, 22 May 2024 14:12:03 +0800 Subject: [PATCH 296/484] mm: implement update_mmu_tlb() using update_mmu_tlb_range() commit 8f65aa32239f1c3f11b7a25bd5921223bafc5fed upstream Conflicts: none Backport-reason: Prepare for shmem mTHP swap support Let's make update_mmu_tlb() simply a generic wrapper around update_mmu_tlb_range(). Only the latter can now be overridden by the architecture. We can now remove __HAVE_ARCH_UPDATE_MMU_TLB as well. Link: https://lkml.kernel.org/r/20240522061204.117421-3-libang.li@antgroup.com Signed-off-by: Bang Li Acked-by: David Hildenbrand Cc: Chris Zankel Cc: Huacai Chen Cc: Lance Yang Cc: Max Filippov Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Ryan Roberts Cc: Thomas Bogendoerfer Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- arch/loongarch/include/asm/pgtable.h | 2 -- arch/mips/include/asm/pgtable.h | 2 -- arch/riscv/include/asm/pgtable.h | 2 -- arch/xtensa/include/asm/pgtable.h | 3 --- arch/xtensa/mm/tlb.c | 6 ------ include/linux/pgtable.h | 4 +--- 6 files changed, 1 insertion(+), 18 deletions(-) diff --git a/arch/loongarch/include/asm/pgtable.h b/arch/loongarch/include/asm/pgtable.h index b83584b492f1..4f4498ce2255 100644 --- a/arch/loongarch/include/asm/pgtable.h +++ b/arch/loongarch/include/asm/pgtable.h @@ -453,8 +453,6 @@ static inline void update_mmu_cache_range(struct vm_fault *vmf, #define update_mmu_cache(vma, addr, ptep) \ update_mmu_cache_range(NULL, vma, addr, ptep, 1) -#define __HAVE_ARCH_UPDATE_MMU_TLB -#define update_mmu_tlb update_mmu_cache #define update_mmu_tlb_range(vma, addr, ptep, nr) \ update_mmu_cache_range(NULL, vma, addr, ptep, nr) diff --git a/arch/mips/include/asm/pgtable.h b/arch/mips/include/asm/pgtable.h index 58ada9791e5a..daa48f28ce5e 100644 --- a/arch/mips/include/asm/pgtable.h +++ b/arch/mips/include/asm/pgtable.h @@ -594,8 +594,6 @@ static inline void update_mmu_cache_range(struct vm_fault *vmf, #define update_mmu_cache(vma, address, ptep) \ update_mmu_cache_range(NULL, vma, address, ptep, 1) -#define __HAVE_ARCH_UPDATE_MMU_TLB -#define update_mmu_tlb update_mmu_cache #define update_mmu_tlb_range(vma, address, ptep, nr) \ update_mmu_cache_range(NULL, vma, address, ptep, nr) diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h index 84582d6d30ff..fa22c1e56c20 100644 --- a/arch/riscv/include/asm/pgtable.h +++ b/arch/riscv/include/asm/pgtable.h @@ -491,8 +491,6 @@ static inline void update_mmu_cache_range(struct vm_fault *vmf, #define update_mmu_cache(vma, addr, ptep) \ update_mmu_cache_range(NULL, vma, addr, ptep, 1) -#define __HAVE_ARCH_UPDATE_MMU_TLB -#define update_mmu_tlb update_mmu_cache #define update_mmu_tlb_range(vma, addr, ptep, nr) \ update_mmu_cache_range(NULL, vma, addr, ptep, nr) diff --git a/arch/xtensa/include/asm/pgtable.h b/arch/xtensa/include/asm/pgtable.h index 436158bd9030..1647a7cc3fbf 100644 --- a/arch/xtensa/include/asm/pgtable.h +++ b/arch/xtensa/include/asm/pgtable.h @@ -410,9 +410,6 @@ void update_mmu_cache_range(struct vm_fault *vmf, struct vm_area_struct *vma, typedef pte_t *pte_addr_t; -void update_mmu_tlb(struct vm_area_struct *vma, - unsigned long address, pte_t *ptep); -#define __HAVE_ARCH_UPDATE_MMU_TLB void update_mmu_tlb_range(struct vm_area_struct *vma, unsigned long address, pte_t *ptep, unsigned int nr); #define update_mmu_tlb_range update_mmu_tlb_range diff --git a/arch/xtensa/mm/tlb.c b/arch/xtensa/mm/tlb.c index b1e1f63de72b..f69feee19d59 100644 --- a/arch/xtensa/mm/tlb.c +++ b/arch/xtensa/mm/tlb.c @@ -163,12 +163,6 @@ void local_flush_tlb_kernel_range(unsigned long start, unsigned long end) } } -void update_mmu_tlb(struct vm_area_struct *vma, - unsigned long address, pte_t *ptep) -{ - local_flush_tlb_page(vma, address); -} - void update_mmu_tlb_range(struct vm_area_struct *vma, unsigned long address, pte_t *ptep, unsigned int nr) { diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index decff72c2ff0..26ba6169f320 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -717,13 +717,11 @@ static inline void update_mmu_tlb_range(struct vm_area_struct *vma, } #endif -#ifndef __HAVE_ARCH_UPDATE_MMU_TLB static inline void update_mmu_tlb(struct vm_area_struct *vma, unsigned long address, pte_t *ptep) { + update_mmu_tlb_range(vma, address, ptep, 1); } -#define __HAVE_ARCH_UPDATE_MMU_TLB -#endif /* * Some architectures may be able to avoid expensive synchronization -- Gitee From ca80e4e25b0f3059247abbdbdcda42aa3634c094 Mon Sep 17 00:00:00 2001 From: Bang Li Date: Wed, 22 May 2024 14:12:04 +0800 Subject: [PATCH 297/484] mm: use update_mmu_tlb_range() to simplify code commit 6faa49d1c4404e0b949fd92f1e891c24870d4f86 upstream Conflicts: none Backport-reason: Prepare for shmem mTHP swap support Let us simplify the code by update_mmu_tlb_range(). Link: https://lkml.kernel.org/r/20240522061204.117421-4-libang.li@antgroup.com Signed-off-by: Bang Li Reviewed-by: Lance Yang Acked-by: David Hildenbrand Cc: Chris Zankel Cc: Huacai Chen Cc: Max Filippov Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Ryan Roberts Cc: Thomas Bogendoerfer Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/memory.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index 671112bb402d..d4b1a8c2ec1f 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4631,7 +4631,6 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) vm_fault_t ret = 0; int nr_pages = 1; pte_t entry; - int i; /* File mapping without ->vm_ops ? */ if (vma->vm_flags & VM_SHARED) @@ -4700,8 +4699,7 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) update_mmu_tlb(vma, addr, vmf->pte); goto release; } else if (nr_pages > 1 && !pte_range_none(vmf->pte, nr_pages)) { - for (i = 0; i < nr_pages; i++) - update_mmu_tlb(vma, addr + PAGE_SIZE * i, vmf->pte + i); + update_mmu_tlb_range(vma, addr, vmf->pte, nr_pages); goto release; } -- Gitee From 2ba63beb2761a09331f52848249efc8438a61ba2 Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Tue, 11 Jun 2024 18:11:06 +0800 Subject: [PATCH 298/484] mm: shmem: add THP validation for PMD-mapped THP related statistics commit 3d95bc21cea558c7cdb2942b4d0223a571e93f27 upstream Conflicts: none Backport-reason: mTHP for Shmem In order to extend support for mTHP, add THP validation for PMD-mapped THP related statistics to avoid statistical confusion. Link: https://lkml.kernel.org/r/c4b04cbd51e6951cc2436a87be8eaa4a1516faec.1718090413.git.baolin.wang@linux.alibaba.com Signed-off-by: Baolin Wang Reviewed-by: Barry Song Cc: Daniel Gomez Cc: David Hildenbrand Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Kefeng Wang Cc: Lance Yang Cc: Pankaj Raghav Cc: Ryan Roberts Cc: Yang Shi Cc: Zi Yan Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/shmem.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index bdd16adec1cc..47d9803fe2ef 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1684,7 +1684,7 @@ static struct folio *shmem_alloc_and_add_folio(gfp_t gfp, return ERR_PTR(-E2BIG); folio = shmem_alloc_folio(gfp, HPAGE_PMD_ORDER, info, index); - if (!folio) + if (!folio && pages == HPAGE_PMD_NR) count_vm_event(THP_FILE_FALLBACK); } else { pages = 1; @@ -1702,7 +1702,7 @@ static struct folio *shmem_alloc_and_add_folio(gfp_t gfp, if (xa_find(&mapping->i_pages, &index, index + pages - 1, XA_PRESENT)) { error = -EEXIST; - } else if (huge) { + } else if (pages == HPAGE_PMD_NR) { count_vm_event(THP_FILE_FALLBACK); count_vm_event(THP_FILE_FALLBACK_CHARGE); } @@ -2199,7 +2199,8 @@ static int shmem_get_folio_gfp(struct inode *inode, pgoff_t index, folio = shmem_alloc_and_add_folio(huge_gfp, inode, index, fault_mm, true); if (!IS_ERR(folio)) { - count_vm_event(THP_FILE_ALLOC); + if (folio_test_pmd_mappable(folio)) + count_vm_event(THP_FILE_ALLOC); goto alloced; } if (PTR_ERR(folio) == -EEXIST) -- Gitee From edc122ad2d535d50cf6b6edaef9869780dbe4c3b Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Tue, 11 Jun 2024 18:11:07 +0800 Subject: [PATCH 299/484] mm: shmem: add multi-size THP sysfs interface for anonymous shmem commit 4b98995530b77a97912230d8e1564ba7738db19c upstream Conflicts: none Backport-reason: mTHP for Shmem To support the use of mTHP with anonymous shmem, add a new sysfs interface 'shmem_enabled' in the '/sys/kernel/mm/transparent_hugepage/hugepages-kB/' directory for each mTHP to control whether shmem is enabled for that mTHP, with a value similar to the top level 'shmem_enabled', which can be set to: "always", "inherit (to inherit the top level setting)", "within_size", "advise", "never". An 'inherit' option is added to ensure compatibility with these global settings, and the options 'force' and 'deny' are dropped, which are rather testing artifacts from the old ages. By default, PMD-sized hugepages have enabled="inherit" and all other hugepage sizes have enabled="never" for '/sys/kernel/mm/transparent_hugepage/hugepages-xxkB/shmem_enabled'. In addition, if top level value is 'force', then only PMD-sized hugepages have enabled="inherit", otherwise configuration will be failed and vice versa. That means now we will avoid using non-PMD sized THP to override the global huge allocation. [baolin.wang@linux.alibaba.com: fix transhuge.rst indentation] Link: https://lkml.kernel.org/r/b189d815-998b-4dfd-ba89-218ff51313f8@linux.alibaba.com [akpm@linux-foundation.org: reflow transhuge.rst addition to 80 cols] [baolin.wang@linux.alibaba.com: move huge_shmem_orders_lock under CONFIG_SYSFS] Link: https://lkml.kernel.org/r/eb34da66-7f12-44f3-a39e-2bcc90c33354@linux.alibaba.com [akpm@linux-foundation.org: huge_memory.c needs mm_types.h] Link: https://lkml.kernel.org/r/ffddfa8b3cb4266ff963099ab78cfd7184c57ac7.1718090413.git.baolin.wang@linux.alibaba.com Signed-off-by: Baolin Wang Cc: Barry Song Cc: Daniel Gomez Cc: David Hildenbrand Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Kefeng Wang Cc: Lance Yang Cc: Pankaj Raghav Cc: Ryan Roberts Cc: Yang Shi Cc: Zi Yan Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- Documentation/admin-guide/mm/transhuge.rst | 25 ++++++ include/linux/huge_mm.h | 10 +++ mm/huge_memory.c | 12 +-- mm/shmem.c | 96 ++++++++++++++++++++++ 4 files changed, 135 insertions(+), 8 deletions(-) diff --git a/Documentation/admin-guide/mm/transhuge.rst b/Documentation/admin-guide/mm/transhuge.rst index 9ca1f3a7f5bb..99b606bc7762 100644 --- a/Documentation/admin-guide/mm/transhuge.rst +++ b/Documentation/admin-guide/mm/transhuge.rst @@ -331,6 +331,31 @@ deny force Force the huge option on for all - very useful for testing; +Shmem can also use "multi-size THP" (mTHP) by adding a new sysfs knob to +control mTHP allocation: +'/sys/kernel/mm/transparent_hugepage/hugepages-kB/shmem_enabled', +and its value for each mTHP is essentially consistent with the global +setting. An 'inherit' option is added to ensure compatibility with these +global settings. Conversely, the options 'force' and 'deny' are dropped, +which are rather testing artifacts from the old ages. + +always + Attempt to allocate huge pages every time we need a new page; + +inherit + Inherit the top-level "shmem_enabled" value. By default, PMD-sized hugepages + have enabled="inherit" and all other hugepage sizes have enabled="never"; + +never + Do not allocate huge pages; + +within_size + Only allocate huge page if it will be fully within i_size. + Also respect fadvise()/madvise() hints; + +advise + Only allocate huge pages if requested with fadvise()/madvise(); + Need of application restart =========================== diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index e54b874524b0..3e6a3f2316bb 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -6,6 +6,7 @@ #include #include /* only for vma_is_dax() */ +#include vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf); int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, @@ -67,6 +68,7 @@ ssize_t single_hugepage_flag_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf, enum transparent_hugepage_flag flag); extern struct kobj_attribute shmem_enabled_attr; +extern struct kobj_attribute thpsize_shmem_enabled_attr; #define HPAGE_PMD_ORDER (HPAGE_PMD_SHIFT-PAGE_SHIFT) #define HPAGE_PMD_NR (1< #include #include +#include #include #include #include @@ -530,14 +531,6 @@ static void thpsize_release(struct kobject *kobj); static DEFINE_SPINLOCK(huge_anon_orders_lock); static LIST_HEAD(thpsize_list); -struct thpsize { - struct kobject kobj; - struct list_head node; - int order; -}; - -#define to_thpsize(kobj) container_of(kobj, struct thpsize, kobj) - static ssize_t thpsize_enabled_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { @@ -598,6 +591,9 @@ static struct kobj_attribute thpsize_enabled_attr = static struct attribute *thpsize_attrs[] = { &thpsize_enabled_attr.attr, +#ifdef CONFIG_SHMEM + &thpsize_shmem_enabled_attr.attr, +#endif NULL, }; diff --git a/mm/shmem.c b/mm/shmem.c index 47d9803fe2ef..79d029a072f1 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -133,6 +133,13 @@ struct shmem_options { #define SHMEM_SEEN_QUOTA 32 }; +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +static unsigned long huge_shmem_orders_always __read_mostly; +static unsigned long huge_shmem_orders_madvise __read_mostly; +static unsigned long huge_shmem_orders_inherit __read_mostly; +static unsigned long huge_shmem_orders_within_size __read_mostly; +#endif + #ifdef CONFIG_TMPFS static unsigned long shmem_default_max_blocks(void) { @@ -4799,6 +4806,12 @@ void __init shmem_init(void) SHMEM_SB(shm_mnt->mnt_sb)->huge = shmem_huge; else shmem_huge = SHMEM_HUGE_NEVER; /* just in case it was patched */ + + /* + * Default to setting PMD-sized THP to inherit the global setting and + * disable all other multi-size THPs. + */ + huge_shmem_orders_inherit = BIT(HPAGE_PMD_ORDER); #endif return; @@ -4858,6 +4871,11 @@ static ssize_t shmem_enabled_store(struct kobject *kobj, huge != SHMEM_HUGE_NEVER && huge != SHMEM_HUGE_DENY) return -EINVAL; + /* Do not override huge allocation policy with non-PMD sized mTHP */ + if (huge == SHMEM_HUGE_FORCE && + huge_shmem_orders_inherit != BIT(HPAGE_PMD_ORDER)) + return -EINVAL; + shmem_huge = huge; if (shmem_huge > SHMEM_HUGE_DENY) SHMEM_SB(shm_mnt->mnt_sb)->huge = shmem_huge; @@ -4865,6 +4883,84 @@ static ssize_t shmem_enabled_store(struct kobject *kobj, } struct kobj_attribute shmem_enabled_attr = __ATTR_RW(shmem_enabled); +static DEFINE_SPINLOCK(huge_shmem_orders_lock); + +static ssize_t thpsize_shmem_enabled_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + int order = to_thpsize(kobj)->order; + const char *output; + + if (test_bit(order, &huge_shmem_orders_always)) + output = "[always] inherit within_size advise never"; + else if (test_bit(order, &huge_shmem_orders_inherit)) + output = "always [inherit] within_size advise never"; + else if (test_bit(order, &huge_shmem_orders_within_size)) + output = "always inherit [within_size] advise never"; + else if (test_bit(order, &huge_shmem_orders_madvise)) + output = "always inherit within_size [advise] never"; + else + output = "always inherit within_size advise [never]"; + + return sysfs_emit(buf, "%s\n", output); +} + +static ssize_t thpsize_shmem_enabled_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + int order = to_thpsize(kobj)->order; + ssize_t ret = count; + + if (sysfs_streq(buf, "always")) { + spin_lock(&huge_shmem_orders_lock); + clear_bit(order, &huge_shmem_orders_inherit); + clear_bit(order, &huge_shmem_orders_madvise); + clear_bit(order, &huge_shmem_orders_within_size); + set_bit(order, &huge_shmem_orders_always); + spin_unlock(&huge_shmem_orders_lock); + } else if (sysfs_streq(buf, "inherit")) { + /* Do not override huge allocation policy with non-PMD sized mTHP */ + if (shmem_huge == SHMEM_HUGE_FORCE && + order != HPAGE_PMD_ORDER) + return -EINVAL; + + spin_lock(&huge_shmem_orders_lock); + clear_bit(order, &huge_shmem_orders_always); + clear_bit(order, &huge_shmem_orders_madvise); + clear_bit(order, &huge_shmem_orders_within_size); + set_bit(order, &huge_shmem_orders_inherit); + spin_unlock(&huge_shmem_orders_lock); + } else if (sysfs_streq(buf, "within_size")) { + spin_lock(&huge_shmem_orders_lock); + clear_bit(order, &huge_shmem_orders_always); + clear_bit(order, &huge_shmem_orders_inherit); + clear_bit(order, &huge_shmem_orders_madvise); + set_bit(order, &huge_shmem_orders_within_size); + spin_unlock(&huge_shmem_orders_lock); + } else if (sysfs_streq(buf, "madvise")) { + spin_lock(&huge_shmem_orders_lock); + clear_bit(order, &huge_shmem_orders_always); + clear_bit(order, &huge_shmem_orders_inherit); + clear_bit(order, &huge_shmem_orders_within_size); + set_bit(order, &huge_shmem_orders_madvise); + spin_unlock(&huge_shmem_orders_lock); + } else if (sysfs_streq(buf, "never")) { + spin_lock(&huge_shmem_orders_lock); + clear_bit(order, &huge_shmem_orders_always); + clear_bit(order, &huge_shmem_orders_inherit); + clear_bit(order, &huge_shmem_orders_within_size); + clear_bit(order, &huge_shmem_orders_madvise); + spin_unlock(&huge_shmem_orders_lock); + } else { + ret = -EINVAL; + } + + return ret; +} + +struct kobj_attribute thpsize_shmem_enabled_attr = + __ATTR(shmem_enabled, 0644, thpsize_shmem_enabled_show, thpsize_shmem_enabled_store); #endif /* CONFIG_TRANSPARENT_HUGEPAGE && CONFIG_SYSFS */ #else /* !CONFIG_SHMEM */ -- Gitee From 961af301c8e94b243c26cdc0822187f3e6ecbb31 Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Tue, 11 Jun 2024 18:11:08 +0800 Subject: [PATCH 300/484] mm: shmem: add mTHP support for anonymous shmem commit e7a2ab7b3bb5d87f99f2ea3d4481d52fc5ceb52d upstream Conflicts: none Backport-reason: mTHP for Shmem Commit 19eaf44954df adds multi-size THP (mTHP) for anonymous pages, that can allow THP to be configured through the sysfs interface located at '/sys/kernel/mm/transparent_hugepage/hugepage-XXkb/enabled'. However, the anonymous shmem will ignore the anonymous mTHP rule configured through the sysfs interface, and can only use the PMD-mapped THP, that is not reasonable. Users expect to apply the mTHP rule for all anonymous pages, including the anonymous shmem, in order to enjoy the benefits of mTHP. For example, lower latency than PMD-mapped THP, smaller memory bloat than PMD-mapped THP, contiguous PTEs on ARM architecture to reduce TLB miss etc. In addition, the mTHP interfaces can be extended to support all shmem/tmpfs scenarios in the future, especially for the shmem mmap() case. The primary strategy is similar to supporting anonymous mTHP. Introduce a new interface '/mm/transparent_hugepage/hugepage-XXkb/shmem_enabled', which can have almost the same values as the top-level '/sys/kernel/mm/transparent_hugepage/shmem_enabled', with adding a new additional "inherit" option and dropping the testing options 'force' and 'deny'. By default all sizes will be set to "never" except PMD size, which is set to "inherit". This ensures backward compatibility with the anonymous shmem enabled of the top level, meanwhile also allows independent control of anonymous shmem enabled for each mTHP. Link: https://lkml.kernel.org/r/65796c1e72e51e15f3410195b5c2d5b6c160d411.1718090413.git.baolin.wang@linux.alibaba.com Signed-off-by: Baolin Wang Cc: Barry Song Cc: Daniel Gomez Cc: David Hildenbrand Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Kefeng Wang Cc: Lance Yang Cc: Pankaj Raghav Cc: Ryan Roberts Cc: Yang Shi Cc: Zi Yan Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- include/linux/huge_mm.h | 10 +++ mm/shmem.c | 187 +++++++++++++++++++++++++++++++++------- 2 files changed, 167 insertions(+), 30 deletions(-) diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 3e6a3f2316bb..9abc38159176 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -666,6 +666,16 @@ static inline bool thp_migration_supported(void) { return false; } + +static inline int highest_order(unsigned long orders) +{ + return 0; +} + +static inline int next_order(unsigned long *orders, int prev) +{ + return 0; +} #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ static inline int split_folio_to_list_to_order(struct folio *folio, diff --git a/mm/shmem.c b/mm/shmem.c index 79d029a072f1..0d3c2213aec3 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1647,6 +1647,107 @@ static gfp_t limit_gfp_mask(gfp_t huge_gfp, gfp_t limit_gfp) return result; } +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +static unsigned long shmem_allowable_huge_orders(struct inode *inode, + struct vm_area_struct *vma, pgoff_t index, + bool global_huge) +{ + unsigned long mask = READ_ONCE(huge_shmem_orders_always); + unsigned long within_size_orders = READ_ONCE(huge_shmem_orders_within_size); + unsigned long vm_flags = vma->vm_flags; + /* + * Check all the (large) orders below HPAGE_PMD_ORDER + 1 that + * are enabled for this vma. + */ + unsigned long orders = BIT(PMD_ORDER + 1) - 1; + loff_t i_size; + int order; + + if ((vm_flags & VM_NOHUGEPAGE) || + test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags)) + return 0; + + /* If the hardware/firmware marked hugepage support disabled. */ + if (transparent_hugepage_flags & (1 << TRANSPARENT_HUGEPAGE_UNSUPPORTED)) + return 0; + + /* + * Following the 'deny' semantics of the top level, force the huge + * option off from all mounts. + */ + if (shmem_huge == SHMEM_HUGE_DENY) + return 0; + + /* + * Only allow inherit orders if the top-level value is 'force', which + * means non-PMD sized THP can not override 'huge' mount option now. + */ + if (shmem_huge == SHMEM_HUGE_FORCE) + return READ_ONCE(huge_shmem_orders_inherit); + + /* Allow mTHP that will be fully within i_size. */ + order = highest_order(within_size_orders); + while (within_size_orders) { + index = round_up(index + 1, order); + i_size = round_up(i_size_read(inode), PAGE_SIZE); + if (i_size >> PAGE_SHIFT >= index) { + mask |= within_size_orders; + break; + } + + order = next_order(&within_size_orders, order); + } + + if (vm_flags & VM_HUGEPAGE) + mask |= READ_ONCE(huge_shmem_orders_madvise); + + if (global_huge) + mask |= READ_ONCE(huge_shmem_orders_inherit); + + return orders & mask; +} + +static unsigned long shmem_suitable_orders(struct inode *inode, struct vm_fault *vmf, + struct address_space *mapping, pgoff_t index, + unsigned long orders) +{ + struct vm_area_struct *vma = vmf->vma; + unsigned long pages; + int order; + + orders = thp_vma_suitable_orders(vma, vmf->address, orders); + if (!orders) + return 0; + + /* Find the highest order that can add into the page cache */ + order = highest_order(orders); + while (orders) { + pages = 1UL << order; + index = round_down(index, pages); + if (!xa_find(&mapping->i_pages, &index, + index + pages - 1, XA_PRESENT)) + break; + order = next_order(&orders, order); + } + + return orders; +} +#else +static unsigned long shmem_allowable_huge_orders(struct inode *inode, + struct vm_area_struct *vma, pgoff_t index, + bool global_huge) +{ + return 0; +} + +static unsigned long shmem_suitable_orders(struct inode *inode, struct vm_fault *vmf, + struct address_space *mapping, pgoff_t index, + unsigned long orders) +{ + return 0; +} +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ + static struct folio *shmem_alloc_folio(gfp_t gfp, int order, struct shmem_inode_info *info, pgoff_t index) { @@ -1661,38 +1762,55 @@ static struct folio *shmem_alloc_folio(gfp_t gfp, int order, return folio; } -static struct folio *shmem_alloc_and_add_folio(gfp_t gfp, - struct inode *inode, pgoff_t index, - struct mm_struct *fault_mm, bool huge) +static struct folio *shmem_alloc_and_add_folio(struct vm_fault *vmf, + gfp_t gfp, struct inode *inode, pgoff_t index, + struct mm_struct *fault_mm, unsigned long orders) { struct address_space *mapping = inode->i_mapping; struct shmem_inode_info *info = SHMEM_I(inode); - struct folio *folio; + struct vm_area_struct *vma = vmf ? vmf->vma : NULL; + unsigned long suitable_orders = 0; + struct folio *folio = NULL; long pages; - int error; + int error, order; if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) - huge = false; + orders = 0; - if (huge) { - pages = HPAGE_PMD_NR; - index = round_down(index, HPAGE_PMD_NR); + if (orders > 0) { + if (vma && vma_is_anon_shmem(vma)) { + suitable_orders = shmem_suitable_orders(inode, vmf, + mapping, index, orders); + } else if (orders & BIT(HPAGE_PMD_ORDER)) { + pages = HPAGE_PMD_NR; + suitable_orders = BIT(HPAGE_PMD_ORDER); + index = round_down(index, HPAGE_PMD_NR); - /* - * Check for conflict before waiting on a huge allocation. - * Conflict might be that a huge page has just been allocated - * and added to page cache by a racing thread, or that there - * is already at least one small page in the huge extent. - * Be careful to retry when appropriate, but not forever! - * Elsewhere -EEXIST would be the right code, but not here. - */ - if (xa_find(&mapping->i_pages, &index, - index + HPAGE_PMD_NR - 1, XA_PRESENT)) - return ERR_PTR(-E2BIG); + /* + * Check for conflict before waiting on a huge allocation. + * Conflict might be that a huge page has just been allocated + * and added to page cache by a racing thread, or that there + * is already at least one small page in the huge extent. + * Be careful to retry when appropriate, but not forever! + * Elsewhere -EEXIST would be the right code, but not here. + */ + if (xa_find(&mapping->i_pages, &index, + index + HPAGE_PMD_NR - 1, XA_PRESENT)) + return ERR_PTR(-E2BIG); + } - folio = shmem_alloc_folio(gfp, HPAGE_PMD_ORDER, info, index); - if (!folio && pages == HPAGE_PMD_NR) - count_vm_event(THP_FILE_FALLBACK); + order = highest_order(suitable_orders); + while (suitable_orders) { + pages = 1UL << order; + index = round_down(index, pages); + folio = shmem_alloc_folio(gfp, order, info, index); + if (folio) + goto allocated; + + if (pages == HPAGE_PMD_NR) + count_vm_event(THP_FILE_FALLBACK); + order = next_order(&suitable_orders, order); + } } else { pages = 1; folio = shmem_alloc_folio(gfp, 0, info, index); @@ -1700,6 +1818,7 @@ static struct folio *shmem_alloc_and_add_folio(gfp_t gfp, if (!folio) return ERR_PTR(-ENOMEM); +allocated: __folio_set_locked(folio); __folio_set_swapbacked(folio); @@ -2128,7 +2247,8 @@ static int shmem_get_folio_gfp(struct inode *inode, pgoff_t index, struct mm_struct *fault_mm; struct folio *folio; int error; - bool alloced; + bool alloced, huge; + unsigned long orders = 0; if (index > (MAX_LFS_FILESIZE >> PAGE_SHIFT)) return -EFBIG; @@ -2197,14 +2317,21 @@ static int shmem_get_folio_gfp(struct inode *inode, pgoff_t index, return 0; } - if (shmem_is_huge(inode, index, false, fault_mm, - vma ? vma->vm_flags : 0)) { + huge = shmem_is_huge(inode, index, false, fault_mm, + vma ? vma->vm_flags : 0); + /* Find hugepage orders that are allowed for anonymous shmem. */ + if (vma && vma_is_anon_shmem(vma)) + orders = shmem_allowable_huge_orders(inode, vma, index, huge); + else if (huge) + orders = BIT(HPAGE_PMD_ORDER); + + if (orders > 0) { gfp_t huge_gfp; huge_gfp = vma_thp_gfp_mask(vma); huge_gfp = limit_gfp_mask(huge_gfp, gfp); - folio = shmem_alloc_and_add_folio(huge_gfp, - inode, index, fault_mm, true); + folio = shmem_alloc_and_add_folio(vmf, huge_gfp, + inode, index, fault_mm, orders); if (!IS_ERR(folio)) { if (folio_test_pmd_mappable(folio)) count_vm_event(THP_FILE_ALLOC); @@ -2214,7 +2341,7 @@ static int shmem_get_folio_gfp(struct inode *inode, pgoff_t index, goto repeat; } - folio = shmem_alloc_and_add_folio(gfp, inode, index, fault_mm, false); + folio = shmem_alloc_and_add_folio(vmf, gfp, inode, index, fault_mm, 0); if (IS_ERR(folio)) { error = PTR_ERR(folio); if (error == -EEXIST) @@ -2225,7 +2352,7 @@ static int shmem_get_folio_gfp(struct inode *inode, pgoff_t index, alloced: alloced = true; - if (folio_test_pmd_mappable(folio) && + if (folio_test_large(folio) && DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE) < folio_next_index(folio) - 1) { struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); -- Gitee From ea20c210785376f5639a159dabc90c40d0814e30 Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Tue, 11 Jun 2024 18:11:09 +0800 Subject: [PATCH 301/484] mm: shmem: add mTHP size alignment in shmem_get_unmapped_area commit 5a9dd10380a16b343aa87d80d5bcc24409a03f5b upstream Conflicts: none Backport-reason: mTHP for Shmem Although the top-level hugepage allocation can be turned off, anonymous shmem can still use mTHP by configuring the sysfs interface located at '/sys/kernel/mm/transparent_hugepage/hugepage-XXkb/shmem_enabled'. Therefore, add alignment for mTHP size to provide a suitable alignment address in shmem_get_unmapped_area(). Link: https://lkml.kernel.org/r/0c549b57cf7db07503af692d8546ecfad0fcce52.1718090413.git.baolin.wang@linux.alibaba.com Signed-off-by: Baolin Wang Tested-by: Lance Yang Cc: Barry Song Cc: Daniel Gomez Cc: David Hildenbrand Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Kefeng Wang Cc: Pankaj Raghav Cc: Ryan Roberts Cc: Yang Shi Cc: Zi Yan Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/shmem.c | 40 +++++++++++++++++++++++++++++++--------- 1 file changed, 31 insertions(+), 9 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index 0d3c2213aec3..0af142b11970 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2539,6 +2539,7 @@ unsigned long shmem_get_unmapped_area(struct file *file, unsigned long inflated_len; unsigned long inflated_addr; unsigned long inflated_offset; + unsigned long hpage_size; if (len > TASK_SIZE) return -ENOMEM; @@ -2557,8 +2558,6 @@ unsigned long shmem_get_unmapped_area(struct file *file, if (shmem_huge == SHMEM_HUGE_DENY) return addr; - if (len < HPAGE_PMD_SIZE) - return addr; if (flags & MAP_FIXED) return addr; /* @@ -2570,8 +2569,11 @@ unsigned long shmem_get_unmapped_area(struct file *file, if (uaddr == addr) return addr; + hpage_size = HPAGE_PMD_SIZE; if (shmem_huge != SHMEM_HUGE_FORCE) { struct super_block *sb; + unsigned long __maybe_unused hpage_orders; + int order = 0; if (file) { VM_BUG_ON(file->f_op != &shmem_file_operations); @@ -2584,18 +2586,38 @@ unsigned long shmem_get_unmapped_area(struct file *file, if (IS_ERR(shm_mnt)) return addr; sb = shm_mnt->mnt_sb; + + /* + * Find the highest mTHP order used for anonymous shmem to + * provide a suitable alignment address. + */ +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + hpage_orders = READ_ONCE(huge_shmem_orders_always); + hpage_orders |= READ_ONCE(huge_shmem_orders_within_size); + hpage_orders |= READ_ONCE(huge_shmem_orders_madvise); + if (SHMEM_SB(sb)->huge != SHMEM_HUGE_NEVER) + hpage_orders |= READ_ONCE(huge_shmem_orders_inherit); + + if (hpage_orders > 0) { + order = highest_order(hpage_orders); + hpage_size = PAGE_SIZE << order; + } +#endif } - if (SHMEM_SB(sb)->huge == SHMEM_HUGE_NEVER) + if (SHMEM_SB(sb)->huge == SHMEM_HUGE_NEVER && !order) return addr; } - offset = (pgoff << PAGE_SHIFT) & (HPAGE_PMD_SIZE-1); - if (offset && offset + len < 2 * HPAGE_PMD_SIZE) + if (len < hpage_size) + return addr; + + offset = (pgoff << PAGE_SHIFT) & (hpage_size - 1); + if (offset && offset + len < 2 * hpage_size) return addr; - if ((addr & (HPAGE_PMD_SIZE-1)) == offset) + if ((addr & (hpage_size - 1)) == offset) return addr; - inflated_len = len + HPAGE_PMD_SIZE - PAGE_SIZE; + inflated_len = len + hpage_size - PAGE_SIZE; if (inflated_len > TASK_SIZE) return addr; if (inflated_len < len) @@ -2607,10 +2629,10 @@ unsigned long shmem_get_unmapped_area(struct file *file, if (inflated_addr & ~PAGE_MASK) return addr; - inflated_offset = inflated_addr & (HPAGE_PMD_SIZE-1); + inflated_offset = inflated_addr & (hpage_size - 1); inflated_addr += offset - inflated_offset; if (inflated_offset > offset) - inflated_addr += HPAGE_PMD_SIZE; + inflated_addr += hpage_size; if (inflated_addr > TASK_SIZE - len) return addr; -- Gitee From c3ed893737948ef60695ba440f18b06f4ff5e384 Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Tue, 11 Jun 2024 18:11:10 +0800 Subject: [PATCH 302/484] mm: shmem: add mTHP counters for anonymous shmem commit 66f44583f9b617d74ffa2487e75a9c3adf344ddb upstream Conflicts: none Backport-reason: mTHP for Shmem Add mTHP counters for anonymous shmem. [baolin.wang@linux.alibaba.com: update Documentation/admin-guide/mm/transhuge.rst] Link: https://lkml.kernel.org/r/d86e2e7f-4141-432b-b2ba-c6691f36ef0b@linux.alibaba.com Link: https://lkml.kernel.org/r/4fd9e467d49ae4a747e428bcd821c7d13125ae67.1718090413.git.baolin.wang@linux.alibaba.com Signed-off-by: Baolin Wang Reviewed-by: Lance Yang Cc: Barry Song Cc: Daniel Gomez Cc: David Hildenbrand Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Kefeng Wang Cc: Pankaj Raghav Cc: Ryan Roberts Cc: Yang Shi Cc: Zi Yan Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- Documentation/admin-guide/mm/transhuge.rst | 13 +++++++++++++ include/linux/huge_mm.h | 3 +++ mm/huge_memory.c | 6 ++++++ mm/shmem.c | 18 +++++++++++++++--- 4 files changed, 37 insertions(+), 3 deletions(-) diff --git a/Documentation/admin-guide/mm/transhuge.rst b/Documentation/admin-guide/mm/transhuge.rst index 99b606bc7762..a5984f895223 100644 --- a/Documentation/admin-guide/mm/transhuge.rst +++ b/Documentation/admin-guide/mm/transhuge.rst @@ -500,6 +500,19 @@ swpout_fallback Usually because failed to allocate some continuous swap space for the huge page. +file_alloc + is incremented every time a file huge page is successfully + allocated. + +file_fallback + is incremented if a file huge page is attempted to be allocated + but fails and instead falls back to using small pages. + +file_fallback_charge + is incremented if a file huge page cannot be charged and instead + falls back to using small pages even though the allocation was + successful. + As the system ages, allocating huge pages may be expensive as the system uses memory compaction to copy data around memory to free a huge page for use. There are some counters in ``/proc/vmstat`` to help diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 9abc38159176..5ccf5d997a7b 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -288,6 +288,9 @@ enum mthp_stat_item { MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE, MTHP_STAT_SWPOUT, MTHP_STAT_SWPOUT_FALLBACK, + MTHP_STAT_FILE_ALLOC, + MTHP_STAT_FILE_FALLBACK, + MTHP_STAT_FILE_FALLBACK_CHARGE, __MTHP_STAT_COUNT }; diff --git a/mm/huge_memory.c b/mm/huge_memory.c index da946e12f1bb..71c905dd3e9a 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -637,6 +637,9 @@ DEFINE_MTHP_STAT_ATTR(anon_fault_fallback, MTHP_STAT_ANON_FAULT_FALLBACK); DEFINE_MTHP_STAT_ATTR(anon_fault_fallback_charge, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE); DEFINE_MTHP_STAT_ATTR(swpout, MTHP_STAT_SWPOUT); DEFINE_MTHP_STAT_ATTR(swpout_fallback, MTHP_STAT_SWPOUT_FALLBACK); +DEFINE_MTHP_STAT_ATTR(file_alloc, MTHP_STAT_FILE_ALLOC); +DEFINE_MTHP_STAT_ATTR(file_fallback, MTHP_STAT_FILE_FALLBACK); +DEFINE_MTHP_STAT_ATTR(file_fallback_charge, MTHP_STAT_FILE_FALLBACK_CHARGE); static struct attribute *stats_attrs[] = { &anon_fault_alloc_attr.attr, @@ -644,6 +647,9 @@ static struct attribute *stats_attrs[] = { &anon_fault_fallback_charge_attr.attr, &swpout_attr.attr, &swpout_fallback_attr.attr, + &file_alloc_attr.attr, + &file_fallback_attr.attr, + &file_fallback_charge_attr.attr, NULL, }; diff --git a/mm/shmem.c b/mm/shmem.c index 0af142b11970..040dc58ed91b 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1809,6 +1809,9 @@ static struct folio *shmem_alloc_and_add_folio(struct vm_fault *vmf, if (pages == HPAGE_PMD_NR) count_vm_event(THP_FILE_FALLBACK); +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + count_mthp_stat(order, MTHP_STAT_FILE_FALLBACK); +#endif order = next_order(&suitable_orders, order); } } else { @@ -1828,9 +1831,15 @@ static struct folio *shmem_alloc_and_add_folio(struct vm_fault *vmf, if (xa_find(&mapping->i_pages, &index, index + pages - 1, XA_PRESENT)) { error = -EEXIST; - } else if (pages == HPAGE_PMD_NR) { - count_vm_event(THP_FILE_FALLBACK); - count_vm_event(THP_FILE_FALLBACK_CHARGE); + } else if (pages > 1) { + if (pages == HPAGE_PMD_NR) { + count_vm_event(THP_FILE_FALLBACK); + count_vm_event(THP_FILE_FALLBACK_CHARGE); + } +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + count_mthp_stat(folio_order(folio), MTHP_STAT_FILE_FALLBACK); + count_mthp_stat(folio_order(folio), MTHP_STAT_FILE_FALLBACK_CHARGE); +#endif } goto unlock; } @@ -2335,6 +2344,9 @@ static int shmem_get_folio_gfp(struct inode *inode, pgoff_t index, if (!IS_ERR(folio)) { if (folio_test_pmd_mappable(folio)) count_vm_event(THP_FILE_ALLOC); +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + count_mthp_stat(folio_order(folio), MTHP_STAT_FILE_ALLOC); +#endif goto alloced; } if (PTR_ERR(folio) == -EEXIST) -- Gitee From e39e9276c4431ceb64c4bec96fb5c19b19adb13c Mon Sep 17 00:00:00 2001 From: Lance Yang Date: Fri, 28 Jun 2024 21:07:49 +0800 Subject: [PATCH 303/484] mm: add per-order mTHP split counters commit f216c845f3c772e54d27fe209fd300b10e7bf54a upstream Conflicts: none Backport-reason: mTHP statistic Patch series "mm: introduce per-order mTHP split counters", v3. At present, the split counters in THP statistics no longer include PTE-mapped mTHP. Therefore, we want to introduce per-order mTHP split counters to monitor the frequency of mTHP splits. This will assist developers in better analyzing and optimizing system performance. /sys/kernel/mm/transparent_hugepage/hugepages-/stats split split_failed split_deferred This patch (of 2): Currently, the split counters in THP statistics no longer include PTE-mapped mTHP. Therefore, we propose introducing per-order mTHP split counters to monitor the frequency of mTHP splits. This will help developers better analyze and optimize system performance. /sys/kernel/mm/transparent_hugepage/hugepages-/stats split split_failed split_deferred [ioworker0@gmail.com: make things more readable, per Barry and Baolin] Link: https://lkml.kernel.org/r/20240704012905.42971-2-ioworker0@gmail.com [ioworker0@gmail.com: use == for `order' test, per David] Link: https://lkml.kernel.org/r/20240705113119.82210-1-ioworker0@gmail.com Link: https://lkml.kernel.org/r/20240704012905.42971-1-ioworker0@gmail.com Link: https://lkml.kernel.org/r/20240704012905.42971-2-ioworker0@gmail.com Link: https://lkml.kernel.org/r/20240628130750.73097-1-ioworker0@gmail.com Link: https://lkml.kernel.org/r/20240628130750.73097-2-ioworker0@gmail.com Signed-off-by: Mingzhe Yang Signed-off-by: Lance Yang Reviewed-by: Ryan Roberts Acked-by: Barry Song Reviewed-by: Baolin Wang Acked-by: David Hildenbrand Cc: Bang Li Cc: Yang Shi Cc: Zi Yan Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- include/linux/huge_mm.h | 3 +++ mm/huge_memory.c | 12 ++++++++++-- 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 5ccf5d997a7b..d57038919bbb 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -291,6 +291,9 @@ enum mthp_stat_item { MTHP_STAT_FILE_ALLOC, MTHP_STAT_FILE_FALLBACK, MTHP_STAT_FILE_FALLBACK_CHARGE, + MTHP_STAT_SPLIT, + MTHP_STAT_SPLIT_FAILED, + MTHP_STAT_SPLIT_DEFERRED, __MTHP_STAT_COUNT }; diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 71c905dd3e9a..87b671daf71b 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -640,6 +640,9 @@ DEFINE_MTHP_STAT_ATTR(swpout_fallback, MTHP_STAT_SWPOUT_FALLBACK); DEFINE_MTHP_STAT_ATTR(file_alloc, MTHP_STAT_FILE_ALLOC); DEFINE_MTHP_STAT_ATTR(file_fallback, MTHP_STAT_FILE_FALLBACK); DEFINE_MTHP_STAT_ATTR(file_fallback_charge, MTHP_STAT_FILE_FALLBACK_CHARGE); +DEFINE_MTHP_STAT_ATTR(split, MTHP_STAT_SPLIT); +DEFINE_MTHP_STAT_ATTR(split_failed, MTHP_STAT_SPLIT_FAILED); +DEFINE_MTHP_STAT_ATTR(split_deferred, MTHP_STAT_SPLIT_DEFERRED); static struct attribute *stats_attrs[] = { &anon_fault_alloc_attr.attr, @@ -650,6 +653,9 @@ static struct attribute *stats_attrs[] = { &file_alloc_attr.attr, &file_fallback_attr.attr, &file_fallback_charge_attr.attr, + &split_attr.attr, + &split_failed_attr.attr, + &split_deferred_attr.attr, NULL, }; @@ -3072,7 +3078,7 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list, XA_STATE_ORDER(xas, &folio->mapping->i_pages, folio->index, new_order); struct anon_vma *anon_vma = NULL; struct address_space *mapping = NULL; - bool is_thp = folio_test_pmd_mappable(folio); + int order = folio_order(folio); int extra_pins, ret; pgoff_t end; bool is_hzp; @@ -3248,8 +3254,9 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list, i_mmap_unlock_read(mapping); out: xas_destroy(&xas); - if (is_thp) + if (order == HPAGE_PMD_ORDER) count_vm_event(!ret ? THP_SPLIT_PAGE : THP_SPLIT_PAGE_FAILED); + count_mthp_stat(order, !ret ? MTHP_STAT_SPLIT : MTHP_STAT_SPLIT_FAILED); return ret; } @@ -3313,6 +3320,7 @@ void deferred_split_folio(struct folio *folio) if (list_empty(&folio->_deferred_list)) { if (folio_test_pmd_mappable(folio)) count_vm_event(THP_DEFERRED_SPLIT_PAGE); + count_mthp_stat(folio_order(folio), MTHP_STAT_SPLIT_DEFERRED); list_add_tail(&folio->_deferred_list, &ds_queue->split_queue); ds_queue->split_queue_len++; #ifdef CONFIG_MEMCG -- Gitee From b86718caca31aa4aee326997f5f4f85994ff8bee Mon Sep 17 00:00:00 2001 From: Lance Yang Date: Fri, 28 Jun 2024 21:07:50 +0800 Subject: [PATCH 304/484] mm: add docs for per-order mTHP split counters commit 9b89e018990de47c72ef8b2ca29204f88fda8f05 upstream Conflicts: none Backport-reason: mTHP statistic This commit introduces documentation for mTHP split counters in transhuge.rst. [ioworker0@gmail.com: improve the doc as suggested by Ryan] Link: https://lkml.kernel.org/r/20240704012905.42971-3-ioworker0@gmail.com [ioworker0@gmail.com: tweak Documentation/admin-guide/mm/transhuge.rst] Link: https://lkml.kernel.org/r/20240707013659.1151-1-ioworker0@gmail.com Link: https://lkml.kernel.org/r/20240628130750.73097-3-ioworker0@gmail.com Signed-off-by: Mingzhe Yang Signed-off-by: Lance Yang Reviewed-by: Barry Song Reviewed-by: Ryan Roberts Acked-by: David Hildenbrand Cc: Bang Li Cc: Baolin Wang Cc: Yang Shi Cc: Zi Yan Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- Documentation/admin-guide/mm/transhuge.rst | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/Documentation/admin-guide/mm/transhuge.rst b/Documentation/admin-guide/mm/transhuge.rst index a5984f895223..2b57aabcb929 100644 --- a/Documentation/admin-guide/mm/transhuge.rst +++ b/Documentation/admin-guide/mm/transhuge.rst @@ -368,10 +368,6 @@ also applies to the regions registered in khugepaged. Monitoring usage ================ -.. note:: - Currently the below counters only record events relating to - PMD-sized THP. Events relating to other THP sizes are not included. - The number of PMD-sized anonymous transparent huge pages currently used by the system is available by reading the AnonHugePages field in ``/proc/meminfo``. To identify what applications are using PMD-sized anonymous transparent huge @@ -513,6 +509,21 @@ file_fallback_charge falls back to using small pages even though the allocation was successful. +split + is incremented every time a huge page is successfully split into + smaller orders. This can happen for a variety of reasons but a + common reason is that a huge page is old and is being reclaimed. + +split_failed + is incremented if kernel fails to split huge + page. This can happen if the page was pinned by somebody. + +split_deferred + is incremented when a huge page is put onto split queue. + This happens when a huge page is partially unmapped and splitting + it would free up some memory. Pages on split queue are going to + be split under memory pressure, if splitting is possible. + As the system ages, allocating huge pages may be expensive as the system uses memory compaction to copy data around memory to free a huge page for use. There are some counters in ``/proc/vmstat`` to help -- Gitee From ba82a2f81c937e38cc7df9bc28cb0c57774ff71f Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Wed, 10 Jul 2024 10:55:01 +0100 Subject: [PATCH 305/484] mm: shmem: rename mTHP shmem counters commit 63d9866ab01ffd0d0835d5564107283a4afc0a38 upstream Conflicts: none Backport-reason: mTHP statistic, kill the short lived file mthp stat The legacy PMD-sized THP counters at /proc/vmstat include thp_file_alloc, thp_file_fallback and thp_file_fallback_charge, which rather confusingly refer to shmem THP and do not include any other types of file pages. This is inconsistent since in most other places in the kernel, THP counters are explicitly separated for anon, shmem and file flavours. However, we are stuck with it since it constitutes a user ABI. Recently, commit 66f44583f9b6 ("mm: shmem: add mTHP counters for anonymous shmem") added equivalent mTHP stats for shmem, keeping the same "file_" prefix in the names. But in future, we may want to add extra stats to cover actual file pages, at which point, it would all become very confusing. So let's take the opportunity to rename these new counters "shmem_" before the change makes it upstream and the ABI becomes immutable. While we are at it, let's improve the documentation for the legacy counters to make it clear that they count shmem pages only. Link: https://lkml.kernel.org/r/20240710095503.3193901-1-ryan.roberts@arm.com Signed-off-by: Ryan Roberts Reviewed-by: Baolin Wang Reviewed-by: Lance Yang Reviewed-by: Zi Yan Reviewed-by: Barry Song Acked-by: David Hildenbrand Cc: Daniel Gomez Cc: Hugh Dickins Cc: Jonathan Corbet Cc: Matthew Wilcox (Oracle) Cc: Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- Documentation/admin-guide/mm/transhuge.rst | 29 ++++++++++++---------- include/linux/huge_mm.h | 6 ++--- mm/huge_memory.c | 12 ++++----- mm/shmem.c | 8 +++--- 4 files changed, 29 insertions(+), 26 deletions(-) diff --git a/Documentation/admin-guide/mm/transhuge.rst b/Documentation/admin-guide/mm/transhuge.rst index 2b57aabcb929..ee0e1b545007 100644 --- a/Documentation/admin-guide/mm/transhuge.rst +++ b/Documentation/admin-guide/mm/transhuge.rst @@ -412,20 +412,23 @@ thp_collapse_alloc_failed the allocation. thp_file_alloc - is incremented every time a file huge page is successfully - allocated. + is incremented every time a shmem huge page is successfully + allocated (Note that despite being named after "file", the counter + measures only shmem). thp_file_fallback - is incremented if a file huge page is attempted to be allocated - but fails and instead falls back to using small pages. + is incremented if a shmem huge page is attempted to be allocated + but fails and instead falls back to using small pages. (Note that + despite being named after "file", the counter measures only shmem). thp_file_fallback_charge - is incremented if a file huge page cannot be charged and instead + is incremented if a shmem huge page cannot be charged and instead falls back to using small pages even though the allocation was - successful. + successful. (Note that despite being named after "file", the + counter measures only shmem). thp_file_mapped - is incremented every time a file huge page is mapped into + is incremented every time a file or shmem huge page is mapped into user address space. thp_split_page @@ -496,16 +499,16 @@ swpout_fallback Usually because failed to allocate some continuous swap space for the huge page. -file_alloc - is incremented every time a file huge page is successfully +shmem_alloc + is incremented every time a shmem huge page is successfully allocated. -file_fallback - is incremented if a file huge page is attempted to be allocated +shmem_fallback + is incremented if a shmem huge page is attempted to be allocated but fails and instead falls back to using small pages. -file_fallback_charge - is incremented if a file huge page cannot be charged and instead +shmem_fallback_charge + is incremented if a shmem huge page cannot be charged and instead falls back to using small pages even though the allocation was successful. diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index d57038919bbb..7996878b42aa 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -288,9 +288,9 @@ enum mthp_stat_item { MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE, MTHP_STAT_SWPOUT, MTHP_STAT_SWPOUT_FALLBACK, - MTHP_STAT_FILE_ALLOC, - MTHP_STAT_FILE_FALLBACK, - MTHP_STAT_FILE_FALLBACK_CHARGE, + MTHP_STAT_SHMEM_ALLOC, + MTHP_STAT_SHMEM_FALLBACK, + MTHP_STAT_SHMEM_FALLBACK_CHARGE, MTHP_STAT_SPLIT, MTHP_STAT_SPLIT_FAILED, MTHP_STAT_SPLIT_DEFERRED, diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 87b671daf71b..031cafbda11c 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -637,9 +637,9 @@ DEFINE_MTHP_STAT_ATTR(anon_fault_fallback, MTHP_STAT_ANON_FAULT_FALLBACK); DEFINE_MTHP_STAT_ATTR(anon_fault_fallback_charge, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE); DEFINE_MTHP_STAT_ATTR(swpout, MTHP_STAT_SWPOUT); DEFINE_MTHP_STAT_ATTR(swpout_fallback, MTHP_STAT_SWPOUT_FALLBACK); -DEFINE_MTHP_STAT_ATTR(file_alloc, MTHP_STAT_FILE_ALLOC); -DEFINE_MTHP_STAT_ATTR(file_fallback, MTHP_STAT_FILE_FALLBACK); -DEFINE_MTHP_STAT_ATTR(file_fallback_charge, MTHP_STAT_FILE_FALLBACK_CHARGE); +DEFINE_MTHP_STAT_ATTR(shmem_alloc, MTHP_STAT_SHMEM_ALLOC); +DEFINE_MTHP_STAT_ATTR(shmem_fallback, MTHP_STAT_SHMEM_FALLBACK); +DEFINE_MTHP_STAT_ATTR(shmem_fallback_charge, MTHP_STAT_SHMEM_FALLBACK_CHARGE); DEFINE_MTHP_STAT_ATTR(split, MTHP_STAT_SPLIT); DEFINE_MTHP_STAT_ATTR(split_failed, MTHP_STAT_SPLIT_FAILED); DEFINE_MTHP_STAT_ATTR(split_deferred, MTHP_STAT_SPLIT_DEFERRED); @@ -650,9 +650,9 @@ static struct attribute *stats_attrs[] = { &anon_fault_fallback_charge_attr.attr, &swpout_attr.attr, &swpout_fallback_attr.attr, - &file_alloc_attr.attr, - &file_fallback_attr.attr, - &file_fallback_charge_attr.attr, + &shmem_alloc_attr.attr, + &shmem_fallback_attr.attr, + &shmem_fallback_charge_attr.attr, &split_attr.attr, &split_failed_attr.attr, &split_deferred_attr.attr, diff --git a/mm/shmem.c b/mm/shmem.c index 040dc58ed91b..b235c01bfeb4 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1810,7 +1810,7 @@ static struct folio *shmem_alloc_and_add_folio(struct vm_fault *vmf, if (pages == HPAGE_PMD_NR) count_vm_event(THP_FILE_FALLBACK); #ifdef CONFIG_TRANSPARENT_HUGEPAGE - count_mthp_stat(order, MTHP_STAT_FILE_FALLBACK); + count_mthp_stat(order, MTHP_STAT_SHMEM_FALLBACK); #endif order = next_order(&suitable_orders, order); } @@ -1837,8 +1837,8 @@ static struct folio *shmem_alloc_and_add_folio(struct vm_fault *vmf, count_vm_event(THP_FILE_FALLBACK_CHARGE); } #ifdef CONFIG_TRANSPARENT_HUGEPAGE - count_mthp_stat(folio_order(folio), MTHP_STAT_FILE_FALLBACK); - count_mthp_stat(folio_order(folio), MTHP_STAT_FILE_FALLBACK_CHARGE); + count_mthp_stat(folio_order(folio), MTHP_STAT_SHMEM_FALLBACK); + count_mthp_stat(folio_order(folio), MTHP_STAT_SHMEM_FALLBACK_CHARGE); #endif } goto unlock; @@ -2345,7 +2345,7 @@ static int shmem_get_folio_gfp(struct inode *inode, pgoff_t index, if (folio_test_pmd_mappable(folio)) count_vm_event(THP_FILE_ALLOC); #ifdef CONFIG_TRANSPARENT_HUGEPAGE - count_mthp_stat(folio_order(folio), MTHP_STAT_FILE_ALLOC); + count_mthp_stat(folio_order(folio), MTHP_STAT_SHMEM_ALLOC); #endif goto alloced; } -- Gitee From bcba6ee2cd13457b52b0ff1a97418046589dac8e Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Thu, 25 Apr 2024 05:00:55 +0100 Subject: [PATCH 306/484] mm: simplify thp_vma_allowable_order commit e0ffb29bc54d86b9ab10ebafc66eb1b7229e0cd7 upstream Conflicts: none Backport-reason: mTHP flags that will be used Combine the three boolean arguments into one flags argument for readability. Signed-off-by: Matthew Wilcox (Oracle) Cc: David Hildenbrand Cc: Kefeng Wang Cc: Ryan Roberts Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- fs/proc/task_mmu.c | 4 ++-- include/linux/huge_mm.h | 29 +++++++++++++++-------------- mm/huge_memory.c | 7 +++++-- mm/khugepaged.c | 16 +++++++--------- mm/memory.c | 10 ++++++---- 5 files changed, 35 insertions(+), 31 deletions(-) diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index bf032a97e2de..0ce6efb430e9 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -881,8 +881,8 @@ static int show_smap(struct seq_file *m, void *v) __show_smap(m, &mss, false); seq_printf(m, "THPeligible: %8u\n", - !!thp_vma_allowable_orders(vma, vma->vm_flags, true, false, - true, THP_ORDERS_ALL)); + !!thp_vma_allowable_orders(vma, vma->vm_flags, + TVA_SMAPS | TVA_ENFORCE_SYSFS, THP_ORDERS_ALL)); if (arch_pkeys_enabled()) seq_printf(m, "ProtectionKey: %8u\n", vma_pkey(vma)); diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 7996878b42aa..27958b782d6b 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -90,8 +90,12 @@ extern struct kobj_attribute thpsize_shmem_enabled_attr; */ #define THP_ORDERS_ALL (THP_ORDERS_ALL_ANON | THP_ORDERS_ALL_FILE) -#define thp_vma_allowable_order(vma, vm_flags, smaps, in_pf, enforce_sysfs, order) \ - (!!thp_vma_allowable_orders(vma, vm_flags, smaps, in_pf, enforce_sysfs, BIT(order))) +#define TVA_SMAPS (1 << 0) /* Will be used for procfs */ +#define TVA_IN_PF (1 << 1) /* Page fault handler */ +#define TVA_ENFORCE_SYSFS (1 << 2) /* Obey sysfs configuration */ + +#define thp_vma_allowable_order(vma, vm_flags, tva_flags, order) \ + (!!thp_vma_allowable_orders(vma, vm_flags, tva_flags, BIT(order))) #ifdef CONFIG_TRANSPARENT_HUGEPAGE #define HPAGE_PMD_SHIFT PMD_SHIFT @@ -228,17 +232,15 @@ static inline bool file_thp_enabled(struct vm_area_struct *vma) } unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma, - unsigned long vm_flags, bool smaps, - bool in_pf, bool enforce_sysfs, + unsigned long vm_flags, + unsigned long tva_flags, unsigned long orders); /** * thp_vma_allowable_orders - determine hugepage orders that are allowed for vma * @vma: the vm area to check * @vm_flags: use these vm_flags instead of vma->vm_flags - * @smaps: whether answer will be used for smaps file - * @in_pf: whether answer will be used by page fault handler - * @enforce_sysfs: whether sysfs config should be taken into account + * @tva_flags: Which TVA flags to honour * @orders: bitfield of all orders to consider * * Calculates the intersection of the requested hugepage orders and the allowed @@ -251,12 +253,12 @@ unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma, */ static inline unsigned long thp_vma_allowable_orders(struct vm_area_struct *vma, - unsigned long vm_flags, bool smaps, - bool in_pf, bool enforce_sysfs, + unsigned long vm_flags, + unsigned long tva_flags, unsigned long orders) { /* Optimization to check if required orders are enabled early. */ - if (enforce_sysfs && vma_is_anonymous(vma)) { + if ((tva_flags & TVA_ENFORCE_SYSFS) && vma_is_anonymous(vma)) { unsigned long mask = READ_ONCE(huge_anon_orders_always); if (vm_flags & VM_HUGEPAGE) @@ -270,8 +272,7 @@ unsigned long thp_vma_allowable_orders(struct vm_area_struct *vma, return 0; } - return __thp_vma_allowable_orders(vma, vm_flags, smaps, in_pf, - enforce_sysfs, orders); + return __thp_vma_allowable_orders(vma, vm_flags, tva_flags, orders); } struct thpsize { @@ -557,8 +558,8 @@ static inline unsigned long thp_vma_suitable_orders(struct vm_area_struct *vma, } static inline unsigned long thp_vma_allowable_orders(struct vm_area_struct *vma, - unsigned long vm_flags, bool smaps, - bool in_pf, bool enforce_sysfs, + unsigned long vm_flags, + unsigned long tva_flags, unsigned long orders) { return 0; diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 031cafbda11c..f88fc439e686 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -81,10 +81,13 @@ unsigned long huge_anon_orders_madvise __read_mostly; unsigned long huge_anon_orders_inherit __read_mostly; unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma, - unsigned long vm_flags, bool smaps, - bool in_pf, bool enforce_sysfs, + unsigned long vm_flags, + unsigned long tva_flags, unsigned long orders) { + bool smaps = tva_flags & TVA_SMAPS; + bool in_pf = tva_flags & TVA_IN_PF; + bool enforce_sysfs = tva_flags & TVA_ENFORCE_SYSFS; /* Check the intersection of requested and supported orders. */ orders &= vma_is_anonymous(vma) ? THP_ORDERS_ALL_ANON : THP_ORDERS_ALL_FILE; diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 70de487c6400..a7cb540c0897 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -483,7 +483,7 @@ void khugepaged_enter_vma(struct vm_area_struct *vma, { if (!test_bit(MMF_VM_HUGEPAGE, &vma->vm_mm->flags) && hugepage_flags_enabled()) { - if (thp_vma_allowable_order(vma, vm_flags, false, false, true, + if (thp_vma_allowable_order(vma, vm_flags, TVA_ENFORCE_SYSFS, PMD_ORDER)) __khugepaged_enter(vma->vm_mm); } @@ -941,6 +941,7 @@ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address, struct collapse_control *cc) { struct vm_area_struct *vma; + unsigned long tva_flags = cc->is_khugepaged ? TVA_ENFORCE_SYSFS : 0; if (unlikely(hpage_collapse_test_exit(mm))) return SCAN_ANY_PROCESS; @@ -951,8 +952,7 @@ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address, if (!thp_vma_suitable_order(vma, address, PMD_ORDER)) return SCAN_ADDRESS_RANGE; - if (!thp_vma_allowable_order(vma, vma->vm_flags, false, false, - cc->is_khugepaged, PMD_ORDER)) + if (!thp_vma_allowable_order(vma, vma->vm_flags, tva_flags, PMD_ORDER)) return SCAN_VMA_CHECK; /* * Anon VMA expected, the address may be unmapped then @@ -1541,8 +1541,7 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, * and map it by a PMD, regardless of sysfs THP settings. As such, let's * analogously elide sysfs THP settings here. */ - if (!thp_vma_allowable_order(vma, vma->vm_flags, false, false, false, - PMD_ORDER)) + if (!thp_vma_allowable_order(vma, vma->vm_flags, 0, PMD_ORDER)) return SCAN_VMA_CHECK; /* Keep pmd pgtable for uffd-wp; see comment in retract_page_tables() */ @@ -2419,8 +2418,8 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result, progress++; break; } - if (!thp_vma_allowable_order(vma, vma->vm_flags, false, false, - true, PMD_ORDER)) { + if (!thp_vma_allowable_order(vma, vma->vm_flags, + TVA_ENFORCE_SYSFS, PMD_ORDER)) { skip: progress++; continue; @@ -2960,8 +2959,7 @@ int madvise_collapse(struct vm_area_struct *vma, struct vm_area_struct **prev, *prev = vma; - if (!thp_vma_allowable_order(vma, vma->vm_flags, false, false, false, - PMD_ORDER)) + if (!thp_vma_allowable_order(vma, vma->vm_flags, 0, PMD_ORDER)) return -EINVAL; cc = kmalloc(sizeof(*cc), GFP_KERNEL); diff --git a/mm/memory.c b/mm/memory.c index d4b1a8c2ec1f..ec0bdb2a2da1 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4566,8 +4566,8 @@ static struct folio *alloc_anon_folio(struct vm_fault *vmf) * for this vma. Then filter out the orders that can't be allocated over * the faulting address and still be fully contained in the vma. */ - orders = thp_vma_allowable_orders(vma, vma->vm_flags, false, true, true, - BIT(PMD_ORDER) - 1); + orders = thp_vma_allowable_orders(vma, vma->vm_flags, + TVA_IN_PF | TVA_ENFORCE_SYSFS, BIT(PMD_ORDER) - 1); orders = thp_vma_suitable_orders(vma, vmf->address, orders); if (!orders) @@ -5738,7 +5738,8 @@ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma, return VM_FAULT_OOM; retry_pud: if (pud_none(*vmf.pud) && - thp_vma_allowable_order(vma, vm_flags, false, true, true, PUD_ORDER)) { + thp_vma_allowable_order(vma, vm_flags, + TVA_IN_PF | TVA_ENFORCE_SYSFS, PUD_ORDER)) { ret = create_huge_pud(&vmf); if (!(ret & VM_FAULT_FALLBACK)) return ret; @@ -5772,7 +5773,8 @@ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma, goto retry_pud; if (pmd_none(*vmf.pmd) && - thp_vma_allowable_order(vma, vm_flags, false, true, true, PMD_ORDER)) { + thp_vma_allowable_order(vma, vm_flags, + TVA_IN_PF | TVA_ENFORCE_SYSFS, PMD_ORDER)) { ret = create_huge_pmd(&vmf); if (!(ret & VM_FAULT_FALLBACK)) return ret; -- Gitee From cc36d9368bda94085dd1c74570d0b76752558359 Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Tue, 16 Sep 2025 20:46:11 +0800 Subject: [PATCH 307/484] SUQSH: Fix build with HUGEPTEXT Signed-off-by: Kairui Song --- mm/khugepaged.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index a7cb540c0897..0fa355bc9cae 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -2574,7 +2574,7 @@ static unsigned int khugepaged_scan_exec_mm_slot(unsigned int pages, int *result } vma = find_vma(mm, slot->exec_vma[i]); - if (!vma && !thp_vma_allowable_order(vma, vma->vm_flags, false, false, true, PMD_ORDER)) { + if (!vma && !thp_vma_allowable_order(vma, vma->vm_flags, TVA_ENFORCE_SYSFS, PMD_ORDER)) { skip: progress++; continue; -- Gitee From bd641884bc781d5c0801044e2afcb644d31d98e3 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Wed, 27 Mar 2024 11:23:20 -0400 Subject: [PATCH 308/484] mm/Kconfig: CONFIG_PGTABLE_HAS_HUGE_LEAVES commit ac3830c3b2666d08063f633a95cb838b7f8db90e upstream Conflicts: none Backport-reason: mTHP statistic: we need the cleanup and reduce conflict with some macros Patch series "mm/gup: Unify hugetlb, part 2", v4. The series removes the hugetlb slow gup path after a previous refactor work [1], so that slow gup now uses the exact same path to process all kinds of memory including hugetlb. For the long term, we may want to remove most, if not all, call sites of huge_pte_offset(). It'll be ideal if that API can be completely dropped from arch hugetlb API. This series is one small step towards merging hugetlb specific codes into generic mm paths. From that POV, this series removes one reference to huge_pte_offset() out of many others. One goal of such a route is that we can reconsider merging hugetlb features like High Granularity Mapping (HGM). It was not accepted in the past because it may add lots of hugetlb specific codes and make the mm code even harder to maintain. With a merged codeset, features like HGM can hopefully share some code with THP, legacy (PMD+) or modern (continuous PTEs). To make it work, the generic slow gup code will need to at least understand hugepd, which is already done like so in fast-gup. Due to the specialty of hugepd to be software-only solution (no hardware recognizes the hugepd format, so it's purely artificial structures), there's chance we can merge some or all hugepd formats with cont_pte in the future. That question is yet unsettled from Power side to have an acknowledgement. As of now for this series, I kept the hugepd handling because we may still need to do so before getting a clearer picture of the future of hugepd. The other reason is simply that we did it already for fast-gup and most codes are still around to be reused. It'll make more sense to keep slow/fast gup behave the same before a decision is made to remove hugepd. There's one major difference for slow-gup on cont_pte / cont_pmd handling, currently supported on three architectures (aarch64, riscv, ppc). Before the series, slow gup will be able to recognize e.g. cont_pte entries with the help of huge_pte_offset() when hstate is around. Now it's gone but still working, by looking up pgtable entries one by one. It's not ideal, but hopefully this change should not affect yet on major workloads. There's some more information in the commit message of the last patch. If this would be a concern, we can consider teaching slow gup to recognize cont pte/pmd entries, and that should recover the lost performance. But I doubt its necessity for now, so I kept it as simple as it can be. Patch layout ============= Patch 1-8: Preparation works, or cleanups in relevant code paths Patch 9-11: Teach slow gup with all kinds of huge entries (pXd, hugepd) Patch 12: Drop hugetlb_follow_page_mask() More information can be found in the commit messages of each patch. [1] https://lore.kernel.org/all/20230628215310.73782-1-peterx@redhat.com [2] https://lore.kernel.org/r/20240321215047.678172-1-peterx@redhat.com Introduce a config option that will be selected as long as huge leaves are involved in pgtable (thp or hugetlbfs). It would be useful to mark any code with this new config that can process either hugetlb or thp pages in any level that is higher than pte level. Link: https://lkml.kernel.org/r/20240327152332.950956-1-peterx@redhat.com Link: https://lkml.kernel.org/r/20240327152332.950956-2-peterx@redhat.com Signed-off-by: Peter Xu Reviewed-by: Jason Gunthorpe Tested-by: Ryan Roberts Cc: Andrea Arcangeli Cc: Andrew Jones Cc: Aneesh Kumar K.V (IBM) Cc: Axel Rasmussen Cc: Christophe Leroy Cc: Christoph Hellwig Cc: David Hildenbrand Cc: James Houghton Cc: John Hubbard Cc: Kirill A. Shutemov Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Michael Ellerman Cc: "Mike Rapoport (IBM)" Cc: Muchun Song Cc: Rik van Riel Cc: Vlastimil Babka Cc: Yang Shi Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/Kconfig | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/mm/Kconfig b/mm/Kconfig index d388868d5afe..3768728e6bd6 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -894,6 +894,12 @@ config READ_ONLY_THP_FOR_FS endif # TRANSPARENT_HUGEPAGE +# +# The architecture supports pgtable leaves that is larger than PAGE_SIZE +# +config PGTABLE_HAS_HUGE_LEAVES + def_bool TRANSPARENT_HUGEPAGE || HUGETLB_PAGE + # # UP and nommu archs use km based percpu allocator # -- Gitee From 86882afaa4d3abfe52baa3451b6060bcbd2bf2f2 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Wed, 27 Mar 2024 11:23:21 -0400 Subject: [PATCH 309/484] mm/hugetlb: declare hugetlbfs_pagecache_present() non-static commit 24334e78e8e3ca6c8e51dad67d779b1ed40988a6 upstream Conflicts: none Backport-reason: mTHP statistic: we need the cleanup and reduce conflict with some macros It will be used outside hugetlb.c soon. Link: https://lkml.kernel.org/r/20240327152332.950956-3-peterx@redhat.com Signed-off-by: Peter Xu Tested-by: Ryan Roberts Cc: Andrea Arcangeli Cc: Andrew Jones Cc: Aneesh Kumar K.V (IBM) Cc: Axel Rasmussen Cc: Christophe Leroy Cc: Christoph Hellwig Cc: David Hildenbrand Cc: James Houghton Cc: Jason Gunthorpe Cc: John Hubbard Cc: Kirill A. Shutemov Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Michael Ellerman Cc: "Mike Rapoport (IBM)" Cc: Muchun Song Cc: Rik van Riel Cc: Vlastimil Babka Cc: Yang Shi Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- include/linux/hugetlb.h | 9 +++++++++ mm/hugetlb.c | 4 ++-- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 74d96dba8656..d8bf19c1adb1 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -174,6 +174,9 @@ u32 hugetlb_fault_mutex_hash(struct address_space *mapping, pgoff_t idx); pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, pud_t *pud); +bool hugetlbfs_pagecache_present(struct hstate *h, + struct vm_area_struct *vma, + unsigned long address); struct address_space *hugetlb_page_mapping_lock_write(struct page *hpage); @@ -1238,6 +1241,12 @@ static inline void hugetlb_register_node(struct node *node) static inline void hugetlb_unregister_node(struct node *node) { } + +static inline bool hugetlbfs_pagecache_present( + struct hstate *h, struct vm_area_struct *vma, unsigned long address) +{ + return false; +} #endif /* CONFIG_HUGETLB_PAGE */ static inline spinlock_t *huge_pte_lock(struct hstate *h, diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 0109ab23447d..b28687726089 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -5805,8 +5805,8 @@ static vm_fault_t hugetlb_wp(struct mm_struct *mm, struct vm_area_struct *vma, /* * Return whether there is a pagecache page to back given address within VMA. */ -static bool hugetlbfs_pagecache_present(struct hstate *h, - struct vm_area_struct *vma, unsigned long address) +bool hugetlbfs_pagecache_present(struct hstate *h, + struct vm_area_struct *vma, unsigned long address) { struct address_space *mapping = vma->vm_file->f_mapping; pgoff_t idx = linear_page_index(vma, address); -- Gitee From 8c5f458143cd23e09677c1908d9028f1aa32d182 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Wed, 27 Mar 2024 11:23:22 -0400 Subject: [PATCH 310/484] mm: make HPAGE_PXD_* macros even if !THP commit b979db1611a63b207dbc47706de989ac113ea898 upstream Conflicts: none Backport-reason: mTHP statistic: we need the cleanup and reduce conflict with some macros These macros can be helpful when we plan to merge hugetlb code into generic code. Move them out and define them as long as PGTABLE_HAS_HUGE_LEAVES is selected, because there are systems that only define HUGETLB_PAGE not THP. One note here is HPAGE_PMD_SHIFT must be defined even if PMD_SHIFT is not defined (e.g. !CONFIG_MMU case); it (or in other forms, like HPAGE_PMD_NR) is already used in lots of common codes without ifdef guards. Use the old trick to let complations work. Here we only need to differenciate HPAGE_PXD_SHIFT definitions. All the rest macros will be defined based on it. When at it, move HPAGE_PMD_NR / HPAGE_PMD_ORDER over together. Link: https://lkml.kernel.org/r/20240327152332.950956-4-peterx@redhat.com Signed-off-by: Peter Xu Tested-by: Ryan Roberts Cc: Andrea Arcangeli Cc: Andrew Jones Cc: Aneesh Kumar K.V (IBM) Cc: Axel Rasmussen Cc: Christophe Leroy Cc: Christoph Hellwig Cc: David Hildenbrand Cc: James Houghton Cc: Jason Gunthorpe Cc: John Hubbard Cc: Kirill A. Shutemov Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Michael Ellerman Cc: "Mike Rapoport (IBM)" Cc: Muchun Song Cc: Rik van Riel Cc: Vlastimil Babka Cc: Yang Shi Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- include/linux/huge_mm.h | 29 +++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 27958b782d6b..ab2a76de0ebf 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -70,9 +70,6 @@ ssize_t single_hugepage_flag_show(struct kobject *kobj, extern struct kobj_attribute shmem_enabled_attr; extern struct kobj_attribute thpsize_shmem_enabled_attr; -#define HPAGE_PMD_ORDER (HPAGE_PMD_SHIFT-PAGE_SHIFT) -#define HPAGE_PMD_NR (1< Date: Thu, 8 Aug 2024 12:18:46 +0100 Subject: [PATCH 311/484] mm: cleanup count_mthp_stat() definition commit 246d3aa3e53151fa150f10257ddd8a4facd31a6a upstream Conflicts: none Backport-reason: mTHP statistic Patch series "Shmem mTHP controls and stats improvements", v3. This is a small series to tidy up the way the shmem controls and stats are exposed. These patches were previously part of the series at [2], but I decided to split them out since they can go in independently. This patch (of 2): Let's move count_mthp_stat() so that it's always defined, even when THP is disabled. Previously uses of the function in files such as shmem.c, which are compiled even when THP is disabled, required ugly THP ifdeferry. With this cleanup, we can remove those ifdefs and the function resolves to a nop when THP is disabled. I shortly plan to call count_mthp_stat() from more THP-invariant source files. Link: https://lkml.kernel.org/r/20240808111849.651867-1-ryan.roberts@arm.com Link: https://lkml.kernel.org/r/20240808111849.651867-2-ryan.roberts@arm.com Signed-off-by: Ryan Roberts Acked-by: Barry Song Reviewed-by: Baolin Wang Reviewed-by: Lance Yang Acked-by: David Hildenbrand Cc: Gavin Shan Cc: Hugh Dickins Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- include/linux/huge_mm.h | 70 ++++++++++++++++++++--------------------- mm/memory.c | 2 -- mm/shmem.c | 6 ---- 3 files changed, 35 insertions(+), 43 deletions(-) diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index ab2a76de0ebf..9058bcc0f2fd 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -112,6 +112,41 @@ extern struct kobj_attribute thpsize_shmem_enabled_attr; #define HPAGE_PUD_MASK (~(HPAGE_PUD_SIZE - 1)) #define HPAGE_PUD_SIZE ((1UL) << HPAGE_PUD_SHIFT) +enum mthp_stat_item { + MTHP_STAT_ANON_FAULT_ALLOC, + MTHP_STAT_ANON_FAULT_FALLBACK, + MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE, + MTHP_STAT_SWPOUT, + MTHP_STAT_SWPOUT_FALLBACK, + MTHP_STAT_SHMEM_ALLOC, + MTHP_STAT_SHMEM_FALLBACK, + MTHP_STAT_SHMEM_FALLBACK_CHARGE, + MTHP_STAT_SPLIT, + MTHP_STAT_SPLIT_FAILED, + MTHP_STAT_SPLIT_DEFERRED, + __MTHP_STAT_COUNT +}; + +#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && defined(CONFIG_SYSFS) +struct mthp_stat { + unsigned long stats[ilog2(MAX_PTRS_PER_PTE) + 1][__MTHP_STAT_COUNT]; +}; + +DECLARE_PER_CPU(struct mthp_stat, mthp_stats); + +static inline void count_mthp_stat(int order, enum mthp_stat_item item) +{ + if (order <= 0 || order > PMD_ORDER) + return; + + this_cpu_inc(mthp_stats.stats[order][item]); +} +#else +static inline void count_mthp_stat(int order, enum mthp_stat_item item) +{ +} +#endif + #ifdef CONFIG_TRANSPARENT_HUGEPAGE extern unsigned long transparent_hugepage_flags; @@ -291,41 +326,6 @@ struct thpsize { #define to_thpsize(kobj) container_of(kobj, struct thpsize, kobj) -enum mthp_stat_item { - MTHP_STAT_ANON_FAULT_ALLOC, - MTHP_STAT_ANON_FAULT_FALLBACK, - MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE, - MTHP_STAT_SWPOUT, - MTHP_STAT_SWPOUT_FALLBACK, - MTHP_STAT_SHMEM_ALLOC, - MTHP_STAT_SHMEM_FALLBACK, - MTHP_STAT_SHMEM_FALLBACK_CHARGE, - MTHP_STAT_SPLIT, - MTHP_STAT_SPLIT_FAILED, - MTHP_STAT_SPLIT_DEFERRED, - __MTHP_STAT_COUNT -}; - -struct mthp_stat { - unsigned long stats[ilog2(MAX_PTRS_PER_PTE) + 1][__MTHP_STAT_COUNT]; -}; - -#ifdef CONFIG_SYSFS -DECLARE_PER_CPU(struct mthp_stat, mthp_stats); - -static inline void count_mthp_stat(int order, enum mthp_stat_item item) -{ - if (order <= 0 || order > PMD_ORDER) - return; - - this_cpu_inc(mthp_stats.stats[order][item]); -} -#else -static inline void count_mthp_stat(int order, enum mthp_stat_item item) -{ -} -#endif - #define transparent_hugepage_use_zero_page() \ (transparent_hugepage_flags & \ (1<vm_mm, MM_ANONPAGES, nr_pages); -#ifdef CONFIG_TRANSPARENT_HUGEPAGE count_mthp_stat(folio_order(folio), MTHP_STAT_ANON_FAULT_ALLOC); -#endif folio_add_new_anon_rmap(folio, vma, addr, RMAP_EXCLUSIVE); folio_add_lru_vma(folio, vma); setpte: diff --git a/mm/shmem.c b/mm/shmem.c index b235c01bfeb4..7999d40d475b 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1809,9 +1809,7 @@ static struct folio *shmem_alloc_and_add_folio(struct vm_fault *vmf, if (pages == HPAGE_PMD_NR) count_vm_event(THP_FILE_FALLBACK); -#ifdef CONFIG_TRANSPARENT_HUGEPAGE count_mthp_stat(order, MTHP_STAT_SHMEM_FALLBACK); -#endif order = next_order(&suitable_orders, order); } } else { @@ -1836,10 +1834,8 @@ static struct folio *shmem_alloc_and_add_folio(struct vm_fault *vmf, count_vm_event(THP_FILE_FALLBACK); count_vm_event(THP_FILE_FALLBACK_CHARGE); } -#ifdef CONFIG_TRANSPARENT_HUGEPAGE count_mthp_stat(folio_order(folio), MTHP_STAT_SHMEM_FALLBACK); count_mthp_stat(folio_order(folio), MTHP_STAT_SHMEM_FALLBACK_CHARGE); -#endif } goto unlock; } @@ -2344,9 +2340,7 @@ static int shmem_get_folio_gfp(struct inode *inode, pgoff_t index, if (!IS_ERR(folio)) { if (folio_test_pmd_mappable(folio)) count_vm_event(THP_FILE_ALLOC); -#ifdef CONFIG_TRANSPARENT_HUGEPAGE count_mthp_stat(folio_order(folio), MTHP_STAT_SHMEM_ALLOC); -#endif goto alloced; } if (PTR_ERR(folio) == -EEXIST) -- Gitee From 43fd1b42c13554ec7be195f385bc191055a5885f Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Thu, 8 Aug 2024 12:18:47 +0100 Subject: [PATCH 312/484] mm: tidy up shmem mTHP controls and stats commit 70e59a75283bdcc49c8ee104c2d49a22e4912305 upstream Conflicts: none Backport-reason: mTHP statistic Previously we had a situation where shmem mTHP controls and stats were not exposed for some supported sizes and were exposed for some unsupported sizes. So let's clean that up. Anon mTHP can support all large orders [2, PMD_ORDER]. But shmem can support all large orders [1, MAX_PAGECACHE_ORDER]. However, per-size shmem controls and stats were previously being exposed for all the anon mTHP orders, meaning order-1 was not present, and for arm64 64K base pages, orders 12 and 13 were exposed but were not supported internally. Tidy this all up by defining ctrl and stats attribute groups for anon and file separately. Anon ctrl and stats groups are populated for all orders in THP_ORDERS_ALL_ANON and file ctrl and stats groups are populated for all orders in THP_ORDERS_ALL_FILE_DEFAULT. Additionally, create "any" ctrl and stats attribute groups which are populated for all orders in (THP_ORDERS_ALL_ANON | THP_ORDERS_ALL_FILE_DEFAULT). swpout stats use this since they apply to anon and shmem. The side-effect of all this is that different hugepage-*kB directories contain different sets of controls and stats, depending on which memory types support that size. This approach is preferred over the alternative, which is to populate dummy controls and stats for memory types that do not support a given size. [ryan.roberts@arm.com: file pages and shmem can also be split] Link: https://lkml.kernel.org/r/f7ced14c-8bc5-405f-bee7-94f63980f525@arm.comLink: https://lkml.kernel.org/r/20240808111849.651867-3-ryan.roberts@arm.com Signed-off-by: Ryan Roberts Tested-by: Barry Song Reviewed-by: Baolin Wang Cc: David Hildenbrand Cc: Gavin Shan Cc: Hugh Dickins Cc: Lance Yang Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/huge_memory.c | 140 +++++++++++++++++++++++++++++++++++++---------- 1 file changed, 112 insertions(+), 28 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index f88fc439e686..69705dd0b63f 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -534,8 +534,8 @@ static void thpsize_release(struct kobject *kobj); static DEFINE_SPINLOCK(huge_anon_orders_lock); static LIST_HEAD(thpsize_list); -static ssize_t thpsize_enabled_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) +static ssize_t anon_enabled_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) { int order = to_thpsize(kobj)->order; const char *output; @@ -552,9 +552,9 @@ static ssize_t thpsize_enabled_show(struct kobject *kobj, return sysfs_emit(buf, "%s\n", output); } -static ssize_t thpsize_enabled_store(struct kobject *kobj, - struct kobj_attribute *attr, - const char *buf, size_t count) +static ssize_t anon_enabled_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) { int order = to_thpsize(kobj)->order; ssize_t ret = count; @@ -589,19 +589,35 @@ static ssize_t thpsize_enabled_store(struct kobject *kobj, return ret; } -static struct kobj_attribute thpsize_enabled_attr = - __ATTR(enabled, 0644, thpsize_enabled_show, thpsize_enabled_store); +static struct kobj_attribute anon_enabled_attr = + __ATTR(enabled, 0644, anon_enabled_show, anon_enabled_store); -static struct attribute *thpsize_attrs[] = { - &thpsize_enabled_attr.attr, +static struct attribute *anon_ctrl_attrs[] = { + &anon_enabled_attr.attr, + NULL, +}; + +static const struct attribute_group anon_ctrl_attr_grp = { + .attrs = anon_ctrl_attrs, +}; + +static struct attribute *file_ctrl_attrs[] = { #ifdef CONFIG_SHMEM &thpsize_shmem_enabled_attr.attr, #endif NULL, }; -static const struct attribute_group thpsize_attr_group = { - .attrs = thpsize_attrs, +static const struct attribute_group file_ctrl_attr_grp = { + .attrs = file_ctrl_attrs, +}; + +static struct attribute *any_ctrl_attrs[] = { + NULL, +}; + +static const struct attribute_group any_ctrl_attr_grp = { + .attrs = any_ctrl_attrs, }; static const struct kobj_type thpsize_ktype = { @@ -640,64 +656,132 @@ DEFINE_MTHP_STAT_ATTR(anon_fault_fallback, MTHP_STAT_ANON_FAULT_FALLBACK); DEFINE_MTHP_STAT_ATTR(anon_fault_fallback_charge, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE); DEFINE_MTHP_STAT_ATTR(swpout, MTHP_STAT_SWPOUT); DEFINE_MTHP_STAT_ATTR(swpout_fallback, MTHP_STAT_SWPOUT_FALLBACK); +#ifdef CONFIG_SHMEM DEFINE_MTHP_STAT_ATTR(shmem_alloc, MTHP_STAT_SHMEM_ALLOC); DEFINE_MTHP_STAT_ATTR(shmem_fallback, MTHP_STAT_SHMEM_FALLBACK); DEFINE_MTHP_STAT_ATTR(shmem_fallback_charge, MTHP_STAT_SHMEM_FALLBACK_CHARGE); +#endif DEFINE_MTHP_STAT_ATTR(split, MTHP_STAT_SPLIT); DEFINE_MTHP_STAT_ATTR(split_failed, MTHP_STAT_SPLIT_FAILED); DEFINE_MTHP_STAT_ATTR(split_deferred, MTHP_STAT_SPLIT_DEFERRED); -static struct attribute *stats_attrs[] = { +static struct attribute *anon_stats_attrs[] = { &anon_fault_alloc_attr.attr, &anon_fault_fallback_attr.attr, &anon_fault_fallback_charge_attr.attr, +#ifndef CONFIG_SHMEM &swpout_attr.attr, &swpout_fallback_attr.attr, +#endif + &split_deferred_attr.attr, + NULL, +}; + +static struct attribute_group anon_stats_attr_grp = { + .name = "stats", + .attrs = anon_stats_attrs, +}; + +static struct attribute *file_stats_attrs[] = { +#ifdef CONFIG_SHMEM &shmem_alloc_attr.attr, &shmem_fallback_attr.attr, &shmem_fallback_charge_attr.attr, +#endif + NULL, +}; + +static struct attribute_group file_stats_attr_grp = { + .name = "stats", + .attrs = file_stats_attrs, +}; + +static struct attribute *any_stats_attrs[] = { +#ifdef CONFIG_SHMEM + &swpout_attr.attr, + &swpout_fallback_attr.attr, +#endif &split_attr.attr, &split_failed_attr.attr, - &split_deferred_attr.attr, NULL, }; -static struct attribute_group stats_attr_group = { +static struct attribute_group any_stats_attr_grp = { .name = "stats", - .attrs = stats_attrs, + .attrs = any_stats_attrs, }; +static int sysfs_add_group(struct kobject *kobj, + const struct attribute_group *grp) +{ + int ret = -ENOENT; + + /* + * If the group is named, try to merge first, assuming the subdirectory + * was already created. This avoids the warning emitted by + * sysfs_create_group() if the directory already exists. + */ + if (grp->name) + ret = sysfs_merge_group(kobj, grp); + if (ret) + ret = sysfs_create_group(kobj, grp); + + return ret; +} + static struct thpsize *thpsize_create(int order, struct kobject *parent) { unsigned long size = (PAGE_SIZE << order) / SZ_1K; struct thpsize *thpsize; - int ret; + int ret = -ENOMEM; thpsize = kzalloc(sizeof(*thpsize), GFP_KERNEL); if (!thpsize) - return ERR_PTR(-ENOMEM); + goto err; + + thpsize->order = order; ret = kobject_init_and_add(&thpsize->kobj, &thpsize_ktype, parent, "hugepages-%lukB", size); if (ret) { kfree(thpsize); - return ERR_PTR(ret); + goto err; } - ret = sysfs_create_group(&thpsize->kobj, &thpsize_attr_group); - if (ret) { - kobject_put(&thpsize->kobj); - return ERR_PTR(ret); + + ret = sysfs_add_group(&thpsize->kobj, &any_ctrl_attr_grp); + if (ret) + goto err_put; + + ret = sysfs_add_group(&thpsize->kobj, &any_stats_attr_grp); + if (ret) + goto err_put; + + if (BIT(order) & THP_ORDERS_ALL_ANON) { + ret = sysfs_add_group(&thpsize->kobj, &anon_ctrl_attr_grp); + if (ret) + goto err_put; + + ret = sysfs_add_group(&thpsize->kobj, &anon_stats_attr_grp); + if (ret) + goto err_put; } - ret = sysfs_create_group(&thpsize->kobj, &stats_attr_group); - if (ret) { - kobject_put(&thpsize->kobj); - return ERR_PTR(ret); + if (BIT(order) & THP_ORDERS_ALL_FILE_DEFAULT) { + ret = sysfs_add_group(&thpsize->kobj, &file_ctrl_attr_grp); + if (ret) + goto err_put; + + ret = sysfs_add_group(&thpsize->kobj, &file_stats_attr_grp); + if (ret) + goto err_put; } - thpsize->order = order; return thpsize; +err_put: + kobject_put(&thpsize->kobj); +err: + return ERR_PTR(ret); } static void thpsize_release(struct kobject *kobj) @@ -737,7 +821,7 @@ static int __init hugepage_init_sysfs(struct kobject **hugepage_kobj) goto remove_hp_group; } - orders = THP_ORDERS_ALL_ANON; + orders = THP_ORDERS_ALL_ANON | THP_ORDERS_ALL_FILE_DEFAULT; order = highest_order(orders); while (orders) { thpsize = thpsize_create(order, *hugepage_kobj); -- Gitee From aab278c5bd62a9924f113099f5849a5d1ddff16e Mon Sep 17 00:00:00 2001 From: Kanchana P Sridhar Date: Mon, 30 Sep 2024 22:32:22 -0700 Subject: [PATCH 313/484] mm: swap: count successful large folio zswap stores in hugepage zswpout stats commit 0c560dd86040556a9e55d88229d9295672428c78 upstream Conflicts: none Backport-reason: ZSWAP mTHP support Added a new MTHP_STAT_ZSWPOUT entry to the sysfs transparent_hugepage stats so that successful large folio zswap stores can be accounted under the per-order sysfs "zswpout" stats: /sys/kernel/mm/transparent_hugepage/hugepages-*kB/stats/zswpout Other non-zswap swap device swap-out events will be counted under the existing sysfs "swpout" stats: /sys/kernel/mm/transparent_hugepage/hugepages-*kB/stats/swpout Also, added documentation for the newly added sysfs per-order hugepage "zswpout" stats. The documentation clarifies that only non-zswap swapouts will be accounted in the existing "swpout" stats. Link: https://lkml.kernel.org/r/20241001053222.6944-8-kanchana.p.sridhar@intel.com Signed-off-by: Kanchana P Sridhar Reviewed-by: Nhat Pham Cc: Chengming Zhou Cc: "Huang, Ying" Cc: Johannes Weiner Cc: Matthew Wilcox Cc: Ryan Roberts Cc: Shakeel Butt Cc: Usama Arif Cc: Wajdi Feghali Cc: Yosry Ahmed Cc: "Zou, Nanhai" Cc: Barry Song <21cnbao@gmail.com> Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- Documentation/admin-guide/mm/transhuge.rst | 8 ++++++-- include/linux/huge_mm.h | 1 + mm/huge_memory.c | 3 +++ mm/page_io.c | 1 + 4 files changed, 11 insertions(+), 2 deletions(-) diff --git a/Documentation/admin-guide/mm/transhuge.rst b/Documentation/admin-guide/mm/transhuge.rst index ee0e1b545007..59e77e363bfc 100644 --- a/Documentation/admin-guide/mm/transhuge.rst +++ b/Documentation/admin-guide/mm/transhuge.rst @@ -490,10 +490,14 @@ anon_fault_fallback_charge instead falls back to using huge pages with lower orders or small pages even though the allocation was successful. -swpout - is incremented every time a huge page is swapped out in one +zswpout + is incremented every time a huge page is swapped out to zswap in one piece without splitting. +swpout + is incremented every time a huge page is swapped out to a non-zswap + swap device in one piece without splitting. + swpout_fallback is incremented if a huge page has to be split before swapout. Usually because failed to allocate some continuous swap space diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 9058bcc0f2fd..86189aca4046 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -116,6 +116,7 @@ enum mthp_stat_item { MTHP_STAT_ANON_FAULT_ALLOC, MTHP_STAT_ANON_FAULT_FALLBACK, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE, + MTHP_STAT_ZSWPOUT, MTHP_STAT_SWPOUT, MTHP_STAT_SWPOUT_FALLBACK, MTHP_STAT_SHMEM_ALLOC, diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 69705dd0b63f..5e778c87d05e 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -654,6 +654,7 @@ static struct kobj_attribute _name##_attr = __ATTR_RO(_name) DEFINE_MTHP_STAT_ATTR(anon_fault_alloc, MTHP_STAT_ANON_FAULT_ALLOC); DEFINE_MTHP_STAT_ATTR(anon_fault_fallback, MTHP_STAT_ANON_FAULT_FALLBACK); DEFINE_MTHP_STAT_ATTR(anon_fault_fallback_charge, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE); +DEFINE_MTHP_STAT_ATTR(zswpout, MTHP_STAT_ZSWPOUT); DEFINE_MTHP_STAT_ATTR(swpout, MTHP_STAT_SWPOUT); DEFINE_MTHP_STAT_ATTR(swpout_fallback, MTHP_STAT_SWPOUT_FALLBACK); #ifdef CONFIG_SHMEM @@ -670,6 +671,7 @@ static struct attribute *anon_stats_attrs[] = { &anon_fault_fallback_attr.attr, &anon_fault_fallback_charge_attr.attr, #ifndef CONFIG_SHMEM + &zswpout_attr.attr, &swpout_attr.attr, &swpout_fallback_attr.attr, #endif @@ -698,6 +700,7 @@ static struct attribute_group file_stats_attr_grp = { static struct attribute *any_stats_attrs[] = { #ifdef CONFIG_SHMEM + &zswpout_attr.attr, &swpout_attr.attr, &swpout_fallback_attr.attr, #endif diff --git a/mm/page_io.c b/mm/page_io.c index 7c1d5f182626..914b218f9d47 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -276,6 +276,7 @@ int swap_writepage(struct page *page, struct writeback_control *wbc) swap_zeromap_folio_clear(folio); } if (zswap_store(folio)) { + count_mthp_stat(folio_order(folio), MTHP_STAT_ZSWPOUT); folio_unlock(folio); return 0; } -- Gitee From b36ce906adefc39559c5b16ee9ced4655a6b5972 Mon Sep 17 00:00:00 2001 From: Kanchana P Sridhar Date: Wed, 2 Oct 2024 10:33:29 -0700 Subject: [PATCH 314/484] mm: zswap: zswap_store_page() will initialize entry after adding to xarray. commit ed882add6ded66ece28eed8714aa18acdfb90b0c upstream Conflicts: none Backport-reason: ZSWAP mTHP support This incorporates Yosry's suggestions in [1] for further simplifying zswap_store_page(). If the page is successfully compressed and added to the xarray, we get the pool/objcg refs, and initialize all the entry's members. Only after this, we add it to the zswap LRU. In the time between the entry's addition to the xarray and it's member initialization, we are protected against concurrent stores/loads/swapoff through the folio lock, and are protected against writeback because the entry is not on the LRU yet. This way, we don't have to drop the pool/objcg refs, now that the entry initialization is centralized to the successful page store code path. zswap_compress() is modified to take a zswap_pool parameter in keeping with this simplification (as against obtaining this from entry->pool). [1]: https://lore.kernel.org/all/CAJD7tkZh6ufHQef5HjXf_F5b5LC1EATexgseD=4WvrO+a6Ni6w@mail.gmail.com/ Link: https://lkml.kernel.org/r/20241002173329.213722-1-kanchana.p.sridhar@intel.com Signed-off-by: Kanchana P Sridhar Cc: Chengming Zhou Cc: Huang Ying Cc: Johannes Weiner Cc: Nhat Pham Cc: Ryan Roberts Cc: Wajdi Feghali Cc: Yosry Ahmed Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/zswap.c | 56 +++++++++++++++++++++++++----------------------------- 1 file changed, 26 insertions(+), 30 deletions(-) diff --git a/mm/zswap.c b/mm/zswap.c index ecf4d67d1659..43b2a38f659d 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -879,7 +879,8 @@ static int zswap_cpu_comp_dead(unsigned int cpu, struct hlist_node *node) return 0; } -static bool zswap_compress(struct page *page, struct zswap_entry *entry) +static bool zswap_compress(struct page *page, struct zswap_entry *entry, + struct zswap_pool *pool) { struct crypto_acomp_ctx *acomp_ctx; struct scatterlist input, output; @@ -891,7 +892,7 @@ static bool zswap_compress(struct page *page, struct zswap_entry *entry) gfp_t gfp; u8 *dst; - acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx); + acomp_ctx = raw_cpu_ptr(pool->acomp_ctx); mutex_lock(&acomp_ctx->mutex); @@ -924,7 +925,7 @@ static bool zswap_compress(struct page *page, struct zswap_entry *entry) if (comp_ret) goto unlock; - zpool = entry->pool->zpool; + zpool = pool->zpool; gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM; if (zpool_malloc_support_movable(zpool)) gfp |= __GFP_HIGHMEM | __GFP_MOVABLE; @@ -1409,32 +1410,21 @@ static ssize_t zswap_store_page(struct page *page, struct obj_cgroup *objcg, struct zswap_pool *pool) { + swp_entry_t page_swpentry = page_swap_entry(page); struct zswap_entry *entry, *old; /* allocate entry */ entry = zswap_entry_cache_alloc(GFP_KERNEL, page_to_nid(page)); if (!entry) { zswap_reject_kmemcache_fail++; - goto reject; + return -EINVAL; } - /* zswap_store() already holds a ref on 'objcg' and 'pool' */ - if (objcg) - obj_cgroup_get(objcg); - zswap_pool_get(pool); - - /* if entry is successfully added, it keeps the reference */ - entry->pool = pool; + if (!zswap_compress(page, entry, pool)) + goto compress_failed; - if (!zswap_compress(page, entry)) - goto put_pool_objcg; - - entry->swpentry = page_swap_entry(page); - entry->objcg = objcg; - entry->referenced = true; - - old = xa_store(swap_zswap_tree(entry->swpentry), - swp_offset(entry->swpentry), + old = xa_store(swap_zswap_tree(page_swpentry), + swp_offset(page_swpentry), entry, GFP_KERNEL); if (xa_is_err(old)) { int err = xa_err(old); @@ -1452,6 +1442,16 @@ static ssize_t zswap_store_page(struct page *page, if (old) zswap_entry_free(old); + /* + * The entry is successfully compressed and stored in the tree, there is + * no further possibility of failure. Grab refs to the pool and objcg. + * These refs will be dropped by zswap_entry_free() when the entry is + * removed from the tree. + */ + zswap_pool_get(pool); + if (objcg) + obj_cgroup_get(objcg); + /* * We finish initializing the entry while it's already in xarray. * This is safe because: @@ -1462,25 +1462,21 @@ static ssize_t zswap_store_page(struct page *page, * The publishing order matters to prevent writeback from seeing * an incoherent entry. */ + entry->pool = pool; + entry->swpentry = page_swpentry; + entry->objcg = objcg; + entry->referenced = true; if (entry->length) { INIT_LIST_HEAD(&entry->lru); zswap_lru_add(&zswap_list_lru, entry); } - /* - * We shouldn't have any possibility of failure after the entry is - * added in the xarray. The pool/objcg refs obtained here will only - * be dropped if/when zswap_entry_free() gets called. - */ return entry->length; store_failed: - zpool_free(entry->pool->zpool, entry->handle); -put_pool_objcg: - zswap_pool_put(pool); - obj_cgroup_put(objcg); + zpool_free(pool->zpool, entry->handle); +compress_failed: zswap_entry_cache_free(entry); -reject: return -EINVAL; } -- Gitee From 2cd900039f1d0a5606ecbde131daf7f53c013c5d Mon Sep 17 00:00:00 2001 From: Zi Yan Date: Thu, 7 Mar 2024 13:18:53 -0500 Subject: [PATCH 315/484] mm/huge_memory: check new folio order when split a folio commit 1412ecb3d256e5d23c6bd150e110cef6e3736844 upstream Conflicts: none Backport-reason: mTHP splitting A folio can only be split into lower orders. Since there are no new_order checks in debugfs, any new_order can be passed via debugfs into split_huge_page_to_list_to_order(). Check new_order to make sure it is smaller than input folio order. Link: https://lkml.kernel.org/r/20240307181854.138928-1-zi.yan@sent.com Fixes: c010d47f107f ("mm: thp: split huge page to any lower order pages") Signed-off-by: Zi Yan Reported-by: Dan Carpenter Closes: https://lore.kernel.org/linux-mm/7dda9283-b437-4cf8-ab0d-83c330deb9c0@moroto.mountain/ Cc: David Hildenbrand Cc: Kirill A. Shutemov Cc: Matthew Wilcox (Oracle) Cc: Ryan Roberts Cc: Yang Shi Cc: Yu Zhao Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/huge_memory.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 5e778c87d05e..ed4645823fdd 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -3176,6 +3176,9 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list, VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); VM_BUG_ON_FOLIO(!folio_test_large(folio), folio); + if (new_order >= folio_order(folio)) + return -EINVAL; + /* Cannot split anonymous THP to order-1 */ if (new_order == 1 && folio_test_anon(folio)) { VM_WARN_ONCE(1, "Cannot split to order-1 folio"); -- Gitee From 0e708668c16ef9cbcecf244213e8035bc6dfa352 Mon Sep 17 00:00:00 2001 From: Ran Xiaokai Date: Fri, 7 Jun 2024 17:40:48 +0800 Subject: [PATCH 316/484] mm: huge_memory: fix misused mapping_large_folio_support() for anon folios commit 6a50c9b512f7734bc356f4bd47885a6f7c98491a upstream Conflicts: none Backport-reason: mTHP splitting When I did a large folios split test, a WARNING "[ 5059.122759][ T166] Cannot split file folio to non-0 order" was triggered. But the test cases are only for anonmous folios. while mapping_large_folio_support() is only reasonable for page cache folios. In split_huge_page_to_list_to_order(), the folio passed to mapping_large_folio_support() maybe anonmous folio. The folio_test_anon() check is missing. So the split of the anonmous THP is failed. This is also the same for shmem_mapping(). We'd better add a check for both. But the shmem_mapping() in __split_huge_page() is not involved, as for anonmous folios, the end parameter is set to -1, so (head[i].index >= end) is always false. shmem_mapping() is not called. Also add a VM_WARN_ON_ONCE() in mapping_large_folio_support() for anon mapping, So we can detect the wrong use more easily. THP folios maybe exist in the pagecache even the file system doesn't support large folio, it is because when CONFIG_TRANSPARENT_HUGEPAGE is enabled, khugepaged will try to collapse read-only file-backed pages to THP. But the mapping does not actually support multi order large folios properly. Using /sys/kernel/debug/split_huge_pages to verify this, with this patch, large anon THP is successfully split and the warning is ceased. Link: https://lkml.kernel.org/r/202406071740485174hcFl7jRxncsHDtI-Pz-o@zte.com.cn Fixes: c010d47f107f ("mm: thp: split huge page to any lower order pages") Reviewed-by: Barry Song Reviewed-by: Zi Yan Acked-by: David Hildenbrand Signed-off-by: Ran Xiaokai Cc: Michal Hocko Cc: xu xin Cc: Yang Yang Cc: Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- include/linux/pagemap.h | 4 ++++ mm/huge_memory.c | 28 +++++++++++++++++----------- 2 files changed, 21 insertions(+), 11 deletions(-) diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 9c169c9d09cb..07280f01eaab 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -379,6 +379,10 @@ static inline void mapping_set_large_folios(struct address_space *mapping) */ static inline bool mapping_large_folio_support(struct address_space *mapping) { + /* AS_LARGE_FOLIO_SUPPORT is only reasonable for pagecache folios */ + VM_WARN_ONCE((unsigned long)mapping & PAGE_MAPPING_ANON, + "Anonymous mapping always supports large folio"); + return IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && test_bit(AS_LARGE_FOLIO_SUPPORT, &mapping->flags); } diff --git a/mm/huge_memory.c b/mm/huge_memory.c index ed4645823fdd..05317df1594a 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -3179,30 +3179,36 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list, if (new_order >= folio_order(folio)) return -EINVAL; - /* Cannot split anonymous THP to order-1 */ - if (new_order == 1 && folio_test_anon(folio)) { - VM_WARN_ONCE(1, "Cannot split to order-1 folio"); - return -EINVAL; - } - - if (new_order) { - /* Only swapping a whole PMD-mapped folio is supported */ - if (folio_test_swapcache(folio)) + if (folio_test_anon(folio)) { + /* order-1 is not supported for anonymous THP. */ + if (new_order == 1) { + VM_WARN_ONCE(1, "Cannot split to order-1 folio"); return -EINVAL; + } + } else if (new_order) { /* Split shmem folio to non-zero order not supported */ if (shmem_mapping(folio->mapping)) { VM_WARN_ONCE(1, "Cannot split shmem folio to non-0 order"); return -EINVAL; } - /* No split if the file system does not support large folio */ - if (!mapping_large_folio_support(folio->mapping)) { + /* + * No split if the file system does not support large folio. + * Note that we might still have THPs in such mappings due to + * CONFIG_READ_ONLY_THP_FOR_FS. But in that case, the mapping + * does not actually support large folios properly. + */ + if (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && + !mapping_large_folio_support(folio->mapping)) { VM_WARN_ONCE(1, "Cannot split file folio to non-0 order"); return -EINVAL; } } + /* Only swapping a whole PMD-mapped folio is supported */ + if (folio_test_swapcache(folio) && new_order) + return -EINVAL; is_hzp = is_huge_zero_page(&folio->page); if (is_hzp) { -- Gitee From ef602217f5d03a285cac6e83c54629bdbea4e26c Mon Sep 17 00:00:00 2001 From: Brendan Jackman Date: Wed, 8 Nov 2023 16:49:20 +0000 Subject: [PATCH 317/484] mm/page_alloc: dedupe some memcg uncharging logic commit 17b46e7beb8fe4e4807f70aaa615cf50a5ba9d3a upstream Conflicts: minor, due to already backported 66edc3a5894c, solve the conflict to be align with upstream. Backport-reason: mTHP statistic: clean up and prepare The duplication makes it seem like some work is required before uncharging in the !PageHWPoison case. But it isn't, so we can simplify the code a little. Note the PageMemcgKmem check is redundant, but I've left it in as it avoids an unnecessary function call. Link: https://lkml.kernel.org/r/20231108164920.3401565-1-jackmanb@google.com Signed-off-by: Brendan Jackman Reviewed-by: Yosry Ahmed Cc: Johannes Weiner Cc: Michal Hocko Cc: Muchun Song Cc: Roman Gushchin Cc: Shakeel Butt Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/page_alloc.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 4703c1d6d1e1..d7d4664b7e87 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1098,6 +1098,9 @@ __always_inline bool free_pages_prepare(struct page *page, trace_mm_page_free(page, order); kmsan_free_page(page, order); + if (memcg_kmem_online() && PageMemcgKmem(page)) + __memcg_kmem_uncharge_page(page, order); + /* * In rare cases, when truncation or holepunching raced with * munlock after VM_LOCKED was cleared, Mlocked may still be @@ -1113,12 +1116,7 @@ __always_inline bool free_pages_prepare(struct page *page, } if (unlikely(PageHWPoison(page)) && !order) { - /* - * Do not let hwpoison pages hit pcplists/buddy - * Untie memcg state and reset page's owner - */ - if (memcg_kmem_online() && PageMemcgKmem(page)) - __memcg_kmem_uncharge_page(page, order); + /* Do not let hwpoison pages hit pcplists/buddy */ reset_page_owner(page, order); page_table_check_free(page, order); return false; @@ -1150,8 +1148,6 @@ __always_inline bool free_pages_prepare(struct page *page, } if (PageMappingFlags(page)) page->mapping = NULL; - if (memcg_kmem_online() && PageMemcgKmem(page)) - __memcg_kmem_uncharge_page(page, order); if (is_check_pages_enabled()) { if (free_page_is_bad(page)) bad++; -- Gitee From ec23fdb899eaab07742f81e395329bb4d084fb77 Mon Sep 17 00:00:00 2001 From: Barry Song Date: Sat, 24 Aug 2024 13:04:40 +1200 Subject: [PATCH 318/484] mm: count the number of anonymous THPs per size commit 5d65c8d758f2596c008009e39bb2614deed2c730 upstream Conflicts: none Backport-reason: mTHP statistic Patch series "mm: count the number of anonymous THPs per size", v4. Knowing the number of transparent anon THPs in the system is crucial for performance analysis. It helps in understanding the ratio and distribution of THPs versus small folios throughout the system. Additionally, partial unmapping by userspace can lead to significant waste of THPs over time and increase memory reclamation pressure. We need this information for comprehensive system tuning. This patch (of 2): Let's track for each anonymous THP size, how many of them are currently allocated. We'll track the complete lifespan of an anon THP, starting when it becomes an anon THP ("large anon folio") (->mapping gets set), until it gets freed (->mapping gets cleared). Introduce a new "nr_anon" counter per THP size and adjust the corresponding counter in the following cases: * We allocate a new THP and call folio_add_new_anon_rmap() to map it the first time and turn it into an anon THP. * We split an anon THP into multiple smaller ones. * We migrate an anon THP, when we prepare the destination. * We free an anon THP back to the buddy. Note that AnonPages in /proc/meminfo currently tracks the total number of *mapped* anonymous *pages*, and therefore has slightly different semantics. In the future, we might also want to track "nr_anon_mapped" for each THP size, which might be helpful when comparing it to the number of allocated anon THPs (long-term pinning, stuck in swapcache, memory leaks, ...). Further note that for now, we only track anon THPs after they got their ->mapping set, for example via folio_add_new_anon_rmap(). If we would allocate some in the swapcache, they will only show up in the statistics for now after they have been mapped to user space the first time, where we call folio_add_new_anon_rmap(). [akpm@linux-foundation.org: documentation fixups, per David] Link: https://lkml.kernel.org/r/3e8add35-e26b-443b-8a04-1078f4bc78f6@redhat.com Link: https://lkml.kernel.org/r/20240824010441.21308-1-21cnbao@gmail.com Link: https://lkml.kernel.org/r/20240824010441.21308-2-21cnbao@gmail.com Signed-off-by: Barry Song Acked-by: David Hildenbrand Cc: Baolin Wang Cc: Chris Li Cc: Chuanhua Han Cc: Kairui Song Cc: Kalesh Singh Cc: Lance Yang Cc: Ryan Roberts Cc: Shuai Yuan Cc: Usama Arif Cc: Zi Yan Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- Documentation/admin-guide/mm/transhuge.rst | 5 +++++ include/linux/huge_mm.h | 15 +++++++++++++-- mm/huge_memory.c | 13 ++++++++++--- mm/migrate.c | 4 ++++ mm/page_alloc.c | 5 ++++- mm/rmap.c | 1 + 6 files changed, 37 insertions(+), 6 deletions(-) diff --git a/Documentation/admin-guide/mm/transhuge.rst b/Documentation/admin-guide/mm/transhuge.rst index 59e77e363bfc..6a3f3b6f1ee1 100644 --- a/Documentation/admin-guide/mm/transhuge.rst +++ b/Documentation/admin-guide/mm/transhuge.rst @@ -531,6 +531,11 @@ split_deferred it would free up some memory. Pages on split queue are going to be split under memory pressure, if splitting is possible. +nr_anon + the number of anonymous THP we have in the whole system. These THPs + might be currently entirely mapped or have partially unmapped/unused + subpages. + As the system ages, allocating huge pages may be expensive as the system uses memory compaction to copy data around memory to free a huge page for use. There are some counters in ``/proc/vmstat`` to help diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 86189aca4046..a1bdbe3e21ea 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -125,6 +125,7 @@ enum mthp_stat_item { MTHP_STAT_SPLIT, MTHP_STAT_SPLIT_FAILED, MTHP_STAT_SPLIT_DEFERRED, + MTHP_STAT_NR_ANON, __MTHP_STAT_COUNT }; @@ -135,14 +136,24 @@ struct mthp_stat { DECLARE_PER_CPU(struct mthp_stat, mthp_stats); -static inline void count_mthp_stat(int order, enum mthp_stat_item item) +static inline void mod_mthp_stat(int order, enum mthp_stat_item item, int delta) { if (order <= 0 || order > PMD_ORDER) return; - this_cpu_inc(mthp_stats.stats[order][item]); + this_cpu_add(mthp_stats.stats[order][item], delta); +} + +static inline void count_mthp_stat(int order, enum mthp_stat_item item) +{ + mod_mthp_stat(order, item, 1); } + #else +static inline void mod_mthp_stat(int order, enum mthp_stat_item item, int delta) +{ +} + static inline void count_mthp_stat(int order, enum mthp_stat_item item) { } diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 05317df1594a..bcfcb4dee7ac 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -665,6 +665,7 @@ DEFINE_MTHP_STAT_ATTR(shmem_fallback_charge, MTHP_STAT_SHMEM_FALLBACK_CHARGE); DEFINE_MTHP_STAT_ATTR(split, MTHP_STAT_SPLIT); DEFINE_MTHP_STAT_ATTR(split_failed, MTHP_STAT_SPLIT_FAILED); DEFINE_MTHP_STAT_ATTR(split_deferred, MTHP_STAT_SPLIT_DEFERRED); +DEFINE_MTHP_STAT_ATTR(nr_anon, MTHP_STAT_NR_ANON); static struct attribute *anon_stats_attrs[] = { &anon_fault_alloc_attr.attr, @@ -676,6 +677,7 @@ static struct attribute *anon_stats_attrs[] = { &swpout_fallback_attr.attr, #endif &split_deferred_attr.attr, + &nr_anon_attr.attr, NULL, }; @@ -3166,8 +3168,9 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list, struct deferred_split *ds_queue = get_deferred_split_queue(folio); /* reset xarray order to new order after split */ XA_STATE_ORDER(xas, &folio->mapping->i_pages, folio->index, new_order); - struct anon_vma *anon_vma = NULL; + bool is_anon = folio_test_anon(folio); struct address_space *mapping = NULL; + struct anon_vma *anon_vma = NULL; int order = folio_order(folio); int extra_pins, ret; pgoff_t end; @@ -3179,7 +3182,7 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list, if (new_order >= folio_order(folio)) return -EINVAL; - if (folio_test_anon(folio)) { + if (is_anon) { /* order-1 is not supported for anonymous THP. */ if (new_order == 1) { VM_WARN_ONCE(1, "Cannot split to order-1 folio"); @@ -3219,7 +3222,7 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list, if (folio_test_writeback(folio)) return -EBUSY; - if (folio_test_anon(folio)) { + if (is_anon) { /* * The caller does not necessarily hold an mmap_lock that would * prevent the anon_vma disappearing so we first we take a @@ -3332,6 +3335,10 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list, } } + if (is_anon) { + mod_mthp_stat(order, MTHP_STAT_NR_ANON, -1); + mod_mthp_stat(new_order, MTHP_STAT_NR_ANON, 1 << (order - new_order)); + } __split_huge_page(page, list, end, new_order); ret = 0; } else { diff --git a/mm/migrate.c b/mm/migrate.c index 00633c8aa332..0376faa4de35 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -415,6 +415,8 @@ int folio_migrate_mapping(struct address_space *mapping, /* No turning back from here */ newfolio->index = folio->index; newfolio->mapping = folio->mapping; + if (folio_test_anon(folio) && folio_test_large(folio)) + mod_mthp_stat(folio_order(folio), MTHP_STAT_NR_ANON, 1); if (folio_test_swapbacked(folio)) __folio_set_swapbacked(newfolio); @@ -436,6 +438,8 @@ int folio_migrate_mapping(struct address_space *mapping, */ newfolio->index = folio->index; newfolio->mapping = folio->mapping; + if (folio_test_anon(folio) && folio_test_large(folio)) + mod_mthp_stat(folio_order(folio), MTHP_STAT_NR_ANON, 1); folio_ref_add(newfolio, nr); /* add cache reference */ if (folio_test_swapbacked(folio)) __folio_set_swapbacked(newfolio); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index d7d4664b7e87..e30af69350eb 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1146,8 +1146,11 @@ __always_inline bool free_pages_prepare(struct page *page, (page + i)->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; } } - if (PageMappingFlags(page)) + if (PageMappingFlags(page)) { + if (PageAnon(page)) + mod_mthp_stat(order, MTHP_STAT_NR_ANON, -1); page->mapping = NULL; + } if (is_check_pages_enabled()) { if (free_page_is_bad(page)) bad++; diff --git a/mm/rmap.c b/mm/rmap.c index be410371e3d3..43f570549d3a 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1428,6 +1428,7 @@ void folio_add_new_anon_rmap(struct folio *folio, struct vm_area_struct *vma, } __folio_mod_stat(folio, nr, nr_pmdmapped); + mod_mthp_stat(folio_order(folio), MTHP_STAT_NR_ANON, 1); } static __always_inline void __folio_add_file_rmap(struct folio *folio, -- Gitee From 1bd8e584f49a99eec280b0771b3ead7199892355 Mon Sep 17 00:00:00 2001 From: Barry Song Date: Sat, 24 Aug 2024 13:04:41 +1200 Subject: [PATCH 319/484] mm: count the number of partially mapped anonymous THPs per size commit 8175ebfd302abe6fbdca9037f763ecbfdb8db572 upstream Conflicts: none Backport-reason: mTHP statistic When a THP is added to the deferred_list due to partially mapped, its partial pages are unused, leading to wasted memory and potentially increasing memory reclamation pressure. Detailing the specifics of how unmapping occurs is quite difficult and not that useful, so we adopt a simple approach: each time a THP enters the deferred_list, we increment the count by 1; whenever it leaves for any reason, we decrement the count by 1. Link: https://lkml.kernel.org/r/20240824010441.21308-3-21cnbao@gmail.com Signed-off-by: Barry Song Acked-by: David Hildenbrand Cc: Baolin Wang Cc: Chris Li Cc: Chuanhua Han Cc: Kairui Song Cc: Kalesh Singh Cc: Lance Yang Cc: Ryan Roberts Cc: Shuai Yuan Cc: Usama Arif Cc: Zi Yan Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- Documentation/admin-guide/mm/transhuge.rst | 7 +++++++ include/linux/huge_mm.h | 1 + mm/huge_memory.c | 6 ++++++ 3 files changed, 14 insertions(+) diff --git a/Documentation/admin-guide/mm/transhuge.rst b/Documentation/admin-guide/mm/transhuge.rst index 6a3f3b6f1ee1..161e52b93757 100644 --- a/Documentation/admin-guide/mm/transhuge.rst +++ b/Documentation/admin-guide/mm/transhuge.rst @@ -536,6 +536,13 @@ nr_anon might be currently entirely mapped or have partially unmapped/unused subpages. +nr_anon_partially_mapped + the number of anonymous THP which are likely partially mapped, possibly + wasting memory, and have been queued for deferred memory reclamation. + Note that in corner some cases (e.g., failed migration), we might detect + an anonymous THP as "partially mapped" and count it here, even though it + is not actually partially mapped anymore. + As the system ages, allocating huge pages may be expensive as the system uses memory compaction to copy data around memory to free a huge page for use. There are some counters in ``/proc/vmstat`` to help diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index a1bdbe3e21ea..c18ddbf38d20 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -126,6 +126,7 @@ enum mthp_stat_item { MTHP_STAT_SPLIT_FAILED, MTHP_STAT_SPLIT_DEFERRED, MTHP_STAT_NR_ANON, + MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, __MTHP_STAT_COUNT }; diff --git a/mm/huge_memory.c b/mm/huge_memory.c index bcfcb4dee7ac..d6a8639fc6c3 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -666,6 +666,7 @@ DEFINE_MTHP_STAT_ATTR(split, MTHP_STAT_SPLIT); DEFINE_MTHP_STAT_ATTR(split_failed, MTHP_STAT_SPLIT_FAILED); DEFINE_MTHP_STAT_ATTR(split_deferred, MTHP_STAT_SPLIT_DEFERRED); DEFINE_MTHP_STAT_ATTR(nr_anon, MTHP_STAT_NR_ANON); +DEFINE_MTHP_STAT_ATTR(nr_anon_partially_mapped, MTHP_STAT_NR_ANON_PARTIALLY_MAPPED); static struct attribute *anon_stats_attrs[] = { &anon_fault_alloc_attr.attr, @@ -678,6 +679,7 @@ static struct attribute *anon_stats_attrs[] = { #endif &split_deferred_attr.attr, &nr_anon_attr.attr, + &nr_anon_partially_mapped_attr.attr, NULL, }; @@ -3309,6 +3311,7 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list, if (folio_order(folio) > 1 && !list_empty(&folio->_deferred_list)) { ds_queue->split_queue_len--; + mod_mthp_stat(folio_order(folio), MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, -1); /* * Reinitialize page_deferred_list after removing the * page from the split_queue, otherwise a subsequent @@ -3386,6 +3389,7 @@ void folio_undo_large_rmappable(struct folio *folio) spin_lock_irqsave(&ds_queue->split_queue_lock, flags); if (!list_empty(&folio->_deferred_list)) { ds_queue->split_queue_len--; + mod_mthp_stat(folio_order(folio), MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, -1); list_del_init(&folio->_deferred_list); } spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags); @@ -3427,6 +3431,7 @@ void deferred_split_folio(struct folio *folio) if (folio_test_pmd_mappable(folio)) count_vm_event(THP_DEFERRED_SPLIT_PAGE); count_mthp_stat(folio_order(folio), MTHP_STAT_SPLIT_DEFERRED); + mod_mthp_stat(folio_order(folio), MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, 1); list_add_tail(&folio->_deferred_list, &ds_queue->split_queue); ds_queue->split_queue_len++; #ifdef CONFIG_MEMCG @@ -3474,6 +3479,7 @@ static unsigned long deferred_split_scan(struct shrinker *shrink, list_move(&folio->_deferred_list, &list); } else { /* We lost race with folio_put() */ + mod_mthp_stat(folio_order(folio), MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, -1); list_del_init(&folio->_deferred_list); ds_queue->split_queue_len--; } -- Gitee From 226e724522c9aec5ecaf69de4c99dea88363553c Mon Sep 17 00:00:00 2001 From: Barry Song Date: Sat, 26 Oct 2024 21:24:23 +1300 Subject: [PATCH 320/484] mm: add per-order mTHP swpin counters commit aaf2914aec0fa67395574f6fa6b726168b049e60 upstream Conflicts: none Backport-reason: mTHP swapin This helps profile the sizes of folios being swapped in. Currently, only mTHP swap-out is being counted. The new interface can be found at: /sys/kernel/mm/transparent_hugepage/hugepages-/stats swpin For example, cat /sys/kernel/mm/transparent_hugepage/hugepages-64kB/stats/swpin 12809 cat /sys/kernel/mm/transparent_hugepage/hugepages-32kB/stats/swpin 4763 [v-songbaohua@oppo.com: add a blank line in doc] Link: https://lkml.kernel.org/r/20241030233423.80759-1-21cnbao@gmail.com Link: https://lkml.kernel.org/r/20241026082423.26298-1-21cnbao@gmail.com Signed-off-by: Barry Song Reviewed-by: Baolin Wang Acked-by: David Hildenbrand Cc: Chris Li Cc: Yosry Ahmed Cc: "Huang, Ying" Cc: Kairui Song Cc: Ryan Roberts Cc: Kanchana P Sridhar Cc: Usama Arif Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- Documentation/admin-guide/mm/transhuge.rst | 4 ++++ include/linux/huge_mm.h | 1 + mm/huge_memory.c | 3 +++ mm/page_io.c | 3 +++ 4 files changed, 11 insertions(+) diff --git a/Documentation/admin-guide/mm/transhuge.rst b/Documentation/admin-guide/mm/transhuge.rst index 161e52b93757..f526be5b4647 100644 --- a/Documentation/admin-guide/mm/transhuge.rst +++ b/Documentation/admin-guide/mm/transhuge.rst @@ -494,6 +494,10 @@ zswpout is incremented every time a huge page is swapped out to zswap in one piece without splitting. +swpin + is incremented every time a huge page is swapped in from a non-zswap + swap device in one piece. + swpout is incremented every time a huge page is swapped out to a non-zswap swap device in one piece without splitting. diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index c18ddbf38d20..47e46e8d3dec 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -117,6 +117,7 @@ enum mthp_stat_item { MTHP_STAT_ANON_FAULT_FALLBACK, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE, MTHP_STAT_ZSWPOUT, + MTHP_STAT_SWPIN, MTHP_STAT_SWPOUT, MTHP_STAT_SWPOUT_FALLBACK, MTHP_STAT_SHMEM_ALLOC, diff --git a/mm/huge_memory.c b/mm/huge_memory.c index d6a8639fc6c3..07639a74af49 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -655,6 +655,7 @@ DEFINE_MTHP_STAT_ATTR(anon_fault_alloc, MTHP_STAT_ANON_FAULT_ALLOC); DEFINE_MTHP_STAT_ATTR(anon_fault_fallback, MTHP_STAT_ANON_FAULT_FALLBACK); DEFINE_MTHP_STAT_ATTR(anon_fault_fallback_charge, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE); DEFINE_MTHP_STAT_ATTR(zswpout, MTHP_STAT_ZSWPOUT); +DEFINE_MTHP_STAT_ATTR(swpin, MTHP_STAT_SWPIN); DEFINE_MTHP_STAT_ATTR(swpout, MTHP_STAT_SWPOUT); DEFINE_MTHP_STAT_ATTR(swpout_fallback, MTHP_STAT_SWPOUT_FALLBACK); #ifdef CONFIG_SHMEM @@ -674,6 +675,7 @@ static struct attribute *anon_stats_attrs[] = { &anon_fault_fallback_charge_attr.attr, #ifndef CONFIG_SHMEM &zswpout_attr.attr, + &swpin_attr.attr, &swpout_attr.attr, &swpout_fallback_attr.attr, #endif @@ -705,6 +707,7 @@ static struct attribute_group file_stats_attr_grp = { static struct attribute *any_stats_attrs[] = { #ifdef CONFIG_SHMEM &zswpout_attr.attr, + &swpin_attr.attr, &swpout_attr.attr, &swpout_fallback_attr.attr, #endif diff --git a/mm/page_io.c b/mm/page_io.c index 914b218f9d47..c95d570f26d9 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -505,6 +505,7 @@ static void sio_read_complete(struct kiocb *iocb, long ret) for (p = 0; p < sio->pages; p++) { struct folio *folio = page_folio(sio->bvec[p].bv_page); + count_mthp_stat(folio_order(folio), MTHP_STAT_SWPIN); count_memcg_folio_events(folio, PSWPIN, folio_nr_pages(folio)); folio_mark_uptodate(folio); folio_unlock(folio); @@ -599,6 +600,7 @@ static void swap_read_folio_bdev_sync(struct folio *folio, * attempt to access it in the page fault retry time check. */ get_task_struct(current); + count_mthp_stat(folio_order(folio), MTHP_STAT_SWPIN); count_memcg_folio_events(folio, PSWPIN, folio_nr_pages(folio)); count_vm_events(PSWPIN, folio_nr_pages(folio)); submit_bio_wait(&bio); @@ -615,6 +617,7 @@ static void swap_read_folio_bdev_async(struct folio *folio, bio->bi_iter.bi_sector = swap_folio_sector(folio); bio->bi_end_io = end_swap_bio_read; bio_add_folio_nofail(bio, folio, folio_size(folio), 0); + count_mthp_stat(folio_order(folio), MTHP_STAT_SWPIN); count_memcg_folio_events(folio, PSWPIN, folio_nr_pages(folio)); count_vm_events(PSWPIN, folio_nr_pages(folio)); submit_bio(bio); -- Gitee From 06d3ea522b1489e2cda96f3db357e2764b2808e1 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 2 Jul 2024 00:40:55 -0700 Subject: [PATCH 321/484] mm: fix crashes from deferred split racing folio migration commit be9581ea8c058d81154251cb0695987098996cad upstream Conflicts: none Backport-reason: mTHP fixes Even on 6.10-rc6, I've been seeing elusive "Bad page state"s (often on flags when freeing, yet the flags shown are not bad: PG_locked had been set and cleared??), and VM_BUG_ON_PAGE(page_ref_count(page) == 0)s from deferred_split_scan()'s folio_put(), and a variety of other BUG and WARN symptoms implying double free by deferred split and large folio migration. 6.7 commit 9bcef5973e31 ("mm: memcg: fix split queue list crash when large folio migration") was right to fix the memcg-dependent locking broken in 85ce2c517ade ("memcontrol: only transfer the memcg data for migration"), but missed a subtlety of deferred_split_scan(): it moves folios to its own local list to work on them without split_queue_lock, during which time folio->_deferred_list is not empty, but even the "right" lock does nothing to secure the folio and the list it is on. Fortunately, deferred_split_scan() is careful to use folio_try_get(): so folio_migrate_mapping() can avoid the race by folio_undo_large_rmappable() while the old folio's reference count is temporarily frozen to 0 - adding such a freeze in the !mapping case too (originally, folio lock and unmapping and no swap cache left an anon folio unreachable, so no freezing was needed there: but the deferred split queue offers a way to reach it). Link: https://lkml.kernel.org/r/29c83d1a-11ca-b6c9-f92e-6ccb322af510@google.com Fixes: 9bcef5973e31 ("mm: memcg: fix split queue list crash when large folio migration") Signed-off-by: Hugh Dickins Reviewed-by: Baolin Wang Cc: Barry Song Cc: David Hildenbrand Cc: Hugh Dickins Cc: Kefeng Wang Cc: Matthew Wilcox (Oracle) Cc: Nhat Pham Cc: Yang Shi Cc: Zi Yan Cc: Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/memcontrol.c | 11 ----------- mm/migrate.c | 13 +++++++++++++ 2 files changed, 13 insertions(+), 11 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 73bc3ca42fa1..545df3c61379 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -9624,17 +9624,6 @@ void mem_cgroup_migrate(struct folio *old, struct folio *new) /* Transfer the charge and the css ref */ commit_charge(new, memcg); - /* - * If the old folio is a large folio and is in the split queue, it needs - * to be removed from the split queue now, in case getting an incorrect - * split queue in destroy_large_folio() after the memcg of the old folio - * is cleared. - * - * In addition, the old folio is about to be freed after migration, so - * removing from the split queue a bit earlier seems reasonable. - */ - if (folio_test_large(old) && folio_test_large_rmappable(old)) - folio_undo_large_rmappable(old); old->memcg_data = 0; } diff --git a/mm/migrate.c b/mm/migrate.c index 0376faa4de35..77147c390fa2 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -412,6 +412,15 @@ int folio_migrate_mapping(struct address_space *mapping, if (folio_ref_count(folio) != expected_count) return -EAGAIN; + /* Take off deferred split queue while frozen and memcg set */ + if (folio_test_large(folio) && + folio_test_large_rmappable(folio)) { + if (!folio_ref_freeze(folio, expected_count)) + return -EAGAIN; + folio_undo_large_rmappable(folio); + folio_ref_unfreeze(folio, expected_count); + } + /* No turning back from here */ newfolio->index = folio->index; newfolio->mapping = folio->mapping; @@ -432,6 +441,10 @@ int folio_migrate_mapping(struct address_space *mapping, return -EAGAIN; } + /* Take off deferred split queue while frozen and memcg set */ + if (folio_test_large(folio) && folio_test_large_rmappable(folio)) + folio_undo_large_rmappable(folio); + /* * Now we know that no one else is looking at the folio: * no turning back from here. -- Gitee From 88ab1ba78a826c7eeb6f309c453bd8a38e204527 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Sat, 6 Jul 2024 14:29:00 -0700 Subject: [PATCH 322/484] mm: simplify folio_migrate_mapping() commit a5ea521250afdf3d70c72970660f44aebf56ea19 upstream Conflicts: none Backport-reason: mTHP updates Now that folio_undo_large_rmappable() is an inline function checking order and large_rmappable for itself (and __folio_undo_large_rmappable() is now declared even when CONFIG_TRANASPARENT_HUGEPAGE is off) there is no need for folio_migrate_mapping() to check large and large_rmappable first (in the mapping case when it has had to freeze anyway). Link: https://lkml.kernel.org/r/68feee73-050e-8e98-7a3a-abf78738d92c@google.com Signed-off-by: Hugh Dickins Reviewed-by: Zi Yan Cc: Baolin Wang Cc: Barry Song Cc: David Hildenbrand Cc: Kefeng Wang Cc: Matthew Wilcox (Oracle) Cc: Nhat Pham Cc: Yang Shi Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/migrate.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/mm/migrate.c b/mm/migrate.c index 77147c390fa2..fec29fcc4811 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -442,8 +442,7 @@ int folio_migrate_mapping(struct address_space *mapping, } /* Take off deferred split queue while frozen and memcg set */ - if (folio_test_large(folio) && folio_test_large_rmappable(folio)) - folio_undo_large_rmappable(folio); + folio_undo_large_rmappable(folio); /* * Now we know that no one else is looking at the folio: -- Gitee From 7051f6091b243aebcf83ce8ff61133c450c8bfc3 Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Mon, 15 Jul 2024 10:04:23 +1000 Subject: [PATCH 323/484] mm/huge_memory: avoid PMD-size page cache if needed commit d659b715e94ac039803d7601505d3473393fc0be upstream Conflicts: none Backport-reason: mTHP: optimize allocation and compaction xarray can't support arbitrary page cache size. the largest and supported page cache size is defined as MAX_PAGECACHE_ORDER by commit 099d90642a71 ("mm/filemap: make MAX_PAGECACHE_ORDER acceptable to xarray"). However, it's possible to have 512MB page cache in the huge memory's collapsing path on ARM64 system whose base page size is 64KB. 512MB page cache is breaking the limitation and a warning is raised when the xarray entry is split as shown in the following example. [root@dhcp-10-26-1-207 ~]# cat /proc/1/smaps | grep KernelPageSize KernelPageSize: 64 kB [root@dhcp-10-26-1-207 ~]# cat /tmp/test.c : int main(int argc, char **argv) { const char *filename = TEST_XFS_FILENAME; int fd = 0; void *buf = (void *)-1, *p; int pgsize = getpagesize(); int ret = 0; if (pgsize != 0x10000) { fprintf(stdout, "System with 64KB base page size is required!\n"); return -EPERM; } system("echo 0 > /sys/devices/virtual/bdi/253:0/read_ahead_kb"); system("echo 1 > /proc/sys/vm/drop_caches"); /* Open the xfs file */ fd = open(filename, O_RDONLY); assert(fd > 0); /* Create VMA */ buf = mmap(NULL, TEST_MEM_SIZE, PROT_READ, MAP_SHARED, fd, 0); assert(buf != (void *)-1); fprintf(stdout, "mapped buffer at 0x%p\n", buf); /* Populate VMA */ ret = madvise(buf, TEST_MEM_SIZE, MADV_NOHUGEPAGE); assert(ret == 0); ret = madvise(buf, TEST_MEM_SIZE, MADV_POPULATE_READ); assert(ret == 0); /* Collapse VMA */ ret = madvise(buf, TEST_MEM_SIZE, MADV_HUGEPAGE); assert(ret == 0); ret = madvise(buf, TEST_MEM_SIZE, MADV_COLLAPSE); if (ret) { fprintf(stdout, "Error %d to madvise(MADV_COLLAPSE)\n", errno); goto out; } /* Split xarray entry. Write permission is needed */ munmap(buf, TEST_MEM_SIZE); buf = (void *)-1; close(fd); fd = open(filename, O_RDWR); assert(fd > 0); fallocate(fd, FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE, TEST_MEM_SIZE - pgsize, pgsize); out: if (buf != (void *)-1) munmap(buf, TEST_MEM_SIZE); if (fd > 0) close(fd); return ret; } [root@dhcp-10-26-1-207 ~]# gcc /tmp/test.c -o /tmp/test [root@dhcp-10-26-1-207 ~]# /tmp/test ------------[ cut here ]------------ WARNING: CPU: 25 PID: 7560 at lib/xarray.c:1025 xas_split_alloc+0xf8/0x128 Modules linked in: nft_fib_inet nft_fib_ipv4 nft_fib_ipv6 nft_fib \ nft_reject_inet nf_reject_ipv4 nf_reject_ipv6 nft_reject nft_ct \ nft_chain_nat nf_nat nf_conntrack nf_defrag_ipv6 nf_defrag_ipv4 \ ip_set rfkill nf_tables nfnetlink vfat fat virtio_balloon drm fuse \ xfs libcrc32c crct10dif_ce ghash_ce sha2_ce sha256_arm64 virtio_net \ sha1_ce net_failover virtio_blk virtio_console failover dimlib virtio_mmio CPU: 25 PID: 7560 Comm: test Kdump: loaded Not tainted 6.10.0-rc7-gavin+ #9 Hardware name: QEMU KVM Virtual Machine, BIOS edk2-20240524-1.el9 05/24/2024 pstate: 83400005 (Nzcv daif +PAN -UAO +TCO +DIT -SSBS BTYPE=--) pc : xas_split_alloc+0xf8/0x128 lr : split_huge_page_to_list_to_order+0x1c4/0x780 sp : ffff8000ac32f660 x29: ffff8000ac32f660 x28: ffff0000e0969eb0 x27: ffff8000ac32f6c0 x26: 0000000000000c40 x25: ffff0000e0969eb0 x24: 000000000000000d x23: ffff8000ac32f6c0 x22: ffffffdfc0700000 x21: 0000000000000000 x20: 0000000000000000 x19: ffffffdfc0700000 x18: 0000000000000000 x17: 0000000000000000 x16: ffffd5f3708ffc70 x15: 0000000000000000 x14: 0000000000000000 x13: 0000000000000000 x12: 0000000000000000 x11: ffffffffffffffc0 x10: 0000000000000040 x9 : ffffd5f3708e692c x8 : 0000000000000003 x7 : 0000000000000000 x6 : ffff0000e0969eb8 x5 : ffffd5f37289e378 x4 : 0000000000000000 x3 : 0000000000000c40 x2 : 000000000000000d x1 : 000000000000000c x0 : 0000000000000000 Call trace: xas_split_alloc+0xf8/0x128 split_huge_page_to_list_to_order+0x1c4/0x780 truncate_inode_partial_folio+0xdc/0x160 truncate_inode_pages_range+0x1b4/0x4a8 truncate_pagecache_range+0x84/0xa0 xfs_flush_unmap_range+0x70/0x90 [xfs] xfs_file_fallocate+0xfc/0x4d8 [xfs] vfs_fallocate+0x124/0x2f0 ksys_fallocate+0x4c/0xa0 __arm64_sys_fallocate+0x24/0x38 invoke_syscall.constprop.0+0x7c/0xd8 do_el0_svc+0xb4/0xd0 el0_svc+0x44/0x1d8 el0t_64_sync_handler+0x134/0x150 el0t_64_sync+0x17c/0x180 Fix it by correcting the supported page cache orders, different sets for DAX and other files. With it corrected, 512MB page cache becomes disallowed on all non-DAX files on ARM64 system where the base page size is 64KB. After this patch is applied, the test program fails with error -EINVAL returned from __thp_vma_allowable_orders() and the madvise() system call to collapse the page caches. Link: https://lkml.kernel.org/r/20240715000423.316491-1-gshan@redhat.com Fixes: 6b24ca4a1a8d ("mm: Use multi-index entries in the page cache") Signed-off-by: Gavin Shan Acked-by: David Hildenbrand Reviewed-by: Ryan Roberts Acked-by: Zi Yan Cc: Baolin Wang Cc: Barry Song Cc: Don Dutile Cc: Matthew Wilcox (Oracle) Cc: Peter Xu Cc: Ryan Roberts Cc: William Kucharski Cc: [5.17+] Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- include/linux/huge_mm.h | 12 +++++++++--- mm/huge_memory.c | 12 ++++++++++-- 2 files changed, 19 insertions(+), 5 deletions(-) diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 47e46e8d3dec..34056b5165e8 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -78,14 +78,20 @@ extern struct kobj_attribute thpsize_shmem_enabled_attr; #define THP_ORDERS_ALL_ANON ((BIT(PMD_ORDER + 1) - 1) & ~(BIT(0) | BIT(1))) /* - * Mask of all large folio orders supported for file THP. + * Mask of all large folio orders supported for file THP. Folios in a DAX + * file is never split and the MAX_PAGECACHE_ORDER limit does not apply to + * it. */ -#define THP_ORDERS_ALL_FILE (BIT(PMD_ORDER) | BIT(PUD_ORDER)) +#define THP_ORDERS_ALL_FILE_DAX \ + (BIT(PMD_ORDER) | BIT(PUD_ORDER)) +#define THP_ORDERS_ALL_FILE_DEFAULT \ + ((BIT(MAX_PAGECACHE_ORDER + 1) - 1) & ~BIT(0)) /* * Mask of all large folio orders supported for THP. */ -#define THP_ORDERS_ALL (THP_ORDERS_ALL_ANON | THP_ORDERS_ALL_FILE) +#define THP_ORDERS_ALL \ + (THP_ORDERS_ALL_ANON | THP_ORDERS_ALL_FILE_DAX | THP_ORDERS_ALL_FILE_DEFAULT) #define TVA_SMAPS (1 << 0) /* Will be used for procfs */ #define TVA_IN_PF (1 << 1) /* Page fault handler */ diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 07639a74af49..34f1e756bce5 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -88,9 +88,17 @@ unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma, bool smaps = tva_flags & TVA_SMAPS; bool in_pf = tva_flags & TVA_IN_PF; bool enforce_sysfs = tva_flags & TVA_ENFORCE_SYSFS; + unsigned long supported_orders; + /* Check the intersection of requested and supported orders. */ - orders &= vma_is_anonymous(vma) ? - THP_ORDERS_ALL_ANON : THP_ORDERS_ALL_FILE; + if (vma_is_anonymous(vma)) + supported_orders = THP_ORDERS_ALL_ANON; + else if (vma_is_dax(vma)) + supported_orders = THP_ORDERS_ALL_FILE_DAX; + else + supported_orders = THP_ORDERS_ALL_FILE_DEFAULT; + + orders &= supported_orders; if (!orders) return 0; -- Gitee From 80e10b5e130a9a763ff705b589e68c28ede308d2 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 20 Mar 2024 14:02:06 -0400 Subject: [PATCH 324/484] mm: page_alloc: remove pcppage migratetype caching commit 17edeb5d3f761c20fd28f6002f5a9faa53c0a0d8 upstream Conflicts: minor, due to 7c43f196e114, no need to check isolated block now. Backport-reason: mTHP: optimize allocation and compaction Patch series "mm: page_alloc: freelist migratetype hygiene", v4. The page allocator's mobility grouping is intended to keep unmovable pages separate from reclaimable/compactable ones to allow on-demand defragmentation for higher-order allocations and huge pages. Currently, there are several places where accidental type mixing occurs: an allocation asks for a page of a certain migratetype and receives another. This ruins pageblocks for compaction, which in turn makes allocating huge pages more expensive and less reliable. The series addresses those causes. The last patch adds type checks on all freelist movements to prevent new violations being introduced. The benefits can be seen in a mixed workload that stresses the machine with a memcache-type workload and a kernel build job while periodically attempting to allocate batches of THP. The following data is aggregated over 50 consecutive defconfig builds: VANILLA PATCHED Hugealloc Time mean 165843.93 ( +0.00%) 113025.88 ( -31.85%) Hugealloc Time stddev 158957.35 ( +0.00%) 114716.07 ( -27.83%) Kbuild Real time 310.24 ( +0.00%) 300.73 ( -3.06%) Kbuild User time 1271.13 ( +0.00%) 1259.42 ( -0.92%) Kbuild System time 582.02 ( +0.00%) 559.79 ( -3.81%) THP fault alloc 30585.14 ( +0.00%) 40853.62 ( +33.57%) THP fault fallback 36626.46 ( +0.00%) 26357.62 ( -28.04%) THP fault fail rate % 54.49 ( +0.00%) 39.22 ( -27.53%) Pagealloc fallback 1328.00 ( +0.00%) 1.00 ( -99.85%) Pagealloc type mismatch 181009.50 ( +0.00%) 0.00 ( -100.00%) Direct compact stall 434.56 ( +0.00%) 257.66 ( -40.61%) Direct compact fail 421.70 ( +0.00%) 249.94 ( -40.63%) Direct compact success 12.86 ( +0.00%) 7.72 ( -37.09%) Direct compact success rate % 2.86 ( +0.00%) 2.82 ( -0.96%) Compact daemon scanned migrate 3370059.62 ( +0.00%) 3612054.76 ( +7.18%) Compact daemon scanned free 7718439.20 ( +0.00%) 5386385.02 ( -30.21%) Compact direct scanned migrate 309248.62 ( +0.00%) 176721.04 ( -42.85%) Compact direct scanned free 433582.84 ( +0.00%) 315727.66 ( -27.18%) Compact migrate scanned daemon % 91.20 ( +0.00%) 94.48 ( +3.56%) Compact free scanned daemon % 94.58 ( +0.00%) 94.42 ( -0.16%) Compact total migrate scanned 3679308.24 ( +0.00%) 3788775.80 ( +2.98%) Compact total free scanned 8152022.04 ( +0.00%) 5702112.68 ( -30.05%) Alloc stall 872.04 ( +0.00%) 5156.12 ( +490.71%) Pages kswapd scanned 510645.86 ( +0.00%) 3394.94 ( -99.33%) Pages kswapd reclaimed 134811.62 ( +0.00%) 2701.26 ( -98.00%) Pages direct scanned 99546.06 ( +0.00%) 376407.52 ( +278.12%) Pages direct reclaimed 62123.40 ( +0.00%) 289535.70 ( +366.06%) Pages total scanned 610191.92 ( +0.00%) 379802.46 ( -37.76%) Pages scanned kswapd % 76.36 ( +0.00%) 0.10 ( -98.58%) Swap out 12057.54 ( +0.00%) 15022.98 ( +24.59%) Swap in 209.16 ( +0.00%) 256.48 ( +22.52%) File refaults 17701.64 ( +0.00%) 11765.40 ( -33.53%) Huge page success rate is higher, allocation latencies are shorter and more predictable. Stealing (fallback) rate is drastically reduced. Notably, while the vanilla kernel keeps doing fallbacks on an ongoing basis, the patched kernel enters a steady state once the distribution of block types is adequate for the workload. Steals over 50 runs: VANILLA PATCHED 1504.0 227.0 1557.0 6.0 1391.0 13.0 1080.0 26.0 1057.0 40.0 1156.0 6.0 805.0 46.0 736.0 20.0 1747.0 2.0 1699.0 34.0 1269.0 13.0 1858.0 12.0 907.0 4.0 727.0 2.0 563.0 2.0 3094.0 2.0 10211.0 3.0 2621.0 1.0 5508.0 2.0 1060.0 2.0 538.0 3.0 5773.0 2.0 2199.0 0.0 3781.0 2.0 1387.0 1.0 4977.0 0.0 2865.0 1.0 1814.0 1.0 3739.0 1.0 6857.0 0.0 382.0 0.0 407.0 1.0 3784.0 0.0 297.0 0.0 298.0 0.0 6636.0 0.0 4188.0 0.0 242.0 0.0 9960.0 0.0 5816.0 0.0 354.0 0.0 287.0 0.0 261.0 0.0 140.0 1.0 2065.0 0.0 312.0 0.0 331.0 0.0 164.0 0.0 465.0 1.0 219.0 0.0 Type mismatches are down too. Those count every time an allocation request asks for one migratetype and gets another. This can still occur minimally in the patched kernel due to non-stealing fallbacks, but it's quite rare and follows the pattern of overall fallbacks - once the block type distribution settles, mismatches cease as well: VANILLA: PATCHED: 182602.0 268.0 135794.0 20.0 88619.0 19.0 95973.0 0.0 129590.0 0.0 129298.0 0.0 147134.0 0.0 230854.0 0.0 239709.0 0.0 137670.0 0.0 132430.0 0.0 65712.0 0.0 57901.0 0.0 67506.0 0.0 63565.0 4.0 34806.0 0.0 42962.0 0.0 32406.0 0.0 38668.0 0.0 61356.0 0.0 57800.0 0.0 41435.0 0.0 83456.0 0.0 65048.0 0.0 28955.0 0.0 47597.0 0.0 75117.0 0.0 55564.0 0.0 38280.0 0.0 52404.0 0.0 26264.0 0.0 37538.0 0.0 19671.0 0.0 30936.0 0.0 26933.0 0.0 16962.0 0.0 44554.0 0.0 46352.0 0.0 24995.0 0.0 35152.0 0.0 12823.0 0.0 21583.0 0.0 18129.0 0.0 31693.0 0.0 28745.0 0.0 33308.0 0.0 31114.0 0.0 35034.0 0.0 12111.0 0.0 24885.0 0.0 Compaction work is markedly reduced despite much better THP rates. In the vanilla kernel, reclaim seems to have been driven primarily by watermark boosting that happens as a result of fallbacks. With those all but eliminated, watermarks average lower and kswapd does less work. The uptick in direct reclaim is because THP requests have to fend for themselves more often - which is intended policy right now. Aggregate reclaim activity is lowered significantly, though. This patch (of 10): The idea behind the cache is to save get_pageblock_migratetype() lookups during bulk freeing. A microbenchmark suggests this isn't helping, though. The pcp migratetype can get stale, which means that bulk freeing has an extra branch to check if the pageblock was isolated while on the pcp. While the variance overlaps, the cache write and the branch seem to make this a net negative. The following test allocates and frees batches of 10,000 pages (~3x the pcp high marks to trigger flushing): Before: 8,668.48 msec task-clock # 99.735 CPUs utilized ( +- 2.90% ) 19 context-switches # 4.341 /sec ( +- 3.24% ) 0 cpu-migrations # 0.000 /sec 17,440 page-faults # 3.984 K/sec ( +- 2.90% ) 41,758,692,473 cycles # 9.541 GHz ( +- 2.90% ) 126,201,294,231 instructions # 5.98 insn per cycle ( +- 2.90% ) 25,348,098,335 branches # 5.791 G/sec ( +- 2.90% ) 33,436,921 branch-misses # 0.26% of all branches ( +- 2.90% ) 0.0869148 +- 0.0000302 seconds time elapsed ( +- 0.03% ) After: 8,444.81 msec task-clock # 99.726 CPUs utilized ( +- 2.90% ) 22 context-switches # 5.160 /sec ( +- 3.23% ) 0 cpu-migrations # 0.000 /sec 17,443 page-faults # 4.091 K/sec ( +- 2.90% ) 40,616,738,355 cycles # 9.527 GHz ( +- 2.90% ) 126,383,351,792 instructions # 6.16 insn per cycle ( +- 2.90% ) 25,224,985,153 branches # 5.917 G/sec ( +- 2.90% ) 32,236,793 branch-misses # 0.25% of all branches ( +- 2.90% ) 0.0846799 +- 0.0000412 seconds time elapsed ( +- 0.05% ) A side effect is that this also ensures that pages whose pageblock gets stolen while on the pcplist end up on the right freelist and we don't perform potentially type-incompatible buddy merges (or skip merges when we shouldn't), which is likely beneficial to long-term fragmentation management, although the effects would be harder to measure. Settle for simpler and faster code as justification here. Link: https://lkml.kernel.org/r/20240320180429.678181-1-hannes@cmpxchg.org Link: https://lkml.kernel.org/r/20240320180429.678181-2-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Acked-by: Zi Yan Reviewed-by: Vlastimil Babka Acked-by: Mel Gorman Tested-by: "Huang, Ying" Tested-by: Baolin Wang Cc: David Hildenbrand Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/page_alloc.c | 67 +++++++++++-------------------------------------- 1 file changed, 14 insertions(+), 53 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index e30af69350eb..dceaab7b5862 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -212,24 +212,6 @@ EXPORT_SYMBOL(node_states); gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK; -/* - * A cached value of the page's pageblock's migratetype, used when the page is - * put on a pcplist. Used to avoid the pageblock migratetype lookup when - * freeing from pcplists in most cases, at the cost of possibly becoming stale. - * Also the migratetype set in the page does not necessarily match the pcplist - * index, e.g. page might have MIGRATE_CMA set but be on a pcplist with any - * other index - this ensures that it will be put on the correct CMA freelist. - */ -static inline int get_pcppage_migratetype(struct page *page) -{ - return page->index; -} - -static inline void set_pcppage_migratetype(struct page *page, int migratetype) -{ - page->index = migratetype; -} - #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE unsigned int pageblock_order __read_mostly; #endif @@ -1213,7 +1195,6 @@ static void free_pcppages_bulk(struct zone *zone, int count, { unsigned long flags; unsigned int order; - bool isolated_pageblocks; struct page *page; int batch; @@ -1227,7 +1208,6 @@ static void free_pcppages_bulk(struct zone *zone, int count, pindex = pindex - 1; spin_lock_irqsave(&zone->lock, flags); - isolated_pageblocks = has_isolate_pageblock(zone); batch = 0; while (count > 0) { @@ -1244,23 +1224,19 @@ static void free_pcppages_bulk(struct zone *zone, int count, order = pindex_to_order(pindex); nr_pages = 1 << order; do { + unsigned long pfn; int mt; page = list_last_entry(list, struct page, pcp_list); - mt = get_pcppage_migratetype(page); + pfn = page_to_pfn(page); + mt = get_pfnblock_migratetype(page, pfn); /* must delete to avoid corrupting pcp list */ list_del(&page->pcp_list); count -= nr_pages; pcp->count -= nr_pages; - /* MIGRATE_ISOLATE page should not go to pcplists */ - VM_BUG_ON_PAGE(is_migrate_isolate(mt), page); - /* Pageblock could have been isolated meanwhile */ - if (unlikely(isolated_pageblocks)) - mt = get_pageblock_migratetype(page); - - __free_one_page(page, page_to_pfn(page), zone, order, mt, FPI_NONE); + __free_one_page(page, pfn, zone, order, mt, FPI_NONE); trace_mm_page_pcpu_drain(page, order, mt); if (++batch > READ_ONCE(pcp->batch)) { @@ -1269,7 +1245,6 @@ static void free_pcppages_bulk(struct zone *zone, int count, continue; spin_unlock_irqrestore(&zone->lock, flags); spin_lock_irqsave(&zone->lock, flags); - isolated_pageblocks = has_isolate_pageblock(zone); } } while (count > 0 && !list_empty(list)); } @@ -1608,7 +1583,6 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order, continue; del_page_from_free_list(page, zone, current_order); expand(zone, page, order, current_order, migratetype); - set_pcppage_migratetype(page, migratetype); trace_mm_page_alloc_zone_locked(page, order, migratetype, pcp_allowed_order(order) && migratetype < MIGRATE_PCPTYPES); @@ -2175,7 +2149,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, * pages are ordered properly. */ list_add_tail(&page->pcp_list, list); - if (is_migrate_cma(get_pcppage_migratetype(page))) + if (is_migrate_cma(get_pageblock_migratetype(page))) __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, -(1 << order)); } @@ -2429,19 +2403,6 @@ void drain_all_pages(struct zone *zone) __drain_all_pages(zone, false); } -static bool free_unref_page_prepare(struct page *page, unsigned long pfn, - unsigned int order) -{ - int migratetype; - - if (!free_pages_prepare(page, order)) - return false; - - migratetype = get_pfnblock_migratetype(page, pfn); - set_pcppage_migratetype(page, migratetype); - return true; -} - static int nr_pcp_free(struct per_cpu_pages *pcp, int batch, int high, bool free_high) { int min_nr_free, max_nr_free; @@ -2575,7 +2536,7 @@ void free_unref_page(struct page *page, unsigned int order) unsigned long pfn = page_to_pfn(page); int migratetype, pcpmigratetype; - if (!free_unref_page_prepare(page, pfn, order)) + if (!free_pages_prepare(page, order)) return; /* @@ -2585,7 +2546,7 @@ void free_unref_page(struct page *page, unsigned int order) * get those areas back if necessary. Otherwise, we may have to free * excessively into the page allocator */ - migratetype = pcpmigratetype = get_pcppage_migratetype(page); + migratetype = pcpmigratetype = get_pfnblock_migratetype(page, pfn); if (unlikely(migratetype >= MIGRATE_PCPTYPES)) { if (unlikely(is_migrate_isolate(migratetype))) { free_one_page(page_zone(page), page, pfn, order, migratetype, FPI_NONE); @@ -2624,14 +2585,14 @@ void free_unref_folios(struct folio_batch *folios) if (order > 0 && folio_test_large_rmappable(folio)) folio_undo_large_rmappable(folio); - if (!free_unref_page_prepare(&folio->page, pfn, order)) + if (!free_pages_prepare(&folio->page, order)) continue; /* * Free isolated folios and orders not handled on the PCP * directly to the allocator, see comment in free_unref_page. */ - migratetype = get_pcppage_migratetype(&folio->page); + migratetype = get_pfnblock_migratetype(&folio->page, pfn); if (!pcp_allowed_order(order) || is_migrate_isolate(migratetype)) { free_one_page(folio_zone(folio), &folio->page, pfn, @@ -2648,10 +2609,11 @@ void free_unref_folios(struct folio_batch *folios) for (i = 0; i < folios->nr; i++) { struct folio *folio = folios->folios[i]; struct zone *zone = folio_zone(folio); + unsigned long pfn = folio_pfn(folio); unsigned int order = (unsigned long)folio->private; folio->private = NULL; - migratetype = get_pcppage_migratetype(&folio->page); + migratetype = get_pfnblock_migratetype(&folio->page, pfn); /* Different zone requires a different pcp lock */ if (zone != locked_zone) { @@ -2668,9 +2630,8 @@ void free_unref_folios(struct folio_batch *folios) pcp = pcp_spin_trylock(zone->per_cpu_pageset); if (unlikely(!pcp)) { pcp_trylock_finish(UP_flags); - free_one_page(zone, &folio->page, - folio_pfn(folio), order, - migratetype, FPI_NONE); + free_one_page(zone, &folio->page, pfn, + order, migratetype, FPI_NONE); locked_zone = NULL; continue; } @@ -2839,7 +2800,7 @@ struct page *rmqueue_buddy(struct zone *preferred_zone, struct zone *zone, } } __mod_zone_freepage_state(zone, -(1 << order), - get_pcppage_migratetype(page)); + get_pageblock_migratetype(page)); spin_unlock_irqrestore(&zone->lock, flags); } while (check_new_pages(page, order)); -- Gitee From 88f1e5ef3e8468829eff64eb4027b6a69fb36da0 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 20 Mar 2024 14:02:07 -0400 Subject: [PATCH 325/484] mm: page_alloc: optimize free_unref_folios() commit 9cbe97bad5cd75b5b493734bd2695febb8e95281 upstream Conflicts: none Backport-reason: mTHP: optimize allocation and compaction Move direct freeing of isolated pages to the lock-breaking block in the second loop. This saves an unnecessary migratetype reassessment. Minor comment and local variable scoping cleanups. Link: https://lkml.kernel.org/r/20240320180429.678181-3-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Suggested-by: Vlastimil Babka Tested-by: "Huang, Ying" Reviewed-by: Vlastimil Babka Tested-by: Baolin Wang Cc: David Hildenbrand Cc: Mel Gorman Cc: Zi Yan Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/page_alloc.c | 32 +++++++++++++++++++++++--------- 1 file changed, 23 insertions(+), 9 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index dceaab7b5862..074de4e0ddd2 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2575,7 +2575,7 @@ void free_unref_folios(struct folio_batch *folios) unsigned long __maybe_unused UP_flags; struct per_cpu_pages *pcp = NULL; struct zone *locked_zone = NULL; - int i, j, migratetype; + int i, j; /* Prepare folios for freeing */ for (i = 0, j = 0; i < folios->nr; i++) { @@ -2587,14 +2587,15 @@ void free_unref_folios(struct folio_batch *folios) folio_undo_large_rmappable(folio); if (!free_pages_prepare(&folio->page, order)) continue; - /* - * Free isolated folios and orders not handled on the PCP - * directly to the allocator, see comment in free_unref_page. + * Free orders not handled on the PCP directly to the + * allocator. */ - migratetype = get_pfnblock_migratetype(&folio->page, pfn); - if (!pcp_allowed_order(order) || - is_migrate_isolate(migratetype)) { + if (!pcp_allowed_order(order)) { + int migratetype; + + migratetype = get_pfnblock_migratetype(&folio->page, + pfn); free_one_page(folio_zone(folio), &folio->page, pfn, order, migratetype, FPI_NONE); continue; @@ -2611,15 +2612,29 @@ void free_unref_folios(struct folio_batch *folios) struct zone *zone = folio_zone(folio); unsigned long pfn = folio_pfn(folio); unsigned int order = (unsigned long)folio->private; + int migratetype; folio->private = NULL; migratetype = get_pfnblock_migratetype(&folio->page, pfn); /* Different zone requires a different pcp lock */ - if (zone != locked_zone) { + if (zone != locked_zone || + is_migrate_isolate(migratetype)) { if (pcp) { pcp_spin_unlock(pcp); pcp_trylock_finish(UP_flags); + locked_zone = NULL; + pcp = NULL; + } + + /* + * Free isolated pages directly to the + * allocator, see comment in free_unref_page. + */ + if (is_migrate_isolate(migratetype)) { + free_one_page(zone, &folio->page, pfn, + order, migratetype, FPI_NONE); + continue; } /* @@ -2632,7 +2647,6 @@ void free_unref_folios(struct folio_batch *folios) pcp_trylock_finish(UP_flags); free_one_page(zone, &folio->page, pfn, order, migratetype, FPI_NONE); - locked_zone = NULL; continue; } locked_zone = zone; -- Gitee From aa0f72679d6d0225d9c58d04e63d2a7e605da8fb Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 20 Mar 2024 14:02:08 -0400 Subject: [PATCH 326/484] mm: page_alloc: fix up block types when merging compatible blocks commit e6cf9e1c4cde8a53385423ecb8ca581097f42e02 upstream Conflicts: none Backport-reason: mTHP: optimize allocation and compaction The buddy allocator coalesces compatible blocks during freeing, but it doesn't update the types of the subblocks to match. When an allocation later breaks the chunk down again, its pieces will be put on freelists of the wrong type. This encourages incompatible page mixing (ask for one type, get another), and thus long-term fragmentation. Update the subblocks when merging a larger chunk, such that a later expand() will maintain freelist type hygiene. Link: https://lkml.kernel.org/r/20240320180429.678181-4-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Reviewed-by: Zi Yan Reviewed-by: Vlastimil Babka Acked-by: Mel Gorman Tested-by: "Huang, Ying" Tested-by: Baolin Wang Cc: David Hildenbrand Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/page_alloc.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 074de4e0ddd2..1045f9123cc8 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -795,10 +795,17 @@ static inline void __free_one_page(struct page *page, */ int buddy_mt = get_pfnblock_migratetype(buddy, buddy_pfn); - if (migratetype != buddy_mt - && (!migratetype_is_mergeable(migratetype) || - !migratetype_is_mergeable(buddy_mt))) - goto done_merging; + if (migratetype != buddy_mt) { + if (!migratetype_is_mergeable(migratetype) || + !migratetype_is_mergeable(buddy_mt)) + goto done_merging; + /* + * Match buddy type. This ensures that + * an expand() down the line puts the + * sub-blocks on the right freelists. + */ + set_pageblock_migratetype(buddy, migratetype); + } } /* -- Gitee From 4875c4c65270269d3f6def96ae9005c8b89a7b80 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 20 Mar 2024 14:02:09 -0400 Subject: [PATCH 327/484] mm: page_alloc: move free pages when converting block during isolation commit b54ccd3c6bacbc571f7e61797fb5ff9fe3861413 upstream Conflicts: none Backport-reason: mTHP: optimize allocation and compaction When claiming a block during compaction isolation, move any remaining free pages to the correct freelists as well, instead of stranding them on the wrong list. Otherwise, this encourages incompatible page mixing down the line, and thus long-term fragmentation. Link: https://lkml.kernel.org/r/20240320180429.678181-5-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Reviewed-by: Zi Yan Reviewed-by: Vlastimil Babka Acked-by: Mel Gorman Tested-by: "Huang, Ying" Tested-by: Baolin Wang Cc: David Hildenbrand Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/page_alloc.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 1045f9123cc8..d3db5f5cba7d 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2734,9 +2734,12 @@ int __isolate_free_page(struct page *page, unsigned int order) * Only change normal pageblocks (i.e., they can merge * with others) */ - if (migratetype_is_mergeable(mt)) + if (migratetype_is_mergeable(mt)) { set_pageblock_migratetype(page, MIGRATE_MOVABLE); + move_freepages_block(zone, page, + MIGRATE_MOVABLE, NULL); + } } } -- Gitee From 6b38442aea56aa922a26a11206937be1e54316e0 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 20 Mar 2024 14:02:10 -0400 Subject: [PATCH 328/484] mm: page_alloc: fix move_freepages_block() range error commit 2dd482ba627de15d67f0c0ed445133c8ae9b201b upstream Conflicts: none Backport-reason: mTHP: optimize allocation and compaction When a block is partially outside the zone of the cursor page, the function cuts the range to the pivot page instead of the zone start. This can leave large parts of the block behind, which encourages incompatible page mixing down the line (ask for one type, get another), and thus long-term fragmentation. This triggers reliably on the first block in the DMA zone, whose start_pfn is 1. The block is stolen, but everything before the pivot page (which was often hundreds of pages) is left on the old list. Link: https://lkml.kernel.org/r/20240320180429.678181-6-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Reviewed-by: Vlastimil Babka Tested-by: Baolin Wang Cc: David Hildenbrand Cc: "Huang, Ying" Cc: Mel Gorman Cc: Zi Yan Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/page_alloc.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index d3db5f5cba7d..b783ad99a641 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1677,9 +1677,15 @@ int move_freepages_block(struct zone *zone, struct page *page, start_pfn = pageblock_start_pfn(pfn); end_pfn = pageblock_end_pfn(pfn) - 1; - /* Do not cross zone boundaries */ + /* + * The caller only has the lock for @zone, don't touch ranges + * that straddle into other zones. While we could move part of + * the range that's inside the zone, this call is usually + * accompanied by other operations such as migratetype updates + * which also should be locked. + */ if (!zone_spans_pfn(zone, start_pfn)) - start_pfn = pfn; + return 0; if (!zone_spans_pfn(zone, end_pfn)) return 0; -- Gitee From f9e2fbd2c3130a6747bea1fb1af26b3d5f8b8d68 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 20 Mar 2024 14:02:11 -0400 Subject: [PATCH 329/484] mm: page_alloc: fix freelist movement during block conversion commit c0cd6f557b9090525d288806cccbc73440ac235a upstream Conflicts: none Backport-reason: mTHP: optimize allocation and compaction Currently, page block type conversion during fallbacks, atomic reservations and isolation can strand various amounts of free pages on incorrect freelists. For example, fallback stealing moves free pages in the block to the new type's freelists, but then may not actually claim the block for that type if there aren't enough compatible pages already allocated. In all cases, free page moving might fail if the block straddles more than one zone, in which case no free pages are moved at all, but the block type is changed anyway. This is detrimental to type hygiene on the freelists. It encourages incompatible page mixing down the line (ask for one type, get another) and thus contributes to long-term fragmentation. Split the process into a proper transaction: check first if conversion will happen, then try to move the free pages, and only if that was successful convert the block to the new type. [baolin.wang@linux.alibaba.com: fix allocation failures with CONFIG_CMA] Link: https://lkml.kernel.org/r/a97697e0-45b0-4f71-b087-fdc7a1d43c0e@linux.alibaba.com Link: https://lkml.kernel.org/r/20240320180429.678181-7-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Signed-off-by: Baolin Wang Tested-by: "Huang, Ying" Reviewed-by: Vlastimil Babka Tested-by: Baolin Wang Cc: David Hildenbrand Cc: Mel Gorman Cc: Zi Yan Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- include/linux/page-isolation.h | 3 +- mm/page_alloc.c | 174 ++++++++++++++++++++------------- mm/page_isolation.c | 22 +++-- 3 files changed, 121 insertions(+), 78 deletions(-) diff --git a/include/linux/page-isolation.h b/include/linux/page-isolation.h index 4ac34392823a..8550b3c91480 100644 --- a/include/linux/page-isolation.h +++ b/include/linux/page-isolation.h @@ -34,8 +34,7 @@ static inline bool is_migrate_isolate(int migratetype) #define REPORT_FAILURE 0x2 void set_pageblock_migratetype(struct page *page, int migratetype); -int move_freepages_block(struct zone *zone, struct page *page, - int migratetype, int *num_movable); +int move_freepages_block(struct zone *zone, struct page *page, int migratetype); int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, int migratetype, int flags, gfp_t gfp_flags); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index b783ad99a641..0f6cc39c4b8f 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1628,9 +1628,8 @@ static inline struct page *__rmqueue_cma_fallback(struct zone *zone, * Note that start_page and end_pages are not aligned on a pageblock * boundary. If alignment is required, use move_freepages_block() */ -static int move_freepages(struct zone *zone, - unsigned long start_pfn, unsigned long end_pfn, - int migratetype, int *num_movable) +static int move_freepages(struct zone *zone, unsigned long start_pfn, + unsigned long end_pfn, int migratetype) { struct page *page; unsigned long pfn; @@ -1640,14 +1639,6 @@ static int move_freepages(struct zone *zone, for (pfn = start_pfn; pfn <= end_pfn;) { page = pfn_to_page(pfn); if (!PageBuddy(page)) { - /* - * We assume that pages that could be isolated for - * migration are movable. But we don't actually try - * isolating, as that would be expensive. - */ - if (num_movable && - (PageLRU(page) || __PageMovable(page))) - (*num_movable)++; pfn++; continue; } @@ -1665,17 +1656,16 @@ static int move_freepages(struct zone *zone, return pages_moved; } -int move_freepages_block(struct zone *zone, struct page *page, - int migratetype, int *num_movable) +static bool prep_move_freepages_block(struct zone *zone, struct page *page, + unsigned long *start_pfn, + unsigned long *end_pfn, + int *num_free, int *num_movable) { - unsigned long start_pfn, end_pfn, pfn; - - if (num_movable) - *num_movable = 0; + unsigned long pfn, start, end; pfn = page_to_pfn(page); - start_pfn = pageblock_start_pfn(pfn); - end_pfn = pageblock_end_pfn(pfn) - 1; + start = pageblock_start_pfn(pfn); + end = pageblock_end_pfn(pfn) - 1; /* * The caller only has the lock for @zone, don't touch ranges @@ -1684,13 +1674,50 @@ int move_freepages_block(struct zone *zone, struct page *page, * accompanied by other operations such as migratetype updates * which also should be locked. */ - if (!zone_spans_pfn(zone, start_pfn)) - return 0; - if (!zone_spans_pfn(zone, end_pfn)) - return 0; + if (!zone_spans_pfn(zone, start)) + return false; + if (!zone_spans_pfn(zone, end)) + return false; + + *start_pfn = start; + *end_pfn = end; - return move_freepages(zone, start_pfn, end_pfn, migratetype, - num_movable); + if (num_free) { + *num_free = 0; + *num_movable = 0; + for (pfn = start; pfn <= end;) { + page = pfn_to_page(pfn); + if (PageBuddy(page)) { + int nr = 1 << buddy_order(page); + + *num_free += nr; + pfn += nr; + continue; + } + /* + * We assume that pages that could be isolated for + * migration are movable. But we don't actually try + * isolating, as that would be expensive. + */ + if (PageLRU(page) || __PageMovable(page)) + (*num_movable)++; + pfn++; + } + } + + return true; +} + +int move_freepages_block(struct zone *zone, struct page *page, + int migratetype) +{ + unsigned long start_pfn, end_pfn; + + if (!prep_move_freepages_block(zone, page, &start_pfn, &end_pfn, + NULL, NULL)) + return -1; + + return move_freepages(zone, start_pfn, end_pfn, migratetype); } static void change_pageblock_range(struct page *pageblock_page, @@ -1775,33 +1802,37 @@ static inline bool boost_watermark(struct zone *zone) } /* - * This function implements actual steal behaviour. If order is large enough, - * we can steal whole pageblock. If not, we first move freepages in this - * pageblock to our migratetype and determine how many already-allocated pages - * are there in the pageblock with a compatible migratetype. If at least half - * of pages are free or compatible, we can change migratetype of the pageblock - * itself, so pages freed in the future will be put on the correct free list. + * This function implements actual steal behaviour. If order is large enough, we + * can claim the whole pageblock for the requested migratetype. If not, we check + * the pageblock for constituent pages; if at least half of the pages are free + * or compatible, we can still claim the whole block, so pages freed in the + * future will be put on the correct free list. Otherwise, we isolate exactly + * the order we need from the fallback block and leave its migratetype alone. */ -static void steal_suitable_fallback(struct zone *zone, struct page *page, - unsigned int alloc_flags, int start_type, bool whole_block) +static struct page * +steal_suitable_fallback(struct zone *zone, struct page *page, + int current_order, int order, int start_type, + unsigned int alloc_flags, bool whole_block) { - unsigned int current_order = buddy_order(page); int free_pages, movable_pages, alike_pages; - int old_block_type; + unsigned long start_pfn, end_pfn; + int block_type; - old_block_type = get_pageblock_migratetype(page); + block_type = get_pageblock_migratetype(page); /* * This can happen due to races and we want to prevent broken * highatomic accounting. */ - if (is_migrate_highatomic(old_block_type)) + if (is_migrate_highatomic(block_type)) goto single_page; /* Take ownership for orders >= pageblock_order */ if (current_order >= pageblock_order) { + del_page_from_free_list(page, zone, current_order); change_pageblock_range(page, current_order, start_type); - goto single_page; + expand(zone, page, order, current_order, start_type); + return page; } /* @@ -1816,10 +1847,9 @@ static void steal_suitable_fallback(struct zone *zone, struct page *page, if (!whole_block) goto single_page; - free_pages = move_freepages_block(zone, page, start_type, - &movable_pages); /* moving whole block can fail due to zone boundary conditions */ - if (!free_pages) + if (!prep_move_freepages_block(zone, page, &start_pfn, &end_pfn, + &free_pages, &movable_pages)) goto single_page; /* @@ -1837,7 +1867,7 @@ static void steal_suitable_fallback(struct zone *zone, struct page *page, * vice versa, be conservative since we can't distinguish the * exact migratetype of non-movable pages. */ - if (old_block_type == MIGRATE_MOVABLE) + if (block_type == MIGRATE_MOVABLE) alike_pages = pageblock_nr_pages - (free_pages + movable_pages); else @@ -1848,13 +1878,16 @@ static void steal_suitable_fallback(struct zone *zone, struct page *page, * compatible migratability as our allocation, claim the whole block. */ if (free_pages + alike_pages >= (1 << (pageblock_order-1)) || - page_group_by_mobility_disabled) + page_group_by_mobility_disabled) { + move_freepages(zone, start_pfn, end_pfn, start_type); set_pageblock_migratetype(page, start_type); - - return; + return __rmqueue_smallest(zone, order, start_type); + } single_page: - move_to_free_list(page, zone, current_order, start_type); + del_page_from_free_list(page, zone, current_order); + expand(zone, page, order, current_order, block_type); + return page; } /* @@ -1918,9 +1951,10 @@ static void reserve_highatomic_pageblock(struct page *page, struct zone *zone) mt = get_pageblock_migratetype(page); /* Only reserve normal pageblocks (i.e., they can merge with others) */ if (migratetype_is_mergeable(mt)) { - zone->nr_reserved_highatomic += pageblock_nr_pages; - set_pageblock_migratetype(page, MIGRATE_HIGHATOMIC); - move_freepages_block(zone, page, MIGRATE_HIGHATOMIC, NULL); + if (move_freepages_block(zone, page, MIGRATE_HIGHATOMIC) != -1) { + set_pageblock_migratetype(page, MIGRATE_HIGHATOMIC); + zone->nr_reserved_highatomic += pageblock_nr_pages; + } } out_unlock: @@ -1945,7 +1979,7 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac, struct zone *zone; struct page *page; int order; - bool ret; + int ret; for_each_zone_zonelist_nodemask(zone, z, zonelist, ac->highest_zoneidx, ac->nodemask) { @@ -1994,10 +2028,14 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac, * of pageblocks that cannot be completely freed * may increase. */ + ret = move_freepages_block(zone, page, ac->migratetype); + /* + * Reserving this block already succeeded, so this should + * not fail on zone boundaries. + */ + WARN_ON_ONCE(ret == -1); set_pageblock_migratetype(page, ac->migratetype); - ret = move_freepages_block(zone, page, ac->migratetype, - NULL); - if (ret) { + if (ret > 0) { spin_unlock_irqrestore(&zone->lock, flags); return ret; } @@ -2018,7 +2056,7 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac, * deviation from the rest of this file, to make the for loop * condition simpler. */ -static __always_inline bool +static __always_inline struct page * __rmqueue_fallback(struct zone *zone, int order, int start_migratetype, unsigned int alloc_flags) { @@ -2065,7 +2103,7 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype, goto do_steal; } - return false; + return NULL; find_smallest: for (current_order = order; current_order < NR_PAGE_ORDERS; current_order++) { @@ -2085,14 +2123,14 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype, do_steal: page = get_page_from_free_area(area, fallback_mt); - steal_suitable_fallback(zone, page, alloc_flags, start_migratetype, - can_steal); + /* take off list, maybe claim block, expand remainder */ + page = steal_suitable_fallback(zone, page, current_order, order, + start_migratetype, alloc_flags, can_steal); trace_mm_page_alloc_extfrag(page, order, current_order, start_migratetype, fallback_mt); - return true; - + return page; } /* @@ -2119,15 +2157,15 @@ __rmqueue(struct zone *zone, unsigned int order, int migratetype, return page; } } -retry: + page = __rmqueue_smallest(zone, order, migratetype); if (unlikely(!page)) { if (alloc_flags & ALLOC_CMA) page = __rmqueue_cma_fallback(zone, order); - if (!page && __rmqueue_fallback(zone, order, migratetype, - alloc_flags)) - goto retry; + if (!page) + page = __rmqueue_fallback(zone, order, migratetype, + alloc_flags); } return page; } @@ -2740,12 +2778,10 @@ int __isolate_free_page(struct page *page, unsigned int order) * Only change normal pageblocks (i.e., they can merge * with others) */ - if (migratetype_is_mergeable(mt)) { - set_pageblock_migratetype(page, - MIGRATE_MOVABLE); - move_freepages_block(zone, page, - MIGRATE_MOVABLE, NULL); - } + if (migratetype_is_mergeable(mt) && + move_freepages_block(zone, page, + MIGRATE_MOVABLE) != -1) + set_pageblock_migratetype(page, MIGRATE_MOVABLE); } } diff --git a/mm/page_isolation.c b/mm/page_isolation.c index 5fe736982c2d..616d96a1ea25 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -178,15 +178,18 @@ static int set_migratetype_isolate(struct page *page, int migratetype, int isol_ unmovable = has_unmovable_pages(check_unmovable_start, check_unmovable_end, migratetype, isol_flags); if (!unmovable) { - unsigned long nr_pages; + int nr_pages; int mt = get_pageblock_migratetype(page); + nr_pages = move_freepages_block(zone, page, MIGRATE_ISOLATE); + /* Block spans zone boundaries? */ + if (nr_pages == -1) { + spin_unlock_irqrestore(&zone->lock, flags); + return -EBUSY; + } + __mod_zone_freepage_state(zone, -nr_pages, mt); set_pageblock_migratetype(page, MIGRATE_ISOLATE); zone->nr_isolate_pageblock++; - nr_pages = move_freepages_block(zone, page, MIGRATE_ISOLATE, - NULL); - - __mod_zone_freepage_state(zone, -nr_pages, mt); spin_unlock_irqrestore(&zone->lock, flags); return 0; } @@ -206,7 +209,7 @@ static int set_migratetype_isolate(struct page *page, int migratetype, int isol_ static void unset_migratetype_isolate(struct page *page, int migratetype) { struct zone *zone; - unsigned long flags, nr_pages; + unsigned long flags; bool isolated_page = false; unsigned int order; struct page *buddy; @@ -252,7 +255,12 @@ static void unset_migratetype_isolate(struct page *page, int migratetype) * allocation. */ if (!isolated_page) { - nr_pages = move_freepages_block(zone, page, migratetype, NULL); + int nr_pages = move_freepages_block(zone, page, migratetype); + /* + * Isolating this block already succeeded, so this + * should not fail on zone boundaries. + */ + WARN_ON_ONCE(nr_pages == -1); __mod_zone_freepage_state(zone, nr_pages, migratetype); } set_pageblock_migratetype(page, migratetype); -- Gitee From 76b292a1959ebf4732cebf0289f75a209cf8ca9c Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 20 Mar 2024 14:02:12 -0400 Subject: [PATCH 330/484] mm: page_alloc: close migratetype race between freeing and stealing commit 55612e80e722ac554cc5e80df05555b4f8d40c37 upstream Conflicts: none Backport-reason: mTHP: optimize allocation and compaction There are three freeing paths that read the page's migratetype optimistically before grabbing the zone lock. When this races with block stealing, those pages go on the wrong freelist. The paths in question are: - when freeing >costly orders that aren't THP - when freeing pages to the buddy upon pcp lock contention - when freeing pages that are isolated - when freeing pages initially during boot - when freeing the remainder in alloc_pages_exact() - when "accepting" unaccepted VM host memory before first use - when freeing pages during unpoisoning None of these are so hot that they would need this optimization at the cost of hampering defrag efforts. Especially when contrasted with the fact that the most common buddy freeing path - free_pcppages_bulk - is checking the migratetype under the zone->lock just fine. In addition, isolated pages need to look up the migratetype under the lock anyway, which adds branches to the locked section, and results in a double lookup when the pages are in fact isolated. Move the lookups into the lock. Link: https://lkml.kernel.org/r/20240320180429.678181-8-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Reported-by: Vlastimil Babka Reviewed-by: Vlastimil Babka Tested-by: Baolin Wang Cc: David Hildenbrand Cc: "Huang, Ying" Cc: Mel Gorman Cc: Zi Yan Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/page_alloc.c | 52 ++++++++++++++++++------------------------------- 1 file changed, 19 insertions(+), 33 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 0f6cc39c4b8f..a4f1186e2f6a 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1259,18 +1259,15 @@ static void free_pcppages_bulk(struct zone *zone, int count, spin_unlock_irqrestore(&zone->lock, flags); } -static void free_one_page(struct zone *zone, - struct page *page, unsigned long pfn, - unsigned int order, - int migratetype, fpi_t fpi_flags) +static void free_one_page(struct zone *zone, struct page *page, + unsigned long pfn, unsigned int order, + fpi_t fpi_flags) { unsigned long flags; + int migratetype; spin_lock_irqsave(&zone->lock, flags); - if (unlikely(has_isolate_pageblock(zone) || - is_migrate_isolate(migratetype))) { - migratetype = get_pfnblock_migratetype(page, pfn); - } + migratetype = get_pfnblock_migratetype(page, pfn); __free_one_page(page, pfn, zone, order, migratetype, fpi_flags); spin_unlock_irqrestore(&zone->lock, flags); } @@ -1278,21 +1275,13 @@ static void free_one_page(struct zone *zone, static void __free_pages_ok(struct page *page, unsigned int order, fpi_t fpi_flags) { - int migratetype; unsigned long pfn = page_to_pfn(page); struct zone *zone = page_zone(page); if (!free_pages_prepare(page, order)) return; - /* - * Calling get_pfnblock_migratetype() without spin_lock_irqsave() here - * is used to avoid calling get_pfnblock_migratetype() under the lock. - * This will reduce the lock holding time. - */ - migratetype = get_pfnblock_migratetype(page, pfn); - - free_one_page(zone, page, pfn, order, migratetype, fpi_flags); + free_one_page(zone, page, pfn, order, fpi_flags); __count_vm_events(PGFREE, 1 << order); } @@ -2585,7 +2574,7 @@ void free_unref_page(struct page *page, unsigned int order) struct per_cpu_pages *pcp; struct zone *zone; unsigned long pfn = page_to_pfn(page); - int migratetype, pcpmigratetype; + int migratetype; if (!free_pages_prepare(page, order)) return; @@ -2597,23 +2586,23 @@ void free_unref_page(struct page *page, unsigned int order) * get those areas back if necessary. Otherwise, we may have to free * excessively into the page allocator */ - migratetype = pcpmigratetype = get_pfnblock_migratetype(page, pfn); + migratetype = get_pfnblock_migratetype(page, pfn); if (unlikely(migratetype >= MIGRATE_PCPTYPES)) { if (unlikely(is_migrate_isolate(migratetype))) { - free_one_page(page_zone(page), page, pfn, order, migratetype, FPI_NONE); + free_one_page(page_zone(page), page, pfn, order, FPI_NONE); return; } - pcpmigratetype = MIGRATE_MOVABLE; + migratetype = MIGRATE_MOVABLE; } zone = page_zone(page); pcp_trylock_prepare(UP_flags); pcp = pcp_spin_trylock(zone->per_cpu_pageset); if (pcp) { - free_unref_page_commit(zone, pcp, page, pcpmigratetype, order); + free_unref_page_commit(zone, pcp, page, migratetype, order); pcp_spin_unlock(pcp); } else { - free_one_page(zone, page, pfn, order, migratetype, FPI_NONE); + free_one_page(zone, page, pfn, order, FPI_NONE); } pcp_trylock_finish(UP_flags); } @@ -2643,12 +2632,8 @@ void free_unref_folios(struct folio_batch *folios) * allocator. */ if (!pcp_allowed_order(order)) { - int migratetype; - - migratetype = get_pfnblock_migratetype(&folio->page, - pfn); - free_one_page(folio_zone(folio), &folio->page, pfn, - order, migratetype, FPI_NONE); + free_one_page(folio_zone(folio), &folio->page, + pfn, order, FPI_NONE); continue; } folio->private = (void *)(unsigned long)order; @@ -2684,7 +2669,7 @@ void free_unref_folios(struct folio_batch *folios) */ if (is_migrate_isolate(migratetype)) { free_one_page(zone, &folio->page, pfn, - order, migratetype, FPI_NONE); + order, FPI_NONE); continue; } @@ -2697,7 +2682,7 @@ void free_unref_folios(struct folio_batch *folios) if (unlikely(!pcp)) { pcp_trylock_finish(UP_flags); free_one_page(zone, &folio->page, pfn, - order, migratetype, FPI_NONE); + order, FPI_NONE); continue; } locked_zone = zone; @@ -7012,13 +6997,14 @@ bool take_page_off_buddy(struct page *page) bool put_page_back_buddy(struct page *page) { struct zone *zone = page_zone(page); - unsigned long pfn = page_to_pfn(page); unsigned long flags; - int migratetype = get_pfnblock_migratetype(page, pfn); bool ret = false; spin_lock_irqsave(&zone->lock, flags); if (put_page_testzero(page)) { + unsigned long pfn = page_to_pfn(page); + int migratetype = get_pfnblock_migratetype(page, pfn); + ClearPageHWPoisonTakenOff(page); __free_one_page(page, pfn, zone, 0, migratetype, FPI_NONE); if (TestClearPageHWPoison(page)) { -- Gitee From 75310c0c4501a45cf0e44e8cbf2363ae6d042e98 Mon Sep 17 00:00:00 2001 From: Zi Yan Date: Wed, 20 Mar 2024 14:02:13 -0400 Subject: [PATCH 331/484] mm: page_alloc: set migratetype inside move_freepages() commit f37c0f6876a8eabe1477c87860460bc181f6cdbb upstream Conflicts: none Backport-reason: mTHP: optimize allocation and compaction This avoids changing migratetype after move_freepages() or move_freepages_block(), which is error prone. It also prepares for upcoming changes to fix move_freepages() not moving free pages partially in the range. Link: https://lkml.kernel.org/r/20240320180429.678181-9-hannes@cmpxchg.org Signed-off-by: Zi Yan Signed-off-by: Johannes Weiner Reviewed-by: Vlastimil Babka Tested-by: Baolin Wang Cc: David Hildenbrand Cc: "Huang, Ying" Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/page_alloc.c | 27 +++++++++++++-------------- mm/page_isolation.c | 7 +++---- 2 files changed, 16 insertions(+), 18 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index a4f1186e2f6a..cfce2cb5c682 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1613,9 +1613,8 @@ static inline struct page *__rmqueue_cma_fallback(struct zone *zone, #endif /* - * Move the free pages in a range to the freelist tail of the requested type. - * Note that start_page and end_pages are not aligned on a pageblock - * boundary. If alignment is required, use move_freepages_block() + * Change the type of a block and move all its free pages to that + * type's freelist. */ static int move_freepages(struct zone *zone, unsigned long start_pfn, unsigned long end_pfn, int migratetype) @@ -1625,6 +1624,9 @@ static int move_freepages(struct zone *zone, unsigned long start_pfn, unsigned int order; int pages_moved = 0; + VM_WARN_ON(start_pfn & (pageblock_nr_pages - 1)); + VM_WARN_ON(start_pfn + pageblock_nr_pages - 1 != end_pfn); + for (pfn = start_pfn; pfn <= end_pfn;) { page = pfn_to_page(pfn); if (!PageBuddy(page)) { @@ -1642,6 +1644,8 @@ static int move_freepages(struct zone *zone, unsigned long start_pfn, pages_moved += 1 << order; } + set_pageblock_migratetype(pfn_to_page(start_pfn), migratetype); + return pages_moved; } @@ -1869,7 +1873,6 @@ steal_suitable_fallback(struct zone *zone, struct page *page, if (free_pages + alike_pages >= (1 << (pageblock_order-1)) || page_group_by_mobility_disabled) { move_freepages(zone, start_pfn, end_pfn, start_type); - set_pageblock_migratetype(page, start_type); return __rmqueue_smallest(zone, order, start_type); } @@ -1939,12 +1942,10 @@ static void reserve_highatomic_pageblock(struct page *page, struct zone *zone) /* Yoink! */ mt = get_pageblock_migratetype(page); /* Only reserve normal pageblocks (i.e., they can merge with others) */ - if (migratetype_is_mergeable(mt)) { - if (move_freepages_block(zone, page, MIGRATE_HIGHATOMIC) != -1) { - set_pageblock_migratetype(page, MIGRATE_HIGHATOMIC); + if (migratetype_is_mergeable(mt)) + if (move_freepages_block(zone, page, + MIGRATE_HIGHATOMIC) != -1) zone->nr_reserved_highatomic += pageblock_nr_pages; - } - } out_unlock: spin_unlock_irqrestore(&zone->lock, flags); @@ -2023,7 +2024,6 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac, * not fail on zone boundaries. */ WARN_ON_ONCE(ret == -1); - set_pageblock_migratetype(page, ac->migratetype); if (ret > 0) { spin_unlock_irqrestore(&zone->lock, flags); return ret; @@ -2763,10 +2763,9 @@ int __isolate_free_page(struct page *page, unsigned int order) * Only change normal pageblocks (i.e., they can merge * with others) */ - if (migratetype_is_mergeable(mt) && - move_freepages_block(zone, page, - MIGRATE_MOVABLE) != -1) - set_pageblock_migratetype(page, MIGRATE_MOVABLE); + if (migratetype_is_mergeable(mt)) + move_freepages_block(zone, page, + MIGRATE_MOVABLE); } } diff --git a/mm/page_isolation.c b/mm/page_isolation.c index 616d96a1ea25..1b7d1b466d0f 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -188,7 +188,6 @@ static int set_migratetype_isolate(struct page *page, int migratetype, int isol_ return -EBUSY; } __mod_zone_freepage_state(zone, -nr_pages, mt); - set_pageblock_migratetype(page, MIGRATE_ISOLATE); zone->nr_isolate_pageblock++; spin_unlock_irqrestore(&zone->lock, flags); return 0; @@ -262,10 +261,10 @@ static void unset_migratetype_isolate(struct page *page, int migratetype) */ WARN_ON_ONCE(nr_pages == -1); __mod_zone_freepage_state(zone, nr_pages, migratetype); - } - set_pageblock_migratetype(page, migratetype); - if (isolated_page) + } else { + set_pageblock_migratetype(page, migratetype); __putback_isolated_page(page, order, migratetype); + } zone->nr_isolate_pageblock--; out: spin_unlock_irqrestore(&zone->lock, flags); -- Gitee From bda35426a67c03a4f435f3a2d83f225db335ea54 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 20 Mar 2024 14:02:14 -0400 Subject: [PATCH 332/484] mm: page_isolation: prepare for hygienic freelists commit fd919a85cd55be5d00a6a7372071f44c8eafb825 upstream Conflicts: minor, due to missing c8b360031218 Backport-reason: mTHP: optimize allocation and compaction Page isolation currently sets MIGRATE_ISOLATE on a block, then drops zone->lock and scans the block for straddling buddies to split up. Because this happens non-atomically wrt the page allocator, it's possible for allocations to get a buddy whose first block is a regular pcp migratetype but whose tail is isolated. This means that in certain cases memory can still be allocated after isolation. It will also trigger the freelist type hygiene warnings in subsequent patches. start_isolate_page_range() isolate_single_pageblock() set_migratetype_isolate(tail) lock zone->lock move_freepages_block(tail) // nop set_pageblock_migratetype(tail) unlock zone->lock __rmqueue_smallest() del_page_from_freelist(head) expand(head, head_mt) WARN(head_mt != tail_mt) start_pfn = ALIGN_DOWN(MAX_ORDER_NR_PAGES) for (pfn = start_pfn, pfn < end_pfn) if (PageBuddy()) split_free_page(head) Introduce a variant of move_freepages_block() provided by the allocator specifically for page isolation; it moves free pages, converts the block, and handles the splitting of straddling buddies while holding zone->lock. The allocator knows that pageblocks and buddies are always naturally aligned, which means that buddies can only straddle blocks if they're actually >pageblock_order. This means the search-and-split part can be simplified compared to what page isolation used to do. Also tighten up the page isolation code around the expectations of which pages can be large, and how they are freed. Based on extensive discussions with and invaluable input from Zi Yan. [hannes@cmpxchg.org: work around older gcc warning] Link: https://lkml.kernel.org/r/20240321142426.GB777580@cmpxchg.org Link: https://lkml.kernel.org/r/20240320180429.678181-10-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Reviewed-by: Vlastimil Babka Tested-by: Baolin Wang Cc: David Hildenbrand Cc: "Huang, Ying" Cc: Mel Gorman Cc: Zi Yan Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- include/linux/page-isolation.h | 4 +- mm/internal.h | 4 - mm/page_alloc.c | 204 +++++++++++++++++++-------------- mm/page_isolation.c | 106 ++++++----------- 4 files changed, 155 insertions(+), 163 deletions(-) diff --git a/include/linux/page-isolation.h b/include/linux/page-isolation.h index 8550b3c91480..c16db0067090 100644 --- a/include/linux/page-isolation.h +++ b/include/linux/page-isolation.h @@ -34,7 +34,9 @@ static inline bool is_migrate_isolate(int migratetype) #define REPORT_FAILURE 0x2 void set_pageblock_migratetype(struct page *page, int migratetype); -int move_freepages_block(struct zone *zone, struct page *page, int migratetype); + +bool move_freepages_block_isolate(struct zone *zone, struct page *page, + int migratetype); int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, int migratetype, int flags, gfp_t gfp_flags); diff --git a/mm/internal.h b/mm/internal.h index 93c78e0e66bf..743fc91ffdc3 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -715,10 +715,6 @@ extern void *memmap_alloc(phys_addr_t size, phys_addr_t align, void memmap_init_range(unsigned long, int, unsigned long, unsigned long, unsigned long, enum meminit_context, struct vmem_altmap *, int); - -int split_free_page(struct page *free_page, - unsigned int order, unsigned long split_pfn_offset); - #if defined CONFIG_COMPACTION || defined CONFIG_CMA /* diff --git a/mm/page_alloc.c b/mm/page_alloc.c index cfce2cb5c682..c188b4f11fb3 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -842,64 +842,6 @@ static inline void __free_one_page(struct page *page, page_reporting_notify_free(order); } -/** - * split_free_page() -- split a free page at split_pfn_offset - * @free_page: the original free page - * @order: the order of the page - * @split_pfn_offset: split offset within the page - * - * Return -ENOENT if the free page is changed, otherwise 0 - * - * It is used when the free page crosses two pageblocks with different migratetypes - * at split_pfn_offset within the page. The split free page will be put into - * separate migratetype lists afterwards. Otherwise, the function achieves - * nothing. - */ -int split_free_page(struct page *free_page, - unsigned int order, unsigned long split_pfn_offset) -{ - struct zone *zone = page_zone(free_page); - unsigned long free_page_pfn = page_to_pfn(free_page); - unsigned long pfn; - unsigned long flags; - int free_page_order; - int mt; - int ret = 0; - - if (split_pfn_offset == 0) - return ret; - - spin_lock_irqsave(&zone->lock, flags); - - if (!PageBuddy(free_page) || buddy_order(free_page) != order) { - ret = -ENOENT; - goto out; - } - - mt = get_pfnblock_migratetype(free_page, free_page_pfn); - if (likely(!is_migrate_isolate(mt))) - __mod_zone_freepage_state(zone, -(1UL << order), mt); - - del_page_from_free_list(free_page, zone, order); - for (pfn = free_page_pfn; - pfn < free_page_pfn + (1UL << order);) { - int mt = get_pfnblock_migratetype(pfn_to_page(pfn), pfn); - - free_page_order = min_t(unsigned int, - pfn ? __ffs(pfn) : order, - __fls(split_pfn_offset)); - __free_one_page(pfn_to_page(pfn), pfn, zone, free_page_order, - mt, FPI_NONE); - pfn += 1UL << free_page_order; - split_pfn_offset -= (1UL << free_page_order); - /* we have done the first part, now switch to second part */ - if (split_pfn_offset == 0) - split_pfn_offset = (1UL << order) - (pfn - free_page_pfn); - } -out: - spin_unlock_irqrestore(&zone->lock, flags); - return ret; -} /* * A bad page could be due to a number of fields. Instead of multiple branches, * try and check multiple fields with one check. The caller must do a detailed @@ -1701,8 +1643,8 @@ static bool prep_move_freepages_block(struct zone *zone, struct page *page, return true; } -int move_freepages_block(struct zone *zone, struct page *page, - int migratetype) +static int move_freepages_block(struct zone *zone, struct page *page, + int migratetype) { unsigned long start_pfn, end_pfn; @@ -1713,6 +1655,123 @@ int move_freepages_block(struct zone *zone, struct page *page, return move_freepages(zone, start_pfn, end_pfn, migratetype); } +#ifdef CONFIG_MEMORY_ISOLATION +/* Look for a buddy that straddles start_pfn */ +static unsigned long find_large_buddy(unsigned long start_pfn) +{ + int order = 0; + struct page *page; + unsigned long pfn = start_pfn; + + while (!PageBuddy(page = pfn_to_page(pfn))) { + /* Nothing found */ + if (++order > MAX_PAGE_ORDER) + return start_pfn; + pfn &= ~0UL << order; + } + + /* + * Found a preceding buddy, but does it straddle? + */ + if (pfn + (1 << buddy_order(page)) > start_pfn) + return pfn; + + /* Nothing found */ + return start_pfn; +} + +/* Split a multi-block free page into its individual pageblocks */ +static void split_large_buddy(struct zone *zone, struct page *page, + unsigned long pfn, int order) +{ + unsigned long end_pfn = pfn + (1 << order); + + VM_WARN_ON_ONCE(order <= pageblock_order); + VM_WARN_ON_ONCE(pfn & (pageblock_nr_pages - 1)); + + /* Caller removed page from freelist, buddy info cleared! */ + VM_WARN_ON_ONCE(PageBuddy(page)); + + while (pfn != end_pfn) { + int mt = get_pfnblock_migratetype(page, pfn); + + __free_one_page(page, pfn, zone, pageblock_order, mt, FPI_NONE); + pfn += pageblock_nr_pages; + page = pfn_to_page(pfn); + } +} + +/** + * move_freepages_block_isolate - move free pages in block for page isolation + * @zone: the zone + * @page: the pageblock page + * @migratetype: migratetype to set on the pageblock + * + * This is similar to move_freepages_block(), but handles the special + * case encountered in page isolation, where the block of interest + * might be part of a larger buddy spanning multiple pageblocks. + * + * Unlike the regular page allocator path, which moves pages while + * stealing buddies off the freelist, page isolation is interested in + * arbitrary pfn ranges that may have overlapping buddies on both ends. + * + * This function handles that. Straddling buddies are split into + * individual pageblocks. Only the block of interest is moved. + * + * Returns %true if pages could be moved, %false otherwise. + */ +bool move_freepages_block_isolate(struct zone *zone, struct page *page, + int migratetype) +{ + unsigned long start_pfn, end_pfn, pfn; + int nr_moved, mt; + + if (!prep_move_freepages_block(zone, page, &start_pfn, &end_pfn, + NULL, NULL)) + return false; + + /* No splits needed if buddies can't span multiple blocks */ + if (pageblock_order == MAX_PAGE_ORDER) + goto move; + + /* We're a tail block in a larger buddy */ + pfn = find_large_buddy(start_pfn); + if (pfn != start_pfn) { + struct page *buddy = pfn_to_page(pfn); + int order = buddy_order(buddy); + int mt = get_pfnblock_migratetype(buddy, pfn); + + if (!is_migrate_isolate(mt)) + __mod_zone_freepage_state(zone, -(1UL << order), mt); + del_page_from_free_list(buddy, zone, order); + set_pageblock_migratetype(page, migratetype); + split_large_buddy(zone, buddy, pfn, order); + return true; + } + + /* We're the starting block of a larger buddy */ + if (PageBuddy(page) && buddy_order(page) > pageblock_order) { + int mt = get_pfnblock_migratetype(page, pfn); + int order = buddy_order(page); + + if (!is_migrate_isolate(mt)) + __mod_zone_freepage_state(zone, -(1UL << order), mt); + del_page_from_free_list(page, zone, order); + set_pageblock_migratetype(page, migratetype); + split_large_buddy(zone, page, pfn, order); + return true; + } +move: + mt = get_pfnblock_migratetype(page, start_pfn); + nr_moved = move_freepages(zone, start_pfn, end_pfn, migratetype); + if (!is_migrate_isolate(mt)) + __mod_zone_freepage_state(zone, -nr_moved, mt); + else if (!is_migrate_isolate(migratetype)) + __mod_zone_freepage_state(zone, nr_moved, migratetype); + return true; +} +#endif /* CONFIG_MEMORY_ISOLATION */ + static void change_pageblock_range(struct page *pageblock_page, int start_order, int migratetype) { @@ -6569,7 +6628,6 @@ int alloc_contig_range(unsigned long start, unsigned long end, unsigned migratetype, gfp_t gfp_mask) { unsigned long outer_start, outer_end; - int order; int ret = 0; struct compact_control cc = { @@ -6652,29 +6710,7 @@ int alloc_contig_range(unsigned long start, unsigned long end, * We don't have to hold zone->lock here because the pages are * isolated thus they won't get removed from buddy. */ - - order = 0; - outer_start = start; - while (!PageBuddy(pfn_to_page(outer_start))) { - if (++order > MAX_PAGE_ORDER) { - outer_start = start; - break; - } - outer_start &= ~0UL << order; - } - - if (outer_start != start) { - order = buddy_order(pfn_to_page(outer_start)); - - /* - * outer_start page could be small order buddy page and - * it doesn't include start page. Adjust outer_start - * in this case to report failed page properly - * on tracepoint in test_pages_isolated() - */ - if (outer_start + (1UL << order) <= start) - outer_start = start; - } + outer_start = find_large_buddy(start); /* Make sure the range is really isolated. */ if (test_pages_isolated(outer_start, end, 0)) { diff --git a/mm/page_isolation.c b/mm/page_isolation.c index 1b7d1b466d0f..8781bd3b8fbc 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -178,16 +178,10 @@ static int set_migratetype_isolate(struct page *page, int migratetype, int isol_ unmovable = has_unmovable_pages(check_unmovable_start, check_unmovable_end, migratetype, isol_flags); if (!unmovable) { - int nr_pages; - int mt = get_pageblock_migratetype(page); - - nr_pages = move_freepages_block(zone, page, MIGRATE_ISOLATE); - /* Block spans zone boundaries? */ - if (nr_pages == -1) { + if (!move_freepages_block_isolate(zone, page, MIGRATE_ISOLATE)) { spin_unlock_irqrestore(&zone->lock, flags); return -EBUSY; } - __mod_zone_freepage_state(zone, -nr_pages, mt); zone->nr_isolate_pageblock++; spin_unlock_irqrestore(&zone->lock, flags); return 0; @@ -254,13 +248,11 @@ static void unset_migratetype_isolate(struct page *page, int migratetype) * allocation. */ if (!isolated_page) { - int nr_pages = move_freepages_block(zone, page, migratetype); /* * Isolating this block already succeeded, so this * should not fail on zone boundaries. */ - WARN_ON_ONCE(nr_pages == -1); - __mod_zone_freepage_state(zone, nr_pages, migratetype); + WARN_ON_ONCE(!move_freepages_block_isolate(zone, page, migratetype)); } else { set_pageblock_migratetype(page, migratetype); __putback_isolated_page(page, order, migratetype); @@ -374,26 +366,29 @@ static int isolate_single_pageblock(unsigned long boundary_pfn, int flags, VM_BUG_ON(!page); pfn = page_to_pfn(page); - /* - * start_pfn is MAX_ORDER_NR_PAGES aligned, if there is any - * free pages in [start_pfn, boundary_pfn), its head page will - * always be in the range. - */ + if (PageBuddy(page)) { int order = buddy_order(page); - if (pfn + (1UL << order) > boundary_pfn) { - /* free page changed before split, check it again */ - if (split_free_page(page, order, boundary_pfn - pfn)) - continue; - } + /* move_freepages_block_isolate() handled this */ + VM_WARN_ON_ONCE(pfn + (1 << order) > boundary_pfn); pfn += 1UL << order; continue; } + /* - * migrate compound pages then let the free page handling code - * above do the rest. If migration is not possible, just fail. + * If a compound page is straddling our block, attempt + * to migrate it out of the way. + * + * We don't have to worry about this creating a large + * free page that straddles into our block: gigantic + * pages are freed as order-0 chunks, and LRU pages + * (currently) do not exceed pageblock_order. + * + * The block of interest has already been marked + * MIGRATE_ISOLATE above, so when migration is done it + * will free its pages onto the correct freelists. */ if (PageCompound(page)) { struct page *head = compound_head(page); @@ -404,16 +399,10 @@ static int isolate_single_pageblock(unsigned long boundary_pfn, int flags, pfn = head_pfn + nr_pages; continue; } + #if defined CONFIG_COMPACTION || defined CONFIG_CMA - /* - * hugetlb, lru compound (THP), and movable compound pages - * can be migrated. Otherwise, fail the isolation. - */ - if (PageHuge(page) || PageLRU(page) || __PageMovable(page)) { - int order; - unsigned long outer_pfn; + if (PageHuge(page)) { int page_mt = get_pageblock_migratetype(page); - bool isolate_page = !is_migrate_isolate_page(page); struct compact_control cc = { .nr_migratepages = 0, .order = -1, @@ -426,56 +415,25 @@ static int isolate_single_pageblock(unsigned long boundary_pfn, int flags, }; INIT_LIST_HEAD(&cc.migratepages); - /* - * XXX: mark the page as MIGRATE_ISOLATE so that - * no one else can grab the freed page after migration. - * Ideally, the page should be freed as two separate - * pages to be added into separate migratetype free - * lists. - */ - if (isolate_page) { - ret = set_migratetype_isolate(page, page_mt, - flags, head_pfn, head_pfn + nr_pages); - if (ret) - goto failed; - } - ret = __alloc_contig_migrate_range(&cc, head_pfn, head_pfn + nr_pages); - - /* - * restore the page's migratetype so that it can - * be split into separate migratetype free lists - * later. - */ - if (isolate_page) - unset_migratetype_isolate(page, page_mt); - if (ret) goto failed; - /* - * reset pfn to the head of the free page, so - * that the free page handling code above can split - * the free page to the right migratetype list. - * - * head_pfn is not used here as a hugetlb page order - * can be bigger than MAX_PAGE_ORDER, but after it is - * freed, the free page order is not. Use pfn within - * the range to find the head of the free page. - */ - order = 0; - outer_pfn = pfn; - while (!PageBuddy(pfn_to_page(outer_pfn))) { - /* stop if we cannot find the free page */ - if (++order > MAX_PAGE_ORDER) - goto failed; - outer_pfn &= ~0UL << order; - } - pfn = outer_pfn; + pfn = head_pfn + nr_pages; continue; - } else + } + + /* + * These pages are movable too, but they're + * not expected to exceed pageblock_order. + * + * Let us know when they do, so we can add + * proper free and split handling for them. + */ + VM_WARN_ON_ONCE_PAGE(PageLRU(page), page); + VM_WARN_ON_ONCE_PAGE(__PageMovable(page), page); #endif - goto failed; + goto failed; } pfn++; -- Gitee From bd402c77445a9fef78ea95545efc0d870cce7d1b Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 20 Mar 2024 14:02:15 -0400 Subject: [PATCH 333/484] mm: page_alloc: consolidate free page accounting commit e0932b6c1f942fa747258e152cdce0d0b2b5be5c upstream Conflicts: none Backport-reason: mTHP: optimize allocation and compaction Free page accounting currently happens a bit too high up the call stack, where it has to deal with guard pages, compaction capturing, block stealing and even page isolation. This is subtle and fragile, and makes it difficult to hack on the code. Now that type violations on the freelists have been fixed, push the accounting down to where pages enter and leave the freelist. [hannes@cmpxchg.org: undo unrelated drive-by line wrap] Link: https://lkml.kernel.org/r/20240327185736.GA7597@cmpxchg.org [hannes@cmpxchg.org: remove unused page parameter from account_freepages()] Link: https://lkml.kernel.org/r/20240327185831.GB7597@cmpxchg.org [baolin.wang@linux.alibaba.com: fix free page accounting] Link: https://lkml.kernel.org/r/a2a48baca69f103aa431fd201f8a06e3b95e203d.1712648441.git.baolin.wang@linux.alibaba.com [andriy.shevchenko@linux.intel.com: avoid defining unused function] Link: https://lkml.kernel.org/r/20240423161506.2637177-1-andriy.shevchenko@linux.intel.com Link: https://lkml.kernel.org/r/20240320180429.678181-11-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Signed-off-by: Andy Shevchenko Signed-off-by: Baolin Wang Reviewed-by: Vlastimil Babka Tested-by: Baolin Wang Cc: David Hildenbrand Cc: "Huang, Ying" Cc: Mel Gorman Cc: Zi Yan Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- include/linux/mm.h | 18 ++-- include/linux/vmstat.h | 8 -- mm/debug_page_alloc.c | 12 +-- mm/internal.h | 5 -- mm/page_alloc.c | 192 +++++++++++++++++++++++------------------ 5 files changed, 118 insertions(+), 117 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 98f7d0f2aa32..0682b1766e4e 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3772,24 +3772,22 @@ static inline bool page_is_guard(struct page *page) return PageGuard(page); } -bool __set_page_guard(struct zone *zone, struct page *page, unsigned int order, - int migratetype); +bool __set_page_guard(struct zone *zone, struct page *page, unsigned int order); static inline bool set_page_guard(struct zone *zone, struct page *page, - unsigned int order, int migratetype) + unsigned int order) { if (!debug_guardpage_enabled()) return false; - return __set_page_guard(zone, page, order, migratetype); + return __set_page_guard(zone, page, order); } -void __clear_page_guard(struct zone *zone, struct page *page, unsigned int order, - int migratetype); +void __clear_page_guard(struct zone *zone, struct page *page, unsigned int order); static inline void clear_page_guard(struct zone *zone, struct page *page, - unsigned int order, int migratetype) + unsigned int order) { if (!debug_guardpage_enabled()) return; - __clear_page_guard(zone, page, order, migratetype); + __clear_page_guard(zone, page, order); } #else /* CONFIG_DEBUG_PAGEALLOC */ @@ -3799,9 +3797,9 @@ static inline unsigned int debug_guardpage_minorder(void) { return 0; } static inline bool debug_guardpage_enabled(void) { return false; } static inline bool page_is_guard(struct page *page) { return false; } static inline bool set_page_guard(struct zone *zone, struct page *page, - unsigned int order, int migratetype) { return false; } + unsigned int order) { return false; } static inline void clear_page_guard(struct zone *zone, struct page *page, - unsigned int order, int migratetype) {} + unsigned int order) {} #endif /* CONFIG_DEBUG_PAGEALLOC */ #ifdef __HAVE_ARCH_GATE_AREA diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index 777b5d53b84a..11d1b4a6a12e 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -489,14 +489,6 @@ static inline void node_stat_sub_folio(struct folio *folio, mod_node_page_state(folio_pgdat(folio), item, -folio_nr_pages(folio)); } -static inline void __mod_zone_freepage_state(struct zone *zone, int nr_pages, - int migratetype) -{ - __mod_zone_page_state(zone, NR_FREE_PAGES, nr_pages); - if (is_migrate_cma(migratetype)) - __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, nr_pages); -} - extern const char * const vmstat_text[]; static inline const char *zone_stat_name(enum zone_stat_item item) diff --git a/mm/debug_page_alloc.c b/mm/debug_page_alloc.c index 6755f0c9d4a3..d46acf989dde 100644 --- a/mm/debug_page_alloc.c +++ b/mm/debug_page_alloc.c @@ -32,8 +32,7 @@ static int __init debug_guardpage_minorder_setup(char *buf) } early_param("debug_guardpage_minorder", debug_guardpage_minorder_setup); -bool __set_page_guard(struct zone *zone, struct page *page, unsigned int order, - int migratetype) +bool __set_page_guard(struct zone *zone, struct page *page, unsigned int order) { if (order >= debug_guardpage_minorder()) return false; @@ -41,19 +40,12 @@ bool __set_page_guard(struct zone *zone, struct page *page, unsigned int order, __SetPageGuard(page); INIT_LIST_HEAD(&page->buddy_list); set_page_private(page, order); - /* Guard pages are not available for any usage */ - if (!is_migrate_isolate(migratetype)) - __mod_zone_freepage_state(zone, -(1 << order), migratetype); return true; } -void __clear_page_guard(struct zone *zone, struct page *page, unsigned int order, - int migratetype) +void __clear_page_guard(struct zone *zone, struct page *page, unsigned int order) { __ClearPageGuard(page); - set_page_private(page, 0); - if (!is_migrate_isolate(migratetype)) - __mod_zone_freepage_state(zone, (1 << order), migratetype); } diff --git a/mm/internal.h b/mm/internal.h index 743fc91ffdc3..473065101aa9 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -1191,11 +1191,6 @@ static inline bool is_migrate_highatomic(enum migratetype migratetype) return migratetype == MIGRATE_HIGHATOMIC; } -static inline bool is_migrate_highatomic_page(struct page *page) -{ - return get_pageblock_migratetype(page) == MIGRATE_HIGHATOMIC; -} - void setup_zone_pageset(struct zone *zone); struct migration_target_control { diff --git a/mm/page_alloc.c b/mm/page_alloc.c index c188b4f11fb3..e4ed29d68943 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -652,23 +652,33 @@ compaction_capture(struct capture_control *capc, struct page *page, } #endif /* CONFIG_COMPACTION */ -/* Used for pages not on another list */ -static inline void add_to_free_list(struct page *page, struct zone *zone, - unsigned int order, int migratetype) +static inline void account_freepages(struct zone *zone, int nr_pages, + int migratetype) { - struct free_area *area = &zone->free_area[order]; + if (is_migrate_isolate(migratetype)) + return; - list_add(&page->buddy_list, &area->free_list[migratetype]); - area->nr_free++; + __mod_zone_page_state(zone, NR_FREE_PAGES, nr_pages); + + if (is_migrate_cma(migratetype)) + __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, nr_pages); } /* Used for pages not on another list */ -static inline void add_to_free_list_tail(struct page *page, struct zone *zone, - unsigned int order, int migratetype) +static inline void __add_to_free_list(struct page *page, struct zone *zone, + unsigned int order, int migratetype, + bool tail) { struct free_area *area = &zone->free_area[order]; - list_add_tail(&page->buddy_list, &area->free_list[migratetype]); + VM_WARN_ONCE(get_pageblock_migratetype(page) != migratetype, + "page type is %lu, passed migratetype is %d (nr=%d)\n", + get_pageblock_migratetype(page), migratetype, 1 << order); + + if (tail) + list_add_tail(&page->buddy_list, &area->free_list[migratetype]); + else + list_add(&page->buddy_list, &area->free_list[migratetype]); area->nr_free++; } @@ -678,16 +688,28 @@ static inline void add_to_free_list_tail(struct page *page, struct zone *zone, * allocation again (e.g., optimization for memory onlining). */ static inline void move_to_free_list(struct page *page, struct zone *zone, - unsigned int order, int migratetype) + unsigned int order, int old_mt, int new_mt) { struct free_area *area = &zone->free_area[order]; - list_move_tail(&page->buddy_list, &area->free_list[migratetype]); + /* Free page moving can fail, so it happens before the type update */ + VM_WARN_ONCE(get_pageblock_migratetype(page) != old_mt, + "page type is %lu, passed migratetype is %d (nr=%d)\n", + get_pageblock_migratetype(page), old_mt, 1 << order); + + list_move_tail(&page->buddy_list, &area->free_list[new_mt]); + + account_freepages(zone, -(1 << order), old_mt); + account_freepages(zone, 1 << order, new_mt); } -static inline void del_page_from_free_list(struct page *page, struct zone *zone, - unsigned int order) +static inline void __del_page_from_free_list(struct page *page, struct zone *zone, + unsigned int order, int migratetype) { + VM_WARN_ONCE(get_pageblock_migratetype(page) != migratetype, + "page type is %lu, passed migratetype is %d (nr=%d)\n", + get_pageblock_migratetype(page), migratetype, 1 << order); + /* clear reported state and update reported page count */ if (page_reported(page)) __ClearPageReported(page); @@ -698,6 +720,13 @@ static inline void del_page_from_free_list(struct page *page, struct zone *zone, zone->free_area[order].nr_free--; } +static inline void del_page_from_free_list(struct page *page, struct zone *zone, + unsigned int order, int migratetype) +{ + __del_page_from_free_list(page, zone, order, migratetype); + account_freepages(zone, -(1 << order), migratetype); +} + static inline struct page *get_page_from_free_area(struct free_area *area, int migratetype) { @@ -769,16 +798,16 @@ static inline void __free_one_page(struct page *page, VM_BUG_ON_PAGE(page->flags & PAGE_FLAGS_CHECK_AT_PREP, page); VM_BUG_ON(migratetype == -1); - if (likely(!is_migrate_isolate(migratetype))) - __mod_zone_freepage_state(zone, 1 << order, migratetype); - VM_BUG_ON_PAGE(pfn & ((1 << order) - 1), page); VM_BUG_ON_PAGE(bad_range(zone, page), page); + account_freepages(zone, 1 << order, migratetype); + while (order < MAX_PAGE_ORDER) { + int buddy_mt = migratetype; + if (compaction_capture(capc, page, order, migratetype)) { - __mod_zone_freepage_state(zone, -(1 << order), - migratetype); + account_freepages(zone, -(1 << order), migratetype); return; } @@ -793,19 +822,12 @@ static inline void __free_one_page(struct page *page, * pageblock isolation could cause incorrect freepage or CMA * accounting or HIGHATOMIC accounting. */ - int buddy_mt = get_pfnblock_migratetype(buddy, buddy_pfn); + buddy_mt = get_pfnblock_migratetype(buddy, buddy_pfn); - if (migratetype != buddy_mt) { - if (!migratetype_is_mergeable(migratetype) || - !migratetype_is_mergeable(buddy_mt)) - goto done_merging; - /* - * Match buddy type. This ensures that - * an expand() down the line puts the - * sub-blocks on the right freelists. - */ - set_pageblock_migratetype(buddy, migratetype); - } + if (migratetype != buddy_mt && + (!migratetype_is_mergeable(migratetype) || + !migratetype_is_mergeable(buddy_mt))) + goto done_merging; } /* @@ -813,9 +835,19 @@ static inline void __free_one_page(struct page *page, * merge with it and move up one order. */ if (page_is_guard(buddy)) - clear_page_guard(zone, buddy, order, migratetype); + clear_page_guard(zone, buddy, order); else - del_page_from_free_list(buddy, zone, order); + __del_page_from_free_list(buddy, zone, order, buddy_mt); + + if (unlikely(buddy_mt != migratetype)) { + /* + * Match buddy type. This ensures that an + * expand() down the line puts the sub-blocks + * on the right freelists. + */ + set_pageblock_migratetype(buddy, migratetype); + } + combined_pfn = buddy_pfn & pfn; page = page + (combined_pfn - pfn); pfn = combined_pfn; @@ -832,10 +864,7 @@ static inline void __free_one_page(struct page *page, else to_tail = buddy_merge_likely(pfn, buddy_pfn, page, order); - if (to_tail) - add_to_free_list_tail(page, zone, order, migratetype); - else - add_to_free_list(page, zone, order, migratetype); + __add_to_free_list(page, zone, order, migratetype, to_tail); /* Notify page reporting subsystem of freed page */ if (!(fpi_flags & FPI_SKIP_REPORT_NOTIFY)) @@ -1346,10 +1375,10 @@ static inline void expand(struct zone *zone, struct page *page, * Corresponding page table entries will not be touched, * pages will stay not present in virtual address space */ - if (set_page_guard(zone, &page[size], high, migratetype)) + if (set_page_guard(zone, &page[size], high)) continue; - add_to_free_list(&page[size], zone, high, migratetype); + add_to_free_list(&page[size], zone, high, migratetype, false); set_buddy_order(&page[size], high); } } @@ -1519,7 +1548,7 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order, page = get_page_from_free_area(area, migratetype); if (!page) continue; - del_page_from_free_list(page, zone, current_order); + del_page_from_free_list(page, zone, current_order, migratetype); expand(zone, page, order, current_order, migratetype); trace_mm_page_alloc_zone_locked(page, order, migratetype, pcp_allowed_order(order) && @@ -1559,7 +1588,7 @@ static inline struct page *__rmqueue_cma_fallback(struct zone *zone, * type's freelist. */ static int move_freepages(struct zone *zone, unsigned long start_pfn, - unsigned long end_pfn, int migratetype) + unsigned long end_pfn, int old_mt, int new_mt) { struct page *page; unsigned long pfn; @@ -1581,12 +1610,14 @@ static int move_freepages(struct zone *zone, unsigned long start_pfn, VM_BUG_ON_PAGE(page_zone(page) != zone, page); order = buddy_order(page); - move_to_free_list(page, zone, order, migratetype); + + move_to_free_list(page, zone, order, old_mt, new_mt); + pfn += 1 << order; pages_moved += 1 << order; } - set_pageblock_migratetype(pfn_to_page(start_pfn), migratetype); + set_pageblock_migratetype(pfn_to_page(start_pfn), new_mt); return pages_moved; } @@ -1644,7 +1675,7 @@ static bool prep_move_freepages_block(struct zone *zone, struct page *page, } static int move_freepages_block(struct zone *zone, struct page *page, - int migratetype) + int old_mt, int new_mt) { unsigned long start_pfn, end_pfn; @@ -1652,7 +1683,7 @@ static int move_freepages_block(struct zone *zone, struct page *page, NULL, NULL)) return -1; - return move_freepages(zone, start_pfn, end_pfn, migratetype); + return move_freepages(zone, start_pfn, end_pfn, old_mt, new_mt); } #ifdef CONFIG_MEMORY_ISOLATION @@ -1724,7 +1755,6 @@ bool move_freepages_block_isolate(struct zone *zone, struct page *page, int migratetype) { unsigned long start_pfn, end_pfn, pfn; - int nr_moved, mt; if (!prep_move_freepages_block(zone, page, &start_pfn, &end_pfn, NULL, NULL)) @@ -1739,11 +1769,9 @@ bool move_freepages_block_isolate(struct zone *zone, struct page *page, if (pfn != start_pfn) { struct page *buddy = pfn_to_page(pfn); int order = buddy_order(buddy); - int mt = get_pfnblock_migratetype(buddy, pfn); - if (!is_migrate_isolate(mt)) - __mod_zone_freepage_state(zone, -(1UL << order), mt); - del_page_from_free_list(buddy, zone, order); + del_page_from_free_list(buddy, zone, order, + get_pfnblock_migratetype(buddy, pfn)); set_pageblock_migratetype(page, migratetype); split_large_buddy(zone, buddy, pfn, order); return true; @@ -1751,23 +1779,17 @@ bool move_freepages_block_isolate(struct zone *zone, struct page *page, /* We're the starting block of a larger buddy */ if (PageBuddy(page) && buddy_order(page) > pageblock_order) { - int mt = get_pfnblock_migratetype(page, pfn); int order = buddy_order(page); - if (!is_migrate_isolate(mt)) - __mod_zone_freepage_state(zone, -(1UL << order), mt); - del_page_from_free_list(page, zone, order); + del_page_from_free_list(page, zone, order, + get_pfnblock_migratetype(page, pfn)); set_pageblock_migratetype(page, migratetype); split_large_buddy(zone, page, pfn, order); return true; } move: - mt = get_pfnblock_migratetype(page, start_pfn); - nr_moved = move_freepages(zone, start_pfn, end_pfn, migratetype); - if (!is_migrate_isolate(mt)) - __mod_zone_freepage_state(zone, -nr_moved, mt); - else if (!is_migrate_isolate(migratetype)) - __mod_zone_freepage_state(zone, nr_moved, migratetype); + move_freepages(zone, start_pfn, end_pfn, + get_pfnblock_migratetype(page, start_pfn), migratetype); return true; } #endif /* CONFIG_MEMORY_ISOLATION */ @@ -1881,7 +1903,7 @@ steal_suitable_fallback(struct zone *zone, struct page *page, /* Take ownership for orders >= pageblock_order */ if (current_order >= pageblock_order) { - del_page_from_free_list(page, zone, current_order); + del_page_from_free_list(page, zone, current_order, block_type); change_pageblock_range(page, current_order, start_type); expand(zone, page, order, current_order, start_type); return page; @@ -1931,12 +1953,12 @@ steal_suitable_fallback(struct zone *zone, struct page *page, */ if (free_pages + alike_pages >= (1 << (pageblock_order-1)) || page_group_by_mobility_disabled) { - move_freepages(zone, start_pfn, end_pfn, start_type); + move_freepages(zone, start_pfn, end_pfn, block_type, start_type); return __rmqueue_smallest(zone, order, start_type); } single_page: - del_page_from_free_list(page, zone, current_order); + del_page_from_free_list(page, zone, current_order, block_type); expand(zone, page, order, current_order, block_type); return page; } @@ -2002,7 +2024,7 @@ static void reserve_highatomic_pageblock(struct page *page, struct zone *zone) mt = get_pageblock_migratetype(page); /* Only reserve normal pageblocks (i.e., they can merge with others) */ if (migratetype_is_mergeable(mt)) - if (move_freepages_block(zone, page, + if (move_freepages_block(zone, page, mt, MIGRATE_HIGHATOMIC) != -1) zone->nr_reserved_highatomic += pageblock_nr_pages; @@ -2043,11 +2065,13 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac, spin_lock_irqsave(&zone->lock, flags); for (order = 0; order < NR_PAGE_ORDERS; order++) { struct free_area *area = &(zone->free_area[order]); + int mt; page = get_page_from_free_area(area, MIGRATE_HIGHATOMIC); if (!page) continue; + mt = get_pageblock_migratetype(page); /* * In page freeing path, migratetype change is racy so * we can counter several free pages in a pageblock @@ -2055,7 +2079,7 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac, * from highatomic to ac->migratetype. So we should * adjust the count once. */ - if (is_migrate_highatomic_page(page)) { + if (is_migrate_highatomic(mt)) { /* * It should never happen but changes to * locking could inadvertently allow a per-cpu @@ -2077,7 +2101,8 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac, * of pageblocks that cannot be completely freed * may increase. */ - ret = move_freepages_block(zone, page, ac->migratetype); + ret = move_freepages_block(zone, page, mt, + ac->migratetype); /* * Reserving this block already succeeded, so this should * not fail on zone boundaries. @@ -2248,12 +2273,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, * pages are ordered properly. */ list_add_tail(&page->pcp_list, list); - if (is_migrate_cma(get_pageblock_migratetype(page))) - __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, - -(1 << order)); } - - __mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order)); spin_unlock_irqrestore(&zone->lock, flags); return i; @@ -2804,11 +2824,9 @@ int __isolate_free_page(struct page *page, unsigned int order) watermark = zone->_watermark[WMARK_MIN] + (1UL << order); if (!zone_watermark_ok(zone, 0, watermark, 0, ALLOC_CMA)) return 0; - - __mod_zone_freepage_state(zone, -(1UL << order), mt); } - del_page_from_free_list(page, zone, order); + del_page_from_free_list(page, zone, order, mt); /* * Set the pageblock if the isolated page is at least half of a @@ -2823,7 +2841,7 @@ int __isolate_free_page(struct page *page, unsigned int order) * with others) */ if (migratetype_is_mergeable(mt)) - move_freepages_block(zone, page, + move_freepages_block(zone, page, mt, MIGRATE_MOVABLE); } } @@ -2908,8 +2926,6 @@ struct page *rmqueue_buddy(struct zone *preferred_zone, struct zone *zone, return NULL; } } - __mod_zone_freepage_state(zone, -(1 << order), - get_pageblock_migratetype(page)); spin_unlock_irqrestore(&zone->lock, flags); } while (check_new_pages(page, order)); @@ -6930,8 +6946,9 @@ void __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) BUG_ON(page_count(page)); BUG_ON(!PageBuddy(page)); + VM_WARN_ON(get_pageblock_migratetype(page) != MIGRATE_ISOLATE); order = buddy_order(page); - del_page_from_free_list(page, zone, order); + del_page_from_free_list(page, zone, order, MIGRATE_ISOLATE); pfn += (1 << order); } spin_unlock_irqrestore(&zone->lock, flags); @@ -6959,6 +6976,14 @@ bool is_free_buddy_page(struct page *page) EXPORT_SYMBOL(is_free_buddy_page); #ifdef CONFIG_MEMORY_FAILURE +static inline void add_to_free_list(struct page *page, struct zone *zone, + unsigned int order, int migratetype, + bool tail) +{ + __add_to_free_list(page, zone, order, migratetype, tail); + account_freepages(zone, 1 << order, migratetype); +} + /* * Break down a higher-order page in sub-pages, and keep our target out of * buddy allocator. @@ -6981,10 +7006,10 @@ static void break_down_buddy_pages(struct zone *zone, struct page *page, current_buddy = page + size; } - if (set_page_guard(zone, current_buddy, high, migratetype)) + if (set_page_guard(zone, current_buddy, high)) continue; - add_to_free_list(current_buddy, zone, high, migratetype); + add_to_free_list(current_buddy, zone, high, migratetype, false); set_buddy_order(current_buddy, high); } } @@ -7010,12 +7035,11 @@ bool take_page_off_buddy(struct page *page) int migratetype = get_pfnblock_migratetype(page_head, pfn_head); - del_page_from_free_list(page_head, zone, page_order); + del_page_from_free_list(page_head, zone, page_order, + migratetype); break_down_buddy_pages(zone, page_head, page, 0, page_order, migratetype); SetPageHWPoisonTakenOff(page); - if (!is_migrate_isolate(migratetype)) - __mod_zone_freepage_state(zone, -1, migratetype); ret = true; break; } @@ -7115,7 +7139,7 @@ static bool try_to_accept_memory_one(struct zone *zone) list_del(&page->lru); - __mod_zone_freepage_state(zone, -MAX_ORDER_NR_PAGES, MIGRATE_MOVABLE); + account_freepages(zone, -MAX_ORDER_NR_PAGES, MIGRATE_MOVABLE); __mod_zone_page_state(zone, NR_UNACCEPTED, -MAX_ORDER_NR_PAGES); spin_unlock_irqrestore(&zone->lock, flags); @@ -7170,7 +7194,7 @@ static bool __free_unaccepted(struct page *page) spin_lock_irqsave(&zone->lock, flags); list_add_tail(&page->lru, &zone->unaccepted_pages); - __mod_zone_freepage_state(zone, MAX_ORDER_NR_PAGES, MIGRATE_MOVABLE); + account_freepages(zone, MAX_ORDER_NR_PAGES, MIGRATE_MOVABLE); __mod_zone_page_state(zone, NR_UNACCEPTED, MAX_ORDER_NR_PAGES); spin_unlock_irqrestore(&zone->lock, flags); -- Gitee From 2fcc0afa6231f82d1dbc1f039bb2e1a23def58d8 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 27 Mar 2024 15:01:11 -0400 Subject: [PATCH 334/484] mm: page_alloc: batch vmstat updates in expand() commit 883dd161e9a83e188487debc562b1928917a4b39 upstream Conflicts: none Backport-reason: mTHP: backport fix, this actually fixes e0932b6c1f94 expand() currently updates vmstat for every subpage. This is unnecessary, since they're all of the same zone and migratetype. Count added pages locally, then do a single vmstat update. Link: https://lkml.kernel.org/r/20240327190111.GC7597@cmpxchg.org Signed-off-by: Johannes Weiner Suggested-by: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/page_alloc.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index e4ed29d68943..14bcee0ffaf2 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1363,6 +1363,7 @@ static inline void expand(struct zone *zone, struct page *page, int low, int high, int migratetype) { unsigned long size = 1 << high; + unsigned long nr_added = 0; while (high > low) { high--; @@ -1378,9 +1379,11 @@ static inline void expand(struct zone *zone, struct page *page, if (set_page_guard(zone, &page[size], high)) continue; - add_to_free_list(&page[size], zone, high, migratetype, false); + __add_to_free_list(&page[size], zone, high, migratetype, false); set_buddy_order(&page[size], high); + nr_added += size; } + account_freepages(zone, nr_added, migratetype); } static void check_new_page_bad(struct page *page) -- Gitee From c6b617b4b14b1dd7a47e4a9be17e79766e470b19 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 21 Mar 2024 14:24:39 +0000 Subject: [PATCH 335/484] mm: always initialise folio->_deferred_list commit b7b098cf00a2b65d5654a86dc8edf82f125289c1 upstream Conflicts: Only update for hugetlb to use folio_put, this commit is already partially backported. Backport-reason: Page flag cleanup Patch series "Various significant MM patches". These patches all interact in annoying ways which make it tricky to send them out in any way other than a big batch, even though there's not really an overarching theme to connect them. The big effects of this patch series are: - folio_test_hugetlb() becomes reliable, even when called without a page reference - We free up PG_slab, and we could always use more page flags - We no longer need to check PageSlab before calling page_mapcount() This patch (of 9): For compound pages which are at least order-2 (and hence have a deferred_list), initialise it and then we can check at free that the page is not part of a deferred list. We recently found this useful to rule out a source of corruption. [peterx@redhat.com: always initialise folio->_deferred_list] Link: https://lkml.kernel.org/r/20240417211836.2742593-2-peterx@redhat.com Link: https://lkml.kernel.org/r/20240321142448.1645400-1-willy@infradead.org Link: https://lkml.kernel.org/r/20240321142448.1645400-2-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Peter Xu Reviewed-by: David Hildenbrand Acked-by: Vlastimil Babka Cc: Miaohe Lin Cc: Muchun Song Cc: Oscar Salvador Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/hugetlb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index b28687726089..08adf1c880c3 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1793,7 +1793,7 @@ static void __update_and_free_hugetlb_folio(struct hstate *h, free_gigantic_folio(folio, huge_page_order(h)); } else { INIT_LIST_HEAD(&folio->_deferred_list); - __free_pages(&folio->page, huge_page_order(h)); + folio_put(folio); } } -- Gitee From 21eef699b34614bbecac7a7a0715f13e6f0088fa Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 21 Mar 2024 14:24:41 +0000 Subject: [PATCH 336/484] mm: remove folio_prep_large_rmappable() commit 85edc15a4c606094a14c36ebf5bceea7f9a3e395 upstream Conflicts: none Backport-reason: Page flag cleanup Now that prep_compound_page() initialises folio->_deferred_list, folio_prep_large_rmappable()'s only purpose is to set the large_rmappable flag, so inline it into the two callers. Take the opportunity to convert the large_rmappable definition from PAGEFLAG to FOLIO_FLAG and remove the existance of PageTestLargeRmappable and friends. Link: https://lkml.kernel.org/r/20240321142448.1645400-4-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: David Hildenbrand Acked-by: Vlastimil Babka Cc: Miaohe Lin Cc: Muchun Song Cc: Oscar Salvador Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- include/linux/huge_mm.h | 3 --- include/linux/page-flags.h | 4 ++-- mm/huge_memory.c | 9 +-------- mm/internal.h | 3 ++- 4 files changed, 5 insertions(+), 14 deletions(-) diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 34056b5165e8..591f5da3618a 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -395,7 +395,6 @@ extern unsigned long hugetext_pad_threshold; unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags); -void folio_prep_large_rmappable(struct folio *folio); bool can_split_folio(struct folio *folio, int *pextra_pins); int split_huge_page_to_list_to_order(struct page *page, struct list_head *list, unsigned int new_order); @@ -586,8 +585,6 @@ static inline unsigned long thp_vma_allowable_orders(struct vm_area_struct *vma, return 0; } -static inline void folio_prep_large_rmappable(struct folio *folio) {} - #define transparent_hugepage_flags 0UL #define thp_get_unmapped_area NULL diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 70608a298099..ff8bc1f5e747 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -870,9 +870,9 @@ static inline void ClearPageCompound(struct page *page) BUG_ON(!PageHead(page)); ClearPageHead(page); } -PAGEFLAG(LargeRmappable, large_rmappable, PF_SECOND) +FOLIO_FLAG(large_rmappable, FOLIO_SECOND_PAGE) #else -TESTPAGEFLAG_FALSE(LargeRmappable, large_rmappable) +FOLIO_FLAG_FALSE(large_rmappable) #endif #define PG_head_mask ((1UL << PG_head)) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 34f1e756bce5..fff5ac854033 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1073,13 +1073,6 @@ struct deferred_split *get_deferred_split_queue(struct folio *folio) } #endif -void folio_prep_large_rmappable(struct folio *folio) -{ - if (!folio || !folio_test_large(folio)) - return; - folio_set_large_rmappable(folio); -} - static inline bool is_transparent_hugepage(struct folio *folio) { if (!folio_test_large(folio)) @@ -3005,7 +2998,7 @@ static void __split_huge_page_tail(struct folio *folio, int tail, clear_compound_head(page_tail); if (new_order) { prep_compound_page(page_tail, new_order); - folio_prep_large_rmappable(new_folio); + folio_set_large_rmappable(new_folio); } /* Finally unfreeze refcount. Additional reference from page cache. */ diff --git a/mm/internal.h b/mm/internal.h index 473065101aa9..eedc8c614644 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -667,7 +667,8 @@ static inline struct folio *page_rmappable_folio(struct page *page) { struct folio *folio = (struct folio *)page; - folio_prep_large_rmappable(folio); + if (folio && folio_test_large(folio)) + folio_set_large_rmappable(folio); return folio; } -- Gitee From 0cf60f7fd5ff168337461616b48c307f40b8c02a Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 5 Apr 2024 16:32:24 +0100 Subject: [PATCH 337/484] mm: combine free_the_page() and free_unref_page() commit 5b8d75913a0ed9deb16140c0aa880c4d6db2dc62 upstream Conflicts: minor, due to missing cc92eba1c88b Backport-reason: mm: cleanup The pcp_allowed_order() check in free_the_page() was only being skipped by __folio_put_small() which is about to be rearranged. Link: https://lkml.kernel.org/r/20240405153228.2563754-3-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Zi Yan Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/page_alloc.c | 25 +++++++++++-------------- 1 file changed, 11 insertions(+), 14 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 14bcee0ffaf2..18103743ce33 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -550,14 +550,6 @@ static inline bool pcp_allowed_order(unsigned int order) return false; } -static inline void free_the_page(struct page *page, unsigned int order) -{ - if (pcp_allowed_order(order)) /* Via pcp? */ - free_unref_page(page, order); - else - __free_pages_ok(page, order, FPI_NONE); -} - /* * Higher-order pages are called "compound pages". They are structured thusly: * @@ -593,7 +585,7 @@ void destroy_large_folio(struct folio *folio) folio_undo_large_rmappable(folio); mem_cgroup_uncharge(folio); - free_the_page(&folio->page, folio_order(folio)); + free_unref_page(&folio->page, folio_order(folio)); } static inline void set_buddy_order(struct page *page, unsigned int order) @@ -2658,6 +2650,11 @@ void free_unref_page(struct page *page, unsigned int order) unsigned long pfn = page_to_pfn(page); int migratetype; + if (!pcp_allowed_order(order)) { + __free_pages_ok(page, order, FPI_NONE); + return; + } + if (!free_pages_prepare(page, order)) return; @@ -4892,10 +4889,10 @@ void __free_pages(struct page *page, unsigned int order) int head = PageHead(page); if (put_page_testzero(page)) - free_the_page(page, order); + free_unref_page(page, order); else if (!head) while (order-- > 0) - free_the_page(page + (1 << order), order); + free_unref_page(page + (1 << order), order); } EXPORT_SYMBOL(__free_pages); @@ -4946,7 +4943,7 @@ void __page_frag_cache_drain(struct page *page, unsigned int count) VM_BUG_ON_PAGE(page_ref_count(page) == 0, page); if (page_ref_sub_and_test(page, count)) - free_the_page(page, compound_order(page)); + free_unref_page(page, compound_order(page)); } EXPORT_SYMBOL(__page_frag_cache_drain); @@ -4987,7 +4984,7 @@ void *page_frag_alloc_align(struct page_frag_cache *nc, goto refill; if (unlikely(nc->pfmemalloc)) { - free_the_page(page, compound_order(page)); + free_unref_page(page, compound_order(page)); goto refill; } @@ -5031,7 +5028,7 @@ void page_frag_free(void *addr) struct page *page = virt_to_head_page(addr); if (unlikely(put_page_testzero(page))) - free_the_page(page, compound_order(page)); + free_unref_page(page, compound_order(page)); } EXPORT_SYMBOL(page_frag_free); -- Gitee From c597be33d7c7c7c353bf51d6ab76e905fba02bf1 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 5 Apr 2024 16:32:25 +0100 Subject: [PATCH 338/484] mm: inline destroy_large_folio() into __folio_put_large() commit 2542b1ac9a46ac58f9565de0048457956898d481 upstream Conflicts: none Backport-reason: mm: cleanup destroy_large_folio() has only one caller, move its contents there. Link: https://lkml.kernel.org/r/20240405153228.2563754-4-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Zi Yan Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- include/linux/mm.h | 2 -- mm/page_alloc.c | 14 -------------- mm/swap.c | 13 ++++++++++--- 3 files changed, 10 insertions(+), 19 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 0682b1766e4e..eb9dd46503a4 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1300,8 +1300,6 @@ void folio_copy(struct folio *dst, struct folio *src); unsigned long nr_free_buffer_pages(void); -void destroy_large_folio(struct folio *folio); - /* Returns the number of bytes in this potentially compound page. */ static inline unsigned long page_size(struct page *page) { diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 18103743ce33..221625b0bec1 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -574,20 +574,6 @@ void prep_compound_page(struct page *page, unsigned int order) prep_compound_head(page, order); } -void destroy_large_folio(struct folio *folio) -{ - if (folio_test_hugetlb(folio)) { - free_huge_folio(folio); - return; - } - - if (folio_test_large_rmappable(folio)) - folio_undo_large_rmappable(folio); - - mem_cgroup_uncharge(folio); - free_unref_page(&folio->page, folio_order(folio)); -} - static inline void set_buddy_order(struct page *page, unsigned int order) { set_page_private(page, order); diff --git a/mm/swap.c b/mm/swap.c index 616760460f75..904a76bb82e9 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -113,9 +113,16 @@ static void __folio_put_large(struct folio *folio) * (it's never listed to any LRU lists) and no memcg routines should * be called for hugetlb (it has a separate hugetlb_cgroup.) */ - if (!folio_test_hugetlb(folio)) - page_cache_release(folio); - destroy_large_folio(folio); + if (folio_test_hugetlb(folio)) { + free_huge_folio(folio); + return; + } + + page_cache_release(folio); + if (folio_test_large_rmappable(folio)) + folio_undo_large_rmappable(folio); + mem_cgroup_uncharge(folio); + free_unref_page(&folio->page, folio_order(folio)); } void __folio_put(struct folio *folio) -- Gitee From 8480498649120c982b6733bbf73beb24935c1afc Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 5 Apr 2024 16:32:26 +0100 Subject: [PATCH 339/484] mm: combine __folio_put_small, __folio_put_large and __folio_put commit 79a48287515848c18a49d75c1fdf176c82bb13cf upstream Conflicts: none Backport-reason: mm: cleanup It's now obvious that __folio_put_small() and __folio_put_large() do almost exactly the same thing. Inline them both into __folio_put(). Link: https://lkml.kernel.org/r/20240405153228.2563754-5-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Zi Yan Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/swap.c | 32 ++++++-------------------------- 1 file changed, 6 insertions(+), 26 deletions(-) diff --git a/mm/swap.c b/mm/swap.c index 904a76bb82e9..38a1d537c1cb 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -98,42 +98,22 @@ static void page_cache_release(struct folio *folio) unlock_page_lruvec_irqrestore(lruvec, flags); } -static void __folio_put_small(struct folio *folio) -{ - page_cache_release(folio); - mem_cgroup_uncharge(folio); - free_unref_page(&folio->page, 0); -} - -static void __folio_put_large(struct folio *folio) +void __folio_put(struct folio *folio) { - /* - * __page_cache_release() is supposed to be called for thp, not for - * hugetlb. This is because hugetlb page does never have PageLRU set - * (it's never listed to any LRU lists) and no memcg routines should - * be called for hugetlb (it has a separate hugetlb_cgroup.) - */ - if (folio_test_hugetlb(folio)) { + if (unlikely(folio_is_zone_device(folio))) { + free_zone_device_page(&folio->page); + return; + } else if (folio_test_hugetlb(folio)) { free_huge_folio(folio); return; } page_cache_release(folio); - if (folio_test_large_rmappable(folio)) + if (folio_test_large(folio) && folio_test_large_rmappable(folio)) folio_undo_large_rmappable(folio); mem_cgroup_uncharge(folio); free_unref_page(&folio->page, folio_order(folio)); } - -void __folio_put(struct folio *folio) -{ - if (unlikely(folio_is_zone_device(folio))) - free_zone_device_page(&folio->page); - else if (unlikely(folio_test_large(folio))) - __folio_put_large(folio); - else - __folio_put_small(folio); -} EXPORT_SYMBOL(__folio_put); /** -- Gitee From 52ab15ad94c852ca0a9f7be31ee985103dafd054 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 5 Apr 2024 16:32:27 +0100 Subject: [PATCH 340/484] mm: convert free_zone_device_page to free_zone_device_folio commit 9f100e3b37590828ae23b0210ee634d14b28b8e8 upstream Conflicts: none Backport-reason: mm: cleanup Both callers already have a folio; pass it in and save a few calls to compound_head(). Link: https://lkml.kernel.org/r/20240405153228.2563754-6-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Zi Yan Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/internal.h | 2 +- mm/memremap.c | 30 ++++++++++++++++-------------- mm/swap.c | 4 ++-- 3 files changed, 19 insertions(+), 17 deletions(-) diff --git a/mm/internal.h b/mm/internal.h index eedc8c614644..92d498d7f9fa 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -1237,7 +1237,7 @@ void __vunmap_range_noflush(unsigned long start, unsigned long end); int numa_migrate_prep(struct folio *folio, struct vm_area_struct *vma, unsigned long addr, int page_nid, int *flags); -void free_zone_device_page(struct page *page); +void free_zone_device_folio(struct folio *folio); int migrate_device_coherent_page(struct page *page); /* diff --git a/mm/memremap.c b/mm/memremap.c index 19ed6855f96f..8c5ebe5b0fc7 100644 --- a/mm/memremap.c +++ b/mm/memremap.c @@ -468,21 +468,23 @@ struct dev_pagemap *get_dev_pagemap(unsigned long pfn, } EXPORT_SYMBOL_GPL(get_dev_pagemap); -void free_zone_device_page(struct page *page) +void free_zone_device_folio(struct folio *folio) { - if (WARN_ON_ONCE(!page->pgmap->ops || !page->pgmap->ops->page_free)) + if (WARN_ON_ONCE(!folio->page.pgmap->ops || + !folio->page.pgmap->ops->page_free)) return; - mem_cgroup_uncharge(page_folio(page)); + mem_cgroup_uncharge(folio); /* * Note: we don't expect anonymous compound pages yet. Once supported * and we could PTE-map them similar to THP, we'd have to clear * PG_anon_exclusive on all tail pages. */ - VM_BUG_ON_PAGE(PageAnon(page) && PageCompound(page), page); - if (PageAnon(page)) - __ClearPageAnonExclusive(page); + if (folio_test_anon(folio)) { + VM_BUG_ON_FOLIO(folio_test_large(folio), folio); + __ClearPageAnonExclusive(folio_page(folio, 0)); + } /* * When a device managed page is freed, the folio->mapping field @@ -493,20 +495,20 @@ void free_zone_device_page(struct page *page) * * For other types of ZONE_DEVICE pages, migration is either * handled differently or not done at all, so there is no need - * to clear page->mapping. + * to clear folio->mapping. */ - page->mapping = NULL; - page->pgmap->ops->page_free(page); + folio->mapping = NULL; + folio->page.pgmap->ops->page_free(folio_page(folio, 0)); - if (page->pgmap->type != MEMORY_DEVICE_PRIVATE && - page->pgmap->type != MEMORY_DEVICE_COHERENT) + if (folio->page.pgmap->type != MEMORY_DEVICE_PRIVATE && + folio->page.pgmap->type != MEMORY_DEVICE_COHERENT) /* - * Reset the page count to 1 to prepare for handing out the page + * Reset the refcount to 1 to prepare for handing out the page * again. */ - set_page_count(page, 1); + folio_set_count(folio, 1); else - put_dev_pagemap(page->pgmap); + put_dev_pagemap(folio->page.pgmap); } void zone_device_page_init(struct page *page) diff --git a/mm/swap.c b/mm/swap.c index 38a1d537c1cb..9cab24a17151 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -101,7 +101,7 @@ static void page_cache_release(struct folio *folio) void __folio_put(struct folio *folio) { if (unlikely(folio_is_zone_device(folio))) { - free_zone_device_page(&folio->page); + free_zone_device_folio(folio); return; } else if (folio_test_hugetlb(folio)) { free_huge_folio(folio); @@ -1007,7 +1007,7 @@ void folios_put_refs(struct folio_batch *folios, unsigned int *refs) if (put_devmap_managed_page_refs(&folio->page, nr_refs)) continue; if (folio_ref_sub_and_test(folio, nr_refs)) - free_zone_device_page(&folio->page); + free_zone_device_folio(folio); continue; } -- Gitee From 07ae36631ee104b354d54add2f58f2f85f2e80c2 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Tue, 21 May 2024 21:03:15 +0800 Subject: [PATCH 341/484] mm: refactor folio_undo_large_rmappable() commit 593a10dabe08dcf93259fce2badd8dc2528859a8 upstream Conflicts: none Backport-reason: mm: reapply upstream fixes Folios of order <= 1 are not in deferred list, the check of order is added into folio_undo_large_rmappable() from commit 8897277acfef ("mm: support order-1 folios in the page cache"), but there is a repeated check for small folio (order 0) during each call of the folio_undo_large_rmappable(), so only keep folio_order() check inside the function. In addition, move all the checks into header file to save a function call for non-large-rmappable or empty deferred_list folio. Link: https://lkml.kernel.org/r/20240521130315.46072-1-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Reviewed-by: David Hildenbrand Reviewed-by: Vishal Moola (Oracle) Cc: Johannes Weiner Cc: Lance Yang Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Muchun Song Cc: Roman Gushchin Cc: Shakeel Butt Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/huge_memory.c | 13 +------------ mm/internal.h | 17 ++++++++++++++++- mm/page_alloc.c | 3 +-- mm/swap.c | 8 ++------ mm/vmscan.c | 8 ++------ 5 files changed, 22 insertions(+), 27 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index fff5ac854033..348db74de6ae 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -3373,22 +3373,11 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list, return ret; } -void folio_undo_large_rmappable(struct folio *folio) +void __folio_undo_large_rmappable(struct folio *folio) { struct deferred_split *ds_queue; unsigned long flags; - if (folio_order(folio) <= 1) - return; - - /* - * At this point, there is no one trying to add the folio to - * deferred_list. If folio is not in deferred_list, it's safe - * to check without acquiring the split_queue_lock. - */ - if (data_race(list_empty(&folio->_deferred_list))) - return; - ds_queue = get_deferred_split_queue(folio); spin_lock_irqsave(&ds_queue->split_queue_lock, flags); if (!list_empty(&folio->_deferred_list)) { diff --git a/mm/internal.h b/mm/internal.h index 92d498d7f9fa..2e7bf47bacbb 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -661,7 +661,22 @@ static inline void folio_set_order(struct folio *folio, unsigned int order) #endif } -void folio_undo_large_rmappable(struct folio *folio); +void __folio_undo_large_rmappable(struct folio *folio); +static inline void folio_undo_large_rmappable(struct folio *folio) +{ + if (folio_order(folio) <= 1 || !folio_test_large_rmappable(folio)) + return; + + /* + * At this point, there is no one trying to add the folio to + * deferred_list. If folio is not in deferred_list, it's safe + * to check without acquiring the split_queue_lock. + */ + if (data_race(list_empty(&folio->_deferred_list))) + return; + + __folio_undo_large_rmappable(folio); +} static inline struct folio *page_rmappable_folio(struct page *page) { diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 221625b0bec1..236e0c5a452b 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2688,8 +2688,7 @@ void free_unref_folios(struct folio_batch *folios) unsigned long pfn = folio_pfn(folio); unsigned int order = folio_order(folio); - if (order > 0 && folio_test_large_rmappable(folio)) - folio_undo_large_rmappable(folio); + folio_undo_large_rmappable(folio); if (!free_pages_prepare(&folio->page, order)) continue; /* diff --git a/mm/swap.c b/mm/swap.c index 9cab24a17151..2434426c9dcc 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -109,8 +109,7 @@ void __folio_put(struct folio *folio) } page_cache_release(folio); - if (folio_test_large(folio) && folio_test_large_rmappable(folio)) - folio_undo_large_rmappable(folio); + folio_undo_large_rmappable(folio); mem_cgroup_uncharge(folio); free_unref_page(&folio->page, folio_order(folio)); } @@ -1023,10 +1022,7 @@ void folios_put_refs(struct folio_batch *folios, unsigned int *refs) free_huge_folio(folio); continue; } - if (folio_test_large(folio) && - folio_test_large_rmappable(folio)) - folio_undo_large_rmappable(folio); - + folio_undo_large_rmappable(folio); __page_cache_release(folio, &lruvec, &flags); if (j != i) diff --git a/mm/vmscan.c b/mm/vmscan.c index 398929912c43..1918ad9fb2ee 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1569,9 +1569,7 @@ static unsigned int shrink_folio_list(struct list_head *folio_list, */ nr_reclaimed += nr_pages; - if (folio_test_large(folio) && - folio_test_large_rmappable(folio)) - folio_undo_large_rmappable(folio); + folio_undo_large_rmappable(folio); if (folio_batch_add(&free_folios, folio) == 0) { mem_cgroup_uncharge_folios(&free_folios); try_to_unmap_flush(); @@ -1959,9 +1957,7 @@ static unsigned int move_folios_to_lru(struct lruvec *lruvec, if (unlikely(folio_put_testzero(folio))) { __folio_clear_lru_flags(folio); - if (folio_test_large(folio) && - folio_test_large_rmappable(folio)) - folio_undo_large_rmappable(folio); + folio_undo_large_rmappable(folio); if (folio_batch_add(&free_folios, folio) == 0) { spin_unlock_irq(&lruvec->lru_lock); mem_cgroup_uncharge_folios(&free_folios); -- Gitee From e9734e5617b49aeb294daf9d9d37adecd06198cb Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Sun, 27 Oct 2024 13:02:13 -0700 Subject: [PATCH 342/484] mm/thp: fix deferred split unqueue naming and locking commit f8f931bba0f92052cf842b7e30917b1afcc77d5a upstream Conflicts: minor, due to memcontrol-v1 missing. Backport-reason: THP: fixing deferred list Recent changes are putting more pressure on THP deferred split queues: under load revealing long-standing races, causing list_del corruptions, "Bad page state"s and worse (I keep BUGs in both of those, so usually don't get to see how badly they end up without). The relevant recent changes being 6.8's mTHP, 6.10's mTHP swapout, and 6.12's mTHP swapin, improved swap allocation, and underused THP splitting. Before fixing locking: rename misleading folio_undo_large_rmappable(), which does not undo large_rmappable, to folio_unqueue_deferred_split(), which is what it does. But that and its out-of-line __callee are mm internals of very limited usability: add comment and WARN_ON_ONCEs to check usage; and return a bool to say if a deferred split was unqueued, which can then be used in WARN_ON_ONCEs around safety checks (sparing callers the arcane conditionals in __folio_unqueue_deferred_split()). Just omit the folio_unqueue_deferred_split() from free_unref_folios(), all of whose callers now call it beforehand (and if any forget then bad_page() will tell) - except for its caller put_pages_list(), which itself no longer has any callers (and will be deleted separately). Swapout: mem_cgroup_swapout() has been resetting folio->memcg_data 0 without checking and unqueueing a THP folio from deferred split list; which is unfortunate, since the split_queue_lock depends on the memcg (when memcg is enabled); so swapout has been unqueueing such THPs later, when freeing the folio, using the pgdat's lock instead: potentially corrupting the memcg's list. __remove_mapping() has frozen refcount to 0 here, so no problem with calling folio_unqueue_deferred_split() before resetting memcg_data. That goes back to 5.4 commit 87eaceb3faa5 ("mm: thp: make deferred split shrinker memcg aware"): which included a check on swapcache before adding to deferred queue, but no check on deferred queue before adding THP to swapcache. That worked fine with the usual sequence of events in reclaim (though there were a couple of rare ways in which a THP on deferred queue could have been swapped out), but 6.12 commit dafff3f4c850 ("mm: split underused THPs") avoids splitting underused THPs in reclaim, which makes swapcache THPs on deferred queue commonplace. Keep the check on swapcache before adding to deferred queue? Yes: it is no longer essential, but preserves the existing behaviour, and is likely to be a worthwhile optimization (vmstat showed much more traffic on the queue under swapping load if the check was removed); update its comment. Memcg-v1 move (deprecated): mem_cgroup_move_account() has been changing folio->memcg_data without checking and unqueueing a THP folio from the deferred list, sometimes corrupting "from" memcg's list, like swapout. Refcount is non-zero here, so folio_unqueue_deferred_split() can only be used in a WARN_ON_ONCE to validate the fix, which must be done earlier: mem_cgroup_move_charge_pte_range() first try to split the THP (splitting of course unqueues), or skip it if that fails. Not ideal, but moving charge has been requested, and khugepaged should repair the THP later: nobody wants new custom unqueueing code just for this deprecated case. The 87eaceb3faa5 commit did have the code to move from one deferred list to another (but was not conscious of its unsafety while refcount non-0); but that was removed by 5.6 commit fac0516b5534 ("mm: thp: don't need care deferred split queue in memcg charge move path"), which argued that the existence of a PMD mapping guarantees that the THP cannot be on a deferred list. As above, false in rare cases, and now commonly false. Backport to 6.11 should be straightforward. Earlier backports must take care that other _deferred_list fixes and dependencies are included. There is not a strong case for backports, but they can fix cornercases. Link: https://lkml.kernel.org/r/8dc111ae-f6db-2da7-b25c-7a20b1effe3b@google.com Fixes: 87eaceb3faa5 ("mm: thp: make deferred split shrinker memcg aware") Fixes: dafff3f4c850 ("mm: split underused THPs") Signed-off-by: Hugh Dickins Acked-by: David Hildenbrand Reviewed-by: Yang Shi Cc: Baolin Wang Cc: Barry Song Cc: Chris Li Cc: Johannes Weiner Cc: Kefeng Wang Cc: Kirill A. Shutemov Cc: Matthew Wilcox (Oracle) Cc: Nhat Pham Cc: Ryan Roberts Cc: Shakeel Butt Cc: Usama Arif Cc: Wei Yang Cc: Zi Yan Cc: Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/huge_memory.c | 35 ++++++++++++++++++++++++++--------- mm/internal.h | 10 +++++----- mm/memcontrol.c | 35 ++++++++++++++++++++++++++++++++--- mm/migrate.c | 4 ++-- mm/page_alloc.c | 1 - mm/swap.c | 4 ++-- mm/vmscan.c | 4 ++-- 7 files changed, 69 insertions(+), 24 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 348db74de6ae..083292f3e730 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -3373,10 +3373,27 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list, return ret; } -void __folio_undo_large_rmappable(struct folio *folio) +/* + * __folio_unqueue_deferred_split() is not to be called directly: + * the folio_unqueue_deferred_split() inline wrapper in mm/internal.h + * limits its calls to those folios which may have a _deferred_list for + * queueing THP splits, and that list is (racily observed to be) non-empty. + * + * It is unsafe to call folio_unqueue_deferred_split() until folio refcount is + * zero: because even when split_queue_lock is held, a non-empty _deferred_list + * might be in use on deferred_split_scan()'s unlocked on-stack list. + * + * If memory cgroups are enabled, split_queue_lock is in the mem_cgroup: it is + * therefore important to unqueue deferred split before changing folio memcg. + */ +bool __folio_unqueue_deferred_split(struct folio *folio) { struct deferred_split *ds_queue; unsigned long flags; + bool unqueued = false; + + WARN_ON_ONCE(folio_ref_count(folio)); + WARN_ON_ONCE(!mem_cgroup_disabled() && !folio_memcg(folio)); ds_queue = get_deferred_split_queue(folio); spin_lock_irqsave(&ds_queue->split_queue_lock, flags); @@ -3384,8 +3401,11 @@ void __folio_undo_large_rmappable(struct folio *folio) ds_queue->split_queue_len--; mod_mthp_stat(folio_order(folio), MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, -1); list_del_init(&folio->_deferred_list); + unqueued = true; } spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags); + + return unqueued; /* useful for debug warnings */ } void deferred_split_folio(struct folio *folio) @@ -3404,14 +3424,11 @@ void deferred_split_folio(struct folio *folio) return; /* - * The try_to_unmap() in page reclaim path might reach here too, - * this may cause a race condition to corrupt deferred split queue. - * And, if page reclaim is already handling the same folio, it is - * unnecessary to handle it again in shrinker. - * - * Check the swapcache flag to determine if the folio is being - * handled by page reclaim since THP swap would add the folio into - * swap cache before calling try_to_unmap(). + * Exclude swapcache: originally to avoid a corrupt deferred split + * queue. Nowadays that is fully prevented by mem_cgroup_swapout(); + * but if page reclaim is already handling the same folio, it is + * unnecessary to handle it again in the shrinker, so excluding + * swapcache here may still be a useful optimization. */ if (folio_test_swapcache(folio)) return; diff --git a/mm/internal.h b/mm/internal.h index 2e7bf47bacbb..9badc16c69b8 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -661,11 +661,11 @@ static inline void folio_set_order(struct folio *folio, unsigned int order) #endif } -void __folio_undo_large_rmappable(struct folio *folio); -static inline void folio_undo_large_rmappable(struct folio *folio) +bool __folio_unqueue_deferred_split(struct folio *folio); +static inline bool folio_unqueue_deferred_split(struct folio *folio) { if (folio_order(folio) <= 1 || !folio_test_large_rmappable(folio)) - return; + return false; /* * At this point, there is no one trying to add the folio to @@ -673,9 +673,9 @@ static inline void folio_undo_large_rmappable(struct folio *folio) * to check without acquiring the split_queue_lock. */ if (data_race(list_empty(&folio->_deferred_list))) - return; + return false; - __folio_undo_large_rmappable(folio); + return __folio_unqueue_deferred_split(folio); } static inline struct folio *page_rmappable_folio(struct page *page) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 545df3c61379..2a50e7d10d1e 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -7928,6 +7928,8 @@ static int mem_cgroup_move_account(struct page *page, css_get(&to->css); css_put(&from->css); + /* Warning should never happen, so don't worry about refcount non-0 */ + WARN_ON_ONCE(folio_unqueue_deferred_split(folio)); folio->memcg_data = (unsigned long)to; __folio_memcg_unlock(from); @@ -8299,7 +8301,10 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, enum mc_target_type target_type; union mc_target target; struct page *page; + struct folio *folio; + bool tried_split_before = false; +retry_pmd: ptl = pmd_trans_huge_lock(pmd, vma); if (ptl) { if (mc.precharge < HPAGE_PMD_NR) { @@ -8309,6 +8314,28 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, target_type = get_mctgt_type_thp(vma, addr, *pmd, &target); if (target_type == MC_TARGET_PAGE) { page = target.page; + folio = page_folio(page); + /* + * Deferred split queue locking depends on memcg, + * and unqueue is unsafe unless folio refcount is 0: + * split or skip if on the queue? first try to split. + */ + if (!list_empty(&folio->_deferred_list)) { + spin_unlock(ptl); + if (!tried_split_before) + split_folio(folio); + folio_unlock(folio); + folio_put(folio); + if (tried_split_before) + return 0; + tried_split_before = true; + goto retry_pmd; + } + /* + * So long as that pmd lock is held, the folio cannot + * be racily added to the _deferred_list, because + * page_remove_rmap() will find it still pmdmapped. + */ if (isolate_lru_page(page)) { if (!mem_cgroup_move_account(page, true, mc.from, mc.to)) { @@ -9460,9 +9487,6 @@ static void uncharge_folio(struct folio *folio, struct uncharge_gather *ug) struct obj_cgroup *objcg; VM_BUG_ON_FOLIO(folio_test_lru(folio), folio); - VM_BUG_ON_FOLIO(folio_order(folio) > 1 && - !folio_test_hugetlb(folio) && - !list_empty(&folio->_deferred_list), folio); /* * Nobody should be changing or seriously looking at @@ -9509,6 +9533,7 @@ static void uncharge_folio(struct folio *folio, struct uncharge_gather *ug) ug->nr_memory += nr_pages; ug->pgpgout++; + WARN_ON_ONCE(folio_unqueue_deferred_split(folio)); folio->memcg_data = 0; } @@ -9624,6 +9649,9 @@ void mem_cgroup_migrate(struct folio *old, struct folio *new) /* Transfer the charge and the css ref */ commit_charge(new, memcg); + + /* Warning should never happen, so don't worry about refcount non-0 */ + WARN_ON_ONCE(folio_unqueue_deferred_split(old)); old->memcg_data = 0; } @@ -9859,6 +9887,7 @@ void mem_cgroup_swapout(struct folio *folio, swp_entry_t entry) VM_BUG_ON_FOLIO(oldid, folio); mod_memcg_state(swap_memcg, MEMCG_SWAP, nr_entries); + folio_unqueue_deferred_split(folio); folio->memcg_data = 0; if (!mem_cgroup_is_root(memcg)) diff --git a/mm/migrate.c b/mm/migrate.c index fec29fcc4811..4da1249a68e7 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -417,7 +417,7 @@ int folio_migrate_mapping(struct address_space *mapping, folio_test_large_rmappable(folio)) { if (!folio_ref_freeze(folio, expected_count)) return -EAGAIN; - folio_undo_large_rmappable(folio); + folio_unqueue_deferred_split(folio); folio_ref_unfreeze(folio, expected_count); } @@ -442,7 +442,7 @@ int folio_migrate_mapping(struct address_space *mapping, } /* Take off deferred split queue while frozen and memcg set */ - folio_undo_large_rmappable(folio); + folio_unqueue_deferred_split(folio); /* * Now we know that no one else is looking at the folio: diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 236e0c5a452b..fc88d932e6f7 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2688,7 +2688,6 @@ void free_unref_folios(struct folio_batch *folios) unsigned long pfn = folio_pfn(folio); unsigned int order = folio_order(folio); - folio_undo_large_rmappable(folio); if (!free_pages_prepare(&folio->page, order)) continue; /* diff --git a/mm/swap.c b/mm/swap.c index 2434426c9dcc..bb598d9922c0 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -109,7 +109,7 @@ void __folio_put(struct folio *folio) } page_cache_release(folio); - folio_undo_large_rmappable(folio); + folio_unqueue_deferred_split(folio); mem_cgroup_uncharge(folio); free_unref_page(&folio->page, folio_order(folio)); } @@ -1022,7 +1022,7 @@ void folios_put_refs(struct folio_batch *folios, unsigned int *refs) free_huge_folio(folio); continue; } - folio_undo_large_rmappable(folio); + folio_unqueue_deferred_split(folio); __page_cache_release(folio, &lruvec, &flags); if (j != i) diff --git a/mm/vmscan.c b/mm/vmscan.c index 1918ad9fb2ee..f0ff3362b345 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1569,7 +1569,7 @@ static unsigned int shrink_folio_list(struct list_head *folio_list, */ nr_reclaimed += nr_pages; - folio_undo_large_rmappable(folio); + folio_unqueue_deferred_split(folio); if (folio_batch_add(&free_folios, folio) == 0) { mem_cgroup_uncharge_folios(&free_folios); try_to_unmap_flush(); @@ -1957,7 +1957,7 @@ static unsigned int move_folios_to_lru(struct lruvec *lruvec, if (unlikely(folio_put_testzero(folio))) { __folio_clear_lru_flags(folio); - folio_undo_large_rmappable(folio); + folio_unqueue_deferred_split(folio); if (folio_batch_add(&free_folios, folio) == 0) { spin_unlock_irq(&lruvec->lru_lock); mem_cgroup_uncharge_folios(&free_folios); -- Gitee From bb176e26701448c1c38283adc304cdd65b29f2b2 Mon Sep 17 00:00:00 2001 From: Hyeonggon Yoo <42.hyeyoo@gmail.com> Date: Wed, 29 Jan 2025 19:08:44 +0900 Subject: [PATCH 343/484] mm/zswap: fix inconsistency when zswap_store_page() fails commit 63895d20d63b446f5049a963983489319c2ea3e2 upstream Conflicts: none Backport-reason: ZSWAP mTHP support Commit b7c0ccdfbafd ("mm: zswap: support large folios in zswap_store()") skips charging any zswap entries when it failed to zswap the entire folio. However, when some base pages are zswapped but it failed to zswap the entire folio, the zswap operation is rolled back. When freeing zswap entries for those pages, zswap_entry_free() uncharges the zswap entries that were not previously charged, causing zswap charging to become inconsistent. This inconsistency triggers two warnings with following steps: # On a machine with 64GiB of RAM and 36GiB of zswap $ stress-ng --bigheap 2 # wait until the OOM-killer kills stress-ng $ sudo reboot The two warnings are: in mm/memcontrol.c:163, function obj_cgroup_release(): WARN_ON_ONCE(nr_bytes & (PAGE_SIZE - 1)); in mm/page_counter.c:60, function page_counter_cancel(): if (WARN_ONCE(new < 0, "page_counter underflow: %ld nr_pages=%lu\n", new, nr_pages)) zswap_stored_pages also becomes inconsistent in the same way. As suggested by Kanchana, increment zswap_stored_pages and charge zswap entries within zswap_store_page() when it succeeds. This way, zswap_entry_free() will decrement the counter and uncharge the entries when it failed to zswap the entire folio. While this could potentially be optimized by batching objcg charging and incrementing the counter, let's focus on fixing the bug this time and leave the optimization for later after some evaluation. After resolving the inconsistency, the warnings disappear. [42.hyeyoo@gmail.com: refactor zswap_store_page()] Link: https://lkml.kernel.org/r/20250131082037.2426-1-42.hyeyoo@gmail.com Link: https://lkml.kernel.org/r/20250129100844.2935-1-42.hyeyoo@gmail.com Fixes: b7c0ccdfbafd ("mm: zswap: support large folios in zswap_store()") Co-developed-by: Kanchana P Sridhar Signed-off-by: Kanchana P Sridhar Signed-off-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Acked-by: Yosry Ahmed Acked-by: Nhat Pham Cc: Chengming Zhou Cc: Johannes Weiner Cc: Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/zswap.c | 35 ++++++++++++++++------------------- 1 file changed, 16 insertions(+), 19 deletions(-) diff --git a/mm/zswap.c b/mm/zswap.c index 43b2a38f659d..cc3e8b97e3b8 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -1406,9 +1406,9 @@ static void shrink_worker(struct work_struct *w) * main API **********************************/ -static ssize_t zswap_store_page(struct page *page, - struct obj_cgroup *objcg, - struct zswap_pool *pool) +static bool zswap_store_page(struct page *page, + struct obj_cgroup *objcg, + struct zswap_pool *pool) { swp_entry_t page_swpentry = page_swap_entry(page); struct zswap_entry *entry, *old; @@ -1417,7 +1417,7 @@ static ssize_t zswap_store_page(struct page *page, entry = zswap_entry_cache_alloc(GFP_KERNEL, page_to_nid(page)); if (!entry) { zswap_reject_kmemcache_fail++; - return -EINVAL; + return false; } if (!zswap_compress(page, entry, pool)) @@ -1444,13 +1444,17 @@ static ssize_t zswap_store_page(struct page *page, /* * The entry is successfully compressed and stored in the tree, there is - * no further possibility of failure. Grab refs to the pool and objcg. - * These refs will be dropped by zswap_entry_free() when the entry is - * removed from the tree. + * no further possibility of failure. Grab refs to the pool and objcg, + * charge zswap memory, and increment zswap_stored_pages. + * The opposite actions will be performed by zswap_entry_free() + * when the entry is removed from the tree. */ zswap_pool_get(pool); - if (objcg) + if (objcg) { obj_cgroup_get(objcg); + obj_cgroup_charge_zswap(objcg, entry->length); + } + atomic_long_inc(&zswap_stored_pages); /* * We finish initializing the entry while it's already in xarray. @@ -1471,13 +1475,13 @@ static ssize_t zswap_store_page(struct page *page, zswap_lru_add(&zswap_list_lru, entry); } - return entry->length; + return true; store_failed: zpool_free(pool->zpool, entry->handle); compress_failed: zswap_entry_cache_free(entry); - return -EINVAL; + return false; } bool zswap_store(struct folio *folio) @@ -1487,7 +1491,6 @@ bool zswap_store(struct folio *folio) struct obj_cgroup *objcg = NULL; struct mem_cgroup *memcg = NULL; struct zswap_pool *pool; - size_t compressed_bytes = 0; bool ret = false; long index; @@ -1525,20 +1528,14 @@ bool zswap_store(struct folio *folio) for (index = 0; index < nr_pages; ++index) { struct page *page = folio_page(folio, index); - ssize_t bytes; - bytes = zswap_store_page(page, objcg, pool); - if (bytes < 0) + if (!zswap_store_page(page, objcg, pool)) goto put_pool; - compressed_bytes += bytes; } - if (objcg) { - obj_cgroup_charge_zswap(objcg, compressed_bytes); + if (objcg) count_objcg_events(objcg, ZSWPOUT, nr_pages); - } - atomic_long_add(nr_pages, &zswap_stored_pages); count_vm_events(ZSWPOUT, nr_pages); ret = true; -- Gitee From 60d87e5140790b9ad3fd8453a0121f9c19560541 Mon Sep 17 00:00:00 2001 From: gao xu Date: Wed, 19 Feb 2025 01:56:28 +0000 Subject: [PATCH 344/484] mm: fix possible NULL pointer dereference in __swap_duplicate commit c3e998398de48a7528842f05858a3a6bb21002e6 upstream Conflicts: none Backport-reason: SWAP Fixes Add a NULL check on the return value of swp_swap_info in __swap_duplicate to prevent crashes caused by NULL pointer dereference. The reason why swp_swap_info() returns NULL is unclear; it may be due to CPU cache issues or DDR bit flips. The probability of this issue is very small - it has been observed to occur approximately 1 in 500,000 times per week. The stack info we encountered is as follows: Unable to handle kernel NULL pointer dereference at virtual address 0000000000000058 [RB/E]rb_sreason_str_set: sreason_str set null_pointer Mem abort info: ESR = 0x0000000096000005 EC = 0x25: DABT (current EL), IL = 32 bits SET = 0, FnV = 0 EA = 0, S1PTW = 0 FSC = 0x05: level 1 translation fault Data abort info: ISV = 0, ISS = 0x00000005, ISS2 = 0x00000000 CM = 0, WnR = 0, TnD = 0, TagAccess = 0 GCS = 0, Overlay = 0, DirtyBit = 0, Xs = 0 user pgtable: 4k pages, 39-bit VAs, pgdp=00000008a80e5000 [0000000000000058] pgd=0000000000000000, p4d=0000000000000000, pud=0000000000000000 Internal error: Oops: 0000000096000005 [#1] PREEMPT SMP Skip md ftrace buffer dump for: 0x1609e0 ... pc : swap_duplicate+0x44/0x164 lr : copy_page_range+0x508/0x1e78 sp : ffffffc0f2a699e0 x29: ffffffc0f2a699e0 x28: ffffff8a5b28d388 x27: ffffff8b06603388 x26: ffffffdf7291fe70 x25: 0000000000000006 x24: 0000000000100073 x23: 00000000002d2d2f x22: 0000000000000008 x21: 0000000000000000 x20: 00000000002d2d2f x19: 18000000002d2d2f x18: ffffffdf726faec0 x17: 0000000000000000 x16: 0010000000000001 x15: 0040000000000001 x14: 0400000000000001 x13: ff7ffffffffffb7f x12: ffeffffffffffbff x11: ffffff8a5c7e1898 x10: 0000000000000018 x9 : 0000000000000006 x8 : 1800000000000000 x7 : 0000000000000000 x6 : ffffff8057c01f10 x5 : 000000000000a318 x4 : 0000000000000000 x3 : 0000000000000000 x2 : 0000006daf200000 x1 : 0000000000000001 x0 : 18000000002d2d2f Call trace: swap_duplicate+0x44/0x164 copy_page_range+0x508/0x1e78 copy_process+0x1278/0x21cc kernel_clone+0x90/0x438 __arm64_sys_clone+0x5c/0x8c invoke_syscall+0x58/0x110 do_el0_svc+0x8c/0xe0 el0_svc+0x38/0x9c el0t_64_sync_handler+0x44/0xec el0t_64_sync+0x1a8/0x1ac Code: 9139c35a 71006f3f 54000568 f8797b55 (f9402ea8) ---[ end trace 0000000000000000 ]--- Kernel panic - not syncing: Oops: Fatal exception SMP: stopping secondary CPUs The patch seems to only provide a workaround, but there are no more effective software solutions to handle the bit flips problem. This path will change the issue from a system crash to a process exception, thereby reducing the impact on the entire machine. akpm: this is probably a kernel bug, but this patch keeps the system running and doesn't reduce that bug's debuggability. Link: https://lkml.kernel.org/r/e223b0e6ba2f4924984b1917cc717bd5@honor.com Signed-off-by: gao xu Reviewed-by: Barry Song Cc: Suren Baghdasaryan Cc: Yosry Ahmed Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/swapfile.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/mm/swapfile.c b/mm/swapfile.c index 21e43e7e2a6a..a577ea8aecdc 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -3549,6 +3549,10 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage, int nr) int err, i; si = swp_swap_info(entry); + if (WARN_ON_ONCE(!si)) { + pr_err("%s%08lx\n", Bad_file, entry.val); + return -EINVAL; + } offset = swp_offset(entry); VM_WARN_ON(nr > SWAPFILE_CLUSTER - offset % SWAPFILE_CLUSTER); -- Gitee From a2ad1b2610b06f9a30c7efb583d04831b581618d Mon Sep 17 00:00:00 2001 From: Sun YangKai Date: Wed, 26 Feb 2025 23:32:43 +0800 Subject: [PATCH 345/484] mm: zswap: use ATOMIC_LONG_INIT to initialize zswap_stored_pages commit ea6de4f8f8f32e54662118a97c441a6ad7b24345 upstream Conflicts: none Backport-reason: ZSWAP Fixes This is currently the only atomic_long_t variable initialized by ATOMIC_INIT macro found in the kernel by using `grep -r atomic_long_t | grep ATOMIC_INIT` This was introduced in 6e1fa555ec77, in which we modified the type of zswap_stored_pages to atomic_long_t, but didn't change the initialization. Link: https://lkml.kernel.org/r/20250226153253.19179-1-sunk67188@gmail.com Fixes: 6e1fa555ec77 ("mm: zswap: modify zswap_stored_pages to be atomic_long_t") Signed-off-by: Sun YangKai Acked-by: Yosry Ahmed Acked-by: David Hildenbrand Cc: Chengming Zhou Cc: Johannes Weiner Cc: Kanchana P Sridhar Cc: Nhat Pham Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/zswap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/zswap.c b/mm/zswap.c index cc3e8b97e3b8..88659c8a2288 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -43,7 +43,7 @@ * statistics **********************************/ /* The number of compressed pages currently stored in zswap */ -atomic_long_t zswap_stored_pages = ATOMIC_INIT(0); +atomic_long_t zswap_stored_pages = ATOMIC_LONG_INIT(0); /* * The statistics below are not protected from concurrent access for -- Gitee From cb1e5219aa1c54d894f2f9683ff9588898f8972c Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Thu, 19 Dec 2024 21:24:37 +0000 Subject: [PATCH 346/484] mm: zswap: fix race between [de]compression and CPU hotunplug commit eaebeb93922ca6ab0dd92027b73d0112701706ef upstream Conflicts: none Backport-reason: ZSWAP Fixes In zswap_compress() and zswap_decompress(), the per-CPU acomp_ctx of the current CPU at the beginning of the operation is retrieved and used throughout. However, since neither preemption nor migration are disabled, it is possible that the operation continues on a different CPU. If the original CPU is hotunplugged while the acomp_ctx is still in use, we run into a UAF bug as the resources attached to the acomp_ctx are freed during hotunplug in zswap_cpu_comp_dead(). The problem was introduced in commit 1ec3b5fe6eec ("mm/zswap: move to use crypto_acomp API for hardware acceleration") when the switch to the crypto_acomp API was made. Prior to that, the per-CPU crypto_comp was retrieved using get_cpu_ptr() which disables preemption and makes sure the CPU cannot go away from under us. Preemption cannot be disabled with the crypto_acomp API as a sleepable context is needed. Commit 8ba2f844f050 ("mm/zswap: change per-cpu mutex and buffer to per-acomp_ctx") increased the UAF surface area by making the per-CPU buffers dynamic, adding yet another resource that can be freed from under zswap compression/decompression by CPU hotunplug. There are a few ways to fix this: (a) Add a refcount for acomp_ctx. (b) Disable migration while using the per-CPU acomp_ctx. (c) Disable CPU hotunplug while using the per-CPU acomp_ctx by holding the CPUs read lock. Implement (c) since it's simpler than (a), and (b) involves using migrate_disable() which is apparently undesired (see huge comment in include/linux/preempt.h). Link: https://lkml.kernel.org/r/20241219212437.2714151-1-yosryahmed@google.com Fixes: 1ec3b5fe6eec ("mm/zswap: move to use crypto_acomp API for hardware acceleration") Signed-off-by: Yosry Ahmed Reported-by: Johannes Weiner Closes: https://lore.kernel.org/lkml/20241113213007.GB1564047@cmpxchg.org/ Reported-by: Sam Sun Closes: https://lore.kernel.org/lkml/CAEkJfYMtSdM5HceNsXUDf5haghD5+o2e7Qv4OcuruL4tPg6OaQ@mail.gmail.com/ Reviewed-by: Chengming Zhou Acked-by: Barry Song Reviewed-by: Nhat Pham Cc: Vitaly Wool Cc: Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/zswap.c | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/mm/zswap.c b/mm/zswap.c index 88659c8a2288..58303dc1071e 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -879,6 +879,18 @@ static int zswap_cpu_comp_dead(unsigned int cpu, struct hlist_node *node) return 0; } +/* Prevent CPU hotplug from freeing up the per-CPU acomp_ctx resources */ +static struct crypto_acomp_ctx *acomp_ctx_get_cpu(struct crypto_acomp_ctx __percpu *acomp_ctx) +{ + cpus_read_lock(); + return raw_cpu_ptr(acomp_ctx); +} + +static void acomp_ctx_put_cpu(void) +{ + cpus_read_unlock(); +} + static bool zswap_compress(struct page *page, struct zswap_entry *entry, struct zswap_pool *pool) { @@ -892,8 +904,7 @@ static bool zswap_compress(struct page *page, struct zswap_entry *entry, gfp_t gfp; u8 *dst; - acomp_ctx = raw_cpu_ptr(pool->acomp_ctx); - + acomp_ctx = acomp_ctx_get_cpu(pool->acomp_ctx); mutex_lock(&acomp_ctx->mutex); dst = acomp_ctx->buffer; @@ -949,6 +960,7 @@ static bool zswap_compress(struct page *page, struct zswap_entry *entry, zswap_reject_alloc_fail++; mutex_unlock(&acomp_ctx->mutex); + acomp_ctx_put_cpu(); return comp_ret == 0 && alloc_ret == 0; } @@ -959,7 +971,7 @@ static void zswap_decompress(struct zswap_entry *entry, struct folio *folio) struct crypto_acomp_ctx *acomp_ctx; u8 *src; - acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx); + acomp_ctx = acomp_ctx_get_cpu(entry->pool->acomp_ctx); mutex_lock(&acomp_ctx->mutex); src = zpool_map_handle(zpool, entry->handle, ZPOOL_MM_RO); @@ -979,6 +991,7 @@ static void zswap_decompress(struct zswap_entry *entry, struct folio *folio) if (zpool_can_sleep_mapped(zpool)) zpool_unmap_handle(zpool, entry->handle); + acomp_ctx_put_cpu(); } /********************************* -- Gitee From aa6e7a4eea6bfc38d96f8f6c78174771ac41e863 Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Tue, 7 Jan 2025 22:22:34 +0000 Subject: [PATCH 347/484] Revert "mm: zswap: fix race between [de]compression and CPU hotunplug" commit 4dff389c9f1dd787e8058930b3fbd3248a6238c5 upstream Conflicts: none Backport-reason: ZSWAP Fixes This reverts commit eaebeb93922ca6ab0dd92027b73d0112701706ef. Commit eaebeb93922c ("mm: zswap: fix race between [de]compression and CPU hotunplug") used the CPU hotplug lock in zswap compress/decompress operations to protect against a race with CPU hotunplug making some per-CPU resources go away. However, zswap compress/decompress can be reached through reclaim while the lock is held, resulting in a potential deadlock as reported by syzbot: ====================================================== WARNING: possible circular locking dependency detected 6.13.0-rc6-syzkaller-00006-g5428dc1906dd #0 Not tainted ------------------------------------------------------ kswapd0/89 is trying to acquire lock: ffffffff8e7d2ed0 (cpu_hotplug_lock){++++}-{0:0}, at: acomp_ctx_get_cpu mm/zswap.c:886 [inline] ffffffff8e7d2ed0 (cpu_hotplug_lock){++++}-{0:0}, at: zswap_compress mm/zswap.c:908 [inline] ffffffff8e7d2ed0 (cpu_hotplug_lock){++++}-{0:0}, at: zswap_store_page mm/zswap.c:1439 [inline] ffffffff8e7d2ed0 (cpu_hotplug_lock){++++}-{0:0}, at: zswap_store+0xa74/0x1ba0 mm/zswap.c:1546 but task is already holding lock: ffffffff8ea355a0 (fs_reclaim){+.+.}-{0:0}, at: balance_pgdat mm/vmscan.c:6871 [inline] ffffffff8ea355a0 (fs_reclaim){+.+.}-{0:0}, at: kswapd+0xb58/0x2f30 mm/vmscan.c:7253 which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #1 (fs_reclaim){+.+.}-{0:0}: lock_acquire+0x1ed/0x550 kernel/locking/lockdep.c:5849 __fs_reclaim_acquire mm/page_alloc.c:3853 [inline] fs_reclaim_acquire+0x88/0x130 mm/page_alloc.c:3867 might_alloc include/linux/sched/mm.h:318 [inline] slab_pre_alloc_hook mm/slub.c:4070 [inline] slab_alloc_node mm/slub.c:4148 [inline] __kmalloc_cache_node_noprof+0x40/0x3a0 mm/slub.c:4337 kmalloc_node_noprof include/linux/slab.h:924 [inline] alloc_worker kernel/workqueue.c:2638 [inline] create_worker+0x11b/0x720 kernel/workqueue.c:2781 workqueue_prepare_cpu+0xe3/0x170 kernel/workqueue.c:6628 cpuhp_invoke_callback+0x48d/0x830 kernel/cpu.c:194 __cpuhp_invoke_callback_range kernel/cpu.c:965 [inline] cpuhp_invoke_callback_range kernel/cpu.c:989 [inline] cpuhp_up_callbacks kernel/cpu.c:1020 [inline] _cpu_up+0x2b3/0x580 kernel/cpu.c:1690 cpu_up+0x184/0x230 kernel/cpu.c:1722 cpuhp_bringup_mask+0xdf/0x260 kernel/cpu.c:1788 cpuhp_bringup_cpus_parallel+0xf9/0x160 kernel/cpu.c:1878 bringup_nonboot_cpus+0x2b/0x50 kernel/cpu.c:1892 smp_init+0x34/0x150 kernel/smp.c:1009 kernel_init_freeable+0x417/0x5d0 init/main.c:1569 kernel_init+0x1d/0x2b0 init/main.c:1466 ret_from_fork+0x4b/0x80 arch/x86/kernel/process.c:147 ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:244 -> #0 (cpu_hotplug_lock){++++}-{0:0}: check_prev_add kernel/locking/lockdep.c:3161 [inline] check_prevs_add kernel/locking/lockdep.c:3280 [inline] validate_chain+0x18ef/0x5920 kernel/locking/lockdep.c:3904 __lock_acquire+0x1397/0x2100 kernel/locking/lockdep.c:5226 lock_acquire+0x1ed/0x550 kernel/locking/lockdep.c:5849 percpu_down_read include/linux/percpu-rwsem.h:51 [inline] cpus_read_lock+0x42/0x150 kernel/cpu.c:490 acomp_ctx_get_cpu mm/zswap.c:886 [inline] zswap_compress mm/zswap.c:908 [inline] zswap_store_page mm/zswap.c:1439 [inline] zswap_store+0xa74/0x1ba0 mm/zswap.c:1546 swap_writepage+0x647/0xce0 mm/page_io.c:279 shmem_writepage+0x1248/0x1610 mm/shmem.c:1579 pageout mm/vmscan.c:696 [inline] shrink_folio_list+0x35ee/0x57e0 mm/vmscan.c:1374 shrink_inactive_list mm/vmscan.c:1967 [inline] shrink_list mm/vmscan.c:2205 [inline] shrink_lruvec+0x16db/0x2f30 mm/vmscan.c:5734 mem_cgroup_shrink_node+0x385/0x8e0 mm/vmscan.c:6575 mem_cgroup_soft_reclaim mm/memcontrol-v1.c:312 [inline] memcg1_soft_limit_reclaim+0x346/0x810 mm/memcontrol-v1.c:362 balance_pgdat mm/vmscan.c:6975 [inline] kswapd+0x17b3/0x2f30 mm/vmscan.c:7253 kthread+0x2f0/0x390 kernel/kthread.c:389 ret_from_fork+0x4b/0x80 arch/x86/kernel/process.c:147 ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:244 other info that might help us debug this: Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(fs_reclaim); lock(cpu_hotplug_lock); lock(fs_reclaim); rlock(cpu_hotplug_lock); *** DEADLOCK *** 1 lock held by kswapd0/89: #0: ffffffff8ea355a0 (fs_reclaim){+.+.}-{0:0}, at: balance_pgdat mm/vmscan.c:6871 [inline] #0: ffffffff8ea355a0 (fs_reclaim){+.+.}-{0:0}, at: kswapd+0xb58/0x2f30 mm/vmscan.c:7253 stack backtrace: CPU: 0 UID: 0 PID: 89 Comm: kswapd0 Not tainted 6.13.0-rc6-syzkaller-00006-g5428dc1906dd #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 09/13/2024 Call Trace: __dump_stack lib/dump_stack.c:94 [inline] dump_stack_lvl+0x241/0x360 lib/dump_stack.c:120 print_circular_bug+0x13a/0x1b0 kernel/locking/lockdep.c:2074 check_noncircular+0x36a/0x4a0 kernel/locking/lockdep.c:2206 check_prev_add kernel/locking/lockdep.c:3161 [inline] check_prevs_add kernel/locking/lockdep.c:3280 [inline] validate_chain+0x18ef/0x5920 kernel/locking/lockdep.c:3904 __lock_acquire+0x1397/0x2100 kernel/locking/lockdep.c:5226 lock_acquire+0x1ed/0x550 kernel/locking/lockdep.c:5849 percpu_down_read include/linux/percpu-rwsem.h:51 [inline] cpus_read_lock+0x42/0x150 kernel/cpu.c:490 acomp_ctx_get_cpu mm/zswap.c:886 [inline] zswap_compress mm/zswap.c:908 [inline] zswap_store_page mm/zswap.c:1439 [inline] zswap_store+0xa74/0x1ba0 mm/zswap.c:1546 swap_writepage+0x647/0xce0 mm/page_io.c:279 shmem_writepage+0x1248/0x1610 mm/shmem.c:1579 pageout mm/vmscan.c:696 [inline] shrink_folio_list+0x35ee/0x57e0 mm/vmscan.c:1374 shrink_inactive_list mm/vmscan.c:1967 [inline] shrink_list mm/vmscan.c:2205 [inline] shrink_lruvec+0x16db/0x2f30 mm/vmscan.c:5734 mem_cgroup_shrink_node+0x385/0x8e0 mm/vmscan.c:6575 mem_cgroup_soft_reclaim mm/memcontrol-v1.c:312 [inline] memcg1_soft_limit_reclaim+0x346/0x810 mm/memcontrol-v1.c:362 balance_pgdat mm/vmscan.c:6975 [inline] kswapd+0x17b3/0x2f30 mm/vmscan.c:7253 kthread+0x2f0/0x390 kernel/kthread.c:389 ret_from_fork+0x4b/0x80 arch/x86/kernel/process.c:147 ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:244 Revert the change. A different fix for the race with CPU hotunplug will follow. Link: https://lkml.kernel.org/r/20250107222236.2715883-1-yosryahmed@google.com Signed-off-by: Yosry Ahmed Reported-by: syzbot Cc: Barry Song Cc: Chengming Zhou Cc: Johannes Weiner Cc: Kanchana P Sridhar Cc: Nhat Pham Cc: Sam Sun Cc: Vitaly Wool Cc: Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/zswap.c | 19 +++---------------- 1 file changed, 3 insertions(+), 16 deletions(-) diff --git a/mm/zswap.c b/mm/zswap.c index 58303dc1071e..88659c8a2288 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -879,18 +879,6 @@ static int zswap_cpu_comp_dead(unsigned int cpu, struct hlist_node *node) return 0; } -/* Prevent CPU hotplug from freeing up the per-CPU acomp_ctx resources */ -static struct crypto_acomp_ctx *acomp_ctx_get_cpu(struct crypto_acomp_ctx __percpu *acomp_ctx) -{ - cpus_read_lock(); - return raw_cpu_ptr(acomp_ctx); -} - -static void acomp_ctx_put_cpu(void) -{ - cpus_read_unlock(); -} - static bool zswap_compress(struct page *page, struct zswap_entry *entry, struct zswap_pool *pool) { @@ -904,7 +892,8 @@ static bool zswap_compress(struct page *page, struct zswap_entry *entry, gfp_t gfp; u8 *dst; - acomp_ctx = acomp_ctx_get_cpu(pool->acomp_ctx); + acomp_ctx = raw_cpu_ptr(pool->acomp_ctx); + mutex_lock(&acomp_ctx->mutex); dst = acomp_ctx->buffer; @@ -960,7 +949,6 @@ static bool zswap_compress(struct page *page, struct zswap_entry *entry, zswap_reject_alloc_fail++; mutex_unlock(&acomp_ctx->mutex); - acomp_ctx_put_cpu(); return comp_ret == 0 && alloc_ret == 0; } @@ -971,7 +959,7 @@ static void zswap_decompress(struct zswap_entry *entry, struct folio *folio) struct crypto_acomp_ctx *acomp_ctx; u8 *src; - acomp_ctx = acomp_ctx_get_cpu(entry->pool->acomp_ctx); + acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx); mutex_lock(&acomp_ctx->mutex); src = zpool_map_handle(zpool, entry->handle, ZPOOL_MM_RO); @@ -991,7 +979,6 @@ static void zswap_decompress(struct zswap_entry *entry, struct folio *folio) if (zpool_can_sleep_mapped(zpool)) zpool_unmap_handle(zpool, entry->handle); - acomp_ctx_put_cpu(); } /********************************* -- Gitee From 339f1891874b0d3c0f85961287f5cda306980511 Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Wed, 8 Jan 2025 22:24:41 +0000 Subject: [PATCH 348/484] mm: zswap: properly synchronize freeing resources during CPU hotunplug commit 12dcb0ef540629a281533f9dedc1b6b8e14cfb65 upstream Conflicts: none Backport-reason: ZSWAP Fixes In zswap_compress() and zswap_decompress(), the per-CPU acomp_ctx of the current CPU at the beginning of the operation is retrieved and used throughout. However, since neither preemption nor migration are disabled, it is possible that the operation continues on a different CPU. If the original CPU is hotunplugged while the acomp_ctx is still in use, we run into a UAF bug as some of the resources attached to the acomp_ctx are freed during hotunplug in zswap_cpu_comp_dead() (i.e. acomp_ctx.buffer, acomp_ctx.req, or acomp_ctx.acomp). The problem was introduced in commit 1ec3b5fe6eec ("mm/zswap: move to use crypto_acomp API for hardware acceleration") when the switch to the crypto_acomp API was made. Prior to that, the per-CPU crypto_comp was retrieved using get_cpu_ptr() which disables preemption and makes sure the CPU cannot go away from under us. Preemption cannot be disabled with the crypto_acomp API as a sleepable context is needed. Use the acomp_ctx.mutex to synchronize CPU hotplug callbacks allocating and freeing resources with compression/decompression paths. Make sure that acomp_ctx.req is NULL when the resources are freed. In the compression/decompression paths, check if acomp_ctx.req is NULL after acquiring the mutex (meaning the CPU was offlined) and retry on the new CPU. The initialization of acomp_ctx.mutex is moved from the CPU hotplug callback to the pool initialization where it belongs (where the mutex is allocated). In addition to adding clarity, this makes sure that CPU hotplug cannot reinitialize a mutex that is already locked by compression/decompression. Previously a fix was attempted by holding cpus_read_lock() [1]. This would have caused a potential deadlock as it is possible for code already holding the lock to fall into reclaim and enter zswap (causing a deadlock). A fix was also attempted using SRCU for synchronization, but Johannes pointed out that synchronize_srcu() cannot be used in CPU hotplug notifiers [2]. Alternative fixes that were considered/attempted and could have worked: - Refcounting the per-CPU acomp_ctx. This involves complexity in handling the race between the refcount dropping to zero in zswap_[de]compress() and the refcount being re-initialized when the CPU is onlined. - Disabling migration before getting the per-CPU acomp_ctx [3], but that's discouraged and is a much bigger hammer than needed, and could result in subtle performance issues. [1]https://lkml.kernel.org/20241219212437.2714151-1-yosryahmed@google.com/ [2]https://lkml.kernel.org/20250107074724.1756696-2-yosryahmed@google.com/ [3]https://lkml.kernel.org/20250107222236.2715883-2-yosryahmed@google.com/ [yosryahmed@google.com: remove comment] Link: https://lkml.kernel.org/r/CAJD7tkaxS1wjn+swugt8QCvQ-rVF5RZnjxwPGX17k8x9zSManA@mail.gmail.com Link: https://lkml.kernel.org/r/20250108222441.3622031-1-yosryahmed@google.com Fixes: 1ec3b5fe6eec ("mm/zswap: move to use crypto_acomp API for hardware acceleration") Signed-off-by: Yosry Ahmed Reported-by: Johannes Weiner Closes: https://lore.kernel.org/lkml/20241113213007.GB1564047@cmpxchg.org/ Reported-by: Sam Sun Closes: https://lore.kernel.org/lkml/CAEkJfYMtSdM5HceNsXUDf5haghD5+o2e7Qv4OcuruL4tPg6OaQ@mail.gmail.com/ Cc: Barry Song Cc: Chengming Zhou Cc: Kanchana P Sridhar Cc: Nhat Pham Cc: Vitaly Wool Cc: Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/zswap.c | 58 +++++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 44 insertions(+), 14 deletions(-) diff --git a/mm/zswap.c b/mm/zswap.c index 88659c8a2288..f5b3aa0edf89 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -250,7 +250,7 @@ static struct zswap_pool *zswap_pool_create(char *type, char *compressor) struct zswap_pool *pool; char name[38]; /* 'zswap' + 32 char (max) num + \0 */ gfp_t gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM; - int ret; + int ret, cpu; if (!zswap_has_pool) { /* if either are unset, pool initialization failed, and we @@ -284,6 +284,9 @@ static struct zswap_pool *zswap_pool_create(char *type, char *compressor) goto error; } + for_each_possible_cpu(cpu) + mutex_init(&per_cpu_ptr(pool->acomp_ctx, cpu)->mutex); + ret = cpuhp_state_add_instance(CPUHP_MM_ZSWP_POOL_PREPARE, &pool->node); if (ret) @@ -821,11 +824,12 @@ static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node) struct acomp_req *req; int ret; - mutex_init(&acomp_ctx->mutex); - + mutex_lock(&acomp_ctx->mutex); acomp_ctx->buffer = kmalloc_node(PAGE_SIZE * 2, GFP_KERNEL, cpu_to_node(cpu)); - if (!acomp_ctx->buffer) - return -ENOMEM; + if (!acomp_ctx->buffer) { + ret = -ENOMEM; + goto buffer_fail; + } acomp = crypto_alloc_acomp_node(pool->tfm_name, 0, 0, cpu_to_node(cpu)); if (IS_ERR(acomp)) { @@ -854,12 +858,15 @@ static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node) acomp_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG, crypto_req_done, &acomp_ctx->wait); + mutex_unlock(&acomp_ctx->mutex); return 0; req_fail: crypto_free_acomp(acomp_ctx->acomp); acomp_fail: kfree(acomp_ctx->buffer); +buffer_fail: + mutex_unlock(&acomp_ctx->mutex); return ret; } @@ -868,17 +875,45 @@ static int zswap_cpu_comp_dead(unsigned int cpu, struct hlist_node *node) struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node); struct crypto_acomp_ctx *acomp_ctx = per_cpu_ptr(pool->acomp_ctx, cpu); + mutex_lock(&acomp_ctx->mutex); if (!IS_ERR_OR_NULL(acomp_ctx)) { if (!IS_ERR_OR_NULL(acomp_ctx->req)) acomp_request_free(acomp_ctx->req); + acomp_ctx->req = NULL; if (!IS_ERR_OR_NULL(acomp_ctx->acomp)) crypto_free_acomp(acomp_ctx->acomp); kfree(acomp_ctx->buffer); } + mutex_unlock(&acomp_ctx->mutex); return 0; } +static struct crypto_acomp_ctx *acomp_ctx_get_cpu_lock(struct zswap_pool *pool) +{ + struct crypto_acomp_ctx *acomp_ctx; + + for (;;) { + acomp_ctx = raw_cpu_ptr(pool->acomp_ctx); + mutex_lock(&acomp_ctx->mutex); + if (likely(acomp_ctx->req)) + return acomp_ctx; + /* + * It is possible that we were migrated to a different CPU after + * getting the per-CPU ctx but before the mutex was acquired. If + * the old CPU got offlined, zswap_cpu_comp_dead() could have + * already freed ctx->req (among other things) and set it to + * NULL. Just try again on the new CPU that we ended up on. + */ + mutex_unlock(&acomp_ctx->mutex); + } +} + +static void acomp_ctx_put_unlock(struct crypto_acomp_ctx *acomp_ctx) +{ + mutex_unlock(&acomp_ctx->mutex); +} + static bool zswap_compress(struct page *page, struct zswap_entry *entry, struct zswap_pool *pool) { @@ -892,10 +927,7 @@ static bool zswap_compress(struct page *page, struct zswap_entry *entry, gfp_t gfp; u8 *dst; - acomp_ctx = raw_cpu_ptr(pool->acomp_ctx); - - mutex_lock(&acomp_ctx->mutex); - + acomp_ctx = acomp_ctx_get_cpu_lock(pool); dst = acomp_ctx->buffer; sg_init_table(&input, 1); sg_set_page(&input, page, PAGE_SIZE, 0); @@ -948,7 +980,7 @@ static bool zswap_compress(struct page *page, struct zswap_entry *entry, else if (alloc_ret) zswap_reject_alloc_fail++; - mutex_unlock(&acomp_ctx->mutex); + acomp_ctx_put_unlock(acomp_ctx); return comp_ret == 0 && alloc_ret == 0; } @@ -959,9 +991,7 @@ static void zswap_decompress(struct zswap_entry *entry, struct folio *folio) struct crypto_acomp_ctx *acomp_ctx; u8 *src; - acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx); - mutex_lock(&acomp_ctx->mutex); - + acomp_ctx = acomp_ctx_get_cpu_lock(entry->pool); src = zpool_map_handle(zpool, entry->handle, ZPOOL_MM_RO); if (!zpool_can_sleep_mapped(zpool)) { memcpy(acomp_ctx->buffer, src, entry->length); @@ -975,10 +1005,10 @@ static void zswap_decompress(struct zswap_entry *entry, struct folio *folio) acomp_request_set_params(acomp_ctx->req, &input, &output, entry->length, PAGE_SIZE); BUG_ON(crypto_wait_req(crypto_acomp_decompress(acomp_ctx->req), &acomp_ctx->wait)); BUG_ON(acomp_ctx->req->dlen != PAGE_SIZE); - mutex_unlock(&acomp_ctx->mutex); if (zpool_can_sleep_mapped(zpool)) zpool_unmap_handle(zpool, entry->handle); + acomp_ctx_put_unlock(acomp_ctx); } /********************************* -- Gitee From 3196075bc2af862ed93f4fa800dc5d1fb3834218 Mon Sep 17 00:00:00 2001 From: Nhat Pham Date: Thu, 6 Mar 2025 12:50:10 -0800 Subject: [PATCH 349/484] page_io: zswap: do not crash the kernel on decompression failure commit ff22f9299d7b2c7874b560993c21543708b7e1b6 upstream Conflicts: major, __swap_writeback may fail in downstream. Backport-reason: ZSWAP Fixes Currently, we crash the kernel when a decompression failure occurs in zswap (either because of memory corruption, or a bug in the compression algorithm). This is overkill. We should only SIGBUS the unfortunate process asking for the zswap entry on zswap load, and skip the corrupted entry in zswap writeback. See [1] for a recent upstream discussion about this. The zswap writeback case is relatively straightforward to fix. For the zswap_load() case, we change the return behavior: * Return 0 on success. * Return -ENOENT (with the folio locked) if zswap does not own the swapped out content. * Return -EIO if zswap owns the swapped out content, but encounters a decompression failure for some reasons. The folio will be unlocked, but not be marked up-to-date, which will eventually cause the process requesting the page to SIGBUS (see the handling of not-up-to-date folio in do_swap_page() in mm/memory.c), without crashing the kernel. * Return -EINVAL if we encounter a large folio, as large folio should not be swapped in while zswap is being used. Similar to the -EIO case, we also unlock the folio but do not mark it as up-to-date to SIGBUS the faulting process. As a side effect, we require one extra zswap tree traversal in the load and writeback paths. Quick benchmarking on a kernel build test shows no performance difference: With the new scheme: real: mean: 125.1s, stdev: 0.12s user: mean: 3265.23s, stdev: 9.62s sys: mean: 2156.41s, stdev: 13.98s The old scheme: real: mean: 125.78s, stdev: 0.45s user: mean: 3287.18s, stdev: 5.95s sys: mean: 2177.08s, stdev: 26.52s [nphamcs@gmail.com: fix documentation of zswap_load()] Link: https://lkml.kernel.org/r/20250306222453.1269456-1-nphamcs@gmail.com Link: https://lore.kernel.org/all/ZsiLElTykamcYZ6J@casper.infradead.org/ [1] Link: https://lkml.kernel.org/r/20250306205011.784787-1-nphamcs@gmail.com Signed-off-by: Nhat Pham Suggested-by: Matthew Wilcox Suggested-by: Yosry Ahmed Suggested-by: Johannes Weiner Reviewed-by: Chengming Zhou Acked-by: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- include/linux/zswap.h | 6 +-- mm/page_io.c | 6 +-- mm/zswap.c | 116 +++++++++++++++++++++++++++++------------- 3 files changed, 87 insertions(+), 41 deletions(-) diff --git a/include/linux/zswap.h b/include/linux/zswap.h index e9db943ad25a..18e25f11da6c 100644 --- a/include/linux/zswap.h +++ b/include/linux/zswap.h @@ -26,7 +26,7 @@ struct zswap_lruvec_state { unsigned long zswap_total_pages(void); bool zswap_store(struct folio *folio); -bool zswap_load(struct folio *folio); +int zswap_load(struct folio *folio); void zswap_invalidate(swp_entry_t swp); int zswap_swapon(int type, unsigned long nr_pages); void zswap_swapoff(int type); @@ -44,9 +44,9 @@ static inline bool zswap_store(struct folio *folio) return false; } -static inline bool zswap_load(struct folio *folio) +static inline int zswap_load(struct folio *folio) { - return false; + return -ENOENT; } static inline void zswap_invalidate(swp_entry_t swp) {} diff --git a/mm/page_io.c b/mm/page_io.c index c95d570f26d9..08c1dd91539e 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -649,11 +649,11 @@ void swap_read_folio(struct folio *folio, struct swap_iocb **plug) if (swap_read_folio_zeromap(folio)) { folio_unlock(folio); goto finish; - } else if (zswap_load(folio)) { - folio_unlock(folio); - goto finish; } + if (zswap_load(folio) != -ENOENT) + goto finish; + /* We have to read from slower devices. Increase zswap protection. */ zswap_folio_swapin(folio); diff --git a/mm/zswap.c b/mm/zswap.c index f5b3aa0edf89..5fbea06912be 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -62,6 +62,8 @@ static u64 zswap_reject_reclaim_fail; static u64 zswap_reject_compress_fail; /* Compressed page was too big for the allocator to (optimally) store */ static u64 zswap_reject_compress_poor; +/* Load or writeback failed due to decompression failure */ +static u64 zswap_decompress_fail; /* Store failed because underlying allocator could not get memory */ static u64 zswap_reject_alloc_fail; /* Store failed because the entry metadata could not be allocated (rare) */ @@ -984,11 +986,12 @@ static bool zswap_compress(struct page *page, struct zswap_entry *entry, return comp_ret == 0 && alloc_ret == 0; } -static void zswap_decompress(struct zswap_entry *entry, struct folio *folio) +static bool zswap_decompress(struct zswap_entry *entry, struct folio *folio) { struct zpool *zpool = entry->pool->zpool; struct scatterlist input, output; struct crypto_acomp_ctx *acomp_ctx; + int decomp_ret, dlen; u8 *src; acomp_ctx = acomp_ctx_get_cpu_lock(entry->pool); @@ -1003,12 +1006,22 @@ static void zswap_decompress(struct zswap_entry *entry, struct folio *folio) sg_init_table(&output, 1); sg_set_folio(&output, folio, PAGE_SIZE, 0); acomp_request_set_params(acomp_ctx->req, &input, &output, entry->length, PAGE_SIZE); - BUG_ON(crypto_wait_req(crypto_acomp_decompress(acomp_ctx->req), &acomp_ctx->wait)); - BUG_ON(acomp_ctx->req->dlen != PAGE_SIZE); + decomp_ret = crypto_wait_req(crypto_acomp_decompress(acomp_ctx->req), &acomp_ctx->wait); + dlen = acomp_ctx->req->dlen; if (zpool_can_sleep_mapped(zpool)) zpool_unmap_handle(zpool, entry->handle); acomp_ctx_put_unlock(acomp_ctx); + + if (!decomp_ret && dlen == PAGE_SIZE) + return true; + + zswap_decompress_fail++; + pr_alert_ratelimited("Decompression error from zswap (%d:%lu %s %u->%d)\n", + swp_type(entry->swpentry), + swp_offset(entry->swpentry), + entry->pool->tfm_name, entry->length, dlen); + return false; } /********************************* @@ -1060,8 +1073,8 @@ static int zswap_writeback_entry(struct zswap_entry *entry, * and freed when invalidated by the concurrent shrinker anyway. */ if (!folio_was_allocated) { - folio_put(folio); - return -EEXIST; + ret = -EEXIST; + goto out; } /* @@ -1074,14 +1087,17 @@ static int zswap_writeback_entry(struct zswap_entry *entry, * be dereferenced. */ tree = swap_zswap_tree(swpentry); - if (entry != xa_cmpxchg(tree, offset, entry, NULL, GFP_KERNEL)) { - delete_from_swap_cache(folio); - folio_unlock(folio); - folio_put(folio); - return -ENOMEM; + if (entry != xa_load(tree, offset)) { + ret = -ENOMEM; + goto out; } - zswap_decompress(entry, folio); + if (!zswap_decompress(entry, folio)) { + ret = -EIO; + goto out; + } + + xa_erase(tree, offset); count_vm_event(ZSWPWB); if (entry->objcg) @@ -1098,7 +1114,14 @@ static int zswap_writeback_entry(struct zswap_entry *entry, /* start writeback */ ret = __swap_writepage(folio, &wbc); folio_put(folio); + return ret; +out: + if (ret && ret != -EEXIST) { + delete_from_swap_cache(folio); + folio_unlock(folio); + } + folio_put(folio); return ret; } @@ -1600,7 +1623,27 @@ bool zswap_store(struct folio *folio) return ret; } -bool zswap_load(struct folio *folio) +/** + * zswap_load() - load a folio from zswap + * @folio: folio to load + * + * Return: 0 on success, with the folio unlocked and marked up-to-date, or one + * of the following error codes: + * + * -EIO: if the swapped out content was in zswap, but could not be loaded + * into the page due to a decompression failure. The folio is unlocked, but + * NOT marked up-to-date, so that an IO error is emitted (e.g. do_swap_page() + * will SIGBUS). + * + * -EINVAL: if the swapped out content was in zswap, but the page belongs + * to a large folio, which is not supported by zswap. The folio is unlocked, + * but NOT marked up-to-date, so that an IO error is emitted (e.g. + * do_swap_page() will SIGBUS). + * + * -ENOENT: if the swapped out content was not in zswap. The folio remains + * locked on return. + */ +int zswap_load(struct folio *folio) { swp_entry_t swp = folio->swap; pgoff_t offset = swp_offset(swp); @@ -1611,18 +1654,32 @@ bool zswap_load(struct folio *folio) VM_WARN_ON_ONCE(!folio_test_locked(folio)); if (zswap_never_enabled()) - return false; + return -ENOENT; /* * Large folios should not be swapped in while zswap is being used, as * they are not properly handled. Zswap does not properly load large * folios, and a large folio may only be partially in zswap. - * - * Return true without marking the folio uptodate so that an IO error is - * emitted (e.g. do_swap_page() will sigbus). */ - if (WARN_ON_ONCE(folio_test_large(folio))) - return true; + if (WARN_ON_ONCE(folio_test_large(folio))) { + folio_unlock(folio); + return -EINVAL; + } + + entry = xa_load(tree, offset); + if (!entry) + return -ENOENT; + + if (!zswap_decompress(entry, folio)) { + folio_unlock(folio); + return -EIO; + } + + folio_mark_uptodate(folio); + + count_vm_event(ZSWPIN); + if (entry->objcg) + count_objcg_events(entry->objcg, ZSWPIN, 1); /* * When reading into the swapcache, invalidate our entry. The @@ -1636,27 +1693,14 @@ bool zswap_load(struct folio *folio) * files, which reads into a private page and may free it if * the fault fails. We remain the primary owner of the entry.) */ - if (swapcache) - entry = xa_erase(tree, offset); - else - entry = xa_load(tree, offset); - - if (!entry) - return false; - - zswap_decompress(entry, folio); - - count_vm_event(ZSWPIN); - if (entry->objcg) - count_objcg_events(entry->objcg, ZSWPIN, 1); - if (swapcache) { - zswap_entry_free(entry); folio_mark_dirty(folio); + xa_erase(tree, offset); + zswap_entry_free(entry); } - folio_mark_uptodate(folio); - return true; + folio_unlock(folio); + return 0; } void zswap_invalidate(swp_entry_t swp) @@ -1751,6 +1795,8 @@ static int zswap_debugfs_init(void) zswap_debugfs_root, &zswap_reject_compress_fail); debugfs_create_u64("reject_compress_poor", 0444, zswap_debugfs_root, &zswap_reject_compress_poor); + debugfs_create_u64("decompress_fail", 0444, + zswap_debugfs_root, &zswap_decompress_fail); debugfs_create_u64("written_back_pages", 0444, zswap_debugfs_root, &zswap_written_back_pages); debugfs_create_file("pool_total_size", 0444, -- Gitee From e615c4c2e7fcddc64d874b9ea10112a2cb97701a Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Wed, 5 Feb 2025 17:27:11 +0800 Subject: [PATCH 350/484] mm/swap_state.c: optimize the code in clear_shadow_from_swap_cache() commit cd57a3fb37f91ef6106bc970d2b5c5878d3b5627 upstream Conflicts: none Backport-reason: SWAP Allocator Use ALIGN to achieve the same effect and simplify the code. Link: https://lkml.kernel.org/r/20250205092721.9395-3-bhe@redhat.com Signed-off-by: Baoquan He Cc: Chris Li Cc: Kairui Song Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/swap_state.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/mm/swap_state.c b/mm/swap_state.c index 623147db9e1f..9c0ecf9c6d3f 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -210,9 +210,7 @@ void clear_shadow_from_swap_cache(int type, unsigned long begin, xa_unlock_irq(&address_space->i_pages); /* search the next swapcache until we meet end */ - curr >>= SWAP_ADDRESS_SPACE_SHIFT; - curr++; - curr <<= SWAP_ADDRESS_SPACE_SHIFT; + curr = ALIGN((curr + 1), SWAP_ADDRESS_SPACE_PAGES); if (curr > end) break; } -- Gitee From 9401ba28ef39988e961f36c36b654a5f85bdb0a1 Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Wed, 5 Feb 2025 17:27:12 +0800 Subject: [PATCH 351/484] mm/swap: remove SWAP_FLAG_PRIO_SHIFT commit 0eb7d2c337f9df3b6adbcbdce213595d94781e05 upstream Conflicts: none Backport-reason: SWAP Allocator It doesn't make sense to have a zero value of shift. Remove it to avoid confusion. Link: https://lkml.kernel.org/r/20250205092721.9395-4-bhe@redhat.com Signed-off-by: Baoquan He Cc: Chris Li Cc: Kairui Song Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- include/linux/swap.h | 1 - mm/swapfile.c | 3 +-- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/include/linux/swap.h b/include/linux/swap.h index e9433c2aa375..34fcd6ce42b8 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -24,7 +24,6 @@ struct pagevec; #define SWAP_FLAG_PREFER 0x8000 /* set if swap priority specified */ #define SWAP_FLAG_PRIO_MASK 0x7fff -#define SWAP_FLAG_PRIO_SHIFT 0 #define SWAP_FLAG_DISCARD 0x10000 /* enable discard for swap */ #define SWAP_FLAG_DISCARD_ONCE 0x20000 /* discard swap area at swapon-time */ #define SWAP_FLAG_DISCARD_PAGES 0x40000 /* discard page-clusters after use */ diff --git a/mm/swapfile.c b/mm/swapfile.c index a577ea8aecdc..4de5b3e5c51e 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -3459,8 +3459,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) mutex_lock(&swapon_mutex); prio = -1; if (swap_flags & SWAP_FLAG_PREFER) - prio = - (swap_flags & SWAP_FLAG_PRIO_MASK) >> SWAP_FLAG_PRIO_SHIFT; + prio = swap_flags & SWAP_FLAG_PRIO_MASK; enable_swap_info(si, prio, swap_map, cluster_info, zeromap); pr_info("Adding %uk swap on %s. Priority:%d extents:%d across:%lluk %s%s%s%s\n", -- Gitee From 105cc1bfe89d43092bb06f8068aa55608936d51c Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Wed, 5 Feb 2025 17:27:13 +0800 Subject: [PATCH 352/484] mm/swap: skip scanning cluster range if it's empty cluster commit 9b9cba7289ba7e5076fbfe913860306b4672631c upstream Conflicts: none Backport-reason: SWAP Allocator Since ci->lock has been taken when isolating cluster from si->free_clusters or taking si->percpu_cluster->next[order], it's unnecessary to scan and check the cluster range availability if i'ts empty cluster, and this can accelerate the huge page swapping. Link: https://lkml.kernel.org/r/20250205092721.9395-5-bhe@redhat.com Signed-off-by: Baoquan He Cc: Chris Li Cc: Kairui Song Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/swapfile.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/mm/swapfile.c b/mm/swapfile.c index 4de5b3e5c51e..d467ae69915d 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -723,6 +723,9 @@ static bool cluster_scan_range(struct swap_info_struct *si, unsigned long offset, end = start + nr_pages; unsigned char *map = si->swap_map; + if (cluster_is_empty(ci)) + return true; + for (offset = start; offset < end; offset++) { switch (READ_ONCE(map[offset])) { case 0: -- Gitee From 2c04f5f73eba99ff0d830671b77723541e296831 Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Wed, 5 Feb 2025 17:27:14 +0800 Subject: [PATCH 353/484] mm/swap: rename swap_is_has_cache() to swap_only_has_cache() commit b4735d94c29f6a65362d3cce0d55aa65e0e319e3 upstream Conflicts: none Backport-reason: SWAP Allocator There are two predicates in the name of swap_is_has_cache() which is confusing. Renaming it to remove the confusion and can better reflect its functionality. Link: https://lkml.kernel.org/r/20250205092721.9395-6-bhe@redhat.com Signed-off-by: Baoquan He Cc: Chris Li Cc: Kairui Song Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/swapfile.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mm/swapfile.c b/mm/swapfile.c index d467ae69915d..c8dae18f109e 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -171,7 +171,7 @@ static long swap_usage_in_pages(struct swap_info_struct *si) /* Reclaim the swap entry if swap is getting full */ #define TTRS_FULL 0x4 -static bool swap_is_has_cache(struct swap_info_struct *si, +static bool swap_only_has_cache(struct swap_info_struct *si, unsigned long offset, int nr_pages) { unsigned char *map = si->swap_map + offset; @@ -262,7 +262,7 @@ static int __try_to_reclaim_swap(struct swap_info_struct *si, * reference or pending writeback, and can't be allocated to others. */ ci = lock_cluster(si, offset); - need_reclaim = swap_is_has_cache(si, offset, nr_pages); + need_reclaim = swap_only_has_cache(si, offset, nr_pages); unlock_cluster(ci); if (!need_reclaim) goto out_unlock; @@ -1603,7 +1603,7 @@ void put_swap_folio(struct folio *folio, swp_entry_t entry) return; ci = lock_cluster(si, offset); - if (swap_is_has_cache(si, offset, size)) + if (swap_only_has_cache(si, offset, size)) swap_entry_range_free(si, ci, entry, size); else { for (int i = 0; i < size; i++, entry.val++) { -- Gitee From ab19e018954eaa0eb4bdc7471b4d7842ef75fa54 Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Wed, 5 Feb 2025 17:27:15 +0800 Subject: [PATCH 354/484] mm/swapfile.c: update the code comment above swap_count_continued() commit f80ddc148ca6505060fbef6f8365ca151103e16f upstream Conflicts: none Backport-reason: SWAP Allocator Now, swap_count_continued() has two callers, __swap_duplicate() and __swap_entry_free_locked(), the relevant code comment is stale. Update it to reflect the current situation. [bhe@redhat.com: v2] Link: https://lkml.kernel.org/r/Z6V0/UvG1fvkQ4t/@fedora Link: https://lkml.kernel.org/r/20250205092721.9395-7-bhe@redhat.com Signed-off-by: Baoquan He Cc: Chris Li Cc: Kairui Song Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/swapfile.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/swapfile.c b/mm/swapfile.c index c8dae18f109e..6cb1fff205b0 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -3802,8 +3802,8 @@ int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask) * into, carry if so, or else fail until a new continuation page is allocated; * when the original swap_map count is decremented from 0 with continuation, * borrow from the continuation and report whether it still holds more. - * Called while __swap_duplicate() or swap_entry_free() holds swap or cluster - * lock. + * Called while __swap_duplicate() or caller of __swap_entry_free_locked() + * holds cluster lock. */ static bool swap_count_continued(struct swap_info_struct *si, pgoff_t offset, unsigned char count) -- Gitee From 6e44c57e333576f164add341e7d137c3b936e478 Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Wed, 5 Feb 2025 17:27:16 +0800 Subject: [PATCH 355/484] mm/swapfile.c: optimize code in setup_clusters() commit a46a6bc21c223b852ff324d7c7ef3da868b90fd0 upstream Conflicts: trivial, declaration issue. Backport-reason: SWAP Allocator In the last 'for' loop inside setup_clusters(), using two local variable 'k' and 'j' are obvisouly redundant. Using 'j' is enough and simpler. And also move macro SWAP_CLUSTER_COLS close to its only user setup_clusters(). Link: https://lkml.kernel.org/r/20250205092721.9395-8-bhe@redhat.com Signed-off-by: Baoquan He Cc: Chris Li Cc: Kairui Song Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/swapfile.c | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/mm/swapfile.c b/mm/swapfile.c index 6cb1fff205b0..d3f7835d8a28 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -3146,13 +3146,6 @@ static unsigned long read_swap_header(struct swap_info_struct *si, return maxpages; } -#define SWAP_CLUSTER_INFO_COLS \ - DIV_ROUND_UP(L1_CACHE_BYTES, sizeof(struct swap_cluster_info)) -#define SWAP_CLUSTER_SPACE_COLS \ - DIV_ROUND_UP(SWAP_ADDRESS_SPACE_PAGES, SWAPFILE_CLUSTER) -#define SWAP_CLUSTER_COLS \ - max_t(unsigned int, SWAP_CLUSTER_INFO_COLS, SWAP_CLUSTER_SPACE_COLS) - static int setup_swap_map_and_extents(struct swap_info_struct *si, union swap_header *swap_header, unsigned char *swap_map, @@ -3192,13 +3185,20 @@ static int setup_swap_map_and_extents(struct swap_info_struct *si, return nr_extents; } +#define SWAP_CLUSTER_INFO_COLS \ + DIV_ROUND_UP(L1_CACHE_BYTES, sizeof(struct swap_cluster_info)) +#define SWAP_CLUSTER_SPACE_COLS \ + DIV_ROUND_UP(SWAP_ADDRESS_SPACE_PAGES, SWAPFILE_CLUSTER) +#define SWAP_CLUSTER_COLS \ + max_t(unsigned int, SWAP_CLUSTER_INFO_COLS, SWAP_CLUSTER_SPACE_COLS) + static struct swap_cluster_info *setup_clusters(struct swap_info_struct *si, union swap_header *swap_header, unsigned long maxpages) { unsigned long nr_clusters = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER); struct swap_cluster_info *cluster_info; - unsigned long i, j, k, idx; + unsigned long i, j, idx; int err = -ENOMEM; cluster_info = kvcalloc(nr_clusters, sizeof(*cluster_info), GFP_KERNEL); @@ -3246,8 +3246,7 @@ static struct swap_cluster_info *setup_clusters(struct swap_info_struct *si, * Reduce false cache line sharing between cluster_info and * sharing same address space. */ - for (k = 0; k < SWAP_CLUSTER_COLS; k++) { - j = k % SWAP_CLUSTER_COLS; + for (j = 0; j < SWAP_CLUSTER_COLS; j++) { for (i = 0; i < DIV_ROUND_UP(nr_clusters, SWAP_CLUSTER_COLS); i++) { struct swap_cluster_info *ci; idx = i * SWAP_CLUSTER_COLS + j; -- Gitee From 8f1468adaffb561e9cdd5d464a68ff9414e792d5 Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Wed, 5 Feb 2025 17:27:18 +0800 Subject: [PATCH 356/484] mm/swapfile.c: remove the unneeded checking commit ac2d3268284ba4e254e4ca346bf6d63fb8a17518 upstream Conflicts: none Backport-reason: SWAP Allocator cleanup In free_swap_and_cache_nr(), invocation of get_swap_device() has done the checking if it's a swap entry. So remove the redundant checking here. Link: https://lkml.kernel.org/r/20250205092721.9395-10-bhe@redhat.com Signed-off-by: Baoquan He Cc: Chris Li Cc: Kairui Song Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/swapfile.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/mm/swapfile.c b/mm/swapfile.c index d3f7835d8a28..963f5fdd00e9 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1797,9 +1797,6 @@ void free_swap_and_cache_nr(swp_entry_t entry, int nr) bool any_only_cache = false; unsigned long offset; - if (non_swap_entry(entry)) - return; - si = get_swap_device(entry); if (!si) return; -- Gitee From 8e8dc772432fa2df9a35ac097d4b4013c1e071bc Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Wed, 5 Feb 2025 17:27:19 +0800 Subject: [PATCH 357/484] mm/swap: rename swap_swapcount() to swap_entry_swapped() commit c523aa890760b004eea47a0e00b5750a528f3ced upstream Conflicts: minor Backport-reason: SWAP Allocator cleanup The new function name can reflect the real behaviour of the function more clearly and more accurately. And the renaming avoids the confusion between swap_swapcount() and swp_swapcount(). Link: https://lkml.kernel.org/r/20250205092721.9395-11-bhe@redhat.com Signed-off-by: Baoquan He Cc: Chris Li Cc: Kairui Song Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- include/linux/swap.h | 6 +++--- mm/swap_state.c | 2 +- mm/swapfile.c | 6 +++--- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/include/linux/swap.h b/include/linux/swap.h index 34fcd6ce42b8..3dfdf1b7a1eb 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -524,7 +524,7 @@ int find_first_swap(dev_t *device); extern unsigned int count_swap_pages(int, int); extern sector_t swapdev_block(int, pgoff_t); extern int __swap_count(swp_entry_t entry); -extern int swap_swapcount(struct swap_info_struct *si, swp_entry_t entry); +extern bool swap_entry_swapped(struct swap_info_struct *si, swp_entry_t entry); extern int swp_swapcount(swp_entry_t entry); struct swap_info_struct *swp_swap_info(swp_entry_t entry); struct backing_dev_info; @@ -607,9 +607,9 @@ static inline int __swap_count(swp_entry_t entry) return 0; } -static inline int swap_swapcount(struct swap_info_struct *si, swp_entry_t entry) +static inline bool swap_entry_swapped(struct swap_info_struct *si, swp_entry_t entry) { - return 0; + return false; } static inline int swp_swapcount(swp_entry_t entry) diff --git a/mm/swap_state.c b/mm/swap_state.c index 9c0ecf9c6d3f..3909cc02af6d 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -394,7 +394,7 @@ struct folio *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, /* * Just skip read ahead for unused swap slot. */ - if (!swap_swapcount(si, entry)) + if (!swap_entry_swapped(si, entry)) goto put_and_return; /* diff --git a/mm/swapfile.c b/mm/swapfile.c index 963f5fdd00e9..ab69c490bd4d 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1627,7 +1627,7 @@ int __swap_count(swp_entry_t entry) * This does not give an exact answer when swap count is continued, * but does include the high COUNT_CONTINUED flag to allow for that. */ -int swap_swapcount(struct swap_info_struct *si, swp_entry_t entry) +bool swap_entry_swapped(struct swap_info_struct *si, swp_entry_t entry) { pgoff_t offset = swp_offset(entry); struct swap_cluster_info *ci; @@ -1636,7 +1636,7 @@ int swap_swapcount(struct swap_info_struct *si, swp_entry_t entry) ci = lock_cluster(si, offset); count = swap_count(si->swap_map[offset]); unlock_cluster(ci); - return count; + return !!count; } /* @@ -1723,7 +1723,7 @@ static bool folio_swapped(struct folio *folio) return false; if (!IS_ENABLED(CONFIG_THP_SWAP) || likely(!folio_test_large(folio))) - return swap_swapcount(si, entry) != 0; + return swap_entry_swapped(si, entry); return swap_page_trans_huge_swapped(si, entry, folio_order(folio)); } -- Gitee From afc26673062e2df621594b3c90907ba4d2d95017 Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Sun, 23 Feb 2025 00:08:50 +0800 Subject: [PATCH 358/484] mm: swap: remove stale comment of swap_reclaim_full_clusters() commit 0a8a5b6c4129e61070eb9a45979c395fb6ab31c4 upstream Conflicts: none Backport-reason: SWAP Allocator cleanup swap_reclaim_full_clusters() has no return value now, just remove the stale comment which says swap_reclaim_full_clusters() wil return a bool value. Link: https://lkml.kernel.org/r/20250222160850.505274-7-shikemeng@huaweicloud.com Signed-off-by: Kemeng Shi Cc: Kairui Song Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/swapfile.c | 1 - 1 file changed, 1 deletion(-) diff --git a/mm/swapfile.c b/mm/swapfile.c index ab69c490bd4d..9ece25102d5f 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -826,7 +826,6 @@ static unsigned int alloc_swap_scan_cluster(struct swap_info_struct *si, return found; } -/* Return true if reclaimed a whole cluster */ static void swap_reclaim_full_clusters(struct swap_info_struct *si, bool force) { long to_scan = 1; -- Gitee From a9fc4cb4ccaccbd53079e9af82e2e925e2f8dad9 Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Wed, 26 Mar 2025 00:25:21 +0800 Subject: [PATCH 359/484] mm: swap: rename __swap_[entry/entries]_free[_locked] to swap_[entry/entries]_put[_locked] commit 9c1c38bcdc9215f0bd231591b662bd0fbe8b7045 upstream Conflicts: none Backport-reason: SWAP Allocator cleanup Patch series "Minor cleanups and improvements to swap freeing code", v4. This series contains some cleanups and improvements which are made during learning swapfile. Here is a summary of the changes: 1. Function naming improvments. - Use "put" instead of "free" to name functions which only do actual free when count drops to zero. - Use "entry" to name function only frees one swap slot. Use "entries" to name function could may free multi swap slots within one cluster. Use "_nr" suffix to name function which could free multi swap slots spanning cross multi clusters. 2. Eliminate the need to set swap slot to intermediate SWAP_HAS_CACHE value before do actual free by using swap_entry_range_free() 3. Add helpers swap_entries_put_map() and swap_entries_put_cache() as a general-purpose routine to free swap entries within a single cluster which will try batch-remove first and fallback to put eatch entry indvidually with cluster lock acquired/released only once. By using these helpers, we could remove repeated code, levarage batch-remove in more cases and aoivd to acquire/release cluster lock for each single swap entry. This patch (of 8): In __swap_entry_free[_locked] and __swap_entries_free, we decrease count first and only free swap entry if count drops to zero. This behavior is more akin to a put() operation rather than a free() operation. Therefore, rename these functions with "put" instead of "free". Additionally, add "_nr" suffix to swap_entries_put to indicate the input range may span swap clusters. Link: https://lkml.kernel.org/r/20250325162528.68385-1-shikemeng@huaweicloud.com Link: https://lkml.kernel.org/r/20250325162528.68385-2-shikemeng@huaweicloud.com Signed-off-by: Kemeng Shi Reviewed-by: Tim Chen Reviewed-by: Baoquan He Cc: Kairui Song Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/swapfile.c | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/mm/swapfile.c b/mm/swapfile.c index 9ece25102d5f..0d41c54c360f 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1356,9 +1356,9 @@ static struct swap_info_struct *_swap_info_get(swp_entry_t entry) return NULL; } -static unsigned char __swap_entry_free_locked(struct swap_info_struct *si, - unsigned long offset, - unsigned char usage) +static unsigned char swap_entry_put_locked(struct swap_info_struct *si, + unsigned long offset, + unsigned char usage) { unsigned char count; unsigned char has_cache; @@ -1462,15 +1462,15 @@ struct swap_info_struct *get_swap_device(swp_entry_t entry) return NULL; } -static unsigned char __swap_entry_free(struct swap_info_struct *si, - swp_entry_t entry) +static unsigned char swap_entry_put(struct swap_info_struct *si, + swp_entry_t entry) { struct swap_cluster_info *ci; unsigned long offset = swp_offset(entry); unsigned char usage; ci = lock_cluster(si, offset); - usage = __swap_entry_free_locked(si, offset, 1); + usage = swap_entry_put_locked(si, offset, 1); if (!usage) swap_entry_range_free(si, ci, swp_entry(si->type, offset), 1); unlock_cluster(ci); @@ -1478,8 +1478,8 @@ static unsigned char __swap_entry_free(struct swap_info_struct *si, return usage; } -static bool __swap_entries_free(struct swap_info_struct *si, - swp_entry_t entry, int nr) +static bool swap_entries_put_nr(struct swap_info_struct *si, + swp_entry_t entry, int nr) { unsigned long offset = swp_offset(entry); unsigned int type = swp_type(entry); @@ -1508,7 +1508,7 @@ static bool __swap_entries_free(struct swap_info_struct *si, fallback: for (i = 0; i < nr; i++) { if (data_race(si->swap_map[offset + i])) { - count = __swap_entry_free(si, swp_entry(type, offset + i)); + count = swap_entry_put(si, swp_entry(type, offset + i)); if (count == SWAP_HAS_CACHE) has_cache = true; } else { @@ -1559,7 +1559,7 @@ static void cluster_swap_free_nr(struct swap_info_struct *si, ci = lock_cluster(si, offset); do { - if (!__swap_entry_free_locked(si, offset, usage)) + if (!swap_entry_put_locked(si, offset, usage)) swap_entry_range_free(si, ci, swp_entry(si->type, offset), 1); } while (++offset < end); unlock_cluster(ci); @@ -1606,7 +1606,7 @@ void put_swap_folio(struct folio *folio, swp_entry_t entry) swap_entry_range_free(si, ci, entry, size); else { for (int i = 0; i < size; i++, entry.val++) { - if (!__swap_entry_free_locked(si, offset + i, SWAP_HAS_CACHE)) + if (!swap_entry_put_locked(si, offset + i, SWAP_HAS_CACHE)) swap_entry_range_free(si, ci, entry, 1); } } @@ -1806,7 +1806,7 @@ void free_swap_and_cache_nr(swp_entry_t entry, int nr) /* * First free all entries in the range. */ - any_only_cache = __swap_entries_free(si, entry, nr); + any_only_cache = swap_entries_put_nr(si, entry, nr); /* * Short-circuit the below loop if none of the entries had their @@ -1819,7 +1819,7 @@ void free_swap_and_cache_nr(swp_entry_t entry, int nr) * Now go back over the range trying to reclaim the swap cache. This is * more efficient for large folios because we will only try to reclaim * the swap once per folio in the common case. If we do - * __swap_entry_free() and __try_to_reclaim_swap() in the same loop, the + * swap_entry_put() and __try_to_reclaim_swap() in the same loop, the * latter will get a reference and lock the folio for every individual * page but will only succeed once the swap slot for every subpage is * zero. @@ -3797,7 +3797,7 @@ int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask) * into, carry if so, or else fail until a new continuation page is allocated; * when the original swap_map count is decremented from 0 with continuation, * borrow from the continuation and report whether it still holds more. - * Called while __swap_duplicate() or caller of __swap_entry_free_locked() + * Called while __swap_duplicate() or caller of swap_entry_put_locked() * holds cluster lock. */ static bool swap_count_continued(struct swap_info_struct *si, -- Gitee From 841c4dd8daf48d32a4a18c006dd88da7ded63cb4 Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Wed, 8 Jan 2025 10:16:49 +0800 Subject: [PATCH 360/484] mm: shmem: skip swapcache for swapin of synchronous swap device commit 1dd44c0af4fa1e80a4e82faa10cbf5d22da40362 upstream Conflicts: none Backport-reason: Shmem Swap & Fixes With fast swap devices (such as zram), swapin latency is crucial to applications. For shmem swapin, similar to anonymous memory swapin, we can skip the swapcache operation to improve swapin latency. Testing 1G shmem sequential swapin without THP enabled, I observed approximately a 6% performance improvement: (Note: I repeated 5 times and took the mean data for each test) w/o patch w/ patch changes 534.8ms 501ms +6.3% In addition, currently, we always split the large swap entry stored in the shmem mapping during shmem large folio swapin, which is not perfect, especially with a fast swap device. We should swap in the whole large folio instead of splitting the precious large folios to take advantage of the large folios and improve the swapin latency if the swap device is synchronous device, which is similar to anonymous memory mTHP swapin. Testing 1G shmem sequential swapin with 64K mTHP and 2M mTHP, I observed obvious performance improvement: mTHP=64K w/o patch w/ patch changes 550.4ms 169.6ms +69% mTHP=2M w/o patch w/ patch changes 542.8ms 126.8ms +77% Note that skipping swapcache requires attention to concurrent swapin scenarios. Fortunately the swapcache_prepare() and shmem_add_to_page_cache() can help identify concurrent swapin and large swap entry split scenarios, and return -EEXIST for retry. [akpm@linux-foundation.org: use IS_ENABLED(), tweak comment grammar] Link: https://lkml.kernel.org/r/3d9f3bd3bc6ec953054baff5134f66feeaae7c1e.1736301701.git.baolin.wang@linux.alibaba.com Signed-off-by: Baolin Wang Cc: David Hildenbrand Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Kairui Song Cc: Kefeng Wang Cc: Matthew Wilcox (Oracle) Cc: Ryan Roberts Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/shmem.c | 110 ++++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 105 insertions(+), 5 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index 7999d40d475b..bd607e3b44b7 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1882,6 +1882,65 @@ static struct folio *shmem_alloc_and_add_folio(struct vm_fault *vmf, return ERR_PTR(error); } +static struct folio *shmem_swap_alloc_folio(struct inode *inode, + struct vm_area_struct *vma, pgoff_t index, + swp_entry_t entry, int order, gfp_t gfp) +{ + struct shmem_inode_info *info = SHMEM_I(inode); + struct folio *new; + void *shadow; + int nr_pages; + + /* + * We have arrived here because our zones are constrained, so don't + * limit chance of success with further cpuset and node constraints. + */ + gfp &= ~GFP_CONSTRAINT_MASK; + if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && order > 0) { + gfp_t huge_gfp = vma_thp_gfp_mask(vma); + + gfp = limit_gfp_mask(huge_gfp, gfp); + } + + new = shmem_alloc_folio(gfp, order, info, index); + if (!new) + return ERR_PTR(-ENOMEM); + + nr_pages = folio_nr_pages(new); + if (mem_cgroup_swapin_charge_folio(new, vma ? vma->vm_mm : NULL, + gfp, entry)) { + folio_put(new); + return ERR_PTR(-ENOMEM); + } + + /* + * Prevent parallel swapin from proceeding with the swap cache flag. + * + * Of course there is another possible concurrent scenario as well, + * that is to say, the swap cache flag of a large folio has already + * been set by swapcache_prepare(), while another thread may have + * already split the large swap entry stored in the shmem mapping. + * In this case, shmem_add_to_page_cache() will help identify the + * concurrent swapin and return -EEXIST. + */ + if (swapcache_prepare(entry, nr_pages)) { + folio_put(new); + return ERR_PTR(-EEXIST); + } + + __folio_set_locked(new); + __folio_set_swapbacked(new); + new->swap = entry; + + mem_cgroup_swapin_uncharge_swap(entry, nr_pages); + shadow = get_shadow_from_swap_cache(entry); + if (shadow) + workingset_refault(new, shadow); + folio_add_lru(new); + swap_read_folio(new, NULL); + return new; +} + /* * When a page is moved from swapcache to shmem filecache (either by the * usual swapin of shmem_get_folio_gfp(), or by the less common swapoff of @@ -1986,7 +2045,8 @@ static int shmem_replace_folio(struct folio **foliop, gfp_t gfp, } static void shmem_set_folio_swapin_error(struct inode *inode, pgoff_t index, - struct folio *folio, swp_entry_t swap) + struct folio *folio, swp_entry_t swap, + bool skip_swapcache) { struct address_space *mapping = inode->i_mapping; swp_entry_t swapin_error; @@ -2002,7 +2062,8 @@ static void shmem_set_folio_swapin_error(struct inode *inode, pgoff_t index, nr_pages = folio_nr_pages(folio); folio_wait_writeback(folio); - delete_from_swap_cache(folio); + if (!skip_swapcache) + delete_from_swap_cache(folio); /* * Don't treat swapin error folio as alloced. Otherwise inode->i_blocks * won't be 0 when inode is released and thus trigger WARN_ON(i_blocks) @@ -2106,6 +2167,7 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, struct shmem_inode_info *info = SHMEM_I(inode); struct swap_info_struct *si; struct folio *folio = NULL; + bool skip_swapcache = false; swp_entry_t swap; int error, nr_pages; #ifdef CONFIG_CGROUP_SLI @@ -2130,6 +2192,8 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, /* Look it up and read it in.. */ folio = swap_cache_get_folio(swap, NULL, 0); if (!folio) { + int order = xa_get_order(&mapping->i_pages, index); + bool fallback_order0 = false; int split_order; /* Or update major stats only when swapin succeeds?? */ @@ -2139,6 +2203,33 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, count_memcg_event_mm(fault_mm, PGMAJFAULT); } + /* + * If uffd is active for the vma, we need per-page fault + * fidelity to maintain the uffd semantics, then fallback + * to swapin order-0 folio, as well as for zswap case. + */ + if (order > 0 && ((vma && unlikely(userfaultfd_armed(vma))) || + !zswap_never_enabled())) + fallback_order0 = true; + + /* Skip swapcache for synchronous device. */ + if (!fallback_order0 && data_race(si->flags & SWP_SYNCHRONOUS_IO)) { + folio = shmem_swap_alloc_folio(inode, vma, index, swap, order, gfp); + if (!IS_ERR(folio)) { + skip_swapcache = true; + goto alloced; + } + + /* + * Fallback to swapin order-0 folio unless the swap entry + * already exists. + */ + error = PTR_ERR(folio); + folio = NULL; + if (error == -EEXIST) + goto failed; + } + /* * Now swap device can only swap in order 0 folio, then we * should split the large swap entry stored in the pagecache @@ -2175,9 +2266,10 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, } } +alloced: /* We have to do this with folio locked to prevent races */ folio_lock(folio); - if (!folio_test_swapcache(folio) || + if ((!skip_swapcache && !folio_test_swapcache(folio)) || folio->swap.val != swap.val || !shmem_confirm_swap(mapping, index, swap)) { error = -EEXIST; @@ -2213,7 +2305,12 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, if (sgp == SGP_WRITE) folio_mark_accessed(folio); - delete_from_swap_cache(folio); + if (skip_swapcache) { + folio->swap.val = 0; + swapcache_clear(si, swap, nr_pages); + } else { + delete_from_swap_cache(folio); + } folio_mark_dirty(folio); swap_free_nr(swap, nr_pages); put_swap_device(si); @@ -2224,8 +2321,11 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, if (!shmem_confirm_swap(mapping, index, swap)) error = -EEXIST; if (error == -EIO) - shmem_set_folio_swapin_error(inode, index, folio, swap); + shmem_set_folio_swapin_error(inode, index, folio, swap, + skip_swapcache); unlock: + if (skip_swapcache) + swapcache_clear(si, swap, folio_nr_pages(folio)); if (folio) { folio_unlock(folio); folio_put(folio); -- Gitee From 41f4f5c77f778de32abe4286ff99341817ff3c53 Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Tue, 25 Feb 2025 17:52:55 +0800 Subject: [PATCH 361/484] mm: shmem: fix potential data corruption during shmem swapin commit 058313515d5aab10d0a01dd634f92ed4a4e71d4c upstream Conflicts: trivial, due to SLI Backport-reason: Shmem Swap Fixes Alex and Kairui reported some issues (system hang or data corruption) when swapping out or swapping in large shmem folios. This is especially easy to reproduce when the tmpfs is mount with the 'huge=within_size' parameter. Thanks to Kairui's reproducer, the issue can be easily replicated. The root cause of the problem is that swap readahead may asynchronously swap in order 0 folios into the swap cache, while the shmem mapping can still store large swap entries. Then an order 0 folio is inserted into the shmem mapping without splitting the large swap entry, which overwrites the original large swap entry, leading to data corruption. When getting a folio from the swap cache, we should split the large swap entry stored in the shmem mapping if the orders do not match, to fix this issue. Link: https://lkml.kernel.org/r/2fe47c557e74e9df5fe2437ccdc6c9115fa1bf70.1740476943.git.baolin.wang@linux.alibaba.com Fixes: 809bc86517cc ("mm: shmem: support large folio swap out") Signed-off-by: Baolin Wang Reported-by: Alex Xu (Hello71) Reported-by: Kairui Song Closes: https://lore.kernel.org/all/1738717785.im3r5g2vxc.none@localhost/ Tested-by: Kairui Song Cc: David Hildenbrand Cc: Lance Yang Cc: Matthew Wilcow Cc: Hugh Dickins Cc: Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/shmem.c | 31 +++++++++++++++++++++++++++---- 1 file changed, 27 insertions(+), 4 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index bd607e3b44b7..865920d9d762 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2169,7 +2169,7 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, struct folio *folio = NULL; bool skip_swapcache = false; swp_entry_t swap; - int error, nr_pages; + int error, nr_pages, order, split_order; #ifdef CONFIG_CGROUP_SLI u64 start; #endif @@ -2191,10 +2191,9 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, /* Look it up and read it in.. */ folio = swap_cache_get_folio(swap, NULL, 0); + order = xa_get_order(&mapping->i_pages, index); if (!folio) { - int order = xa_get_order(&mapping->i_pages, index); bool fallback_order0 = false; - int split_order; /* Or update major stats only when swapin succeeds?? */ if (fault_type) { @@ -2264,6 +2263,29 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, error = -ENOMEM; goto failed; } + } else if (order != folio_order(folio)) { + /* + * Swap readahead may swap in order 0 folios into swapcache + * asynchronously, while the shmem mapping can still stores + * large swap entries. In such cases, we should split the + * large swap entry to prevent possible data corruption. + */ + split_order = shmem_split_large_entry(inode, index, swap, gfp); + if (split_order < 0) { + error = split_order; + goto failed; + } + + /* + * If the large swap entry has already been split, it is + * necessary to recalculate the new swap entry based on + * the old order alignment. + */ + if (split_order > 0) { + pgoff_t offset = index - round_down(index, 1 << split_order); + + swap = swp_entry(swp_type(swap), swp_offset(swap) + offset); + } } alloced: @@ -2271,7 +2293,8 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, folio_lock(folio); if ((!skip_swapcache && !folio_test_swapcache(folio)) || folio->swap.val != swap.val || - !shmem_confirm_swap(mapping, index, swap)) { + !shmem_confirm_swap(mapping, index, swap) || + xa_get_order(&mapping->i_pages, index) != folio_order(folio)) { error = -EEXIST; goto unlock; } -- Gitee From 5f17580def78377fb405519785a8745793a6bf98 Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Sat, 17 May 2025 01:09:35 +0800 Subject: [PATCH 362/484] mm: shmem: avoid unpaired folio_unlock() in shmem_swapin_folio() commit e08d5f515613a9860bfee7312461a19f422adb5e upstream Conflicts: none Backport-reason: Shmem Swap Fixes Patch series "Some random fixes and cleanup to shmem", v3. This series contains some simple fixes and cleanup which are made during learning shmem. More details can be found in respective patches. This patch (of 5): If we get a folio from swap_cache_get_folio() successfully but encounter a failure before the folio is locked, we will unlock the folio which was not previously locked. Put the folio and set it to NULL when a failure occurs before the folio is locked to fix the issue. Link: https://lkml.kernel.org/r/20250516170939.965736-1-shikemeng@huaweicloud.com Link: https://lkml.kernel.org/r/20250516170939.965736-2-shikemeng@huaweicloud.com Fixes: 058313515d5a ("mm: shmem: fix potential data corruption during shmem swapin") Signed-off-by: Kemeng Shi Reviewed-by: Baolin Wang Reviewed-by: Kairui Song Cc: Hugh Dickins Cc: kernel test robot Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/shmem.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mm/shmem.c b/mm/shmem.c index 865920d9d762..ae61901ef302 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2272,6 +2272,8 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, */ split_order = shmem_split_large_entry(inode, index, swap, gfp); if (split_order < 0) { + folio_put(folio); + folio = NULL; error = split_order; goto failed; } -- Gitee From e71cc1f74ee80abf04f853725b9712a6776fd7c2 Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Sat, 17 May 2025 01:09:36 +0800 Subject: [PATCH 363/484] mm: shmem: add missing shmem_unacct_size() in __shmem_file_setup() commit 594ec2ab389ab9cdada13ad85f503bbfc5658e84 upstream Conflicts: none Backport-reason: Shmem Swap Fixes We will miss shmem_unacct_size() when is_idmapped_mnt() returns a failure. Move is_idmapped_mnt() before shmem_acct_size() to fix the issue. Link: https://lkml.kernel.org/r/20250516170939.965736-3-shikemeng@huaweicloud.com Fixes: 7a80e5b8c6fa ("shmem: support idmapped mounts for tmpfs") Signed-off-by: Kemeng Shi Reviewed-by: Baolin Wang Cc: Hugh Dickins Cc: Kairui Song Cc: kernel test robot Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/shmem.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index ae61901ef302..718e7a1ddf4b 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -5329,12 +5329,12 @@ static struct file *__shmem_file_setup(struct vfsmount *mnt, const char *name, if (size < 0 || size > MAX_LFS_FILESIZE) return ERR_PTR(-EINVAL); - if (shmem_acct_size(flags, size)) - return ERR_PTR(-ENOMEM); - if (is_idmapped_mnt(mnt)) return ERR_PTR(-EINVAL); + if (shmem_acct_size(flags, size)) + return ERR_PTR(-ENOMEM); + inode = shmem_get_inode(&nop_mnt_idmap, mnt->mnt_sb, NULL, S_IFREG | S_IRWXUGO, 0, flags); if (IS_ERR(inode)) { -- Gitee From 0a463aaec05c54999c72848bbf01ad1c74cff63a Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Sat, 17 May 2025 01:09:37 +0800 Subject: [PATCH 364/484] mm/shmem: fix potential dead loop in shmem_unuse() commit 3f778ab1b52450d467109321e2abfc4b36ec2d3e upstream Conflicts: none Backport-reason: Shmem Swap Fixes If multi shmem_unuse() for different swap type is called concurrently, a dead loop could occur as following: shmem_unuse(typeA) shmem_unuse(typeB) mutex_lock(&shmem_swaplist_mutex) list_for_each_entry_safe(info, next, ...) ... mutex_unlock(&shmem_swaplist_mutex) /* info->swapped may drop to 0 */ shmem_unuse_inode(&info->vfs_inode, type) mutex_lock(&shmem_swaplist_mutex) list_for_each_entry(info, next, ...) if (!info->swapped) list_del_init(&info->swaplist) ... mutex_unlock(&shmem_swaplist_mutex) mutex_lock(&shmem_swaplist_mutex) /* iterate with offlist entry and encounter a dead loop */ next = list_next_entry(info, swaplist); ... Restart the iteration if the inode is already off shmem_swaplist list to fix the issue. Link: https://lkml.kernel.org/r/20250516170939.965736-4-shikemeng@huaweicloud.com Fixes: b56a2d8af914 ("mm: rid swapoff of quadratic complexity") Signed-off-by: Kemeng Shi Reviewed-by: Baolin Wang Cc: Hugh Dickins Cc: Kairui Song Cc: kernel test robot Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/shmem.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index 718e7a1ddf4b..46716a022285 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1413,6 +1413,7 @@ int shmem_unuse(unsigned int type) shrink_page_cache(GFP_KERNEL, NULL); mutex_lock(&shmem_swaplist_mutex); +start_over: list_for_each_entry_safe(info, next, &shmem_swaplist, swaplist) { if (!info->swapped) { list_del_init(&info->swaplist); @@ -1431,13 +1432,15 @@ int shmem_unuse(unsigned int type) cond_resched(); mutex_lock(&shmem_swaplist_mutex); - next = list_next_entry(info, swaplist); - if (!info->swapped) - list_del_init(&info->swaplist); if (atomic_dec_and_test(&info->stop_eviction)) wake_up_var(&info->stop_eviction); if (error) break; + if (list_empty(&info->swaplist)) + goto start_over; + next = list_next_entry(info, swaplist); + if (!info->swapped) + list_del_init(&info->swaplist); } mutex_unlock(&shmem_swaplist_mutex); -- Gitee From b3a8774f362db34d5ddcd26a6491a2614281fc58 Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Sat, 17 May 2025 01:09:39 +0800 Subject: [PATCH 365/484] mm/shmem: remove unneeded xa_is_value() check in shmem_unuse_swap_entries() commit c5a9deace6095c0dca86f5414493bcf1711f1ca3 upstream Conflicts: none Backport-reason: Shmem Swap Fixes As only value entry will be added to fbatch in shmem_find_swap_entries(), there is no need to do xa_is_value() check in shmem_unuse_swap_entries(). Link: https://lkml.kernel.org/r/20250516170939.965736-6-shikemeng@huaweicloud.com Signed-off-by: Kemeng Shi Reviewed-by: Baolin Wang Cc: Hugh Dickins Cc: Kairui Song Cc: kernel test robot Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/shmem.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index 46716a022285..7b740b13a680 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1343,8 +1343,6 @@ static int shmem_unuse_swap_entries(struct inode *inode, for (i = 0; i < folio_batch_count(fbatch); i++) { struct folio *folio = fbatch->folios[i]; - if (!xa_is_value(folio)) - continue; error = shmem_swapin_folio(inode, indices[i], &folio, SGP_CACHE, mapping_gfp_mask(mapping), NULL, NULL); if (error == 0) { -- Gitee From 170bcbeb603ccc90412ff495d6979286710e40d7 Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Mon, 28 Jul 2025 15:52:59 +0800 Subject: [PATCH 366/484] mm/shmem, swap: improve cached mTHP handling and fix potential hang commit 5c241ed8d031693dadf33dd98ed2e7cc363e9b66 upstream Conflicts: none Backport-reason: Shmem Swap Fixes The current swap-in code assumes that, when a swap entry in shmem mapping is order 0, its cached folios (if present) must be order 0 too, which turns out not always correct. The problem is shmem_split_large_entry is called before verifying the folio will eventually be swapped in, one possible race is: CPU1 CPU2 shmem_swapin_folio /* swap in of order > 0 swap entry S1 */ folio = swap_cache_get_folio /* folio = NULL */ order = xa_get_order /* order > 0 */ folio = shmem_swap_alloc_folio /* mTHP alloc failure, folio = NULL */ <... Interrupted ...> shmem_swapin_folio /* S1 is swapped in */ shmem_writeout /* S1 is swapped out, folio cached */ shmem_split_large_entry(..., S1) /* S1 is split, but the folio covering it has order > 0 now */ Now any following swapin of S1 will hang: `xa_get_order` returns 0, and folio lookup will return a folio with order > 0. The `xa_get_order(&mapping->i_pages, index) != folio_order(folio)` will always return false causing swap-in to return -EEXIST. And this looks fragile. So fix this up by allowing seeing a larger folio in swap cache, and check the whole shmem mapping range covered by the swapin have the right swap value upon inserting the folio. And drop the redundant tree walks before the insertion. This will actually improve performance, as it avoids two redundant Xarray tree walks in the hot path, and the only side effect is that in the failure path, shmem may redundantly reallocate a few folios causing temporary slight memory pressure. And worth noting, it may seems the order and value check before inserting might help reducing the lock contention, which is not true. The swap cache layer ensures raced swapin will either see a swap cache folio or failed to do a swapin (we have SWAP_HAS_CACHE bit even if swap cache is bypassed), so holding the folio lock and checking the folio flag is already good enough for avoiding the lock contention. The chance that a folio passes the swap entry value check but the shmem mapping slot has changed should be very low. Link: https://lkml.kernel.org/r/20250728075306.12704-1-ryncsn@gmail.com Link: https://lkml.kernel.org/r/20250728075306.12704-2-ryncsn@gmail.com Fixes: 809bc86517cc ("mm: shmem: support large folio swap out") Signed-off-by: Kairui Song Reviewed-by: Kemeng Shi Reviewed-by: Baolin Wang Tested-by: Baolin Wang Cc: Baoquan He Cc: Barry Song Cc: Chris Li Cc: Hugh Dickins Cc: Matthew Wilcox (Oracle) Cc: Nhat Pham Cc: Dev Jain Cc: Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/shmem.c | 39 ++++++++++++++++++++++++++++++--------- 1 file changed, 30 insertions(+), 9 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index 7b740b13a680..1b225232deea 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -777,7 +777,9 @@ static int shmem_add_to_page_cache(struct folio *folio, pgoff_t index, void *expected, gfp_t gfp) { XA_STATE_ORDER(xas, &mapping->i_pages, index, folio_order(folio)); - long nr = folio_nr_pages(folio); + unsigned long nr = folio_nr_pages(folio); + swp_entry_t iter, swap; + void *entry; VM_BUG_ON_FOLIO(index != round_down(index, nr), folio); VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); @@ -789,14 +791,25 @@ static int shmem_add_to_page_cache(struct folio *folio, gfp &= GFP_RECLAIM_MASK; folio_throttle_swaprate(folio, gfp); + swap = radix_to_swp_entry(expected); do { + iter = swap; xas_lock_irq(&xas); - if (expected != xas_find_conflict(&xas)) { - xas_set_err(&xas, -EEXIST); - goto unlock; + xas_for_each_conflict(&xas, entry) { + /* + * The range must either be empty, or filled with + * expected swap entries. Shmem swap entries are never + * partially freed without split of both entry and + * folio, so there shouldn't be any holes. + */ + if (!expected || entry != swp_to_radix_entry(iter)) { + xas_set_err(&xas, -EEXIST); + goto unlock; + } + iter.val += 1 << xas_get_order(&xas); } - if (expected && xas_find_conflict(&xas)) { + if (expected && iter.val - nr != swap.val) { xas_set_err(&xas, -EEXIST); goto unlock; } @@ -2264,7 +2277,7 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, error = -ENOMEM; goto failed; } - } else if (order != folio_order(folio)) { + } else if (order > folio_order(folio)) { /* * Swap readahead may swap in order 0 folios into swapcache * asynchronously, while the shmem mapping can still stores @@ -2289,15 +2302,23 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, swap = swp_entry(swp_type(swap), swp_offset(swap) + offset); } + } else if (order < folio_order(folio)) { + swap.val = round_down(swap.val, 1 << folio_order(folio)); + index = round_down(index, 1 << folio_order(folio)); } alloced: - /* We have to do this with folio locked to prevent races */ + /* + * We have to do this with the folio locked to prevent races. + * The shmem_confirm_swap below only checks if the first swap + * entry matches the folio, that's enough to ensure the folio + * is not used outside of shmem, as shmem swap entries + * and swap cache folios are never partially freed. + */ folio_lock(folio); if ((!skip_swapcache && !folio_test_swapcache(folio)) || - folio->swap.val != swap.val || !shmem_confirm_swap(mapping, index, swap) || - xa_get_order(&mapping->i_pages, index) != folio_order(folio)) { + folio->swap.val != swap.val) { error = -EEXIST; goto unlock; } -- Gitee From 235871be88364c84ba3d29dafadb6209cc8c4e80 Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Mon, 28 Jul 2025 15:53:00 +0800 Subject: [PATCH 367/484] mm/shmem, swap: avoid redundant Xarray lookup during swapin commit 0cfc0e7e3d062b93e9eec6828de000981cdfb152 upstream Conflicts: trivial, comment. Backport-reason: Shmem mTHP Patch series "mm/shmem, swap: bugfix and improvement of mTHP swap in", v6. The current THP swapin path have several problems. It may potentially hang, may cause redundant faults due to false positive swap cache lookup, and it issues redundant Xarray walks. !CONFIG_TRANSPARENT_HUGEPAGE builds may also contain unnecessary THP checks. This series fixes all of the mentioned issues, the code should be more robust and prepared for the swap table series. Now 4 walks is reduced to 3 (get order & confirm, confirm, insert folio), !CONFIG_TRANSPARENT_HUGEPAGE build overhead is also minimized, and comes with a sanity check now. The performance is slightly better after this series, sequential swap in of 24G data from ZRAM, using transparent_hugepage_tmpfs=always (24 samples each): Before: avg: 10.66s, stddev: 0.04 After patch 1: avg: 10.58s, stddev: 0.04 After patch 2: avg: 10.65s, stddev: 0.05 After patch 3: avg: 10.65s, stddev: 0.04 After patch 4: avg: 10.67s, stddev: 0.04 After patch 5: avg: 9.79s, stddev: 0.04 After patch 6: avg: 9.79s, stddev: 0.05 After patch 7: avg: 9.78s, stddev: 0.05 After patch 8: avg: 9.79s, stddev: 0.04 Several patches improve the performance by a little, which is about ~8% faster in total. Build kernel test showed very slightly improvement, testing with make -j48 with defconfig in a 768M memcg also using ZRAM as swap, and transparent_hugepage_tmpfs=always (6 test runs): Before: avg: 3334.66s, stddev: 43.76 After patch 1: avg: 3349.77s, stddev: 18.55 After patch 2: avg: 3325.01s, stddev: 42.96 After patch 3: avg: 3354.58s, stddev: 14.62 After patch 4: avg: 3336.24s, stddev: 32.15 After patch 5: avg: 3325.13s, stddev: 22.14 After patch 6: avg: 3285.03s, stddev: 38.95 After patch 7: avg: 3287.32s, stddev: 26.37 After patch 8: avg: 3295.87s, stddev: 46.24 This patch (of 7): Currently shmem calls xa_get_order to get the swap radix entry order, requiring a full tree walk. This can be easily combined with the swap entry value checking (shmem_confirm_swap) to avoid the duplicated lookup and abort early if the entry is gone already. Which should improve the performance. Link: https://lkml.kernel.org/r/20250728075306.12704-1-ryncsn@gmail.com Link: https://lkml.kernel.org/r/20250728075306.12704-3-ryncsn@gmail.com Signed-off-by: Kairui Song Reviewed-by: Kemeng Shi Reviewed-by: Dev Jain Reviewed-by: Baolin Wang Cc: Baoquan He Cc: Barry Song Cc: Chris Li Cc: Hugh Dickins Cc: Matthew Wilcox (Oracle) Cc: Nhat Pham Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/shmem.c | 38 +++++++++++++++++++++++++++----------- 1 file changed, 27 insertions(+), 11 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index 1b225232deea..8550505af36e 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -496,15 +496,27 @@ static int shmem_replace_entry(struct address_space *mapping, /* * Sometimes, before we decide whether to proceed or to fail, we must check - * that an entry was not already brought back from swap by a racing thread. + * that an entry was not already brought back or split by a racing thread. * - * Checking page is not enough: by the time a SwapCache page is locked, it - * might be reused, and again be SwapCache, using the same swap as before. + * Checking folio is not enough: by the time a swapcache folio is locked, it + * might be reused, and again be swapcache, using the same swap as before. + * Returns the swap entry's order if it still presents, else returns -1. */ -static bool shmem_confirm_swap(struct address_space *mapping, - pgoff_t index, swp_entry_t swap) +static int shmem_confirm_swap(struct address_space *mapping, pgoff_t index, + swp_entry_t swap) { - return xa_load(&mapping->i_pages, index) == swp_to_radix_entry(swap); + XA_STATE(xas, &mapping->i_pages, index); + int ret = -1; + void *entry; + + rcu_read_lock(); + do { + entry = xas_load(&xas); + if (entry == swp_to_radix_entry(swap)) + ret = xas_get_order(&xas); + } while (xas_retry(&xas, entry)); + rcu_read_unlock(); + return ret; } /* @@ -2196,16 +2208,20 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, return -EIO; si = get_swap_device(swap); - if (!si) { - if (!shmem_confirm_swap(mapping, index, swap)) + order = shmem_confirm_swap(mapping, index, swap); + if (unlikely(!si)) { + if (order < 0) return -EEXIST; else return -EINVAL; } + if (unlikely(order < 0)) { + put_swap_device(si); + return -EEXIST; + } /* Look it up and read it in.. */ folio = swap_cache_get_folio(swap, NULL, 0); - order = xa_get_order(&mapping->i_pages, index); if (!folio) { bool fallback_order0 = false; @@ -2317,7 +2333,7 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, */ folio_lock(folio); if ((!skip_swapcache && !folio_test_swapcache(folio)) || - !shmem_confirm_swap(mapping, index, swap) || + shmem_confirm_swap(mapping, index, swap) < 0 || folio->swap.val != swap.val) { error = -EEXIST; goto unlock; @@ -2365,7 +2381,7 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, *foliop = folio; return 0; failed: - if (!shmem_confirm_swap(mapping, index, swap)) + if (shmem_confirm_swap(mapping, index, swap) < 0) error = -EEXIST; if (error == -EIO) shmem_set_folio_swapin_error(inode, index, folio, swap, -- Gitee From db83f1e4d379fcf264748d6bb2ff8520798465c5 Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Tue, 10 Jun 2025 01:17:51 +0800 Subject: [PATCH 368/484] mm/shmem, swap: fix softlockup with mTHP swapin MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit a05dd8ae5cbb1cb45f349922cfea4f548a5e5d6f upstream Conflicts: none Backport-reason: Shmem mTHP swap Following softlockup can be easily reproduced on my test machine with: echo always > /sys/kernel/mm/transparent_hugepage/hugepages-64kB/enabled swapon /dev/zram0 # zram0 is a 48G swap device mkdir -p /sys/fs/cgroup/memory/test echo 1G > /sys/fs/cgroup/test/memory.max echo $BASHPID > /sys/fs/cgroup/test/cgroup.procs while true; do dd if=/dev/zero of=/tmp/test.img bs=1M count=5120 cat /tmp/test.img > /dev/null rm /tmp/test.img done Then after a while: watchdog: BUG: soft lockup - CPU#0 stuck for 763s! [cat:5787] Modules linked in: zram virtiofs CPU: 0 UID: 0 PID: 5787 Comm: cat Kdump: loaded Tainted: G L 6.15.0.orig-gf3021d9246bc-dirty #118 PREEMPT(voluntary)· Tainted: [L]=SOFTLOCKUP Hardware name: Red Hat KVM/RHEL-AV, BIOS 0.0.0 02/06/2015 RIP: 0010:mpol_shared_policy_lookup+0xd/0x70 Code: e9 b8 b4 ff ff 31 c0 c3 cc cc cc cc 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 66 0f 1f 00 0f 1f 44 00 00 41 54 55 53 <48> 8b 1f 48 85 db 74 41 4c 8d 67 08 48 89 fb 48 89 f5 4c 89 e7 e8 RSP: 0018:ffffc90002b1fc28 EFLAGS: 00000202 RAX: 00000000001c20ca RBX: 0000000000724e1e RCX: 0000000000000001 RDX: ffff888118e214c8 RSI: 0000000000057d42 RDI: ffff888118e21518 RBP: 000000000002bec8 R08: 0000000000000001 R09: 0000000000000000 R10: 0000000000000bf4 R11: 0000000000000000 R12: 0000000000000001 R13: 00000000001c20ca R14: 00000000001c20ca R15: 0000000000000000 FS: 00007f03f995c740(0000) GS:ffff88a07ad9a000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f03f98f1000 CR3: 0000000144626004 CR4: 0000000000770eb0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 PKRU: 55555554 Call Trace: shmem_alloc_folio+0x31/0xc0 shmem_swapin_folio+0x309/0xcf0 ? filemap_get_entry+0x117/0x1e0 ? xas_load+0xd/0xb0 ? filemap_get_entry+0x101/0x1e0 shmem_get_folio_gfp+0x2ed/0x5b0 shmem_file_read_iter+0x7f/0x2e0 vfs_read+0x252/0x330 ksys_read+0x68/0xf0 do_syscall_64+0x4c/0x1c0 entry_SYSCALL_64_after_hwframe+0x76/0x7e RIP: 0033:0x7f03f9a46991 Code: 00 48 8b 15 81 14 10 00 f7 d8 64 89 02 b8 ff ff ff ff eb bd e8 20 ad 01 00 f3 0f 1e fa 80 3d 35 97 10 00 00 74 13 31 c0 0f 05 <48> 3d 00 f0 ff ff 77 4f c3 66 0f 1f 44 00 00 55 48 89 e5 48 83 ec RSP: 002b:00007fff3c52bd28 EFLAGS: 00000246 ORIG_RAX: 0000000000000000 RAX: ffffffffffffffda RBX: 0000000000040000 RCX: 00007f03f9a46991 RDX: 0000000000040000 RSI: 00007f03f98ba000 RDI: 0000000000000003 RBP: 00007fff3c52bd50 R08: 0000000000000000 R09: 00007f03f9b9a380 R10: 0000000000000022 R11: 0000000000000246 R12: 0000000000040000 R13: 00007f03f98ba000 R14: 0000000000000003 R15: 0000000000000000 The reason is simple, readahead brought some order 0 folio in swap cache, and the swapin mTHP folio being allocated is in conflict with it, so swapcache_prepare fails and causes shmem_swap_alloc_folio to return -EEXIST, and shmem simply retries again and again causing this loop. Fix it by applying a similar fix for anon mTHP swapin. The performance change is very slight, time of swapin 10g zero folios with shmem (test for 12 times): Before: 2.47s After: 2.48s [kasong@tencent.com: add comment] Link: https://lkml.kernel.org/r/20250610181645.45922-1-ryncsn@gmail.com Link: https://lkml.kernel.org/r/20250610181645.45922-1-ryncsn@gmail.com Link: https://lkml.kernel.org/r/20250609171751.36305-1-ryncsn@gmail.com Fixes: 1dd44c0af4fa ("mm: shmem: skip swapcache for swapin of synchronous swap device") Signed-off-by: Kairui Song Reviewed-by: Barry Song Acked-by: Nhat Pham Reviewed-by: Baolin Wang Cc: Baoquan He Cc: Chris Li Cc: Hugh Dickins Cc: Kemeng Shi Cc: Usama Arif Cc: Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/memory.c | 20 -------------------- mm/shmem.c | 6 +++++- mm/swap.h | 23 +++++++++++++++++++++++ 3 files changed, 28 insertions(+), 21 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index df72f7277e0c..58b705bb620d 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3929,26 +3929,6 @@ static struct folio *__alloc_swap_folio(struct vm_fault *vmf) } #ifdef CONFIG_TRANSPARENT_HUGEPAGE -static inline int non_swapcache_batch(swp_entry_t entry, int max_nr) -{ - struct swap_info_struct *si = swp_swap_info(entry); - pgoff_t offset = swp_offset(entry); - int i; - - /* - * While allocating a large folio and doing swap_read_folio, which is - * the case the being faulted pte doesn't have swapcache. We need to - * ensure all PTEs have no cache as well, otherwise, we might go to - * swap devices while the content is in swapcache. - */ - for (i = 0; i < max_nr; i++) { - if ((si->swap_map[offset + i] & SWAP_HAS_CACHE)) - return i; - } - - return i; -} - /* * Check if the PTEs within a range are contiguous swap entries * and have consistent swapcache, zeromap. diff --git a/mm/shmem.c b/mm/shmem.c index 8550505af36e..3bda0d1f9db5 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2223,6 +2223,7 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, /* Look it up and read it in.. */ folio = swap_cache_get_folio(swap, NULL, 0); if (!folio) { + int nr_pages = 1 << order; bool fallback_order0 = false; /* Or update major stats only when swapin succeeds?? */ @@ -2236,9 +2237,12 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, * If uffd is active for the vma, we need per-page fault * fidelity to maintain the uffd semantics, then fallback * to swapin order-0 folio, as well as for zswap case. + * Any existing sub folio in the swap cache also blocks + * mTHP swapin. */ if (order > 0 && ((vma && unlikely(userfaultfd_armed(vma))) || - !zswap_never_enabled())) + !zswap_never_enabled() || + non_swapcache_batch(swap, nr_pages) != nr_pages)) fallback_order0 = true; /* Skip swapcache for synchronous device. */ diff --git a/mm/swap.h b/mm/swap.h index faa41d242cd7..a900754b0045 100644 --- a/mm/swap.h +++ b/mm/swap.h @@ -105,6 +105,25 @@ static inline int swap_zeromap_batch(swp_entry_t entry, int max_nr, return find_next_bit(sis->zeromap, end, start) - start; } +static inline int non_swapcache_batch(swp_entry_t entry, int max_nr) +{ + struct swap_info_struct *si = swp_swap_info(entry); + pgoff_t offset = swp_offset(entry); + int i; + + /* + * While allocating a large folio and doing mTHP swapin, we need to + * ensure all entries are not cached, otherwise, the mTHP folio will + * be in conflict with the folio in swap cache. + */ + for (i = 0; i < max_nr; i++) { + if ((si->swap_map[offset + i] & SWAP_HAS_CACHE)) + return i; + } + + return i; +} + #else /* CONFIG_SWAP */ struct swap_iocb; static inline void swap_read_folio(struct folio *folio, struct swap_iocb **plug) @@ -198,6 +217,10 @@ static inline int swap_zeromap_batch(swp_entry_t entry, int max_nr, return 0; } +static inline int non_swapcache_batch(swp_entry_t entry, int max_nr) +{ + return 0; +} #endif /* CONFIG_SWAP */ #endif /* _MM_SWAP_H */ -- Gitee From 06cc60c6644b6bfdf517b63598c319fa86c0f138 Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Mon, 28 Jul 2025 15:53:01 +0800 Subject: [PATCH 369/484] mm/shmem, swap: tidy up THP swapin checks commit c262ffd72c8539d16ada8641a6348c5a88f0c542 upstream Conflicts: none Backport-reason: Shmem mTHP swap Move all THP swapin related checks under CONFIG_TRANSPARENT_HUGEPAGE, so they will be trimmed off by the compiler if not needed. And add a WARN if shmem sees a order > 0 entry when CONFIG_TRANSPARENT_HUGEPAGE is disabled, that should never happen unless things went very wrong. There should be no observable feature change except the new added WARN. Link: https://lkml.kernel.org/r/20250728075306.12704-4-ryncsn@gmail.com Signed-off-by: Kairui Song Reviewed-by: Baolin Wang Cc: Baoquan He Cc: Barry Song Cc: Chris Li Cc: Dev Jain Cc: Hugh Dickins Cc: Kemeng Shi Cc: Matthew Wilcox (Oracle) Cc: Nhat Pham Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/shmem.c | 39 ++++++++++++++++++--------------------- 1 file changed, 18 insertions(+), 21 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index 3bda0d1f9db5..4edea6486865 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1913,26 +1913,38 @@ static struct folio *shmem_swap_alloc_folio(struct inode *inode, swp_entry_t entry, int order, gfp_t gfp) { struct shmem_inode_info *info = SHMEM_I(inode); + int nr_pages = 1 << order; struct folio *new; void *shadow; - int nr_pages; /* * We have arrived here because our zones are constrained, so don't * limit chance of success with further cpuset and node constraints. */ gfp &= ~GFP_CONSTRAINT_MASK; - if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && order > 0) { - gfp_t huge_gfp = vma_thp_gfp_mask(vma); + if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) { + if (WARN_ON_ONCE(order)) + return ERR_PTR(-EINVAL); + } else if (order) { + /* + * If uffd is active for the vma, we need per-page fault + * fidelity to maintain the uffd semantics, then fallback + * to swapin order-0 folio, as well as for zswap case. + * Any existing sub folio in the swap cache also blocks + * mTHP swapin. + */ + if ((vma && unlikely(userfaultfd_armed(vma))) || + !zswap_never_enabled() || + non_swapcache_batch(entry, nr_pages) != nr_pages) + return ERR_PTR(-EINVAL); - gfp = limit_gfp_mask(huge_gfp, gfp); + gfp = limit_gfp_mask(vma_thp_gfp_mask(vma), gfp); } new = shmem_alloc_folio(gfp, order, info, index); if (!new) return ERR_PTR(-ENOMEM); - nr_pages = folio_nr_pages(new); if (mem_cgroup_swapin_charge_folio(new, vma ? vma->vm_mm : NULL, gfp, entry)) { folio_put(new); @@ -2223,9 +2235,6 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, /* Look it up and read it in.. */ folio = swap_cache_get_folio(swap, NULL, 0); if (!folio) { - int nr_pages = 1 << order; - bool fallback_order0 = false; - /* Or update major stats only when swapin succeeds?? */ if (fault_type) { *fault_type |= VM_FAULT_MAJOR; @@ -2233,20 +2242,8 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, count_memcg_event_mm(fault_mm, PGMAJFAULT); } - /* - * If uffd is active for the vma, we need per-page fault - * fidelity to maintain the uffd semantics, then fallback - * to swapin order-0 folio, as well as for zswap case. - * Any existing sub folio in the swap cache also blocks - * mTHP swapin. - */ - if (order > 0 && ((vma && unlikely(userfaultfd_armed(vma))) || - !zswap_never_enabled() || - non_swapcache_batch(swap, nr_pages) != nr_pages)) - fallback_order0 = true; - /* Skip swapcache for synchronous device. */ - if (!fallback_order0 && data_race(si->flags & SWP_SYNCHRONOUS_IO)) { + if (data_race(si->flags & SWP_SYNCHRONOUS_IO)) { folio = shmem_swap_alloc_folio(inode, vma, index, swap, order, gfp); if (!IS_ERR(folio)) { skip_swapcache = true; -- Gitee From f55113184ffb7359631cee456ab0175d261e709b Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Mon, 28 Jul 2025 15:53:02 +0800 Subject: [PATCH 370/484] mm/shmem, swap: tidy up swap entry splitting commit 91ab656ece137c368a3189dfd42f8c9203a6285c upstream Conflicts: trivial, SLI conflict. Backport-reason: Shmem mTHP swap Instead of keeping different paths of splitting the entry before the swap in start, move the entry splitting after the swapin has put the folio in swap cache (or set the SWAP_HAS_CACHE bit). This way we only need one place and one unified way to split the large entry. Whenever swapin brought in a folio smaller than the shmem swap entry, split the entry and recalculate the entry and index for verification. This removes duplicated codes and function calls, reduces LOC, and the split is less racy as it's guarded by swap cache now. So it will have a lower chance of repeated faults due to raced split. The compiler is also able to optimize the coder further: bloat-o-meter results with GCC 14: With DEBUG_SECTION_MISMATCH (-fno-inline-functions-called-once): ./scripts/bloat-o-meter mm/shmem.o.old mm/shmem.o add/remove: 0/0 grow/shrink: 0/1 up/down: 0/-143 (-143) Function old new delta shmem_swapin_folio 2358 2215 -143 Total: Before=32933, After=32790, chg -0.43% With !DEBUG_SECTION_MISMATCH: add/remove: 0/1 grow/shrink: 1/0 up/down: 1069/-749 (320) Function old new delta shmem_swapin_folio 2871 3940 +1069 shmem_split_large_entry.isra 749 - -749 Total: Before=32806, After=33126, chg +0.98% Since shmem_split_large_entry is only called in one place now. The compiler will either generate more compact code, or inlined it for better performance. Link: https://lkml.kernel.org/r/20250728075306.12704-5-ryncsn@gmail.com Signed-off-by: Kairui Song Reviewed-by: Baolin Wang Tested-by: Baolin Wang Cc: Baoquan He Cc: Barry Song Cc: Chris Li Cc: Dev Jain Cc: Hugh Dickins Cc: Kemeng Shi Cc: Matthew Wilcox (Oracle) Cc: Nhat Pham Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/shmem.c | 56 ++++++++++++++++++++++-------------------------------- 1 file changed, 23 insertions(+), 33 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index 4edea6486865..aa7c71ffe7b2 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2203,17 +2203,19 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, struct address_space *mapping = inode->i_mapping; struct mm_struct *fault_mm = vma ? vma->vm_mm : NULL; struct shmem_inode_info *info = SHMEM_I(inode); + swp_entry_t swap, index_entry; struct swap_info_struct *si; struct folio *folio = NULL; bool skip_swapcache = false; - swp_entry_t swap; int error, nr_pages, order, split_order; + pgoff_t offset; #ifdef CONFIG_CGROUP_SLI u64 start; #endif VM_BUG_ON(!*foliop || !xa_is_value(*foliop)); - swap = radix_to_swp_entry(*foliop); + index_entry = radix_to_swp_entry(*foliop); + swap = index_entry; *foliop = NULL; if (is_poisoned_swp_entry(swap)) @@ -2261,28 +2263,15 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, } /* - * Now swap device can only swap in order 0 folio, then we - * should split the large swap entry stored in the pagecache - * if necessary. - */ - split_order = shmem_split_large_entry(inode, index, swap, gfp); - if (split_order < 0) { - error = split_order; - goto failed; - } - - /* - * If the large swap entry has already been split, it is + * Now swap device can only swap in order 0 folio, it is * necessary to recalculate the new swap entry based on - * the old order alignment. + * the offset, as the swapin index might be unalgined. */ - if (split_order > 0) { - pgoff_t offset = index - round_down(index, 1 << split_order); - + if (order) { + offset = index - round_down(index, 1 << order); swap = swp_entry(swp_type(swap), swp_offset(swap) + offset); } - /* Here we actually start the io */ #ifdef CONFIG_CGROUP_SLI sli_memlat_stat_start(&start); #endif @@ -2294,19 +2283,21 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, error = -ENOMEM; goto failed; } - } else if (order > folio_order(folio)) { + } +alloced: + if (order > folio_order(folio)) { /* - * Swap readahead may swap in order 0 folios into swapcache + * Swapin may get smaller folios due to various reasons: + * It may fallback to order 0 due to memory pressure or race, + * swap readahead may swap in order 0 folios into swapcache * asynchronously, while the shmem mapping can still stores * large swap entries. In such cases, we should split the * large swap entry to prevent possible data corruption. */ - split_order = shmem_split_large_entry(inode, index, swap, gfp); + split_order = shmem_split_large_entry(inode, index, index_entry, gfp); if (split_order < 0) { - folio_put(folio); - folio = NULL; error = split_order; - goto failed; + goto failed_nolock; } /* @@ -2315,16 +2306,14 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, * the old order alignment. */ if (split_order > 0) { - pgoff_t offset = index - round_down(index, 1 << split_order); - - swap = swp_entry(swp_type(swap), swp_offset(swap) + offset); + offset = index - round_down(index, 1 << split_order); + swap = swp_entry(swp_type(swap), swp_offset(index_entry) + offset); } } else if (order < folio_order(folio)) { swap.val = round_down(swap.val, 1 << folio_order(folio)); index = round_down(index, 1 << folio_order(folio)); } -alloced: /* * We have to do this with the folio locked to prevent races. * The shmem_confirm_swap below only checks if the first swap @@ -2388,12 +2377,13 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, shmem_set_folio_swapin_error(inode, index, folio, swap, skip_swapcache); unlock: - if (skip_swapcache) - swapcache_clear(si, swap, folio_nr_pages(folio)); - if (folio) { + if (folio) folio_unlock(folio); +failed_nolock: + if (skip_swapcache) + swapcache_clear(si, folio->swap, folio_nr_pages(folio)); + if (folio) folio_put(folio); - } put_swap_device(si); return error; -- Gitee From bc73ea3456e26ad8b7370c3513a236859ac01f80 Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Mon, 28 Jul 2025 15:53:03 +0800 Subject: [PATCH 371/484] mm/shmem, swap: never use swap cache and readahead for SWP_SYNCHRONOUS_IO commit 69805ea79db6634d4e7d596f3f36667924dc6cbf upstream Conflicts: none Backport-reason: Shmem mTHP swap For SWP_SYNCHRONOUS_IO devices, if a cache bypassing THP swapin failed due to reasons like memory pressure, partially conflicting swap cache or ZSWAP enabled, shmem will fallback to cached order 0 swapin. Right now the swap cache still has a non-trivial overhead, and readahead is not helpful for SWP_SYNCHRONOUS_IO devices, so we should always skip the readahead and swap cache even if the swapin falls back to order 0. So handle the fallback logic without falling back to the cached read. Link: https://lkml.kernel.org/r/20250728075306.12704-6-ryncsn@gmail.com Signed-off-by: Kairui Song Reviewed-by: Baolin Wang Cc: Baoquan He Cc: Barry Song Cc: Chris Li Cc: Dev Jain Cc: Hugh Dickins Cc: Kemeng Shi Cc: Matthew Wilcox (Oracle) Cc: Nhat Pham Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/shmem.c | 41 ++++++++++++++++++++++++++++------------- 1 file changed, 28 insertions(+), 13 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index aa7c71ffe7b2..65570c6a036c 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1915,6 +1915,7 @@ static struct folio *shmem_swap_alloc_folio(struct inode *inode, struct shmem_inode_info *info = SHMEM_I(inode); int nr_pages = 1 << order; struct folio *new; + gfp_t alloc_gfp; void *shadow; /* @@ -1922,6 +1923,7 @@ static struct folio *shmem_swap_alloc_folio(struct inode *inode, * limit chance of success with further cpuset and node constraints. */ gfp &= ~GFP_CONSTRAINT_MASK; + alloc_gfp = gfp; if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) { if (WARN_ON_ONCE(order)) return ERR_PTR(-EINVAL); @@ -1936,19 +1938,22 @@ static struct folio *shmem_swap_alloc_folio(struct inode *inode, if ((vma && unlikely(userfaultfd_armed(vma))) || !zswap_never_enabled() || non_swapcache_batch(entry, nr_pages) != nr_pages) - return ERR_PTR(-EINVAL); + goto fallback; - gfp = limit_gfp_mask(vma_thp_gfp_mask(vma), gfp); + alloc_gfp = limit_gfp_mask(vma_thp_gfp_mask(vma), gfp); + } +retry: + new = shmem_alloc_folio(alloc_gfp, order, info, index); + if (!new) { + new = ERR_PTR(-ENOMEM); + goto fallback; } - - new = shmem_alloc_folio(gfp, order, info, index); - if (!new) - return ERR_PTR(-ENOMEM); if (mem_cgroup_swapin_charge_folio(new, vma ? vma->vm_mm : NULL, - gfp, entry)) { + alloc_gfp, entry)) { folio_put(new); - return ERR_PTR(-ENOMEM); + new = ERR_PTR(-ENOMEM); + goto fallback; } /* @@ -1963,7 +1968,9 @@ static struct folio *shmem_swap_alloc_folio(struct inode *inode, */ if (swapcache_prepare(entry, nr_pages)) { folio_put(new); - return ERR_PTR(-EEXIST); + new = ERR_PTR(-EEXIST); + /* Try smaller folio to avoid cache conflict */ + goto fallback; } __folio_set_locked(new); @@ -1977,6 +1984,15 @@ static struct folio *shmem_swap_alloc_folio(struct inode *inode, folio_add_lru(new); swap_read_folio(new, NULL); return new; +fallback: + /* Order 0 swapin failed, nothing to fallback to, abort */ + if (!order) + return new; + entry.val += index - round_down(index, nr_pages); + alloc_gfp = gfp; + nr_pages = 1; + order = 0; + goto retry; } /* @@ -2253,13 +2269,12 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, } /* - * Fallback to swapin order-0 folio unless the swap entry - * already exists. + * Direct swapin handled order 0 fallback already, + * if it failed, abort. */ error = PTR_ERR(folio); folio = NULL; - if (error == -EEXIST) - goto failed; + goto failed; } /* -- Gitee From 1e2a8d0cb591009c7f06b6c231a377b9d73d8204 Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Mon, 28 Jul 2025 15:53:04 +0800 Subject: [PATCH 372/484] mm/shmem, swap: simplify swapin path and result handling commit 1326359f22805b2b0e9567ec0099980b8956fc29 upstream Conflicts: minor, due to SLI Backport-reason: Shmem mTHP swap Slightly tidy up the different handling of swap in and error handling for SWP_SYNCHRONOUS_IO and non-SWP_SYNCHRONOUS_IO devices. Now swapin will always use either shmem_swap_alloc_folio or shmem_swapin_cluster, then check the result. Simplify the control flow and avoid a redundant goto label. Link: https://lkml.kernel.org/r/20250728075306.12704-7-ryncsn@gmail.com Signed-off-by: Kairui Song Reviewed-by: Baolin Wang Cc: Baoquan He Cc: Barry Song Cc: Chris Li Cc: Dev Jain Cc: Hugh Dickins Cc: Kemeng Shi Cc: Matthew Wilcox (Oracle) Cc: Nhat Pham Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/shmem.c | 50 +++++++++++++++++++++----------------------------- 1 file changed, 21 insertions(+), 29 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index 65570c6a036c..d30b35d2a999 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2260,46 +2260,38 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, count_memcg_event_mm(fault_mm, PGMAJFAULT); } - /* Skip swapcache for synchronous device. */ if (data_race(si->flags & SWP_SYNCHRONOUS_IO)) { + /* Direct swapin skipping swap cache & readahead */ folio = shmem_swap_alloc_folio(inode, vma, index, swap, order, gfp); - if (!IS_ERR(folio)) { - skip_swapcache = true; - goto alloced; + if (IS_ERR(folio)) { + error = PTR_ERR(folio); + folio = NULL; + goto failed; } - + skip_swapcache = true; + } else { /* - * Direct swapin handled order 0 fallback already, - * if it failed, abort. + * Cached swapin only supports order 0 folio, it is + * necessary to recalculate the new swap entry based on + * the offset, as the swapin index might be unalgined. */ - error = PTR_ERR(folio); - folio = NULL; - goto failed; - } - - /* - * Now swap device can only swap in order 0 folio, it is - * necessary to recalculate the new swap entry based on - * the offset, as the swapin index might be unalgined. - */ - if (order) { - offset = index - round_down(index, 1 << order); - swap = swp_entry(swp_type(swap), swp_offset(swap) + offset); - } - + if (order) { + offset = index - round_down(index, 1 << order); + swap = swp_entry(swp_type(swap), swp_offset(swap) + offset); + } #ifdef CONFIG_CGROUP_SLI - sli_memlat_stat_start(&start); + sli_memlat_stat_start(&start); #endif - folio = shmem_swapin_cluster(swap, gfp, info, index); + folio = shmem_swapin_cluster(swap, gfp, info, index); #ifdef CONFIG_CGROUP_SLI - sli_memlat_stat_end(MEM_LAT_DIRECT_SWAPIN, start); + sli_memlat_stat_end(MEM_LAT_DIRECT_SWAPIN, start); #endif - if (!folio) { - error = -ENOMEM; - goto failed; + if (!folio) { + error = -ENOMEM; + goto failed; + } } } -alloced: if (order > folio_order(folio)) { /* * Swapin may get smaller folios due to various reasons: -- Gitee From 64f5fc7a1f339ecc5dc1d84ca2c460d9f962ff80 Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Mon, 28 Jul 2025 15:53:05 +0800 Subject: [PATCH 373/484] mm/shmem, swap: rework swap entry and index calculation for large swapin commit 93c0476e705768c7ca902cffea4efb500b9678b4 upstream Conflicts: minor, due to SLI Backport-reason: Shmem mTHP swap Instead of calculating the swap entry differently in different swapin paths, calculate it early before the swap cache lookup and use that for the lookup and later swapin. And after swapin have brought a folio, simply round it down against the size of the folio. This is simple and effective enough to verify the swap value. A folio's swap entry is always aligned by its size. Any kind of parallel split or race is acceptable because the final shmem_add_to_page_cache ensures that all entries covered by the folio are correct, and thus there will be no data corruption. This also prevents false positive cache lookup. If a shmem read request's index points to the middle of a large swap entry, previously, shmem will try the swap cache lookup using the large swap entry's starting value (which is the first sub swap entry of this large entry). This will lead to false positive lookup results if only the first few swap entries are cached but the actual requested swap entry pointed by the index is uncached. This is not a rare event, as swap readahead always tries to cache order 0 folios when possible. And this shouldn't cause any increased repeated faults. Instead, no matter how the shmem mapping is split in parallel, as long as the mapping still contains the right entries, the swapin will succeed. The final object size and stack usage are also reduced due to simplified code: ./scripts/bloat-o-meter mm/shmem.o.old mm/shmem.o add/remove: 0/0 grow/shrink: 0/1 up/down: 0/-145 (-145) Function old new delta shmem_swapin_folio 4056 3911 -145 Total: Before=33242, After=33097, chg -0.44% Stack usage (Before vs After): mm/shmem.c:2314:12:shmem_swapin_folio 264 static mm/shmem.c:2314:12:shmem_swapin_folio 256 static And while at it, round down the index too if swap entry is round down. The index is used either for folio reallocation or confirming the mapping content. In either case, it should be aligned with the swap folio. Link: https://lkml.kernel.org/r/20250728075306.12704-8-ryncsn@gmail.com Signed-off-by: Kairui Song Reviewed-by: Baolin Wang Tested-by: Baolin Wang Cc: Baoquan He Cc: Barry Song Cc: Chris Li Cc: Dev Jain Cc: Hugh Dickins Cc: Kemeng Shi Cc: Matthew Wilcox (Oracle) Cc: Nhat Pham Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/shmem.c | 66 +++++++++++++++++++++++++++--------------------------- 1 file changed, 33 insertions(+), 33 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index d30b35d2a999..e16e0bb88259 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2202,7 +2202,7 @@ static int shmem_split_large_entry(struct inode *inode, pgoff_t index, if (xas_error(&xas)) return xas_error(&xas); - return alloced_order; + return 0; } /* @@ -2223,7 +2223,7 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, struct swap_info_struct *si; struct folio *folio = NULL; bool skip_swapcache = false; - int error, nr_pages, order, split_order; + int error, nr_pages, order; pgoff_t offset; #ifdef CONFIG_CGROUP_SLI u64 start; @@ -2234,11 +2234,11 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, swap = index_entry; *foliop = NULL; - if (is_poisoned_swp_entry(swap)) + if (is_poisoned_swp_entry(index_entry)) return -EIO; - si = get_swap_device(swap); - order = shmem_confirm_swap(mapping, index, swap); + si = get_swap_device(index_entry); + order = shmem_confirm_swap(mapping, index, index_entry); if (unlikely(!si)) { if (order < 0) return -EEXIST; @@ -2250,6 +2250,12 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, return -EEXIST; } + /* index may point to the middle of a large entry, get the sub entry */ + if (order) { + offset = index - round_down(index, 1 << order); + swap = swp_entry(swp_type(swap), swp_offset(swap) + offset); + } + /* Look it up and read it in.. */ folio = swap_cache_get_folio(swap, NULL, 0); if (!folio) { @@ -2262,7 +2268,8 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, if (data_race(si->flags & SWP_SYNCHRONOUS_IO)) { /* Direct swapin skipping swap cache & readahead */ - folio = shmem_swap_alloc_folio(inode, vma, index, swap, order, gfp); + folio = shmem_swap_alloc_folio(inode, vma, index, + index_entry, order, gfp); if (IS_ERR(folio)) { error = PTR_ERR(folio); folio = NULL; @@ -2270,18 +2277,10 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, } skip_swapcache = true; } else { - /* - * Cached swapin only supports order 0 folio, it is - * necessary to recalculate the new swap entry based on - * the offset, as the swapin index might be unalgined. - */ - if (order) { - offset = index - round_down(index, 1 << order); - swap = swp_entry(swp_type(swap), swp_offset(swap) + offset); - } #ifdef CONFIG_CGROUP_SLI sli_memlat_stat_start(&start); #endif + /* Cached swapin only supports order 0 folio */ folio = shmem_swapin_cluster(swap, gfp, info, index); #ifdef CONFIG_CGROUP_SLI sli_memlat_stat_end(MEM_LAT_DIRECT_SWAPIN, start); @@ -2292,6 +2291,7 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, } } } + if (order > folio_order(folio)) { /* * Swapin may get smaller folios due to various reasons: @@ -2301,24 +2301,25 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, * large swap entries. In such cases, we should split the * large swap entry to prevent possible data corruption. */ - split_order = shmem_split_large_entry(inode, index, index_entry, gfp); - if (split_order < 0) { - error = split_order; + error = shmem_split_large_entry(inode, index, index_entry, gfp); + if (error) goto failed_nolock; - } + } - /* - * If the large swap entry has already been split, it is - * necessary to recalculate the new swap entry based on - * the old order alignment. - */ - if (split_order > 0) { - offset = index - round_down(index, 1 << split_order); - swap = swp_entry(swp_type(swap), swp_offset(index_entry) + offset); - } - } else if (order < folio_order(folio)) { - swap.val = round_down(swap.val, 1 << folio_order(folio)); - index = round_down(index, 1 << folio_order(folio)); + /* + * If the folio is large, round down swap and index by folio size. + * No matter what race occurs, the swap layer ensures we either get + * a valid folio that has its swap entry aligned by size, or a + * temporarily invalid one which we'll abort very soon and retry. + * + * shmem_add_to_page_cache ensures the whole range contains expected + * entries and prevents any corruption, so any race split is fine + * too, it will succeed as long as the entries are still there. + */ + nr_pages = folio_nr_pages(folio); + if (nr_pages > 1) { + swap.val = round_down(swap.val, nr_pages); + index = round_down(index, nr_pages); } /* @@ -2354,8 +2355,7 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, goto failed; } - error = shmem_add_to_page_cache(folio, mapping, - round_down(index, nr_pages), + error = shmem_add_to_page_cache(folio, mapping, index, swp_to_radix_entry(swap), gfp); if (error) goto failed; -- Gitee From bba0dfdb7e119a4e66dae337331ef581cdc205c4 Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Mon, 28 Jul 2025 15:53:06 +0800 Subject: [PATCH 374/484] mm/shmem, swap: fix major fault counting commit de55be42379cc0561aadfd9e1459239dea70be32 upstream Conflicts: none Backport-reason: Shmem mTHP swap If the swapin failed, don't update the major fault count. There is a long existing comment for doing it this way, now with previous cleanups, we can finally fix it. Link: https://lkml.kernel.org/r/20250728075306.12704-9-ryncsn@gmail.com Signed-off-by: Kairui Song Reviewed-by: Baolin Wang Cc: Baoquan He Cc: Barry Song Cc: Chris Li Cc: Dev Jain Cc: Hugh Dickins Cc: Kemeng Shi Cc: Matthew Wilcox (Oracle) Cc: Nhat Pham Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/shmem.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index e16e0bb88259..82c398081da7 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2259,13 +2259,6 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, /* Look it up and read it in.. */ folio = swap_cache_get_folio(swap, NULL, 0); if (!folio) { - /* Or update major stats only when swapin succeeds?? */ - if (fault_type) { - *fault_type |= VM_FAULT_MAJOR; - count_vm_event(PGMAJFAULT); - count_memcg_event_mm(fault_mm, PGMAJFAULT); - } - if (data_race(si->flags & SWP_SYNCHRONOUS_IO)) { /* Direct swapin skipping swap cache & readahead */ folio = shmem_swap_alloc_folio(inode, vma, index, @@ -2290,6 +2283,11 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, goto failed; } } + if (fault_type) { + *fault_type |= VM_FAULT_MAJOR; + count_vm_event(PGMAJFAULT); + count_memcg_event_mm(fault_mm, PGMAJFAULT); + } } if (order > folio_order(folio)) { -- Gitee From 235f188134938a11faec32f96f80ec5a1b59e4d9 Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Wed, 26 Mar 2025 00:25:22 +0800 Subject: [PATCH 375/484] mm: swap: enable swap_entry_range_free() to drop any kind of last ref commit 64944ef6a13e774546eb7635bdd26ef963335a41 upstream Conflicts: none Backport-reason: Swap allocator cleanup The original VM_BUG_ON only allows swap_entry_range_free() to drop last SWAP_HAS_CACHE ref. By allowing other kind of last ref in VM_BUG_ON, swap_entry_range_free() could be a more general-purpose function able to handle all kind of last ref. Following thi change, also rename swap_entry_range_free() to swap_entries_free() and update it's comment accordingly. This is a preparation to use swap_entries_free() to drop more kind of last ref other than SWAP_HAS_CACHE. [shikemeng@huaweicloud.com: add __maybe_unused attribute for swap_is_last_ref() and update comment] Link: https://lkml.kernel.org/r/20250410153908.612984-1-shikemeng@huaweicloud.com Link: https://lkml.kernel.org/r/20250325162528.68385-3-shikemeng@huaweicloud.com Signed-off-by: Kemeng Shi Reviewed-by: Tim Chen Reviewed-by: Baoquan He Tested-by: SeongJae Park Cc: Kairui Song Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/swapfile.c | 38 ++++++++++++++++++++++++-------------- 1 file changed, 24 insertions(+), 14 deletions(-) diff --git a/mm/swapfile.c b/mm/swapfile.c index 0d41c54c360f..e4b2f75098ee 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -51,9 +51,9 @@ static bool swap_count_continued(struct swap_info_struct *, pgoff_t, unsigned char); static void free_swap_count_continuations(struct swap_info_struct *); -static void swap_entry_range_free(struct swap_info_struct *si, - struct swap_cluster_info *ci, - swp_entry_t entry, unsigned int nr_pages); +static void swap_entries_free(struct swap_info_struct *si, + struct swap_cluster_info *ci, + swp_entry_t entry, unsigned int nr_pages); static void swap_range_alloc(struct swap_info_struct *si, unsigned int nr_entries); static bool folio_swapcache_freeable(struct folio *folio); @@ -1472,7 +1472,7 @@ static unsigned char swap_entry_put(struct swap_info_struct *si, ci = lock_cluster(si, offset); usage = swap_entry_put_locked(si, offset, 1); if (!usage) - swap_entry_range_free(si, ci, swp_entry(si->type, offset), 1); + swap_entries_free(si, ci, swp_entry(si->type, offset), 1); unlock_cluster(ci); return usage; @@ -1502,7 +1502,7 @@ static bool swap_entries_put_nr(struct swap_info_struct *si, for (i = 0; i < nr; i++) WRITE_ONCE(si->swap_map[offset + i], SWAP_HAS_CACHE); if (!has_cache) - swap_entry_range_free(si, ci, entry, nr); + swap_entries_free(si, ci, entry, nr); unlock_cluster(ci); return has_cache; fallback: @@ -1519,12 +1519,22 @@ static bool swap_entries_put_nr(struct swap_info_struct *si, } /* - * Drop the last HAS_CACHE flag of swap entries, caller have to - * ensure all entries belong to the same cgroup. + * Check if it's the last ref of swap entry in the freeing path. + * Qualified vlaue includes 1, SWAP_HAS_CACHE or SWAP_MAP_SHMEM. */ -static void swap_entry_range_free(struct swap_info_struct *si, - struct swap_cluster_info *ci, - swp_entry_t entry, unsigned int nr_pages) +static inline bool __maybe_unused swap_is_last_ref(unsigned char count) +{ + return (count == SWAP_HAS_CACHE) || (count == 1) || + (count == SWAP_MAP_SHMEM); +} + +/* + * Drop the last ref of swap entries, caller have to ensure all entries + * belong to the same cgroup and cluster. + */ +static void swap_entries_free(struct swap_info_struct *si, + struct swap_cluster_info *ci, + swp_entry_t entry, unsigned int nr_pages) { unsigned long offset = swp_offset(entry); unsigned char *map = si->swap_map + offset; @@ -1537,7 +1547,7 @@ static void swap_entry_range_free(struct swap_info_struct *si, ci->count -= nr_pages; do { - VM_BUG_ON(*map != SWAP_HAS_CACHE); + VM_BUG_ON(!swap_is_last_ref(*map)); *map = 0; } while (++map < map_end); @@ -1560,7 +1570,7 @@ static void cluster_swap_free_nr(struct swap_info_struct *si, ci = lock_cluster(si, offset); do { if (!swap_entry_put_locked(si, offset, usage)) - swap_entry_range_free(si, ci, swp_entry(si->type, offset), 1); + swap_entries_free(si, ci, swp_entry(si->type, offset), 1); } while (++offset < end); unlock_cluster(ci); } @@ -1603,11 +1613,11 @@ void put_swap_folio(struct folio *folio, swp_entry_t entry) ci = lock_cluster(si, offset); if (swap_only_has_cache(si, offset, size)) - swap_entry_range_free(si, ci, entry, size); + swap_entries_free(si, ci, entry, size); else { for (int i = 0; i < size; i++, entry.val++) { if (!swap_entry_put_locked(si, offset + i, SWAP_HAS_CACHE)) - swap_entry_range_free(si, ci, entry, 1); + swap_entries_free(si, ci, entry, 1); } } unlock_cluster(ci); -- Gitee From 6726bcffaf95dce884c5a94e1ca5e8e01bb00b3a Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Wed, 26 Mar 2025 00:25:23 +0800 Subject: [PATCH 376/484] mm: swap: use swap_entries_free() to free swap entry in swap_entry_put_locked() commit 835b868878d0127bd29b8c0009dc424a63dadffb upstream Conflicts: none Backport-reason: Swap allocator cleanup In swap_entry_put_locked(), we will set slot to SWAP_HAS_CACHE before using swap_entries_free() to do actual swap entry freeing. This introduce an unnecessary intermediate state. By using swap_entries_free() in swap_entry_put_locked(), we can eliminate the need to set slot to SWAP_HAS_CACHE. This change would make the behavior of swap_entry_put_locked() more consistent with other put() operations which will do actual free work after put last reference. Link: https://lkml.kernel.org/r/20250325162528.68385-4-shikemeng@huaweicloud.com Signed-off-by: Kemeng Shi Reviewed-by: Tim Chen Reviewed-by: Kairui Song Reviewed-by: Baoquan He Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/swapfile.c | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/mm/swapfile.c b/mm/swapfile.c index e4b2f75098ee..332f7991f8cb 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1357,9 +1357,11 @@ static struct swap_info_struct *_swap_info_get(swp_entry_t entry) } static unsigned char swap_entry_put_locked(struct swap_info_struct *si, - unsigned long offset, + struct swap_cluster_info *ci, + swp_entry_t entry, unsigned char usage) { + unsigned long offset = swp_offset(entry); unsigned char count; unsigned char has_cache; @@ -1391,7 +1393,7 @@ static unsigned char swap_entry_put_locked(struct swap_info_struct *si, if (usage) WRITE_ONCE(si->swap_map[offset], usage); else - WRITE_ONCE(si->swap_map[offset], SWAP_HAS_CACHE); + swap_entries_free(si, ci, entry, 1); return usage; } @@ -1470,9 +1472,7 @@ static unsigned char swap_entry_put(struct swap_info_struct *si, unsigned char usage; ci = lock_cluster(si, offset); - usage = swap_entry_put_locked(si, offset, 1); - if (!usage) - swap_entries_free(si, ci, swp_entry(si->type, offset), 1); + usage = swap_entry_put_locked(si, ci, entry, 1); unlock_cluster(ci); return usage; @@ -1569,8 +1569,8 @@ static void cluster_swap_free_nr(struct swap_info_struct *si, ci = lock_cluster(si, offset); do { - if (!swap_entry_put_locked(si, offset, usage)) - swap_entries_free(si, ci, swp_entry(si->type, offset), 1); + swap_entry_put_locked(si, ci, swp_entry(si->type, offset), + usage); } while (++offset < end); unlock_cluster(ci); } @@ -1615,10 +1615,8 @@ void put_swap_folio(struct folio *folio, swp_entry_t entry) if (swap_only_has_cache(si, offset, size)) swap_entries_free(si, ci, entry, size); else { - for (int i = 0; i < size; i++, entry.val++) { - if (!swap_entry_put_locked(si, offset + i, SWAP_HAS_CACHE)) - swap_entries_free(si, ci, entry, 1); - } + for (int i = 0; i < size; i++, entry.val++) + swap_entry_put_locked(si, ci, entry, SWAP_HAS_CACHE); } unlock_cluster(ci); } -- Gitee From e87ead97310db3fa950024e6dab5844069f1abf0 Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Wed, 26 Mar 2025 00:25:24 +0800 Subject: [PATCH 377/484] mm: swap: use swap_entries_free() drop last ref count in swap_entries_put_nr() commit 46e0ab2c62067a8d5e62ae90088b1743af83725d upstream Conflicts: none Backport-reason: Swap allocator cleanup Use swap_entries_free() to directly free swap entries when the swap entries are not cached and referenced, without needing to set swap entries to set intermediate SWAP_HAS_CACHE state. Link: https://lkml.kernel.org/r/20250325162528.68385-5-shikemeng@huaweicloud.com Signed-off-by: Kemeng Shi Reviewed-by: Tim Chen Reviewed-by: Baoquan He Cc: Kairui Song Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/swapfile.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/mm/swapfile.c b/mm/swapfile.c index 332f7991f8cb..2f343596e821 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1499,10 +1499,11 @@ static bool swap_entries_put_nr(struct swap_info_struct *si, unlock_cluster(ci); goto fallback; } - for (i = 0; i < nr; i++) - WRITE_ONCE(si->swap_map[offset + i], SWAP_HAS_CACHE); if (!has_cache) swap_entries_free(si, ci, entry, nr); + else + for (i = 0; i < nr; i++) + WRITE_ONCE(si->swap_map[offset + i], SWAP_HAS_CACHE); unlock_cluster(ci); return has_cache; fallback: -- Gitee From 19be06bd91ae19a0b3f421a17f9eef65411e789b Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Wed, 26 Mar 2025 00:25:25 +0800 Subject: [PATCH 378/484] mm: swap: drop last SWAP_MAP_SHMEM flag in batch in swap_entries_put_nr() commit f2252acf444764e7de7380201c518ee5ab6fd9da upstream Conflicts: none Backport-reason: Swap allocator cleanup The SWAP_MAP_SHMEM indicates last map from shmem. Therefore we can drop SWAP_MAP_SHMEM in batch in similar way to drop last ref count in batch. Link: https://lkml.kernel.org/r/20250325162528.68385-6-shikemeng@huaweicloud.com Signed-off-by: Kemeng Shi Reviewed-by: Tim Chen Reviewed-by: Baoquan He Cc: Kairui Song Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/swapfile.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/mm/swapfile.c b/mm/swapfile.c index 2f343596e821..e33ecc04fe65 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -193,7 +193,7 @@ static bool swap_is_last_map(struct swap_info_struct *si, unsigned char *map_end = map + nr_pages; unsigned char count = *map; - if (swap_count(count) != 1) + if (swap_count(count) != 1 && swap_count(count) != SWAP_MAP_SHMEM) return false; while (++map < map_end) { @@ -1488,7 +1488,10 @@ static bool swap_entries_put_nr(struct swap_info_struct *si, unsigned char count; int i; - if (nr <= 1 || swap_count(data_race(si->swap_map[offset])) != 1) + if (nr <= 1) + goto fallback; + count = swap_count(data_race(si->swap_map[offset])); + if (count != 1 && count != SWAP_MAP_SHMEM) goto fallback; /* cross into another cluster */ if (nr > SWAPFILE_CLUSTER - offset % SWAPFILE_CLUSTER) -- Gitee From 6958b3239fa93ec013b4a40b80f133558e9274b7 Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Wed, 26 Mar 2025 00:25:26 +0800 Subject: [PATCH 379/484] mm: swap: free each cluster individually in swap_entries_put_map_nr() commit 4d71d9062dd7b2ace56f2351b9f3f06e6c0acf81 upstream Conflicts: none Backport-reason: Swap allocator cleanup 1. Factor out general swap_entries_put_map() helper to drop entries belonging to one cluster. If entries are last map, free entries in batch, otherwise put entries with cluster lock acquired and released only once. 2. Iterate and call swap_entries_put_map() for each cluster in swap_entries_put_nr() to leverage batch-remove for last map belonging to one cluster and reduce lock acquire/release in fallback case. 3. As swap_entries_put_nr() won't handle SWAP_HSA_CACHE drop, rename it to swap_entries_put_map_nr(). 4. As we won't drop each entry invidually with swap_entry_put() now, do reclaim in free_swap_and_cache_nr() because swap_entries_put_map_nr() is general routine to drop reference and the relcaim work should only be done in free_swap_and_cache_nr(). Remove stale comment accordingly. Link: https://lkml.kernel.org/r/20250325162528.68385-7-shikemeng@huaweicloud.com Signed-off-by: Kemeng Shi Reviewed-by: Tim Chen Reviewed-by: Baoquan He Cc: Kairui Song Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/swapfile.c | 70 +++++++++++++++++++++++---------------------------- 1 file changed, 32 insertions(+), 38 deletions(-) diff --git a/mm/swapfile.c b/mm/swapfile.c index e33ecc04fe65..a5f1b5cd7b79 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1464,25 +1464,10 @@ struct swap_info_struct *get_swap_device(swp_entry_t entry) return NULL; } -static unsigned char swap_entry_put(struct swap_info_struct *si, - swp_entry_t entry) -{ - struct swap_cluster_info *ci; - unsigned long offset = swp_offset(entry); - unsigned char usage; - - ci = lock_cluster(si, offset); - usage = swap_entry_put_locked(si, ci, entry, 1); - unlock_cluster(ci); - - return usage; -} - -static bool swap_entries_put_nr(struct swap_info_struct *si, - swp_entry_t entry, int nr) +static bool swap_entries_put_map(struct swap_info_struct *si, + swp_entry_t entry, int nr) { unsigned long offset = swp_offset(entry); - unsigned int type = swp_type(entry); struct swap_cluster_info *ci; bool has_cache = false; unsigned char count; @@ -1493,14 +1478,10 @@ static bool swap_entries_put_nr(struct swap_info_struct *si, count = swap_count(data_race(si->swap_map[offset])); if (count != 1 && count != SWAP_MAP_SHMEM) goto fallback; - /* cross into another cluster */ - if (nr > SWAPFILE_CLUSTER - offset % SWAPFILE_CLUSTER) - goto fallback; ci = lock_cluster(si, offset); if (!swap_is_last_map(si, offset, nr, &has_cache)) { - unlock_cluster(ci); - goto fallback; + goto locked_fallback; } if (!has_cache) swap_entries_free(si, ci, entry, nr); @@ -1510,15 +1491,34 @@ static bool swap_entries_put_nr(struct swap_info_struct *si, unlock_cluster(ci); return has_cache; fallback: - for (i = 0; i < nr; i++) { - if (data_race(si->swap_map[offset + i])) { - count = swap_entry_put(si, swp_entry(type, offset + i)); - if (count == SWAP_HAS_CACHE) - has_cache = true; - } else { - WARN_ON_ONCE(1); - } + ci = lock_cluster(si, offset); +locked_fallback: + for (i = 0; i < nr; i++, entry.val++) { + count = swap_entry_put_locked(si, ci, entry, 1); + if (count == SWAP_HAS_CACHE) + has_cache = true; + } + unlock_cluster(ci); + return has_cache; + +} + +static bool swap_entries_put_map_nr(struct swap_info_struct *si, + swp_entry_t entry, int nr) +{ + int cluster_nr, cluster_rest; + unsigned long offset = swp_offset(entry); + bool has_cache = false; + + cluster_rest = SWAPFILE_CLUSTER - offset % SWAPFILE_CLUSTER; + while (nr) { + cluster_nr = min(nr, cluster_rest); + has_cache |= swap_entries_put_map(si, entry, cluster_nr); + cluster_rest = SWAPFILE_CLUSTER; + nr -= cluster_nr; + entry.val += cluster_nr; } + return has_cache; } @@ -1818,7 +1818,7 @@ void free_swap_and_cache_nr(swp_entry_t entry, int nr) /* * First free all entries in the range. */ - any_only_cache = swap_entries_put_nr(si, entry, nr); + any_only_cache = swap_entries_put_map_nr(si, entry, nr); /* * Short-circuit the below loop if none of the entries had their @@ -1828,13 +1828,7 @@ void free_swap_and_cache_nr(swp_entry_t entry, int nr) goto out; /* - * Now go back over the range trying to reclaim the swap cache. This is - * more efficient for large folios because we will only try to reclaim - * the swap once per folio in the common case. If we do - * swap_entry_put() and __try_to_reclaim_swap() in the same loop, the - * latter will get a reference and lock the folio for every individual - * page but will only succeed once the swap slot for every subpage is - * zero. + * Now go back over the range trying to reclaim the swap cache. */ for (offset = start_offset; offset < end_offset; offset += nr) { nr = 1; -- Gitee From bda0ed073ac849bf6e668628d3105815143dece5 Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Wed, 26 Mar 2025 00:25:27 +0800 Subject: [PATCH 380/484] mm: swap: factor out helper to drop cache of entries within a single cluster commit d4f8000bd6b0cc33c9dddd369e72a13c4c080cb1 upstream Conflicts: none Backport-reason: Swap allocator cleanup Factor out helper swap_entries_put_cache() from put_swap_folio() to serve as a general-purpose routine for dropping cache flag of entries within a single cluster. Link: https://lkml.kernel.org/r/20250325162528.68385-8-shikemeng@huaweicloud.com Signed-off-by: Kemeng Shi Reviewed-by: Tim Chen Reviewed-by: Baoquan He Cc: Kairui Song Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/swapfile.c | 27 +++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-) diff --git a/mm/swapfile.c b/mm/swapfile.c index a5f1b5cd7b79..8df57233ca5f 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1464,6 +1464,22 @@ struct swap_info_struct *get_swap_device(swp_entry_t entry) return NULL; } +static void swap_entries_put_cache(struct swap_info_struct *si, + swp_entry_t entry, int nr) +{ + unsigned long offset = swp_offset(entry); + struct swap_cluster_info *ci; + + ci = lock_cluster(si, offset); + if (swap_only_has_cache(si, offset, nr)) + swap_entries_free(si, ci, entry, nr); + else { + for (int i = 0; i < nr; i++, entry.val++) + swap_entry_put_locked(si, ci, entry, SWAP_HAS_CACHE); + } + unlock_cluster(ci); +} + static bool swap_entries_put_map(struct swap_info_struct *si, swp_entry_t entry, int nr) { @@ -1606,8 +1622,6 @@ void swap_free_nr(swp_entry_t entry, int nr_pages) */ void put_swap_folio(struct folio *folio, swp_entry_t entry) { - unsigned long offset = swp_offset(entry); - struct swap_cluster_info *ci; struct swap_info_struct *si; int size = 1 << swap_entry_order(folio_order(folio)); @@ -1615,14 +1629,7 @@ void put_swap_folio(struct folio *folio, swp_entry_t entry) if (!si) return; - ci = lock_cluster(si, offset); - if (swap_only_has_cache(si, offset, size)) - swap_entries_free(si, ci, entry, size); - else { - for (int i = 0; i < size; i++, entry.val++) - swap_entry_put_locked(si, ci, entry, SWAP_HAS_CACHE); - } - unlock_cluster(ci); + swap_entries_put_cache(si, entry, size); } int __swap_count(swp_entry_t entry) -- Gitee From 2869004fd5d066c248ff76f582c8a6d3cc73040c Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Wed, 26 Mar 2025 00:25:28 +0800 Subject: [PATCH 381/484] mm: swap: replace cluster_swap_free_nr() with swap_entries_put_[map/cache]() commit ec9827cd28b13b88517812eb08b13d0ed97ae8f1 upstream Conflicts: none Backport-reason: Swap allocator cleanup Replace cluster_swap_free_nr() with swap_entries_put_[map/cache]() to remove repeat code and leverage batch-remove for entries with last flag. After removing cluster_swap_free_nr, only functions with "_nr" suffix could free entries spanning cross clusters. Add corresponding description in comment of swap_entries_put_map_nr() as is first function with "_nr" suffix and have a non-suffix variant function swap_entries_put_map(). Link: https://lkml.kernel.org/r/20250325162528.68385-9-shikemeng@huaweicloud.com Signed-off-by: Kemeng Shi Reviewed-by: Tim Chen Reviewed-by: Baoquan He Cc: Kairui Song Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/swapfile.c | 30 +++++++++++------------------- 1 file changed, 11 insertions(+), 19 deletions(-) diff --git a/mm/swapfile.c b/mm/swapfile.c index 8df57233ca5f..d7ad9a573c5b 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1519,6 +1519,11 @@ static bool swap_entries_put_map(struct swap_info_struct *si, } +/* + * Only functions with "_nr" suffix are able to free entries spanning + * cross multi clusters, so ensure the range is within a single cluster + * when freeing entries with functions without "_nr" suffix. + */ static bool swap_entries_put_map_nr(struct swap_info_struct *si, swp_entry_t entry, int nr) { @@ -1580,21 +1585,6 @@ static void swap_entries_free(struct swap_info_struct *si, partial_free_cluster(si, ci); } -static void cluster_swap_free_nr(struct swap_info_struct *si, - unsigned long offset, int nr_pages, - unsigned char usage) -{ - struct swap_cluster_info *ci; - unsigned long end = offset + nr_pages; - - ci = lock_cluster(si, offset); - do { - swap_entry_put_locked(si, ci, swp_entry(si->type, offset), - usage); - } while (++offset < end); - unlock_cluster(ci); -} - /* * Caller has made sure that the swap device corresponding to entry * is still around or has not been recycled. @@ -1611,7 +1601,7 @@ void swap_free_nr(swp_entry_t entry, int nr_pages) while (nr_pages) { nr = min_t(int, nr_pages, SWAPFILE_CLUSTER - offset % SWAPFILE_CLUSTER); - cluster_swap_free_nr(sis, offset, nr, 1); + swap_entries_put_map(sis, swp_entry(sis->type, offset), nr); offset += nr; nr_pages -= nr; } @@ -3665,11 +3655,13 @@ int swapcache_prepare(swp_entry_t entry, int nr) return __swap_duplicate(entry, SWAP_HAS_CACHE, nr); } +/* + * Caller should ensure entries belong to the same folio so + * the entries won't span cross cluster boundary. + */ void swapcache_clear(struct swap_info_struct *si, swp_entry_t entry, int nr) { - unsigned long offset = swp_offset(entry); - - cluster_swap_free_nr(si, offset, nr, SWAP_HAS_CACHE); + swap_entries_put_cache(si, entry, nr); } struct swap_info_struct *swp_swap_info(swp_entry_t entry) -- Gitee From a9af6b0d0181616e5cacfc4d5e900ebcc9ab2f02 Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Thu, 22 May 2025 20:25:51 +0800 Subject: [PATCH 382/484] mm: swap: move nr_swap_pages counter decrement from folio_alloc_swap() to swap_range_alloc() commit 4f78252da887ee7e9d1875dd6e07d9baa936c04f upstream Conflicts: none Backport-reason: Swap allocator cleanup Patch series "Some randome fixes and cleanups to swapfile". Patch 0-3 are some random fixes. Patch 4 is a cleanup. More details can be found in respective patches. This patch (of 4): When folio_alloc_swap() encounters a failure in either mem_cgroup_try_charge_swap() or add_to_swap_cache(), nr_swap_pages counter is not decremented for allocated entry. However, the following put_swap_folio() will increase nr_swap_pages counter unpairly and lead to an imbalance. Move nr_swap_pages decrement from folio_alloc_swap() to swap_range_alloc() to pair the nr_swap_pages counting. Link: https://lkml.kernel.org/r/20250522122554.12209-1-shikemeng@huaweicloud.com Link: https://lkml.kernel.org/r/20250522122554.12209-2-shikemeng@huaweicloud.com Fixes: 0ff67f990bd4 ("mm, swap: remove swap slot cache") Signed-off-by: Kemeng Shi Reviewed-by: Kairui Song Reviewed-by: Baoquan He Cc: Johannes Weiner Cc: Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/swapfile.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/swapfile.c b/mm/swapfile.c index d7ad9a573c5b..5edfd0fb0ccc 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1116,6 +1116,7 @@ static void swap_range_alloc(struct swap_info_struct *si, if (vm_swap_full()) schedule_work(&si->reclaim_work); } + atomic_long_sub(nr_entries, &nr_swap_pages); } static void swap_range_free(struct swap_info_struct *si, unsigned long offset, @@ -1314,7 +1315,6 @@ int folio_alloc_swap(struct folio *folio, gfp_t gfp) if (add_to_swap_cache(folio, entry, gfp | __GFP_NOMEMALLOC, NULL)) goto out_free; - atomic_long_sub(size, &nr_swap_pages); return 0; out_free: -- Gitee From a252a2a1ac4f76b17575889303c865f37dde1903 Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Thu, 22 May 2025 20:25:52 +0800 Subject: [PATCH 383/484] mm: swap: correctly use maxpages in swapon syscall to avoid potential deadloop commit 255116c5b0fa2145ede28c2f7b248df5e73834d1 upstream Conflicts: none Backport-reason: Swap allocator cleanup We use maxpages from read_swap_header() to initialize swap_info_struct, however the maxpages might be reduced in setup_swap_extents() and the si->max is assigned with the reduced maxpages from the setup_swap_extents(). Obviously, this could lead to memory waste as we allocated memory based on larger maxpages, besides, this could lead to a potential deadloop as following: 1) When calling setup_clusters() with larger maxpages, unavailable pages within range [si->max, larger maxpages) are not accounted with inc_cluster_info_page(). As a result, these pages are assumed available but can not be allocated. The cluster contains these pages can be moved to frag_clusters list after it's all available pages were allocated. 2) When the cluster mentioned in 1) is the only cluster in frag_clusters list, cluster_alloc_swap_entry() assume order 0 allocation will never failed and will enter a deadloop by keep trying to allocate page from the only cluster in frag_clusters which contains no actually available page. Call setup_swap_extents() to get the final maxpages before swap_info_struct initialization to fix the issue. After this change, span will include badblocks and will become large value which I think is correct value: In summary, there are two kinds of swapfile_activate operations. 1. Filesystem style: Treat all blocks logical continuity and find usable physical extents in logical range. In this way, si->pages will be actual usable physical blocks and span will be "1 + highest_block - lowest_block". 2. Block device style: Treat all blocks physically continue and only one single extent is added. In this way, si->pages will be si->max and span will be "si->pages - 1". Actually, si->pages and si->max is only used in block device style and span value is set with si->pages. As a result, span value in block device style will become a larger value as you mentioned. I think larger value is correct based on: 1. Span value in filesystem style is "1 + highest_block - lowest_block" which is the range cover all possible phisical blocks including the badblocks. 2. For block device style, si->pages is the actual usable block number and is already in pr_info. The original span value before this patch is also refer to usable block number which is redundant in pr_info. [shikemeng@huaweicloud.com: ensure si->pages == si->max - 1 after setup_swap_extents()] Link: https://lkml.kernel.org/r/20250522122554.12209-3-shikemeng@huaweicloud.com Link: https://lkml.kernel.org/r/20250718065139.61989-1-shikemeng@huaweicloud.com Link: https://lkml.kernel.org/r/20250522122554.12209-3-shikemeng@huaweicloud.com Fixes: 661383c6111a ("mm: swap: relaim the cached parts that got scanned") Signed-off-by: Kemeng Shi Reviewed-by: Baoquan He Cc: Johannes Weiner Cc: Kairui Song Cc: Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/swapfile.c | 53 +++++++++++++++++++++++++-------------------------- 1 file changed, 26 insertions(+), 27 deletions(-) diff --git a/mm/swapfile.c b/mm/swapfile.c index 5edfd0fb0ccc..450508459b2f 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -3145,43 +3145,30 @@ static unsigned long read_swap_header(struct swap_info_struct *si, return maxpages; } -static int setup_swap_map_and_extents(struct swap_info_struct *si, - union swap_header *swap_header, - unsigned char *swap_map, - unsigned long maxpages, - sector_t *span) +static int setup_swap_map(struct swap_info_struct *si, + union swap_header *swap_header, + unsigned char *swap_map, + unsigned long maxpages) { - unsigned int nr_good_pages; unsigned long i; - int nr_extents; - - nr_good_pages = maxpages - 1; /* omit header page */ + swap_map[0] = SWAP_MAP_BAD; /* omit header page */ for (i = 0; i < swap_header->info.nr_badpages; i++) { unsigned int page_nr = swap_header->info.badpages[i]; if (page_nr == 0 || page_nr > swap_header->info.last_page) return -EINVAL; if (page_nr < maxpages) { swap_map[page_nr] = SWAP_MAP_BAD; - nr_good_pages--; + si->pages--; } } - if (nr_good_pages) { - swap_map[0] = SWAP_MAP_BAD; - si->max = maxpages; - si->pages = nr_good_pages; - nr_extents = setup_swap_extents(si, span); - if (nr_extents < 0) - return nr_extents; - nr_good_pages = si->pages; - } - if (!nr_good_pages) { + if (!si->pages) { pr_warn("Empty swap-file\n"); return -EINVAL; } - return nr_extents; + return 0; } #define SWAP_CLUSTER_INFO_COLS \ @@ -3221,7 +3208,7 @@ static struct swap_cluster_info *setup_clusters(struct swap_info_struct *si, * Mark unusable pages as unavailable. The clusters aren't * marked free yet, so no list operations are involved yet. * - * See setup_swap_map_and_extents(): header page, bad pages, + * See setup_swap_map(): header page, bad pages, * and the EOF part of the last cluster. */ inc_cluster_info_page(si, cluster_info, 0); @@ -3358,6 +3345,21 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) goto bad_swap_unlock_inode; } + si->max = maxpages; + si->pages = maxpages - 1; + nr_extents = setup_swap_extents(si, &span); + if (nr_extents < 0) { + error = nr_extents; + goto bad_swap_unlock_inode; + } + if (si->pages != si->max - 1) { + pr_err("swap:%u != (max:%u - 1)\n", si->pages, si->max); + error = -EINVAL; + goto bad_swap_unlock_inode; + } + + maxpages = si->max; + /* OK, set up the swap map and apply the bad block list */ swap_map = vzalloc(maxpages); if (!swap_map) { @@ -3369,12 +3371,9 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) if (error) goto bad_swap_unlock_inode; - nr_extents = setup_swap_map_and_extents(si, swap_header, swap_map, - maxpages, &span); - if (unlikely(nr_extents < 0)) { - error = nr_extents; + error = setup_swap_map(si, swap_header, swap_map, maxpages); + if (error) goto bad_swap_unlock_inode; - } /* * Use kvmalloc_array instead of bitmap_zalloc as the allocation order might -- Gitee From f47ec22ba609418c629d512ed2a6f3a04aaf30ba Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Thu, 22 May 2025 20:25:53 +0800 Subject: [PATCH 384/484] mm: swap: fix potential buffer overflow in setup_clusters() commit 152c1339dc13ad46f1b136e8693de15980750835 upstream Conflicts: none Backport-reason: Swap allocator cleanup In setup_swap_map(), we only ensure badpages are in range (0, last_page]. As maxpages might be < last_page, setup_clusters() will encounter a buffer overflow when a badpage is >= maxpages. Only call inc_cluster_info_page() for badpage which is < maxpages to fix the issue. Link: https://lkml.kernel.org/r/20250522122554.12209-4-shikemeng@huaweicloud.com Fixes: b843786b0bd0 ("mm: swapfile: fix SSD detection with swapfile on btrfs") Signed-off-by: Kemeng Shi Reviewed-by: Baoquan He Cc: Johannes Weiner Cc: Kairui Song Cc: Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/swapfile.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/mm/swapfile.c b/mm/swapfile.c index 450508459b2f..5db0843c4cb8 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -3212,9 +3212,13 @@ static struct swap_cluster_info *setup_clusters(struct swap_info_struct *si, * and the EOF part of the last cluster. */ inc_cluster_info_page(si, cluster_info, 0); - for (i = 0; i < swap_header->info.nr_badpages; i++) - inc_cluster_info_page(si, cluster_info, - swap_header->info.badpages[i]); + for (i = 0; i < swap_header->info.nr_badpages; i++) { + unsigned int page_nr = swap_header->info.badpages[i]; + + if (page_nr >= maxpages) + continue; + inc_cluster_info_page(si, cluster_info, page_nr); + } for (i = maxpages; i < round_up(maxpages, SWAPFILE_CLUSTER); i++) inc_cluster_info_page(si, cluster_info, i); -- Gitee From 2bd25bfcb5ed07eee92903e74068bac76f2deab7 Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Thu, 22 May 2025 20:25:54 +0800 Subject: [PATCH 385/484] mm: swap: remove stale comment stale comment in cluster_alloc_swap_entry() commit 8678d1faf1d4c9c6a87237203fc23c9389e8f143 upstream Conflicts: none Backport-reason: Swap allocator cleanup As cluster_next_cpu was already dropped, the associated comment is stale now. Link: https://lkml.kernel.org/r/20250522122554.12209-5-shikemeng@huaweicloud.com Signed-off-by: Kemeng Shi Reviewed-by: Kairui Song Reviewed-by: Baoquan He Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/swapfile.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/mm/swapfile.c b/mm/swapfile.c index 5db0843c4cb8..793187b338fa 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -957,9 +957,8 @@ static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int o } /* - * We don't have free cluster but have some clusters in - * discarding, do discard now and reclaim them, then - * reread cluster_next_cpu since we dropped si->lock + * We don't have free cluster but have some clusters in discarding, + * do discard now and reclaim them. */ if ((si->flags & SWP_PAGE_DISCARD) && swap_do_scheduled_discard(si)) goto new_cluster; -- Gitee From d8af936310571475a02de9327a7a2fa50f1a278e Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Sun, 14 Sep 2025 01:46:59 +0800 Subject: [PATCH 386/484] mm, swap: do not do discard during allocation Upstream: pending Since commit 1b7e90020eb77 ("mm, swap: use percpu cluster as allocation fast path"), swap allocation is protected by local lock which means we can't do any sleeping calls during allocation. However this discard part is not taken care of. When the swap allocator failed to find any usable cluster, it will look at pending discard cluster and try issue some discard in a synchronized way. Although this discard does not neccessarily sleep, it have a cond_resched at the bio layer, which is just wrong when combined with local lock. And the bio GFP flag used here is wrong (not atomic). It's arguable that if this synchronize discard is helpful at all. In most cases the async discard is good enough so it very rare to see discard clusters piling up. And swap allocator is doing very differently at orgnizing the clusters since the synchronize discard was first introduced. So far no issue was observed with typical SSD setups under months of high pressure. This issue is found during code review. But by modifying the kernel a bit: adding a mdelay(100) in the async discard path, this issue will be observable with WARNING triggered by the wrong GFP and cond_resched in the bio layer. So let's fix this issue in a safe way: Remove the discard in the swap allocation path. And when order 0 is failing an complete allocation attempt, try do a discard following the swap device priority list. If any discard released some cluster, try the allocation again. This way we can still avoid OOM when under great pressure and the hardware is very slow. Fixes: 1b7e90020eb77 ("mm, swap: use percpu cluster as allocation fast path") Signed-off-by: Kairui Song --- mm/swapfile.c | 41 ++++++++++++++++++++++++++++++++++------- 1 file changed, 34 insertions(+), 7 deletions(-) diff --git a/mm/swapfile.c b/mm/swapfile.c index 793187b338fa..16f397114271 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -956,13 +956,6 @@ static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int o } } - /* - * We don't have free cluster but have some clusters in discarding, - * do discard now and reclaim them. - */ - if ((si->flags & SWP_PAGE_DISCARD) && swap_do_scheduled_discard(si)) - goto new_cluster; - if (order) goto done; @@ -1253,6 +1246,34 @@ static bool swap_alloc_slow(swp_entry_t *entry, return false; } +/* + * Discard pending clusters in a synchronized way when under high + * pressure to avoid OOM. + * Return: true if any cluster is discarded. + */ +bool swap_sync_discard(void) +{ + bool ret = false; + int nid = numa_node_id(); + struct swap_info_struct *si, *next; + + spin_lock(&swap_avail_lock); + plist_for_each_entry_safe(si, next, &swap_avail_heads[nid], avail_lists[nid]) { + spin_unlock(&swap_avail_lock); + if (get_swap_device_info(si)) { + if (si->flags & SWP_PAGE_DISCARD) + ret = swap_do_scheduled_discard(si); + put_swap_device(si); + } + if (ret) + break; + spin_lock(&swap_avail_lock); + } + spin_unlock(&swap_avail_lock); + + return ret; +} + /** * folio_alloc_swap - allocate swap space for a folio * @folio: folio we want to move to swap @@ -1291,11 +1312,17 @@ int folio_alloc_swap(struct folio *folio, gfp_t gfp) } } +again: local_lock(&percpu_swap_cluster.lock); if (!swap_alloc_fast(&entry, order)) swap_alloc_slow(&entry, order); local_unlock(&percpu_swap_cluster.lock); + if (unlikely(!order && !entry.val)) { + if (swap_sync_discard()) + goto again; + } + /* Need to call this even if allocation failed, for MEMCG_SWAP_FAIL. */ if (mem_cgroup_try_charge_swap(folio, entry)) goto out_free; -- Gitee From 391242b7708586126e7857c1a2ec82d46d7cf36e Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Sun, 14 Sep 2025 19:32:31 +0800 Subject: [PATCH 387/484] mm, swap: avoid checking of invalid PTE on swapin Upsteam: pending When the faulting PTE have a smaller entry value, the rounding down could result in a an expected PTE value of zero. Zero value PTE is usally not a valid swap entry PTE, and triggers the VM_WARN_ON(!is_swap_pte(pte)) check in swap_pte_batch: ------------[ cut here ]------------ WARNING: CPU: 2 PID: 891513 at mm/internal.h:313 do_swap_page+0x107c/0x1d08 CPU: 2 PID: 891513 Comm: sh Tainted: G W 6.6.104.mthp-gd00cdaf0b59b #53 Hardware name: QEMU QEMU Virtual Machine, BIOS edk2-stable202408-prebuilt.qemu.org 08/13/2024 pstate: 21400005 (nzCv daif +PAN -UAO -TCO +DIT -SSBS BTYPE=--) pc : do_swap_page+0x107c/0x1d08 lr : do_swap_page+0x105c/0x1d08 sp : ffff8000a1643bc0 x29: ffff8000a1643c20 x28: 000000000000003b x27: 0000000000000004 x26: 0000000000000000 x25: 000000000000001c x24: ffff0000516935f0 x23: ffff000016e97100 x22: 0000000000000001 x21: ffff8000a1643cc0 x20: 0000000000000000 x19: 000000000000000d x18: ffff800080efcceb x17: 0000000000000001 x16: ffffffffffffffff x15: 0000000000000003 x14: ffff8000815332b0 x13: 0000000000000003 x12: 000000000000003b x11: 0000000000000004 x10: 0000000000000010 x9 : 4584e1dfbb1cbf00 x8 : 4584e1dfbb1cbf00 x7 : 205d353630393035 x6 : ffff800080c056c0 x5 : 0000000000000000 x4 : 0000000000000001 x3 : 0000000000000000 x2 : ffff00005fcc7e58 x1 : ffff00005fcba548 x0 : 0000000000000044 Call trace: do_swap_page+0x107c/0x1d08 handle_mm_fault+0x2a4/0x2040 do_page_fault+0x1c8/0x478 do_translation_fault+0x44/0x68 do_mem_abort+0x48/0xa0 el0_da+0x64/0x120 el0t_64_sync_handler+0x60/0x100 el0t_64_sync+0x1a4/0x1a8 irq event stamp: 0 hardirqs last enabled at (0): [<0000000000000000>] 0x0 hardirqs last disabled at (0): [] copy_process+0x5dc/0xff8 softirqs last enabled at (0): [] copy_process+0x5f0/0xff8 softirqs last disabled at (0): [<0000000000000000>] 0x0 ---[ end trace 0000000000000000 ]--- For example if the faulting PTE have a value of 0xd00, which is valid swap entry PTE. But if the swapin attempt to swapin 16 pages at once and round down the value, it will result in a PTE value of zero. Signed-off-by: Kairui Song --- mm/memory.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/memory.c b/mm/memory.c index 58b705bb620d..374e7c13f228 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3944,7 +3944,7 @@ static bool can_swapin_thp(struct vm_fault *vmf, pte_t *ptep, int nr_pages) idx = (vmf->address - addr) / PAGE_SIZE; pte = ptep_get(ptep); - if (!pte_same(pte, pte_move_swp_offset(vmf->orig_pte, -idx))) + if (!pte_same(pte, pte_move_swp_offset(vmf->orig_pte, -idx)) || pte_none(pte)) return false; entry = pte_to_swp_entry(pte); if (swap_pte_batch(ptep, nr_pages, pte) != nr_pages) -- Gitee From 8c1fa1a193438b68204d15fa90bf7a9e8cb278ca Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 9 Jan 2024 17:51:08 +0000 Subject: [PATCH 388/484] afs: Don't use certain unnecessary folio_*() functions commit fa7d614da3c556c7ef71023cb8c410a3e8571a42 upstream Conflicts: none Backport-reason: Remove folio_index to prepare for swap table Filesystems should use folio->index and folio->mapping, instead of folio_index(folio), folio_mapping() and folio_file_mapping() since they know that it's in the pagecache. Change this automagically with: perl -p -i -e 's/folio_mapping[(]([^)]*)[)]/\1->mapping/g' fs/afs/*.c perl -p -i -e 's/folio_file_mapping[(]([^)]*)[)]/\1->mapping/g' fs/afs/*.c perl -p -i -e 's/folio_index[(]([^)]*)[)]/\1->index/g' fs/afs/*.c Reported-by: Matthew Wilcox Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org cc: linux-fsdevel@vger.kernel.org Signed-off-by: Kairui Song --- fs/afs/dir.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/afs/dir.c b/fs/afs/dir.c index e57e71e2b18f..76f60d4516de 100644 --- a/fs/afs/dir.c +++ b/fs/afs/dir.c @@ -125,7 +125,7 @@ static void afs_dir_read_cleanup(struct afs_read *req) if (xas_retry(&xas, folio)) continue; BUG_ON(xa_is_value(folio)); - ASSERTCMP(folio_file_mapping(folio), ==, mapping); + ASSERTCMP(folio->mapping, ==, mapping); folio_put(folio); } @@ -203,12 +203,12 @@ static void afs_dir_dump(struct afs_vnode *dvnode, struct afs_read *req) if (xas_retry(&xas, folio)) continue; - BUG_ON(folio_file_mapping(folio) != mapping); + BUG_ON(folio->mapping != mapping); size = min_t(loff_t, folio_size(folio), req->actual_len - folio_pos(folio)); for (offset = 0; offset < size; offset += sizeof(*block)) { block = kmap_local_folio(folio, offset); - pr_warn("[%02lx] %32phN\n", folio_index(folio) + offset, block); + pr_warn("[%02lx] %32phN\n", folio->index + offset, block); kunmap_local(block); } } @@ -234,7 +234,7 @@ static int afs_dir_check(struct afs_vnode *dvnode, struct afs_read *req) if (xas_retry(&xas, folio)) continue; - BUG_ON(folio_file_mapping(folio) != mapping); + BUG_ON(folio->mapping != mapping); if (!afs_dir_check_folio(dvnode, folio, req->actual_len)) { afs_dir_dump(dvnode, req); @@ -2044,7 +2044,7 @@ static bool afs_dir_release_folio(struct folio *folio, gfp_t gfp_flags) { struct afs_vnode *dvnode = AFS_FS_I(folio_inode(folio)); - _enter("{{%llx:%llu}[%lu]}", dvnode->fid.vid, dvnode->fid.vnode, folio_index(folio)); + _enter("{{%llx:%llu}[%lu]}", dvnode->fid.vid, dvnode->fid.vnode, folio->index); folio_detach_private(folio); -- Gitee From 48575b8deec08b256b218aee74e5a8a8f50baaf4 Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Sun, 14 Sep 2025 20:32:35 +0800 Subject: [PATCH 389/484] afs: Don't use certain unnecessary folio_*() functions Upstream: following upstream style Filesystems should use folio->index and folio->mapping, instead of folio_index(folio), folio_mapping() and folio_file_mapping() since they know that it's in the pagecache. Change this automagically with: perl -p -i -e 's/folio_mapping[(]([^)]*)[)]/\1->mapping/g' fs/afs/*.c perl -p -i -e 's/folio_file_mapping[(]([^)]*)[)]/\1->mapping/g' fs/afs/*.c perl -p -i -e 's/folio_index[(]([^)]*)[)]/\1->index/g' fs/afs/*.c Also update a header file manually. Signed-off-by: Kairui Song --- fs/afs/file.c | 2 +- fs/afs/write.c | 16 ++++++++-------- include/trace/events/afs.h | 2 +- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/fs/afs/file.c b/fs/afs/file.c index 0012ea300eb5..c2e160151972 100644 --- a/fs/afs/file.c +++ b/fs/afs/file.c @@ -490,7 +490,7 @@ static bool afs_release_folio(struct folio *folio, gfp_t gfp) struct afs_vnode *vnode = AFS_FS_I(folio_inode(folio)); _enter("{{%llx:%llu}[%lu],%lx},%x", - vnode->fid.vid, vnode->fid.vnode, folio_index(folio), folio->flags, + vnode->fid.vid, vnode->fid.vnode, folio->index, folio->flags, gfp); /* deny if folio is being written to the cache and the caller hasn't diff --git a/fs/afs/write.c b/fs/afs/write.c index 948db2be26ec..9d4900f2d68b 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -88,7 +88,7 @@ int afs_write_begin(struct file *file, struct address_space *mapping, if (ret < 0) return ret; - index = folio_index(folio); + index = folio->index; from = pos - index * PAGE_SIZE; to = from + len; @@ -162,7 +162,7 @@ int afs_write_end(struct file *file, struct address_space *mapping, loff_t i_size, write_end_pos; _enter("{%llx:%llu},{%lx}", - vnode->fid.vid, vnode->fid.vnode, folio_index(folio)); + vnode->fid.vid, vnode->fid.vnode, folio->index); if (!folio_test_uptodate(folio)) { if (copied < len) { @@ -206,7 +206,7 @@ int afs_write_end(struct file *file, struct address_space *mapping, } if (folio_mark_dirty(folio)) - _debug("dirtied %lx", folio_index(folio)); + _debug("dirtied %lx", folio->index); out: folio_unlock(folio); @@ -304,7 +304,7 @@ static void afs_pages_written_back(struct afs_vnode *vnode, loff_t start, unsign xas_for_each(&xas, folio, end) { if (!folio_test_writeback(folio)) { kdebug("bad %x @%llx page %lx %lx", - len, start, folio_index(folio), end); + len, start, folio->index, end); ASSERT(folio_test_writeback(folio)); } @@ -493,7 +493,7 @@ static void afs_extend_writeback(struct address_space *mapping, continue; if (xa_is_value(folio)) break; - if (folio_index(folio) != index) + if (folio->index != index) break; if (!folio_try_get(folio)) { @@ -593,7 +593,7 @@ static ssize_t afs_write_back_from_locked_folio(struct address_space *mapping, long count = wbc->nr_to_write; int ret; - _enter(",%lx,%llx-%llx", folio_index(folio), start, end); + _enter(",%lx,%llx-%llx", folio->index, start, end); if (folio_start_writeback(folio)) BUG(); @@ -726,7 +726,7 @@ static int afs_writepages_region(struct address_space *mapping, folio = fbatch.folios[i]; start = folio_pos(folio); /* May regress with THPs */ - _debug("wback %lx", folio_index(folio)); + _debug("wback %lx", folio->index); /* At this point we hold neither the i_pages lock nor the * page lock: the page may be truncated or invalidated @@ -917,7 +917,7 @@ vm_fault_t afs_page_mkwrite(struct vm_fault *vmf) unsigned long priv; vm_fault_t ret = VM_FAULT_RETRY; - _enter("{{%llx:%llu}},{%lx}", vnode->fid.vid, vnode->fid.vnode, folio_index(folio)); + _enter("{{%llx:%llu}},{%lx}", vnode->fid.vid, vnode->fid.vnode, folio->index); afs_validate(vnode, af->key); diff --git a/include/trace/events/afs.h b/include/trace/events/afs.h index ceb0146ffc7c..2f0a2d1aae33 100644 --- a/include/trace/events/afs.h +++ b/include/trace/events/afs.h @@ -861,7 +861,7 @@ TRACE_EVENT(afs_folio_dirty, unsigned long priv = (unsigned long)folio_get_private(folio); __entry->vnode = vnode; __entry->where = where; - __entry->index = folio_index(folio); + __entry->index = folio->index; __entry->from = afs_folio_dirty_from(folio, priv); __entry->to = afs_folio_dirty_to(folio, priv); __entry->to |= (afs_is_folio_dirty_mmapped(priv) ? -- Gitee From 43449cbb4628ea3c2bcab348aed59a12f53fd356 Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Sun, 14 Sep 2025 20:35:08 +0800 Subject: [PATCH 390/484] netfs: Don't use certain unnecessary folio_*() functions Upstream: following upstream style Filesystems should use folio->index and folio->mapping, instead of folio_index(folio), folio_mapping() and folio_file_mapping() since they know that it's in the pagecache. Change this automagically with: perl -p -i -e 's/folio_mapping[(]([^)]*)[)]/\1->mapping/g' fs/netfs/*.c perl -p -i -e 's/folio_file_mapping[(]([^)]*)[)]/\1->mapping/g' fs/netfs/*.c perl -p -i -e 's/folio_index[(]([^)]*)[)]/\1->index/g' fs/netfs/*.c Signed-off-by: Kairui Song --- fs/netfs/buffered_read.c | 8 ++++---- fs/netfs/io.c | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/fs/netfs/buffered_read.c b/fs/netfs/buffered_read.c index a4bde65bc1a5..7c5b4181ebad 100644 --- a/fs/netfs/buffered_read.c +++ b/fs/netfs/buffered_read.c @@ -90,7 +90,7 @@ void netfs_rreq_unlock_folios(struct netfs_io_request *rreq) } if (!test_bit(NETFS_RREQ_DONT_UNLOCK_FOLIOS, &rreq->flags)) { - if (folio_index(folio) == rreq->no_unlock_folio && + if (folio->index == rreq->no_unlock_folio && test_bit(NETFS_RREQ_NO_UNLOCK_FOLIO, &rreq->flags)) _debug("no unlock"); else @@ -223,12 +223,12 @@ EXPORT_SYMBOL(netfs_readahead); */ int netfs_read_folio(struct file *file, struct folio *folio) { - struct address_space *mapping = folio_file_mapping(folio); + struct address_space *mapping = folio->mapping; struct netfs_io_request *rreq; struct netfs_inode *ctx = netfs_inode(mapping->host); int ret; - _enter("%lx", folio_index(folio)); + _enter("%lx", folio->index); rreq = netfs_alloc_request(mapping, file, folio_pos(folio), folio_size(folio), @@ -387,7 +387,7 @@ int netfs_write_begin(struct netfs_inode *ctx, ret = PTR_ERR(rreq); goto error; } - rreq->no_unlock_folio = folio_index(folio); + rreq->no_unlock_folio = folio->index; __set_bit(NETFS_RREQ_NO_UNLOCK_FOLIO, &rreq->flags); if (ctx->ops->begin_cache_operation) { diff --git a/fs/netfs/io.c b/fs/netfs/io.c index 7f753380e047..683d7432524c 100644 --- a/fs/netfs/io.c +++ b/fs/netfs/io.c @@ -127,9 +127,9 @@ static void netfs_rreq_unmark_after_write(struct netfs_io_request *rreq, /* We might have multiple writes from the same huge * folio, but we mustn't unlock a folio more than once. */ - if (have_unlocked && folio_index(folio) <= unlocked) + if (have_unlocked && folio->index <= unlocked) continue; - unlocked = folio_index(folio); + unlocked = folio->index; folio_end_fscache(folio); have_unlocked = true; } -- Gitee From 68fecffbe1ca7800c62c030f8cbc97867a73949a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 1 Jul 2024 07:26:48 +0200 Subject: [PATCH 391/484] nfs: remove dead code for the old swap over NFS implementation commit 7e8e78a0ba00c88f0ded86de64bdddc82e06b196 upstream Conflicts: trivial Backport-reason: Remove folio_index to prepare for swap table Remove the code testing folio_test_swapcache either explicitly or implicitly in pagemap.h headers, as is now handled using the direct I/O path and not the buffered I/O path that these helpers are located in. Signed-off-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Signed-off-by: Anna Schumaker Signed-off-by: Kairui Song --- fs/nfs/file.c | 6 +- fs/nfs/filelayout/filelayout.c | 1 - fs/nfs/fscache.c | 2 +- fs/nfs/internal.h | 6 +- fs/nfs/pagelist.c | 2 +- fs/nfs/pnfs.h | 22 ------ fs/nfs/pnfs_nfs.c | 47 ------------ fs/nfs/read.c | 2 +- fs/nfs/write.c | 128 +++++++-------------------------- include/linux/nfs_page.h | 4 +- 10 files changed, 35 insertions(+), 185 deletions(-) diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 5ebd077d7551..72864fb577aa 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -426,7 +426,7 @@ static int nfs_write_end(struct file *file, struct address_space *mapping, static void nfs_invalidate_folio(struct folio *folio, size_t offset, size_t length) { - struct inode *inode = folio_file_mapping(folio)->host; + struct inode *inode = folio->mapping->host; dfprintk(PAGECACHE, "NFS: invalidate_folio(%lu, %zu, %zu)\n", folio->index, offset, length); @@ -453,7 +453,7 @@ static bool nfs_release_folio(struct folio *folio, gfp_t gfp) if ((current_gfp_context(gfp) & GFP_KERNEL) != GFP_KERNEL || current_is_kswapd() || current_is_kcompactd()) return false; - if (nfs_wb_folio(folio_file_mapping(folio)->host, folio) < 0) + if (nfs_wb_folio(folio->mapping->host, folio) < 0) return false; } return nfs_fscache_release_folio(folio, gfp); @@ -607,7 +607,7 @@ static vm_fault_t nfs_vm_page_mkwrite(struct vm_fault *vmf) TASK_KILLABLE|TASK_FREEZABLE_UNSAFE); folio_lock(folio); - mapping = folio_file_mapping(folio); + mapping = folio->mapping; if (mapping != inode->i_mapping) goto out_unlock; diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c index 569ae4ec6084..76e409115bee 100644 --- a/fs/nfs/filelayout/filelayout.c +++ b/fs/nfs/filelayout/filelayout.c @@ -1118,7 +1118,6 @@ static const struct pnfs_commit_ops filelayout_commit_ops = { .clear_request_commit = pnfs_generic_clear_request_commit, .scan_commit_lists = pnfs_generic_scan_commit_lists, .recover_commit_reqs = pnfs_generic_recover_commit_reqs, - .search_commit_reqs = pnfs_generic_search_commit_reqs, .commit_pagelist = filelayout_commit_pagelist, }; diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c index 60a3c28784e0..28f92f5853de 100644 --- a/fs/nfs/fscache.c +++ b/fs/nfs/fscache.c @@ -347,7 +347,7 @@ void nfs_netfs_initiate_read(struct nfs_pgio_header *hdr) int nfs_netfs_folio_unlock(struct folio *folio) { - struct inode *inode = folio_file_mapping(folio)->host; + struct inode *inode = folio->mapping->host; /* * If fscache is enabled, netfs will unlock pages. diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index ce194656ae52..9395ea66f64d 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -778,7 +778,7 @@ static inline void nfs_folio_mark_unstable(struct folio *folio, struct nfs_commit_info *cinfo) { if (folio && !cinfo->dreq) { - struct inode *inode = folio_file_mapping(folio)->host; + struct inode *inode = folio->mapping->host; long nr = folio_nr_pages(folio); /* This page is really still in write-back - just that the @@ -795,10 +795,10 @@ static inline void nfs_folio_mark_unstable(struct folio *folio, */ static inline size_t nfs_folio_length(struct folio *folio) { - loff_t i_size = i_size_read(folio_file_mapping(folio)->host); + loff_t i_size = i_size_read(folio->mapping->host); if (i_size > 0) { - pgoff_t index = folio_index(folio) >> folio_order(folio); + pgoff_t index = folio->index >> folio_order(folio); pgoff_t end_index = (i_size - 1) >> folio_shift(folio); if (index < end_index) return folio_size(folio); diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index 040b6b79c75e..3b006bcbcc87 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -569,7 +569,7 @@ struct nfs_page *nfs_page_create_from_folio(struct nfs_open_context *ctx, if (IS_ERR(l_ctx)) return ERR_CAST(l_ctx); - ret = nfs_page_create(l_ctx, offset, folio_index(folio), offset, count); + ret = nfs_page_create(l_ctx, offset, folio->index, offset, count); if (!IS_ERR(ret)) { nfs_page_assign_folio(ret, folio); nfs_page_group_init(ret, NULL); diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index 79996d7dad0f..5673787ebced 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -193,8 +193,6 @@ struct pnfs_commit_ops { int max); void (*recover_commit_reqs) (struct list_head *list, struct nfs_commit_info *cinfo); - struct nfs_page * (*search_commit_reqs)(struct nfs_commit_info *cinfo, - struct folio *folio); }; struct pnfs_layout_hdr { @@ -395,8 +393,6 @@ void pnfs_generic_prepare_to_resend_writes(struct nfs_commit_data *data); void pnfs_generic_rw_release(void *data); void pnfs_generic_recover_commit_reqs(struct list_head *dst, struct nfs_commit_info *cinfo); -struct nfs_page *pnfs_generic_search_commit_reqs(struct nfs_commit_info *cinfo, - struct folio *folio); int pnfs_generic_commit_pagelist(struct inode *inode, struct list_head *mds_pages, int how, @@ -557,17 +553,6 @@ pnfs_recover_commit_reqs(struct list_head *head, struct nfs_commit_info *cinfo) fl_cinfo->ops->recover_commit_reqs(head, cinfo); } -static inline struct nfs_page * -pnfs_search_commit_reqs(struct inode *inode, struct nfs_commit_info *cinfo, - struct folio *folio) -{ - struct pnfs_ds_commit_info *fl_cinfo = cinfo->ds; - - if (!fl_cinfo->ops || !fl_cinfo->ops->search_commit_reqs) - return NULL; - return fl_cinfo->ops->search_commit_reqs(cinfo, folio); -} - /* Should the pNFS client commit and return the layout upon a setattr */ static inline bool pnfs_ld_layoutret_on_setattr(struct inode *inode) @@ -864,13 +849,6 @@ pnfs_recover_commit_reqs(struct list_head *head, struct nfs_commit_info *cinfo) { } -static inline struct nfs_page * -pnfs_search_commit_reqs(struct inode *inode, struct nfs_commit_info *cinfo, - struct folio *folio) -{ - return NULL; -} - static inline int pnfs_layoutcommit_inode(struct inode *inode, bool sync) { return 0; diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c index 1b317c44da12..8021612c1670 100644 --- a/fs/nfs/pnfs_nfs.c +++ b/fs/nfs/pnfs_nfs.c @@ -351,53 +351,6 @@ void pnfs_generic_recover_commit_reqs(struct list_head *dst, } EXPORT_SYMBOL_GPL(pnfs_generic_recover_commit_reqs); -static struct nfs_page * -pnfs_bucket_search_commit_reqs(struct pnfs_commit_bucket *buckets, - unsigned int nbuckets, struct folio *folio) -{ - struct nfs_page *req; - struct pnfs_commit_bucket *b; - unsigned int i; - - /* Linearly search the commit lists for each bucket until a matching - * request is found */ - for (i = 0, b = buckets; i < nbuckets; i++, b++) { - list_for_each_entry(req, &b->written, wb_list) { - if (nfs_page_to_folio(req) == folio) - return req->wb_head; - } - list_for_each_entry(req, &b->committing, wb_list) { - if (nfs_page_to_folio(req) == folio) - return req->wb_head; - } - } - return NULL; -} - -/* pnfs_generic_search_commit_reqs - Search lists in @cinfo for the head request - * for @folio - * @cinfo - commit info for current inode - * @folio - page to search for matching head request - * - * Return: the head request if one is found, otherwise %NULL. - */ -struct nfs_page *pnfs_generic_search_commit_reqs(struct nfs_commit_info *cinfo, - struct folio *folio) -{ - struct pnfs_ds_commit_info *fl_cinfo = cinfo->ds; - struct pnfs_commit_array *array; - struct nfs_page *req; - - list_for_each_entry(array, &fl_cinfo->commits, cinfo_list) { - req = pnfs_bucket_search_commit_reqs(array->buckets, - array->nbuckets, folio); - if (req) - return req; - } - return NULL; -} -EXPORT_SYMBOL_GPL(pnfs_generic_search_commit_reqs); - static struct pnfs_layout_segment * pnfs_bucket_get_committing(struct list_head *head, struct pnfs_commit_bucket *bucket, diff --git a/fs/nfs/read.c b/fs/nfs/read.c index ed411897e33f..bb2f5c0a9484 100644 --- a/fs/nfs/read.c +++ b/fs/nfs/read.c @@ -289,7 +289,7 @@ int nfs_read_add_folio(struct nfs_pageio_descriptor *pgio, struct nfs_open_context *ctx, struct folio *folio) { - struct inode *inode = folio_file_mapping(folio)->host; + struct inode *inode = folio->mapping->host; struct nfs_server *server = NFS_SERVER(inode); size_t fsize = folio_size(folio); unsigned int rsize = server->rsize; diff --git a/fs/nfs/write.c b/fs/nfs/write.c index a7110d3f0221..be7bffcbfa40 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -63,9 +63,6 @@ static void nfs_clear_request_commit(struct nfs_commit_info *cinfo, struct nfs_page *req); static void nfs_init_cinfo_from_inode(struct nfs_commit_info *cinfo, struct inode *inode); -static struct nfs_page * -nfs_page_search_commits_for_head_request_locked(struct nfs_inode *nfsi, - struct folio *folio); static struct kmem_cache *nfs_wdata_cachep; static mempool_t *nfs_wdata_mempool; @@ -178,16 +175,16 @@ static struct nfs_page *nfs_folio_private_request(struct folio *folio) } /** - * nfs_folio_find_private_request - find head request associated with a folio + * nfs_folio_find_head_request - find head request associated with a folio * @folio: pointer to folio * * must be called while holding the inode lock. * * returns matching head request with reference held, or NULL if not found. */ -static struct nfs_page *nfs_folio_find_private_request(struct folio *folio) +static struct nfs_page *nfs_folio_find_head_request(struct folio *folio) { - struct address_space *mapping = folio_file_mapping(folio); + struct address_space *mapping = folio->mapping; struct nfs_page *req; if (!folio_test_private(folio)) @@ -202,45 +199,9 @@ static struct nfs_page *nfs_folio_find_private_request(struct folio *folio) return req; } -static struct nfs_page *nfs_folio_find_swap_request(struct folio *folio) -{ - struct inode *inode = folio_file_mapping(folio)->host; - struct nfs_inode *nfsi = NFS_I(inode); - struct nfs_page *req = NULL; - if (!folio_test_swapcache(folio)) - return NULL; - mutex_lock(&nfsi->commit_mutex); - if (folio_test_swapcache(folio)) { - req = nfs_page_search_commits_for_head_request_locked(nfsi, - folio); - if (req) { - WARN_ON_ONCE(req->wb_head != req); - kref_get(&req->wb_kref); - } - } - mutex_unlock(&nfsi->commit_mutex); - return req; -} - -/** - * nfs_folio_find_head_request - find head request associated with a folio - * @folio: pointer to folio - * - * returns matching head request with reference held, or NULL if not found. - */ -static struct nfs_page *nfs_folio_find_head_request(struct folio *folio) -{ - struct nfs_page *req; - - req = nfs_folio_find_private_request(folio); - if (!req) - req = nfs_folio_find_swap_request(folio); - return req; -} - static struct nfs_page *nfs_folio_find_and_lock_request(struct folio *folio) { - struct inode *inode = folio_file_mapping(folio)->host; + struct inode *inode = folio->mapping->host; struct nfs_page *req, *head; int ret; @@ -261,8 +222,6 @@ static struct nfs_page *nfs_folio_find_and_lock_request(struct folio *folio) /* Ensure that nobody removed the request before we locked it */ if (head == nfs_folio_private_request(folio)) break; - if (folio_test_swapcache(folio)) - break; nfs_unlock_and_release_request(head); } return head; @@ -272,14 +231,14 @@ static struct nfs_page *nfs_folio_find_and_lock_request(struct folio *folio) static void nfs_grow_file(struct folio *folio, unsigned int offset, unsigned int count) { - struct inode *inode = folio_file_mapping(folio)->host; + struct inode *inode = folio->mapping->host; loff_t end, i_size; pgoff_t end_index; spin_lock(&inode->i_lock); i_size = i_size_read(inode); end_index = ((i_size - 1) >> folio_shift(folio)) << folio_order(folio); - if (i_size > 0 && folio_index(folio) < end_index) + if (i_size > 0 && folio->index < end_index) goto out; end = folio_pos(folio) + (loff_t)offset + (loff_t)count; if (i_size >= end) @@ -309,7 +268,7 @@ static void nfs_set_pageerror(struct address_space *mapping) static void nfs_mapping_set_error(struct folio *folio, int error) { - struct address_space *mapping = folio_file_mapping(folio); + struct address_space *mapping = folio->mapping; folio_set_error(folio); filemap_set_wb_err(mapping, error); @@ -410,7 +369,7 @@ int nfs_congestion_kb; static void nfs_folio_set_writeback(struct folio *folio) { - struct nfs_server *nfss = NFS_SERVER(folio_file_mapping(folio)->host); + struct nfs_server *nfss = NFS_SERVER(folio->mapping->host); folio_start_writeback(folio); if (atomic_long_inc_return(&nfss->writeback) > NFS_CONGESTION_ON_THRESH) @@ -419,7 +378,7 @@ static void nfs_folio_set_writeback(struct folio *folio) static void nfs_folio_end_writeback(struct folio *folio) { - struct nfs_server *nfss = NFS_SERVER(folio_file_mapping(folio)->host); + struct nfs_server *nfss = NFS_SERVER(folio->mapping->host); folio_end_writeback(folio); if (atomic_long_dec_return(&nfss->writeback) < @@ -565,7 +524,7 @@ void nfs_join_page_group(struct nfs_page *head, struct nfs_commit_info *cinfo, */ static struct nfs_page *nfs_lock_and_join_requests(struct folio *folio) { - struct inode *inode = folio_file_mapping(folio)->host; + struct inode *inode = folio->mapping->host; struct nfs_page *head; struct nfs_commit_info cinfo; int ret; @@ -641,7 +600,7 @@ static int nfs_page_async_flush(struct folio *folio, nfs_redirty_request(req); pgio->pg_error = 0; } else - nfs_add_stats(folio_file_mapping(folio)->host, + nfs_add_stats(folio->mapping->host, NFSIOS_WRITEPAGES, 1); out: return ret; @@ -653,7 +612,7 @@ static int nfs_page_async_flush(struct folio *folio, static int nfs_do_writepage(struct folio *folio, struct writeback_control *wbc, struct nfs_pageio_descriptor *pgio) { - nfs_pageio_cond_complete(pgio, folio_index(folio)); + nfs_pageio_cond_complete(pgio, folio->index); return nfs_page_async_flush(folio, wbc, pgio); } @@ -664,7 +623,7 @@ static int nfs_writepage_locked(struct folio *folio, struct writeback_control *wbc) { struct nfs_pageio_descriptor pgio; - struct inode *inode = folio_file_mapping(folio)->host; + struct inode *inode = folio->mapping->host; int err; if (wbc->sync_mode == WB_SYNC_NONE && @@ -757,24 +716,17 @@ int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc) static void nfs_inode_add_request(struct nfs_page *req) { struct folio *folio = nfs_page_to_folio(req); - struct address_space *mapping = folio_file_mapping(folio); + struct address_space *mapping = folio->mapping; struct nfs_inode *nfsi = NFS_I(mapping->host); WARN_ON_ONCE(req->wb_this_page != req); /* Lock the request! */ nfs_lock_request(req); - - /* - * Swap-space should not get truncated. Hence no need to plug the race - * with invalidate/truncate. - */ spin_lock(&mapping->private_lock); - if (likely(!folio_test_swapcache(folio))) { - set_bit(PG_MAPPED, &req->wb_flags); - folio_set_private(folio); - folio->private = req; - } + set_bit(PG_MAPPED, &req->wb_flags); + folio_set_private(folio); + folio->private = req; spin_unlock(&mapping->private_lock); atomic_long_inc(&nfsi->nrequests); /* this a head request for a page group - mark it as having an @@ -794,10 +746,10 @@ static void nfs_inode_remove_request(struct nfs_page *req) if (nfs_page_group_sync_on_bit(req, PG_REMOVE)) { struct folio *folio = nfs_page_to_folio(req->wb_head); - struct address_space *mapping = folio_file_mapping(folio); + struct address_space *mapping = folio->mapping; spin_lock(&mapping->private_lock); - if (likely(folio && !folio_test_swapcache(folio))) { + if (likely(folio)) { folio->private = NULL; folio_clear_private(folio); clear_bit(PG_MAPPED, &req->wb_head->wb_flags); @@ -818,38 +770,6 @@ static void nfs_mark_request_dirty(struct nfs_page *req) filemap_dirty_folio(folio_mapping(folio), folio); } -/* - * nfs_page_search_commits_for_head_request_locked - * - * Search through commit lists on @inode for the head request for @folio. - * Must be called while holding the inode (which is cinfo) lock. - * - * Returns the head request if found, or NULL if not found. - */ -static struct nfs_page * -nfs_page_search_commits_for_head_request_locked(struct nfs_inode *nfsi, - struct folio *folio) -{ - struct nfs_page *freq, *t; - struct nfs_commit_info cinfo; - struct inode *inode = &nfsi->vfs_inode; - - nfs_init_cinfo_from_inode(&cinfo, inode); - - /* search through pnfs commit lists */ - freq = pnfs_search_commit_reqs(inode, &cinfo, folio); - if (freq) - return freq->wb_head; - - /* Linearly search the commit list for the correct request */ - list_for_each_entry_safe(freq, t, &cinfo.mds->list, wb_list) { - if (nfs_page_to_folio(freq) == folio) - return freq->wb_head; - } - - return NULL; -} - /** * nfs_request_add_commit_list_locked - add request to a commit list * @req: pointer to a struct nfs_page @@ -956,7 +876,7 @@ static void nfs_folio_clear_commit(struct folio *folio) long nr = folio_nr_pages(folio); node_stat_mod_folio(folio, NR_WRITEBACK, -nr); - wb_stat_mod(&inode_to_bdi(folio_file_mapping(folio)->host)->wb, + wb_stat_mod(&inode_to_bdi(folio->mapping->host)->wb, WB_WRITEBACK, -nr); } } @@ -1141,7 +1061,7 @@ static struct nfs_page *nfs_try_to_update_request(struct folio *folio, */ nfs_mark_request_dirty(req); nfs_unlock_and_release_request(req); - error = nfs_wb_folio(folio_file_mapping(folio)->host, folio); + error = nfs_wb_folio(folio->mapping->host, folio); return (error < 0) ? ERR_PTR(error) : NULL; } @@ -1217,7 +1137,7 @@ int nfs_flush_incompatible(struct file *file, struct folio *folio) nfs_release_request(req); if (!do_flush) return 0; - status = nfs_wb_folio(folio_file_mapping(folio)->host, folio); + status = nfs_wb_folio(folio->mapping->host, folio); } while (status == 0); return status; } @@ -1291,7 +1211,7 @@ bool nfs_ctx_key_to_expire(struct nfs_open_context *ctx, struct inode *inode) */ static bool nfs_folio_write_uptodate(struct folio *folio, unsigned int pagelen) { - struct inode *inode = folio_file_mapping(folio)->host; + struct inode *inode = folio->mapping->host; struct nfs_inode *nfsi = NFS_I(inode); if (nfs_have_delegated_attributes(inode)) @@ -1369,7 +1289,7 @@ int nfs_update_folio(struct file *file, struct folio *folio, unsigned int offset, unsigned int count) { struct nfs_open_context *ctx = nfs_file_open_context(file); - struct address_space *mapping = folio_file_mapping(folio); + struct address_space *mapping = folio->mapping; struct inode *inode = mapping->host; unsigned int pagelen = nfs_folio_length(folio); int status = 0; diff --git a/include/linux/nfs_page.h b/include/linux/nfs_page.h index 1c315f854ea8..7bc31df457ea 100644 --- a/include/linux/nfs_page.h +++ b/include/linux/nfs_page.h @@ -208,8 +208,8 @@ static inline struct inode *nfs_page_to_inode(const struct nfs_page *req) struct folio *folio = nfs_page_to_folio(req); if (folio == NULL) - return page_file_mapping(req->wb_page)->host; - return folio_file_mapping(folio)->host; + return req->wb_page->mapping->host; + return folio->mapping->host; } /** -- Gitee From c2369b21f0761036fc305fbe1c49fe64c867b92d Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Thu, 1 May 2025 02:10:50 +0800 Subject: [PATCH 392/484] filemap: do not use folio_contains for swap cache folios commit 2b80f633c360ce3c56a7071782eae70852c8f344 upstream Conflicts: none Backport-reason: Remove folio_index to prepare for swap table Currently, none of the folio_contains callers should encounter swap cache folios. For fs/ callers, swap cache folios are never part of their workflow. For filemap and truncate, folio_contains is only used for sanity checks to verify the folio index matches the expected lookup / invalidation target. The swap cache does not utilize filemap or truncate helpers in ways that would trigger these checks, as it mostly implements its own cache management. Shmem won't trigger these sanity checks either unless thing went wrong, as it would directly trigger a BUG because swap cache index are unrelated and almost never matches shmem index. Shmem have to handle mixed values of folios, shadows, and swap entries, so it has its own way of handling the mapping. While some filemap helpers works for swap cache space, the swap cache is different from the page cache in many ways. So this particular helper will unlikely to work in a helpful way for swap cache folios. So make it explicit here that folio_contains should not be used for swap cache folios. This helps to avoid misuse, make swap cache less exposed and remove the folio_index usage here. [akpm@linux-foundation.org: s/VM_WARN_ON_FOLIO/VM_WARN_ON_ONCE_FOLIO/, per Kairui] Link: https://lkml.kernel.org/r/20250430181052.55698-5-ryncsn@gmail.com Signed-off-by: Kairui Song Acked-by: David Hildenbrand Cc: Chao Yu Cc: Chris Li Cc: Chris Mason Cc: Christian Brauner Cc: David Sterba Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Jaegeuk Kim Cc: Joanne Koong Cc: Johannes Weiner Cc: Josef Bacik Cc: Matthew Wilcox (Oracle) Cc: Miklos Szeredi Cc: Nhat Pham Cc: Qu Wenruo Cc: Yosry Ahmed Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- include/linux/pagemap.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 07280f01eaab..8ab2e3500aa0 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -845,14 +845,14 @@ static inline struct page *folio_file_page(struct folio *folio, pgoff_t index) * @folio: The folio. * @index: The page index within the file. * - * Context: The caller should have the page locked in order to prevent - * (eg) shmem from moving the page between the page cache and swap cache - * and changing its index in the middle of the operation. + * Context: The caller should have the folio locked and ensure + * e.g., shmem did not move this folio to the swap cache. * Return: true or false. */ static inline bool folio_contains(struct folio *folio, pgoff_t index) { - return index - folio_index(folio) < folio_nr_pages(folio); + VM_WARN_ON_ONCE_FOLIO(folio_test_swapcache(folio), folio); + return index - folio->index < folio_nr_pages(folio); } /* -- Gitee From 48c1580eb01fbd2dc543426836cd22b69b7d50a2 Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Tue, 26 Aug 2025 00:37:21 +0800 Subject: [PATCH 393/484] mm/page-writeback: drop usage of folio_index commit 46afff459925297d9d47c043eb8541fabac9bb0a upstream Conflicts: resolved Backport-reason: Remove folio_index to prepare for swap table folio_index is only needed for mixed usage of page cache and swap cache. The remaining three caller in page-writeback are for page cache tag marking. Swap cache space doesn't use tag (explicitly sets mapping_set_no_writeback_tags), so use folio->index here directly. Link: https://lkml.kernel.org/r/20250825163721.17734-1-ryncsn@gmail.com Signed-off-by: Kairui Song Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/page-writeback.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 970e03c3bc4b..308710d1edca 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -39,6 +39,7 @@ #include #include #include +#include #include #include @@ -2772,12 +2773,18 @@ void __folio_mark_dirty(struct folio *folio, struct address_space *mapping, { unsigned long flags; + /* + * Shmem writeback relies on swap, and swap writeback is LRU based, + * not using the dirty mark. + */ + VM_WARN_ON_ONCE(folio_test_swapcache(folio) || shmem_mapping(mapping)); + xa_lock_irqsave(&mapping->i_pages, flags); if (folio->mapping) { /* Race with truncate? */ WARN_ON_ONCE(warn && !folio_test_uptodate(folio)); folio_account_dirtied(folio, mapping); - __xa_set_mark(&mapping->i_pages, folio_index(folio), - PAGECACHE_TAG_DIRTY); + __xa_set_mark(&mapping->i_pages, folio->index, + PAGECACHE_TAG_DIRTY); } xa_unlock_irqrestore(&mapping->i_pages, flags); } @@ -3064,7 +3071,7 @@ bool __folio_end_writeback(struct folio *folio) xa_lock_irqsave(&mapping->i_pages, flags); ret = folio_test_clear_writeback(folio); if (ret) { - __xa_clear_mark(&mapping->i_pages, folio_index(folio), + __xa_clear_mark(&mapping->i_pages, folio->index, PAGECACHE_TAG_WRITEBACK); if (bdi->capabilities & BDI_CAP_WRITEBACK_ACCT) { struct bdi_writeback *wb = inode_to_wb(inode); @@ -3103,7 +3110,7 @@ bool __folio_start_writeback(struct folio *folio, bool keep_write) folio_memcg_lock(folio); if (mapping && mapping_use_writeback_tags(mapping)) { - XA_STATE(xas, &mapping->i_pages, folio_index(folio)); + XA_STATE(xas, &mapping->i_pages, folio->index); struct inode *inode = mapping->host; struct backing_dev_info *bdi = inode_to_bdi(inode); unsigned long flags; -- Gitee From b9de4155d61e70d2e010145e06ef96467b63ac6d Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Thu, 1 May 2025 02:10:51 +0800 Subject: [PATCH 394/484] mm: move folio_index to mm/swap.h and remove no longer needed helper commit 7d0f0f06153116bb737eef76d47406639e161613 upstream Conflicts: none Backport-reason: Remove folio_index to prepare for swap table There are no remaining users of folio_index() outside the mm subsystem. Move it to mm/swap.h to co-locate it with swap_cache_index(), eliminating a forward declaration, and a function call overhead. Also remove the helper that was used to fix circular header dependency issue. Link: https://lkml.kernel.org/r/20250430181052.55698-6-ryncsn@gmail.com Signed-off-by: Kairui Song Acked-by: David Hildenbrand Cc: Chao Yu Cc: Chris Li Cc: Chris Mason Cc: Christian Brauner Cc: David Sterba Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Jaegeuk Kim Cc: Joanne Koong Cc: Johannes Weiner Cc: Josef Bacik Cc: Matthew Wilcox (Oracle) Cc: Miklos Szeredi Cc: Nhat Pham Cc: Qu Wenruo Cc: Yosry Ahmed Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- include/linux/pagemap.h | 20 -------------------- mm/gup.c | 1 + mm/memfd.c | 1 + mm/migrate.c | 1 + mm/page-writeback.c | 1 + mm/swap.h | 18 ++++++++++++++++++ mm/swapfile.c | 6 ------ 7 files changed, 22 insertions(+), 26 deletions(-) diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 8ab2e3500aa0..af0af16a85e7 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -794,26 +794,6 @@ static inline struct page *grab_cache_page_nowait(struct address_space *mapping, mapping_gfp_mask(mapping)); } -extern pgoff_t __folio_swap_cache_index(struct folio *folio); - -/** - * folio_index - File index of a folio. - * @folio: The folio. - * - * For a folio which is either in the page cache or the swap cache, - * return its index within the address_space it belongs to. If you know - * the page is definitely in the page cache, you can look at the folio's - * index directly. - * - * Return: The index (offset in units of pages) of a folio in its file. - */ -static inline pgoff_t folio_index(struct folio *folio) -{ - if (unlikely(folio_test_swapcache(folio))) - return __folio_swap_cache_index(folio); - return folio->index; -} - /** * folio_next_index - Get the index of the next folio. * @folio: The current folio. diff --git a/mm/gup.c b/mm/gup.c index 29c719b3ab31..ce0df246994f 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -24,6 +24,7 @@ #include #include "internal.h" +#include "swap.h" struct follow_page_context { struct dev_pagemap *pgmap; diff --git a/mm/memfd.c b/mm/memfd.c index 2dba2cb6f0d0..ff324158ba09 100644 --- a/mm/memfd.c +++ b/mm/memfd.c @@ -20,6 +20,7 @@ #include #include #include +#include "swap.h" /* * We need a tag: a new tag would expand every xa_node by 8 bytes, diff --git a/mm/migrate.c b/mm/migrate.c index 4da1249a68e7..dc0af1856bff 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -56,6 +56,7 @@ #include #include "internal.h" +#include "swap.h" bool isolate_movable_page(struct page *page, isolate_mode_t mode) { diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 308710d1edca..57e33acef6cf 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -45,6 +45,7 @@ #include #include "internal.h" +#include "swap.h" /* * Sleep at most 200ms at a time in balance_dirty_pages(). diff --git a/mm/swap.h b/mm/swap.h index a900754b0045..422b417625b4 100644 --- a/mm/swap.h +++ b/mm/swap.h @@ -223,4 +223,22 @@ static inline int non_swapcache_batch(swp_entry_t entry, int max_nr) } #endif /* CONFIG_SWAP */ +/** + * folio_index - File index of a folio. + * @folio: The folio. + * + * For a folio which is either in the page cache or the swap cache, + * return its index within the address_space it belongs to. If you know + * the folio is definitely in the page cache, you can look at the folio's + * index directly. + * + * Return: The index (offset in units of pages) of a folio in its file. + */ +static inline pgoff_t folio_index(struct folio *folio) +{ + if (unlikely(folio_test_swapcache(folio))) + return swap_cache_index(folio->swap); + return folio->index; +} + #endif /* _MM_SWAP_H */ diff --git a/mm/swapfile.c b/mm/swapfile.c index 16f397114271..d38f29a75d4f 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -3708,12 +3708,6 @@ struct address_space *swapcache_mapping(struct folio *folio) } EXPORT_SYMBOL_GPL(swapcache_mapping); -pgoff_t __folio_swap_cache_index(struct folio *folio) -{ - return swap_cache_index(folio->swap); -} -EXPORT_SYMBOL_GPL(__folio_swap_cache_index); - /* * add_swap_count_continuation - called when a swap count is duplicated * beyond SWAP_MAP_MAX, it allocates a new page and links that to the entry's -- Gitee From a4089056524bf743ea581a7200541f7530e2d6fb Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Sun, 14 Sep 2025 23:38:48 +0800 Subject: [PATCH 395/484] f2fs: remove page_file_mapping usage We are removing both folio_file_mapping and page_file_mapping to cleanup the old mixed usage of swap cache and page cache. Signed-off-by: Kairui Song --- fs/f2fs/data.c | 4 ++-- fs/f2fs/f2fs.h | 2 +- include/trace/events/f2fs.h | 10 +++++----- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index b39b13ef02c5..65ca3fb96b6f 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -73,7 +73,7 @@ bool f2fs_is_cp_guaranteed(struct page *page) static enum count_type __read_io_type(struct page *page) { - struct address_space *mapping = page_file_mapping(page); + struct address_space *mapping = page->mapping; if (mapping) { struct inode *inode = mapping->host; @@ -2427,7 +2427,7 @@ static int f2fs_mpage_readpages(struct inode *inode, static int f2fs_read_data_folio(struct file *file, struct folio *folio) { struct page *page = &folio->page; - struct inode *inode = page_file_mapping(page)->host; + struct inode *inode = page->mapping->host; int ret = -EAGAIN; trace_f2fs_readpage(page, DATA); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 8e6259cd7471..542c8cde8aa3 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1981,7 +1981,7 @@ static inline struct f2fs_sb_info *F2FS_M_SB(struct address_space *mapping) static inline struct f2fs_sb_info *F2FS_P_SB(struct page *page) { - return F2FS_M_SB(page_file_mapping(page)); + return F2FS_M_SB(page->mapping); } static inline struct f2fs_super_block *F2FS_RAW_SUPER(struct f2fs_sb_info *sbi) diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h index f2ce7f6da879..d79f933d7ae8 100644 --- a/include/trace/events/f2fs.h +++ b/include/trace/events/f2fs.h @@ -1052,8 +1052,8 @@ DECLARE_EVENT_CLASS(f2fs__submit_page_bio, ), TP_fast_assign( - __entry->dev = page_file_mapping(page)->host->i_sb->s_dev; - __entry->ino = page_file_mapping(page)->host->i_ino; + __entry->dev = page->mapping->host->i_sb->s_dev; + __entry->ino = page->mapping->host->i_ino; __entry->index = page->index; __entry->old_blkaddr = fio->old_blkaddr; __entry->new_blkaddr = fio->new_blkaddr; @@ -1236,11 +1236,11 @@ DECLARE_EVENT_CLASS(f2fs__page, ), TP_fast_assign( - __entry->dev = page_file_mapping(page)->host->i_sb->s_dev; - __entry->ino = page_file_mapping(page)->host->i_ino; + __entry->dev = page->mapping->host->i_sb->s_dev; + __entry->ino = page->mapping->host->i_ino; __entry->type = type; __entry->dir = - S_ISDIR(page_file_mapping(page)->host->i_mode); + S_ISDIR(page->mapping->host->i_mode); __entry->index = page->index; __entry->dirty = PageDirty(page); __entry->uptodate = PageUptodate(page); -- Gitee From 2911f833bbb7f0ae8f0e9eeb6ea2ce62f7ce7356 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 17 Feb 2025 19:20:07 +0000 Subject: [PATCH 396/484] fs: remove page_file_mapping() commit 52d671a1a36a16f3a0dd9a2beff964e75bce9787 upstream Conflicts: none Backport-reason: Clean up to prepare for swap table This wrapper has no more callers. Delete it. Link: https://lkml.kernel.org/r/20250217192009.437916-2-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- include/linux/pagemap.h | 5 ----- 1 file changed, 5 deletions(-) diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index af0af16a85e7..30801d473d2a 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -468,11 +468,6 @@ static inline struct address_space *folio_flush_mapping(struct folio *folio) return folio_mapping(folio); } -static inline struct address_space *page_file_mapping(struct page *page) -{ - return folio_file_mapping(page_folio(page)); -} - /** * folio_inode - Get the host inode for this folio. * @folio: The folio. -- Gitee From c8c0965f8966d1fa7090a6ba29b547d1374b818a Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 17 Feb 2025 19:20:08 +0000 Subject: [PATCH 397/484] fs: remove folio_file_mapping() commit 0d40cfe63a2f19b9d375382e6d90b9ebd412901e upstream Conflicts: none Backport-reason: Clean up to prepare for swap table No callers of this function remain as filesystems no longer see swapfile pages through their normal read/write paths. Link: https://lkml.kernel.org/r/20250217192009.437916-3-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- include/linux/pagemap.h | 20 -------------------- 1 file changed, 20 deletions(-) diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 30801d473d2a..2e2a2a1e936d 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -428,26 +428,6 @@ struct address_space *page_mapping(struct page *); struct address_space *folio_mapping(struct folio *); struct address_space *swapcache_mapping(struct folio *); -/** - * folio_file_mapping - Find the mapping this folio belongs to. - * @folio: The folio. - * - * For folios which are in the page cache, return the mapping that this - * page belongs to. Folios in the swap cache return the mapping of the - * swap file or swap device where the data is stored. This is different - * from the mapping returned by folio_mapping(). The only reason to - * use it is if, like NFS, you return 0 from ->activate_swapfile. - * - * Do not call this for folios which aren't in the page cache or swap cache. - */ -static inline struct address_space *folio_file_mapping(struct folio *folio) -{ - if (unlikely(folio_test_swapcache(folio))) - return swapcache_mapping(folio); - - return folio->mapping; -} - /** * folio_flush_mapping - Find the file mapping this folio belongs to. * @folio: The folio. -- Gitee From 7df673ea2cc2be77b1305c2560721ff12b3007b3 Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Thu, 1 May 2025 02:10:52 +0800 Subject: [PATCH 398/484] mm, swap: remove no longer used swap mapping helper commit dd309bfc68efc4522b4d33a95e92afff249e103b upstream Conflicts: none Backport-reason: Clean up to prepare for swap table This helper existed to fix the circular header dependency issue but it is no longer used since commit 0d40cfe63a2f ("fs: remove folio_file_mapping()"), remove it. Link: https://lkml.kernel.org/r/20250430181052.55698-7-ryncsn@gmail.com Signed-off-by: Kairui Song Reviewed-by: Matthew Wilcox (Oracle) Acked-by: David Hildenbrand Cc: Chao Yu Cc: Chris Li Cc: Chris Mason Cc: Christian Brauner Cc: David Sterba Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Jaegeuk Kim Cc: Joanne Koong Cc: Johannes Weiner Cc: Josef Bacik Cc: Miklos Szeredi Cc: Nhat Pham Cc: Qu Wenruo Cc: Yosry Ahmed Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- include/linux/pagemap.h | 1 - mm/swapfile.c | 9 --------- 2 files changed, 10 deletions(-) diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 2e2a2a1e936d..219c43907424 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -426,7 +426,6 @@ static inline void filemap_nr_thps_dec(struct address_space *mapping) struct address_space *page_mapping(struct page *); struct address_space *folio_mapping(struct folio *); -struct address_space *swapcache_mapping(struct folio *); /** * folio_flush_mapping - Find the file mapping this folio belongs to. diff --git a/mm/swapfile.c b/mm/swapfile.c index d38f29a75d4f..42b7e6fab08d 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -3699,15 +3699,6 @@ struct swap_info_struct *swp_swap_info(swp_entry_t entry) } EXPORT_SYMBOL(swp_swap_info); -/* - * out-of-line methods to avoid include hell. - */ -struct address_space *swapcache_mapping(struct folio *folio) -{ - return swp_swap_info(folio->swap)->swap_file->f_mapping; -} -EXPORT_SYMBOL_GPL(swapcache_mapping); - /* * add_swap_count_continuation - called when a swap count is duplicated * beyond SWAP_MAP_MAX, it allocates a new page and links that to the entry's -- Gitee From 15da474b37723ee9ecfd16da48abf17ccabf8545 Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Fri, 9 May 2025 08:45:21 +0800 Subject: [PATCH 399/484] mm: mincore: use pte_batch_hint() to batch process large folios commit 4df65651f7075481e44c03bb39855a38783d87ac upstream Conflicts: none Backport-reason: Swap Table: early phase When I tested the mincore() syscall, I observed that it takes longer with 64K mTHP enabled on my Arm64 server. The reason is the mincore_pte_range() still checks each PTE individually, even when the PTEs are contiguous, which is not efficient. Thus we can use pte_batch_hint() to get the batch number of the present contiguous PTEs, which can improve the performance. I tested the mincore() syscall with 1G anonymous memory populated with 64K mTHP, and observed an obvious performance improvement: w/o patch w/ patch changes 6022us 549us +91% Moreover, I also tested mincore() with disabling mTHP/THP, and did not see any obvious regression for base pages. Link: https://lkml.kernel.org/r/99cb00ee626ceb6e788102ca36821815cd832237.1746697240.git.baolin.wang@linux.alibaba.com Signed-off-by: Baolin Wang Reviewed-by: Barry Song Reviewed-by: Dev Jain Acked-by: David Hildenbrand Cc: Dev Jain Cc: Ryan Roberts Cc: Zi Yan Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/mincore.c | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/mm/mincore.c b/mm/mincore.c index e31cf1bde614..c8a2dec6dc45 100644 --- a/mm/mincore.c +++ b/mm/mincore.c @@ -21,6 +21,7 @@ #include #include "swap.h" +#include "internal.h" static int mincore_hugetlb(pte_t *pte, unsigned long hmask, unsigned long addr, unsigned long end, struct mm_walk *walk) @@ -105,6 +106,7 @@ static int mincore_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, pte_t *ptep; unsigned char *vec = walk->private; int nr = (end - addr) >> PAGE_SHIFT; + int step, i; ptl = pmd_trans_huge_lock(pmd, vma); if (ptl) { @@ -118,16 +120,26 @@ static int mincore_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, walk->action = ACTION_AGAIN; return 0; } - for (; addr != end; ptep++, addr += PAGE_SIZE) { + for (; addr != end; ptep += step, addr += step * PAGE_SIZE) { pte_t pte = ptep_get(ptep); + step = 1; /* We need to do cache lookup too for pte markers */ if (pte_none_mostly(pte)) __mincore_unmapped_range(addr, addr + PAGE_SIZE, vma, vec); - else if (pte_present(pte)) - *vec = 1; - else { /* pte is a swap entry */ + else if (pte_present(pte)) { + unsigned int batch = pte_batch_hint(ptep, pte); + + if (batch > 1) { + unsigned int max_nr = (end - addr) >> PAGE_SHIFT; + + step = min_t(unsigned int, batch, max_nr); + } + + for (i = 0; i < step; i++) + vec[i] = 1; + } else { /* pte is a swap entry */ swp_entry_t entry = pte_to_swp_entry(pte); if (non_swap_entry(entry)) { @@ -146,7 +158,7 @@ static int mincore_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, #endif } } - vec++; + vec += step; } pte_unmap_unlock(ptep - 1, ptl); out: -- Gitee From cf9342ed13f5d77b87c7abb412dc43c5825322e2 Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Thu, 7 Aug 2025 00:17:46 +0800 Subject: [PATCH 400/484] mm, swap: only scan one cluster in fragment list commit b25786b4a9819419a317994dabb6ecb409e8114b upstream Conflicts: none Backport-reason: Swap Table: early phase Patch series "mm, swap: improve cluster scan strategy", v2. This series improves the large allocation performance and reduces the failure rate. Some design of the cluster alloactor was later found to be improvable after thorough testing. The allocator spent too much effort scanning the fragment list, which is not helpful in most setups, but causes serious contention of the list lock (si->lock). Besides, the allocator prefers free clusters when searching for a new cluster due to historical reasons, which causes fragmentation issues. So make the allocator only scan one cluster for high order allocation, and prefer nonfull cluster. This both improves the performance and reduces fragmentation. For example, build kernel test with make -j96 and 10G ZRAM with 64kB mTHP enabled shows better performance and a lower failure rate: Before: sys time: 11609.69s 64kB/swpout: 1787051 64kB/swpout_fallback: 20917 After: sys time: 5587.53s 64kB/swpout: 1811598 64kB/swpout_fallback: 0 System time is cut in half, and the failure rate drops to zero. Larger allocations in a hybrid workload also showed a major improvement: 512kB swap failure rate: Before: swpout:11663 swpout_fallback:1767 After: swpout:14480 swpout_fallback:6 2M swap failure rate: Before: swpout:24 swpout_fallback:1442 After: swpout:1329 swpout_fallback:7 This patch (of 3): Fragment clusters were mostly failing high order allocation already. The reason we scan it through now is that a swap slot may get freed without releasing the swap cache, so a swap map entry will end up in HAS_CACHE only status, and the cluster won't be moved back to non-full or free cluster list. This may cause a higher allocation failure rate. Usually only !SWP_SYNCHRONOUS_IO devices may have a large number of slots stuck in HAS_CACHE only status. Because when a !SWP_SYNCHRONOUS_IO device's usage is low (!vm_swap_full()), it will try to lazy free the swap cache. But this fragment list scan out is a bit overkill. Fragmentation is only an issue for the allocator when the device is getting full, and by that time, swap will be releasing the swap cache aggressively already. Only scanning one fragment cluster at a time is good enough to reclaim already pinned slots, and move the cluster back to nonfull. And besides, only high order allocation requires iterating over the list, order 0 allocation will succeed on the first attempt. And high order allocation failure isn't a serious problem. So the iteration of fragment clusters is trivial, but it will slow down large allocation by a lot when the fragment cluster list is long. So it's better to drop this fragment cluster iteration design. Test on a 48c96t system, build linux kernel using 10G ZRAM, make -j48, defconfig with 768M cgroup memory limit, on top of tmpfs, 4K folio only: Before: sys time: 4432.56s After: sys time: 4430.18s Change to make -j96, 2G memory limit, 64kB mTHP enabled, and 10G ZRAM: Before: sys time: 11609.69s 64kB/swpout: 1787051 64kB/swpout_fallback: 20917 After: sys time: 5572.85s 64kB/swpout: 1797612 64kB/swpout_fallback: 19254 Change to 8G ZRAM: Before: sys time: 21524.35s 64kB/swpout: 1687142 64kB/swpout_fallback: 128496 After: sys time: 6278.45s 64kB/swpout: 1679127 64kB/swpout_fallback: 130942 Change to use 10G brd device with SWP_SYNCHRONOUS_IO flag removed: Before: sys time: 7393.50s 64kB/swpout:1788246 swpout_fallback: 0 After: sys time: 7399.88s 64kB/swpout:1784257 swpout_fallback: 0 Change to use 8G brd device with SWP_SYNCHRONOUS_IO flag removed: Before: sys time: 26292.26s 64kB/swpout:1645236 swpout_fallback: 138945 After: sys time: 9463.16s 64kB/swpout:1581376 swpout_fallback: 259979 The performance is a lot better for large folios, and the large order allocation failure rate is only very slightly higher or unchanged even for !SWP_SYNCHRONOUS_IO devices high pressure. Link: https://lkml.kernel.org/r/20250806161748.76651-1-ryncsn@gmail.com Link: https://lkml.kernel.org/r/20250806161748.76651-2-ryncsn@gmail.com Signed-off-by: Kairui Song Acked-by: Nhat Pham Acked-by: Chris Li Cc: Baoquan He Cc: Barry Song Cc: "Huang, Ying" Cc: Kairui Song Cc: Kemeng Shi Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/swapfile.c | 23 ++++++++--------------- 1 file changed, 8 insertions(+), 15 deletions(-) diff --git a/mm/swapfile.c b/mm/swapfile.c index 42b7e6fab08d..5014d94b4c6b 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -927,32 +927,25 @@ static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int o swap_reclaim_full_clusters(si, false); if (order < PMD_ORDER) { - unsigned int frags = 0, frags_existing; - while ((ci = isolate_lock_cluster(si, &si->nonfull_clusters[order]))) { found = alloc_swap_scan_cluster(si, ci, cluster_offset(si, ci), order, usage); if (found) goto done; - /* Clusters failed to allocate are moved to frag_clusters */ - frags++; } - frags_existing = atomic_long_read(&si->frag_cluster_nr[order]); - while (frags < frags_existing && - (ci = isolate_lock_cluster(si, &si->frag_clusters[order]))) { - atomic_long_dec(&si->frag_cluster_nr[order]); - /* - * Rotate the frag list to iterate, they were all - * failing high order allocation or moved here due to - * per-CPU usage, but they could contain newly released - * reclaimable (eg. lazy-freed swap cache) slots. - */ + /* + * Scan only one fragment cluster is good enough. Order 0 + * allocation will surely success, and large allocation + * failure is not critical. Scanning one cluster still + * keeps the list rotated and reclaimed (for HAS_CACHE). + */ + ci = isolate_lock_cluster(si, &si->frag_clusters[order]); + if (ci) { found = alloc_swap_scan_cluster(si, ci, cluster_offset(si, ci), order, usage); if (found) goto done; - frags++; } } -- Gitee From f3fe48714510a22e8046dd91ee7748ebaf84bc5a Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Thu, 7 Aug 2025 00:17:47 +0800 Subject: [PATCH 401/484] mm, swap: remove fragment clusters counter commit 913fff314547c1922002e655bb25199ee38e8825 upstream Conflicts: none Backport-reason: Swap Table: early phase It was used for calculating the iteration number when the swap allocator wants to scan the whole fragment list. Now the allocator only scans one fragment cluster at a time, so no one uses this counter anymore. Remove it as a cleanup; the performance change is marginal: Build linux kernel using 10G ZRAM, make -j96, defconfig with 2G cgroup memory limit, on top of tmpfs, 64kB mTHP enabled: Before: sys time: 6278.45s After: sys time: 6176.34s Change to 8G ZRAM: Before: sys time: 5572.85s After: sys time: 5531.49s Link: https://lkml.kernel.org/r/20250806161748.76651-3-ryncsn@gmail.com Signed-off-by: Kairui Song Reviewed-by: Nhat Pham Acked-by: Chris Li Cc: Baoquan He Cc: Barry Song Cc: "Huang, Ying" Cc: Kemeng Shi Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- include/linux/swap.h | 1 - mm/swapfile.c | 7 ------- 2 files changed, 8 deletions(-) diff --git a/include/linux/swap.h b/include/linux/swap.h index 3dfdf1b7a1eb..c916ee478270 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -310,7 +310,6 @@ struct swap_info_struct { /* list of cluster that contains at least one free slot */ struct list_head frag_clusters[SWAP_NR_ORDERS]; /* list of cluster that are fragmented or contented */ - atomic_long_t frag_cluster_nr[SWAP_NR_ORDERS]; unsigned int pages; /* total of usable pages of swap */ atomic_long_t inuse_pages; /* number of those currently in use */ struct swap_sequential_cluster *global_cluster; /* Use one global cluster for rotating device */ diff --git a/mm/swapfile.c b/mm/swapfile.c index 5014d94b4c6b..24b4c25c62ab 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -471,11 +471,6 @@ static void move_cluster(struct swap_info_struct *si, else list_move_tail(&ci->list, list); spin_unlock(&si->lock); - - if (ci->flags == CLUSTER_FLAG_FRAG) - atomic_long_dec(&si->frag_cluster_nr[ci->order]); - else if (new_flags == CLUSTER_FLAG_FRAG) - atomic_long_inc(&si->frag_cluster_nr[ci->order]); ci->flags = new_flags; } @@ -959,7 +954,6 @@ static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int o * allocation, but reclaim may drop si->lock and race with another user. */ while ((ci = isolate_lock_cluster(si, &si->frag_clusters[o]))) { - atomic_long_dec(&si->frag_cluster_nr[o]); found = alloc_swap_scan_cluster(si, ci, cluster_offset(si, ci), 0, usage); if (found) @@ -3248,7 +3242,6 @@ static struct swap_cluster_info *setup_clusters(struct swap_info_struct *si, for (i = 0; i < SWAP_NR_ORDERS; i++) { INIT_LIST_HEAD(&si->nonfull_clusters[i]); INIT_LIST_HEAD(&si->frag_clusters[i]); - atomic_long_set(&si->frag_cluster_nr[i], 0); } /* -- Gitee From f40113cb4d104c4ddd61e1538528ee7ec8e1f272 Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Thu, 7 Aug 2025 00:17:48 +0800 Subject: [PATCH 402/484] mm, swap: prefer nonfull over free clusters commit 9a42aed48421390155037460a3f2fd510fa6df76 upstream Conflicts: none Backport-reason: Swap Table: early phase We prefer a free cluster over a nonfull cluster whenever a CPU local cluster is drained to respect the SSD discard behavior [1]. It's not a best practice for non-discarding devices. And this is causing a higher fragmentation rate. So for a non-discarding device, prefer nonfull over free clusters. This reduces the fragmentation issue by a lot. Testing with make -j96, defconfig, using 64k mTHP, 8G ZRAM: Before: sys time: 6176.34s 64kB/swpout: 1659757 64kB/swpout_fallback: 139503 After: sys time: 6194.11s 64kB/swpout: 1689470 64kB/swpout_fallback: 56147 Testing with make -j96, defconfig, using 64k mTHP, 10G ZRAM: After: sys time: 5531.49s 64kB/swpout: 1791142 64kB/swpout_fallback: 17676 After: sys time: 5587.53s 64kB/swpout: 1811598 64kB/swpout_fallback: 0 Performance is basically unchanged, and the large allocation failure rate is lower. Enabling all mTHP sizes showed a more significant result. Using the same test setup with 10G ZRAM and enabling all mTHP sizes: 128kB swap failure rate: Before: swpout:451599 swpout_fallback:54525 After: swpout:502710 swpout_fallback:870 256kB swap failure rate: Before: swpout:63652 swpout_fallback:2708 After: swpout:65913 swpout_fallback:20 512kB swap failure rate: Before: swpout:11663 swpout_fallback:1767 After: swpout:14480 swpout_fallback:6 2M swap failure rate: Before: swpout:24 swpout_fallback:1442 After: swpout:1329 swpout_fallback:7 The success rate of large allocations is much higher. Link: https://lore.kernel.org/linux-mm/87v8242vng.fsf@yhuang6-desk2.ccr.corp.intel.com/ [1] Link: https://lkml.kernel.org/r/20250806161748.76651-4-ryncsn@gmail.com Signed-off-by: Kairui Song Acked-by: Chris Li Reviewed-by: Nhat Pham Cc: Baoquan He Cc: Barry Song Cc: "Huang, Ying" Cc: Kemeng Shi Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/swapfile.c | 38 ++++++++++++++++++++++++++++---------- 1 file changed, 28 insertions(+), 10 deletions(-) diff --git a/mm/swapfile.c b/mm/swapfile.c index 24b4c25c62ab..f36cacdd17bf 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -909,18 +909,20 @@ static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int o } new_cluster: - ci = isolate_lock_cluster(si, &si->free_clusters); - if (ci) { - found = alloc_swap_scan_cluster(si, ci, cluster_offset(si, ci), - order, usage); - if (found) - goto done; + /* + * If the device need discard, prefer new cluster over nonfull + * to spread out the writes. + */ + if (si->flags & SWP_PAGE_DISCARD) { + ci = isolate_lock_cluster(si, &si->free_clusters); + if (ci) { + found = alloc_swap_scan_cluster(si, ci, cluster_offset(si, ci), + order, usage); + if (found) + goto done; + } } - /* Try reclaim from full clusters if free clusters list is drained */ - if (vm_swap_full()) - swap_reclaim_full_clusters(si, false); - if (order < PMD_ORDER) { while ((ci = isolate_lock_cluster(si, &si->nonfull_clusters[order]))) { found = alloc_swap_scan_cluster(si, ci, cluster_offset(si, ci), @@ -928,7 +930,23 @@ static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int o if (found) goto done; } + } + if (!(si->flags & SWP_PAGE_DISCARD)) { + ci = isolate_lock_cluster(si, &si->free_clusters); + if (ci) { + found = alloc_swap_scan_cluster(si, ci, cluster_offset(si, ci), + order, usage); + if (found) + goto done; + } + } + + /* Try reclaim full clusters if free and nonfull lists are drained */ + if (vm_swap_full()) + swap_reclaim_full_clusters(si, false); + + if (order < PMD_ORDER) { /* * Scan only one fragment cluster is good enough. Order 0 * allocation will surely success, and large allocation -- Gitee From bb14d2902f11d7d0301e44b2bd6f78fdc1fb3bb6 Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Mon, 15 Sep 2025 01:33:40 +0800 Subject: [PATCH 403/484] mm, memcontrol: handle swap folios in migration in-place Upstream: no For cleaning up mincore and swap cache usage, do the lookup by the migration code. Signed-off-by: Kairui Song --- mm/memcontrol.c | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 2a50e7d10d1e..53ec5de48083 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -7809,6 +7809,7 @@ static struct page *mc_handle_file_pte(struct vm_area_struct *vma, { unsigned long index; struct folio *folio; + struct address_space *mapping; if (!vma->vm_file) /* anonymous vma */ return NULL; @@ -7817,10 +7818,28 @@ static struct page *mc_handle_file_pte(struct vm_area_struct *vma, /* folio is moved even if it's not RSS of this task(page-faulted). */ /* shmem/tmpfs may report page out on swap: account for that too. */ + mapping = vma->vm_file->f_mapping; index = linear_page_index(vma, addr); - folio = filemap_get_incore_folio(vma->vm_file->f_mapping, index); - if (IS_ERR(folio)) + folio = filemap_get_entry(mapping, index); + if (!folio) return NULL; + if (xa_is_value(folio)) { + swp_entry_t entry = radix_to_swp_entry(folio); + struct swap_info_struct *si; + folio = NULL; + + if (IS_ENABLED(CONFIG_SWAP) && shmem_mapping(mapping)) { + si = get_swap_device(entry); + if (si) { + folio = filemap_get_folio(swap_address_space(entry), + swap_cache_index(entry)); + put_swap_device(si); + } + } + + if (!folio) + return NULL; + } return folio_file_page(folio, index); } -- Gitee From e38bc0c70de12bd9f871de587c373eca11710440 Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Tue, 12 Aug 2025 01:20:17 +0800 Subject: [PATCH 404/484] mm/mincore, swap: consolidate swap cache checking for mincore commit 27763edac9288bbb35a9feecb82652de04e637fd upstream Conflicts: none Backport-reason: Swap Table: early phase Patch series "mm/mincore: minor clean up for swap cache checking". This series cleans up a swap cache helper only used by mincore, move it back into mincore code. Also separate the swap cache related logics out of shmem / page cache logics in mincore. With this series we have less lines of code and better performance. Before this series: mincore on a swaped out 16G anon mmap range: Took 488220 us mincore on 16G shmem mmap range: Took 530272 us. After this series: mincore on a swaped out 16G anon mmap range: Took 446763 us mincore on 16G shmem mmap range: Took 460496 us. About ~10% faster. This patch (of 2): The filemap_get_incore_folio (previously find_get_incore_page) helper was introduced by commit 61ef18655704 ("mm: factor find_get_incore_page out of mincore_page") to be used by later commit f5df8635c5a3 ("mm: use find_get_incore_page in memcontrol"), so memory cgroup charge move code can be simplified. But commit 6b611388b626 ("memcg-v1: remove charge move code") removed that user completely, it's only used by mincore now. So this commit basically reverts commit 61ef18655704 ("mm: factor find_get_incore_page out of mincore_page"). Move it back to mincore side to simplify the code. Link: https://lkml.kernel.org/r/20250811172018.48901-1-ryncsn@gmail.com Link: https://lkml.kernel.org/r/20250811172018.48901-2-ryncsn@gmail.com Signed-off-by: Kairui Song Acked-by: Nhat Pham Cc: Baoquan He Cc: Barry Song Cc: Chris Li Cc: David Hildenbrand Cc: Hugh Dickins Cc: Jann Horn Cc: Kemeng Shi Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/mincore.c | 29 +++++++++++++++++++++++++++-- mm/swap.h | 10 ---------- mm/swap_state.c | 38 -------------------------------------- 3 files changed, 27 insertions(+), 50 deletions(-) diff --git a/mm/mincore.c b/mm/mincore.c index c8a2dec6dc45..ae8d712abdec 100644 --- a/mm/mincore.c +++ b/mm/mincore.c @@ -61,8 +61,33 @@ static unsigned char mincore_page(struct address_space *mapping, pgoff_t index) * any other file mapping (ie. marked !present and faulted in with * tmpfs's .fault). So swapped out tmpfs mappings are tested here. */ - folio = filemap_get_incore_folio(mapping, index); - if (!IS_ERR(folio)) { + if (IS_ENABLED(CONFIG_SWAP) && shmem_mapping(mapping)) { + folio = filemap_get_entry(mapping, index); + /* + * shmem/tmpfs may return swap: account for swapcache + * page too. + */ + if (xa_is_value(folio)) { + struct swap_info_struct *si; + swp_entry_t swp = radix_to_swp_entry(folio); + /* There might be swapin error entries in shmem mapping. */ + if (non_swap_entry(swp)) + return 0; + /* Prevent swap device to being swapoff under us */ + si = get_swap_device(swp); + if (si) { + folio = filemap_get_folio(swap_address_space(swp), + swap_cache_index(swp)); + put_swap_device(si); + } else { + return 0; + } + } + } else { + folio = filemap_get_folio(mapping, index); + } + + if (!IS_ERR_OR_NULL(folio)) { present = folio_test_uptodate(folio); folio_put(folio); } diff --git a/mm/swap.h b/mm/swap.h index 422b417625b4..60f4cfd7cbe4 100644 --- a/mm/swap.h +++ b/mm/swap.h @@ -61,9 +61,6 @@ void clear_shadow_from_swap_cache(int type, unsigned long begin, void swapcache_clear(struct swap_info_struct *si, swp_entry_t entry, int nr); struct folio *swap_cache_get_folio(swp_entry_t entry, struct vm_area_struct *vma, unsigned long addr); -struct folio *filemap_get_incore_folio(struct address_space *mapping, - pgoff_t index); - struct folio *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, struct vm_area_struct *vma, unsigned long addr, struct swap_iocb **plug); @@ -174,13 +171,6 @@ static inline struct folio *swap_cache_get_folio(swp_entry_t entry, return NULL; } -static inline -struct folio *filemap_get_incore_folio(struct address_space *mapping, - pgoff_t index) -{ - return filemap_get_folio(mapping, index); -} - static inline void *get_shadow_from_swap_cache(swp_entry_t entry) { return NULL; diff --git a/mm/swap_state.c b/mm/swap_state.c index 3909cc02af6d..fbd37375b66a 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -329,44 +329,6 @@ struct folio *swap_cache_get_folio(swp_entry_t entry, return folio; } -/** - * filemap_get_incore_folio - Find and get a folio from the page or swap caches. - * @mapping: The address_space to search. - * @index: The page cache index. - * - * This differs from filemap_get_folio() in that it will also look for the - * folio in the swap cache. - * - * Return: The found folio or %NULL. - */ -struct folio *filemap_get_incore_folio(struct address_space *mapping, - pgoff_t index) -{ - swp_entry_t swp; - struct swap_info_struct *si; - struct folio *folio = filemap_get_entry(mapping, index); - - if (!folio) - return ERR_PTR(-ENOENT); - if (!xa_is_value(folio)) - return folio; - if (!shmem_mapping(mapping)) - return ERR_PTR(-ENOENT); - - swp = radix_to_swp_entry(folio); - /* There might be swapin error entries in shmem mapping. */ - if (non_swap_entry(swp)) - return ERR_PTR(-ENOENT); - /* Prevent swapoff from happening to us */ - si = get_swap_device(swp); - if (!si) - return ERR_PTR(-ENOENT); - index = swap_cache_index(swp); - folio = filemap_get_folio(swap_address_space(swp), index); - put_swap_device(si); - return folio; -} - struct folio *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, struct mempolicy *mpol, pgoff_t ilx, bool *new_page_allocated, bool skip_if_exists) -- Gitee From d0d86846b0eb61d56af5699d6b06b2c2168f8c97 Mon Sep 17 00:00:00 2001 From: Chris Li Date: Tue, 12 Aug 2025 00:10:58 -0700 Subject: [PATCH 405/484] mm/swapfile.c: introduce function alloc_swap_scan_list() commit 348e474f18e12c5a27422d0ae3b4316b13dbe3c9 upstream Conflicts: none Backport-reason: Swap Table: early phase Patch series "mm/swapfile.c and swap.h cleanup", v3. This patch series, which builds on Kairui's swap improve cluster scan series. https://lore.kernel.org/linux-mm/20250806161748.76651-1-ryncsn@gmail.com/ It introduces a new function, alloc_swap_scan_list(), for swapfile.c. It also cleans up swap.h by removing comments that reference fields that have been deleted. There are no functional changes in this two-patch series. This patch (of 2): alloc_swap_scan_list() will scan the whole list or the first cluster. This reduces the repeat patterns of isolating a cluster then scanning that cluster. As a result, cluster_alloc_swap_entry() is shorter and shallower. No functional change. Link: https://lkml.kernel.org/r/20250812-swap-scan-list-v3-0-6d73504d267b@kernel.org Link: https://lkml.kernel.org/r/20250812-swap-scan-list-v3-1-6d73504d267b@kernel.org Signed-off-by: Chris Li Reviewed-by: Kairui Song Acked-by: Nhat Pham Cc: Baoquan He Cc: Barry Song Cc: "Huang, Ying" Cc: Kemeng Shi Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/swapfile.c | 86 ++++++++++++++++++++++++++++----------------------- 1 file changed, 47 insertions(+), 39 deletions(-) diff --git a/mm/swapfile.c b/mm/swapfile.c index f36cacdd17bf..5c2a6e75848d 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -821,6 +821,29 @@ static unsigned int alloc_swap_scan_cluster(struct swap_info_struct *si, return found; } +static unsigned int alloc_swap_scan_list(struct swap_info_struct *si, + struct list_head *list, + unsigned int order, + unsigned char usage, + bool scan_all) +{ + unsigned int found = SWAP_ENTRY_INVALID; + + do { + struct swap_cluster_info *ci = isolate_lock_cluster(si, list); + unsigned long offset; + + if (!ci) + break; + offset = cluster_offset(si, ci); + found = alloc_swap_scan_cluster(si, ci, offset, order, usage); + if (found) + break; + } while (scan_all); + + return found; +} + static void swap_reclaim_full_clusters(struct swap_info_struct *si, bool force) { long to_scan = 1; @@ -914,32 +937,24 @@ static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int o * to spread out the writes. */ if (si->flags & SWP_PAGE_DISCARD) { - ci = isolate_lock_cluster(si, &si->free_clusters); - if (ci) { - found = alloc_swap_scan_cluster(si, ci, cluster_offset(si, ci), - order, usage); - if (found) - goto done; - } + found = alloc_swap_scan_list(si, &si->free_clusters, order, usage, + false); + if (found) + goto done; } if (order < PMD_ORDER) { - while ((ci = isolate_lock_cluster(si, &si->nonfull_clusters[order]))) { - found = alloc_swap_scan_cluster(si, ci, cluster_offset(si, ci), - order, usage); - if (found) - goto done; - } + found = alloc_swap_scan_list(si, &si->nonfull_clusters[order], + order, usage, true); + if (found) + goto done; } if (!(si->flags & SWP_PAGE_DISCARD)) { - ci = isolate_lock_cluster(si, &si->free_clusters); - if (ci) { - found = alloc_swap_scan_cluster(si, ci, cluster_offset(si, ci), - order, usage); - if (found) - goto done; - } + found = alloc_swap_scan_list(si, &si->free_clusters, order, usage, + false); + if (found) + goto done; } /* Try reclaim full clusters if free and nonfull lists are drained */ @@ -953,13 +968,10 @@ static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int o * failure is not critical. Scanning one cluster still * keeps the list rotated and reclaimed (for HAS_CACHE). */ - ci = isolate_lock_cluster(si, &si->frag_clusters[order]); - if (ci) { - found = alloc_swap_scan_cluster(si, ci, cluster_offset(si, ci), - order, usage); - if (found) - goto done; - } + found = alloc_swap_scan_list(si, &si->frag_clusters[order], order, + usage, false); + if (found) + goto done; } if (order) @@ -971,19 +983,15 @@ static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int o * Clusters here have at least one usable slots and can't fail order 0 * allocation, but reclaim may drop si->lock and race with another user. */ - while ((ci = isolate_lock_cluster(si, &si->frag_clusters[o]))) { - found = alloc_swap_scan_cluster(si, ci, cluster_offset(si, ci), - 0, usage); - if (found) - goto done; - } + found = alloc_swap_scan_list(si, &si->frag_clusters[o], + 0, usage, true); + if (found) + goto done; - while ((ci = isolate_lock_cluster(si, &si->nonfull_clusters[o]))) { - found = alloc_swap_scan_cluster(si, ci, cluster_offset(si, ci), - 0, usage); - if (found) - goto done; - } + found = alloc_swap_scan_list(si, &si->nonfull_clusters[o], + 0, usage, true); + if (found) + goto done; } done: if (!(si->flags & SWP_SOLIDSTATE)) -- Gitee From cdb2bc25cd1355de9b2b10f9b70a643776d50dc1 Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Tue, 12 Aug 2025 01:20:18 +0800 Subject: [PATCH 406/484] mm/mincore: use a helper for checking the swap cache commit 1f2052755c152940c336918bd73d13d5468f548b upstream Conflicts: none Backport-reason: Swap Table: early phase Introduce a mincore_swap helper for checking swap entries. Move all swap related logic and sanity debug check into it, and separate them from page cache checking. The performance is better after this commit. mincore_page is never called on a swap cache space now, so the logic can be simpler. The sanity check also covers more potential cases now, previously the WARN_ON only catches potentially corrupted page table, now if shmem contains a swap entry with !CONFIG_SWAP, a WARN will be triggered. This changes the mincore value when the WARN is triggered, but this shouldn't matter. The WARN_ON means the data is already corrupted or something is very wrong, so it really should not happen. Before this series: mincore on a swaped out 16G anon mmap range: Took 488220 us mincore on 16G shmem mmap range: Took 530272 us. After this commit: mincore on a swaped out 16G anon mmap range: Took 446763 us mincore on 16G shmem mmap range: Took 460496 us. About ~10% faster. Link: https://lkml.kernel.org/r/20250811172018.48901-3-ryncsn@gmail.com Signed-off-by: Kairui Song Cc: Baoquan He Cc: Barry Song Cc: Chris Li Cc: David Hildenbrand Cc: Hugh Dickins Cc: Jann Horn Cc: Kemeng Shi Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Nhat Pham Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/mincore.c | 90 ++++++++++++++++++++++++++++------------------------ 1 file changed, 49 insertions(+), 41 deletions(-) diff --git a/mm/mincore.c b/mm/mincore.c index ae8d712abdec..c99e5c645ea0 100644 --- a/mm/mincore.c +++ b/mm/mincore.c @@ -44,6 +44,48 @@ static int mincore_hugetlb(pte_t *pte, unsigned long hmask, unsigned long addr, return 0; } +static unsigned char mincore_swap(swp_entry_t entry, bool shmem) +{ + struct swap_info_struct *si; + struct folio *folio = NULL; + unsigned char present = 0; + + if (!IS_ENABLED(CONFIG_SWAP)) { + WARN_ON(1); + return 0; + } + + /* + * Shmem mapping may contain swapin error entries, which are + * absent. Page table may contain migration or hwpoison + * entries which are always uptodate. + */ + if (non_swap_entry(entry)) + return !shmem; + + /* + * Shmem mapping lookup is lockless, so we need to grab the swap + * device. mincore page table walk locks the PTL, and the swap + * device is stable, avoid touching the si for better performance. + */ + if (shmem) { + si = get_swap_device(entry); + if (!si) + return 0; + } + folio = filemap_get_entry(swap_address_space(entry), + swap_cache_index(entry)); + if (shmem) + put_swap_device(si); + /* The swap cache space contains either folio, shadow or NULL */ + if (folio && !xa_is_value(folio)) { + present = folio_test_uptodate(folio); + folio_put(folio); + } + + return present; +} + /* * Later we can get more picky about what "in core" means precisely. * For now, simply check to see if the page is in the page cache, @@ -61,33 +103,15 @@ static unsigned char mincore_page(struct address_space *mapping, pgoff_t index) * any other file mapping (ie. marked !present and faulted in with * tmpfs's .fault). So swapped out tmpfs mappings are tested here. */ - if (IS_ENABLED(CONFIG_SWAP) && shmem_mapping(mapping)) { - folio = filemap_get_entry(mapping, index); - /* - * shmem/tmpfs may return swap: account for swapcache - * page too. - */ + folio = filemap_get_entry(mapping, index); + if (folio) { if (xa_is_value(folio)) { - struct swap_info_struct *si; - swp_entry_t swp = radix_to_swp_entry(folio); - /* There might be swapin error entries in shmem mapping. */ - if (non_swap_entry(swp)) - return 0; - /* Prevent swap device to being swapoff under us */ - si = get_swap_device(swp); - if (si) { - folio = filemap_get_folio(swap_address_space(swp), - swap_cache_index(swp)); - put_swap_device(si); - } else { + if (shmem_mapping(mapping)) + return mincore_swap(radix_to_swp_entry(folio), + true); + else return 0; - } } - } else { - folio = filemap_get_folio(mapping, index); - } - - if (!IS_ERR_OR_NULL(folio)) { present = folio_test_uptodate(folio); folio_put(folio); } @@ -165,23 +189,7 @@ static int mincore_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, for (i = 0; i < step; i++) vec[i] = 1; } else { /* pte is a swap entry */ - swp_entry_t entry = pte_to_swp_entry(pte); - - if (non_swap_entry(entry)) { - /* - * migration or hwpoison entries are always - * uptodate - */ - *vec = 1; - } else { -#ifdef CONFIG_SWAP - *vec = mincore_page(swap_address_space(entry), - swap_cache_index(entry)); -#else - WARN_ON(1); - *vec = 1; -#endif - } + *vec = mincore_swap(pte_to_swp_entry(pte), false); } vec += step; } -- Gitee From 9a69c51a23421df5d8b23a19a2fbc2362cd0da86 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Tue, 26 Dec 2023 13:46:10 -0800 Subject: [PATCH 407/484] arch/mm/fault: fix major fault accounting when retrying under per-VMA lock commit 46e714c729c8d1d8110bc0545d7ffe8a759c9dc0 upstream Conflicts: none Backport-reason: mm fixes A test [1] in Android test suite started failing after [2] was merged. It turns out that after handling a major fault under per-VMA lock, the process major fault counter does not register that fault as major. Before [2] read faults would be done under mmap_lock, in which case FAULT_FLAG_TRIED flag is set before retrying. That in turn causes mm_account_fault() to account the fault as major once retry completes. With per-VMA locks we often retry because a fault can't be handled without locking the whole mm using mmap_lock. Therefore such retries do not set FAULT_FLAG_TRIED flag. This logic does not work after [2] because we can now handle read major faults under per-VMA lock and upon retry the fact there was a major fault gets lost. Fix this by setting FAULT_FLAG_TRIED after retrying under per-VMA lock if VM_FAULT_MAJOR was returned. Ideally we would use an additional VM_FAULT bit to indicate the reason for the retry (could not handle under per-VMA lock vs other reason) but this simpler solution seems to work, so keeping it simple. [1] https://cs.android.com/android/platform/superproject/+/master:test/vts-testcase/kernel/api/drop_caches_prop/drop_caches_test.cpp [2] https://lore.kernel.org/all/20231006195318.4087158-6-willy@infradead.org/ Link: https://lkml.kernel.org/r/20231226214610.109282-1-surenb@google.com Fixes: 12214eba1992 ("mm: handle read faults under the VMA lock") Signed-off-by: Suren Baghdasaryan Cc: Matthew Wilcox Cc: Alexander Gordeev Cc: Andy Lutomirski Cc: Catalin Marinas Cc: Christophe Leroy Cc: Dave Hansen Cc: Gerald Schaefer Cc: Michael Ellerman Cc: Palmer Dabbelt Cc: Peter Zijlstra Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- arch/arm64/mm/fault.c | 2 ++ arch/powerpc/mm/fault.c | 2 ++ arch/riscv/mm/fault.c | 2 ++ arch/s390/mm/fault.c | 3 +++ arch/x86/mm/fault.c | 2 ++ 5 files changed, 11 insertions(+) diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index f69a701ecabf..b5600b9e58c9 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -609,6 +609,8 @@ static int __kprobes do_page_fault(unsigned long far, unsigned long esr, goto done; } count_vm_vma_lock_event(VMA_LOCK_RETRY); + if (fault & VM_FAULT_MAJOR) + mm_flags |= FAULT_FLAG_TRIED; /* Quick path to respond to signals */ if (fault_signal_pending(fault, regs)) { diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c index d3e0f5b3ecc7..273ab77bb668 100644 --- a/arch/powerpc/mm/fault.c +++ b/arch/powerpc/mm/fault.c @@ -502,6 +502,8 @@ static int ___do_page_fault(struct pt_regs *regs, unsigned long address, goto done; } count_vm_vma_lock_event(VMA_LOCK_RETRY); + if (fault & VM_FAULT_MAJOR) + flags |= FAULT_FLAG_TRIED; if (fault_signal_pending(fault, regs)) return user_mode(regs) ? 0 : SIGBUS; diff --git a/arch/riscv/mm/fault.c b/arch/riscv/mm/fault.c index 8960f4c84497..6f1c7c8bdb83 100644 --- a/arch/riscv/mm/fault.c +++ b/arch/riscv/mm/fault.c @@ -305,6 +305,8 @@ void handle_page_fault(struct pt_regs *regs) goto done; } count_vm_vma_lock_event(VMA_LOCK_RETRY); + if (fault & VM_FAULT_MAJOR) + flags |= FAULT_FLAG_TRIED; if (fault_signal_pending(fault, regs)) { if (!user_mode(regs)) diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c index 1a231181a413..0f3ce563056f 100644 --- a/arch/s390/mm/fault.c +++ b/arch/s390/mm/fault.c @@ -426,6 +426,9 @@ static inline vm_fault_t do_exception(struct pt_regs *regs, int access) goto out; } count_vm_vma_lock_event(VMA_LOCK_RETRY); + if (fault & VM_FAULT_MAJOR) + flags |= FAULT_FLAG_TRIED; + /* Quick path to respond to signals */ if (fault_signal_pending(fault, regs)) { fault = VM_FAULT_SIGNAL; diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 6529b3e2cff3..e604d2d6cc8f 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -1330,6 +1330,8 @@ void do_user_addr_fault(struct pt_regs *regs, goto done; } count_vm_vma_lock_event(VMA_LOCK_RETRY); + if (fault & VM_FAULT_MAJOR) + flags |= FAULT_FLAG_TRIED; /* Quick path to respond to signals */ if (fault_signal_pending(fault, regs)) { -- Gitee From 65ebff72a9eac6badab98024949e4e8b90a6a684 Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Wed, 3 Jan 2024 10:52:22 +0900 Subject: [PATCH 408/484] mm: shrinker: use kvzalloc_node() from expand_one_shrinker_info() commit 7fba9420b726561966e1671004df60a08b39beb3 upstream Conflicts: none Backport-reason: shrinker fixes syzbot is reporting uninit-value at shrinker_alloc(), for commit 307bececcd12 ("mm: shrinker: add a secondary array for shrinker_info::{map, nr_deferred}") which assumed that the ->unit was allocated with __GFP_ZERO forgot to replace kvmalloc_node() in expand_one_shrinker_info() with kvzalloc_node(). Link: https://lkml.kernel.org/r/9226cc0a-10e0-4489-80c5-58c3b5b4359c@I-love.SAKURA.ne.jp Reported-by: syzbot Closes: https://syzkaller.appspot.com/bug?extid=1e0ed05798af62917464 Fixes: 307bececcd12 ("mm: shrinker: add a secondary array for shrinker_info::{map, nr_deferred}") Signed-off-by: Tetsuo Handa Acked-by: Qi Zheng Cc: Muchun Song Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/shrinker.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/shrinker.c b/mm/shrinker.c index dd91eab43ed3..dc5d2a6fcfc4 100644 --- a/mm/shrinker.c +++ b/mm/shrinker.c @@ -126,7 +126,7 @@ static int expand_one_shrinker_info(struct mem_cgroup *memcg, int new_size, if (new_nr_max <= old->map_nr_max) continue; - new = kvmalloc_node(sizeof(*new) + new_size, GFP_KERNEL, nid); + new = kvzalloc_node(sizeof(*new) + new_size, GFP_KERNEL, nid); if (!new) return -ENOMEM; -- Gitee From 112da64e5b5c6c0f3088a277c91d971b2069ce13 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Thu, 14 Dec 2023 07:32:58 -0800 Subject: [PATCH 409/484] mm/list_lru.c: remove unused list_lru_from_kmem() commit 4a3bfbd1699e2306731809d50d480634012ed4de upstream Conflicts: none Backport-reason: ZSWAP & list_lru fix Fixes: 0a97c01cd20bb ("list_lru: allow explicit memcg and NUMA node selection) Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202312141318.q8b5yrAq-lkp@intel.com/ Cc: Nhat Pham Cc: Johannes Weiner Cc: Johannes Weiner Cc: Bagas Sanjaya Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/list_lru.c | 31 ------------------------------- 1 file changed, 31 deletions(-) diff --git a/mm/list_lru.c b/mm/list_lru.c index 61d5707d8f13..5a0a0ef3378a 100644 --- a/mm/list_lru.c +++ b/mm/list_lru.c @@ -59,28 +59,6 @@ list_lru_from_memcg_idx(struct list_lru *lru, int nid, int idx) } return &lru->node[nid].lru; } - -static inline struct list_lru_one * -list_lru_from_kmem(struct list_lru *lru, int nid, void *ptr, - struct mem_cgroup **memcg_ptr) -{ - struct list_lru_node *nlru = &lru->node[nid]; - struct list_lru_one *l = &nlru->lru; - struct mem_cgroup *memcg = NULL; - - if (!list_lru_memcg_aware(lru)) - goto out; - - memcg = mem_cgroup_from_slab_obj(ptr); - if (!memcg) - goto out; - - l = list_lru_from_memcg_idx(lru, nid, memcg_kmem_id(memcg)); -out: - if (memcg_ptr) - *memcg_ptr = memcg; - return l; -} #else static void list_lru_register(struct list_lru *lru) { @@ -105,15 +83,6 @@ list_lru_from_memcg_idx(struct list_lru *lru, int nid, int idx) { return &lru->node[nid].lru; } - -static inline struct list_lru_one * -list_lru_from_kmem(struct list_lru *lru, int nid, void *ptr, - struct mem_cgroup **memcg_ptr) -{ - if (memcg_ptr) - *memcg_ptr = NULL; - return &lru->node[nid].lru; -} #endif /* CONFIG_MEMCG_KMEM */ bool list_lru_add(struct list_lru *lru, struct list_head *item, int nid, -- Gitee From 11b242031f8a07cb024db806dec2d6982c349844 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Fri, 26 Jan 2024 12:25:48 +0900 Subject: [PATCH 410/484] mm/madvise: don't forget to leave lazy MMU mode in madvise_cold_or_pageout_pte_range() commit 4c2da3188b848d33c26d7f0f8b14f3150331c923 upstream Conflicts: none Backport-reason: madvise fixes We need to leave lazy MMU mode before unlocking. Link: https://lkml.kernel.org/r/20240126032608.355899-1-senozhatsky@chromium.org Fixes: b2f557a21bc8 ("mm/madvise: add cond_resched() in madvise_cold_or_pageout_pte_range()") Signed-off-by: Sergey Senozhatsky Cc: Jiexun Wang Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/madvise.c | 1 + 1 file changed, 1 insertion(+) diff --git a/mm/madvise.c b/mm/madvise.c index 6572b618a9f8..83d106b1833c 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -431,6 +431,7 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd, if (++batch_count == SWAP_CLUSTER_MAX) { batch_count = 0; if (need_resched()) { + arch_leave_lazy_mmu_mode(); pte_unmap_unlock(start_pte, ptl); cond_resched(); goto restart; -- Gitee From 4939cab104521bdc003f2574eb77f9c2b0c384eb Mon Sep 17 00:00:00 2001 From: Lukas Bulwahn Date: Mon, 22 Jan 2024 10:25:04 +0100 Subject: [PATCH 411/484] mempolicy: clean up minor dead code in queue_pages_test_walk() commit 3efbe13e361acfd163fd1a2466e4fb9bed7dc1b0 upstream Conflicts: none Backport-reason: mempolicy fixes Commit 2cafb582173f ("mempolicy: remove confusing MPOL_MF_LAZY dead code") removes MPOL_MF_LAZY handling in queue_pages_test_walk(), and with that, there is no effective use of the local variable endvma in that function remaining. Remove the local variable endvma and its dead code. No functional change. This issue was identified with clang-analyzer's dead stores analysis. Link: https://lkml.kernel.org/r/20240122092504.18377-1-lukas.bulwahn@gmail.com Signed-off-by: Lukas Bulwahn Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/mempolicy.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 87e6e8d9fc59..357463cc3b17 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -653,7 +653,6 @@ static int queue_pages_test_walk(unsigned long start, unsigned long end, { struct vm_area_struct *next, *vma = walk->vma; struct queue_pages *qp = walk->private; - unsigned long endvma = vma->vm_end; unsigned long flags = qp->flags; /* range check first */ @@ -681,9 +680,6 @@ static int queue_pages_test_walk(unsigned long start, unsigned long end, !(flags & MPOL_MF_STRICT)) return 1; - if (endvma > end) - endvma = end; - /* * Check page nodes, and queue pages to move, in the current vma. * But if no moving, and no strict checking, the scan can be skipped. -- Gitee From 0408327e59b10312ba56516d282c354dea62c316 Mon Sep 17 00:00:00 2001 From: Muhammad Usama Anjum Date: Tue, 16 Apr 2024 21:26:58 +0500 Subject: [PATCH 412/484] selftests: mm: fix unused and uninitialized variable warning commit 6db7412c142006985a15765785cf6c0c0dd75374 upstream Conflicts: none Backport-reason: THP fixes Fix the warnings by initializing and marking the variable as unused. I've caught the warnings by using clang. split_huge_page_test.c:303:6: warning: variable 'dummy' set but not used [-Wunused-but-set-variable] 303 | int dummy; | ^ split_huge_page_test.c:343:3: warning: variable 'dummy' is uninitialized when used here [-Wuninitialized] 343 | dummy += *(*addr + i); | ^~~~~ split_huge_page_test.c:303:11: note: initialize the variable 'dummy' to silence this warning 303 | int dummy; | ^ | = 0 2 warnings generated. Link: https://lkml.kernel.org/r/20240416162658.3353622-1-usama.anjum@collabora.com Fixes: fc4d182316bd ("mm: huge_memory: enable debugfs to split huge pages to any order") Signed-off-by: Muhammad Usama Anjum Reviewed-by: Zi Yan Cc: Bill Wendling Cc: Justin Stitt Cc: Muhammad Usama Anjum Cc: Nathan Chancellor Cc: Nick Desaulniers Cc: Shuah Khan Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- tools/testing/selftests/mm/split_huge_page_test.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/mm/split_huge_page_test.c b/tools/testing/selftests/mm/split_huge_page_test.c index 061d26462d33..7c722b401b6b 100644 --- a/tools/testing/selftests/mm/split_huge_page_test.c +++ b/tools/testing/selftests/mm/split_huge_page_test.c @@ -325,7 +325,7 @@ int create_pagecache_thp_and_fd(const char *testfile, size_t fd_size, int *fd, char **addr) { size_t i; - int dummy; + int __attribute__((unused)) dummy = 0; srand(time(NULL)); -- Gitee From 72fb844b6f2792036dcb302727821afd93d87030 Mon Sep 17 00:00:00 2001 From: Zi Yan Date: Thu, 7 Mar 2024 13:18:54 -0500 Subject: [PATCH 413/484] mm/huge_memory: skip invalid debugfs new_order input for folio split commit 2394aef616cf38fbf2e797c6845ccd35d76ce256 upstream Conflicts: none Backport-reason: mTHP fixes User can put arbitrary new_order via debugfs for folio split test. Although new_order check is added to split_huge_page_to_list_order() in the prior commit, these two additional checks can avoid unnecessary folio locking and split_folio_to_order() calls. Link: https://lkml.kernel.org/r/20240307181854.138928-2-zi.yan@sent.com Signed-off-by: Zi Yan Reported-by: Dan Carpenter Closes: https://lore.kernel.org/linux-mm/7dda9283-b437-4cf8-ab0d-83c330deb9c0@moroto.mountain/ Cc: David Hildenbrand Cc: Kirill A. Shutemov Cc: Matthew Wilcox (Oracle) Cc: Ryan Roberts Cc: Yang Shi Cc: Yu Zhao Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/huge_memory.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 083292f3e730..b7030b8b9182 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -3643,6 +3643,9 @@ static int split_huge_pages_pid(int pid, unsigned long vaddr_start, if (!is_transparent_hugepage(folio)) goto next; + if (new_order >= folio_order(folio)) + goto next; + total++; /* * For folios with private, split_huge_page_to_list_to_order() @@ -3710,6 +3713,9 @@ static int split_huge_pages_in_file(const char *file_path, pgoff_t off_start, total++; nr_pages = folio_nr_pages(folio); + if (new_order >= folio_order(folio)) + goto next; + if (!folio_trylock(folio)) goto next; -- Gitee From d506f3fac534c74fdc0cadd78d3023ec5f77c3df Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 1 May 2024 16:31:18 +0100 Subject: [PATCH 414/484] XArray: set the marks correctly when splitting an entry commit 2a0774c2886d25f4d2987cd3e3813d16bf96f34f upstream Conflicts: none Backport-reason: THP fixes If we created a new node to replace an entry which had search marks set, we were setting the search mark on every entry in that node. That works fine when we're splitting to order 0, but when splitting to a larger order, we must not set the search marks on the sibling entries. Link: https://lkml.kernel.org/r/20240501153120.4094530-1-willy@infradead.org Fixes: c010d47f107f ("mm: thp: split huge page to any lower order pages") Signed-off-by: Matthew Wilcox (Oracle) Reported-by: Luis Chamberlain Link: https://lore.kernel.org/r/ZjFGCOYk3FK_zVy3@bombadil.infradead.org Tested-by: Luis Chamberlain Cc: Zi Yan Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- lib/test_xarray.c | 14 +++++++++++++- lib/xarray.c | 23 +++++++++++++++++++---- 2 files changed, 32 insertions(+), 5 deletions(-) diff --git a/lib/test_xarray.c b/lib/test_xarray.c index 542926da61a3..5c6953c4822f 100644 --- a/lib/test_xarray.c +++ b/lib/test_xarray.c @@ -1555,9 +1555,11 @@ static void check_split_1(struct xarray *xa, unsigned long index, unsigned int order, unsigned int new_order) { XA_STATE_ORDER(xas, xa, index, new_order); - unsigned int i; + unsigned int i, found; + void *entry; xa_store_order(xa, index, order, xa, GFP_KERNEL); + xa_set_mark(xa, index, XA_MARK_1); xas_split_alloc(&xas, xa, order, GFP_KERNEL); xas_lock(&xas); @@ -1574,6 +1576,16 @@ static void check_split_1(struct xarray *xa, unsigned long index, xa_set_mark(xa, index, XA_MARK_0); XA_BUG_ON(xa, !xa_get_mark(xa, index, XA_MARK_0)); + xas_set_order(&xas, index, 0); + found = 0; + rcu_read_lock(); + xas_for_each_marked(&xas, entry, ULONG_MAX, XA_MARK_1) { + found++; + XA_BUG_ON(xa, xa_is_internal(entry)); + } + rcu_read_unlock(); + XA_BUG_ON(xa, found != 1 << (order - new_order)); + xa_destroy(xa); } diff --git a/lib/xarray.c b/lib/xarray.c index da79128ad754..652daf186688 100644 --- a/lib/xarray.c +++ b/lib/xarray.c @@ -969,8 +969,22 @@ static unsigned int node_get_marks(struct xa_node *node, unsigned int offset) return marks; } +static inline void node_mark_slots(struct xa_node *node, unsigned int sibs, + xa_mark_t mark) +{ + int i; + + if (sibs == 0) + node_mark_all(node, mark); + else { + for (i = 0; i < XA_CHUNK_SIZE; i += sibs + 1) + node_set_mark(node, i, mark); + } +} + static void node_set_marks(struct xa_node *node, unsigned int offset, - struct xa_node *child, unsigned int marks) + struct xa_node *child, unsigned int sibs, + unsigned int marks) { xa_mark_t mark = XA_MARK_0; @@ -978,7 +992,7 @@ static void node_set_marks(struct xa_node *node, unsigned int offset, if (marks & (1 << (__force unsigned int)mark)) { node_set_mark(node, offset, mark); if (child) - node_mark_all(child, mark); + node_mark_slots(child, sibs, mark); } if (mark == XA_MARK_MAX) break; @@ -1077,7 +1091,8 @@ void xas_split(struct xa_state *xas, void *entry, unsigned int order) child->nr_values = xa_is_value(entry) ? XA_CHUNK_SIZE : 0; RCU_INIT_POINTER(child->parent, node); - node_set_marks(node, offset, child, marks); + node_set_marks(node, offset, child, xas->xa_sibs, + marks); rcu_assign_pointer(node->slots[offset], xa_mk_node(child)); if (xa_is_value(curr)) @@ -1086,7 +1101,7 @@ void xas_split(struct xa_state *xas, void *entry, unsigned int order) } else { unsigned int canon = offset - xas->xa_sibs; - node_set_marks(node, canon, NULL, marks); + node_set_marks(node, canon, NULL, 0, marks); rcu_assign_pointer(node->slots[canon], entry); while (offset > canon) rcu_assign_pointer(node->slots[offset--], -- Gitee From 26a1b10626d3f875217ac6a24e3f1556502c2e8b Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 29 May 2024 18:18:12 -0400 Subject: [PATCH 415/484] mm: page_alloc: fix highatomic typing in multi-block buddies commit 7cc5a5d65011983952a9c62f170f5b79e24b1239 upstream Conflicts: none Backport-reason: THP fixes Christoph reports a page allocator splat triggered by xfstests: generic/176 214s ... [ 1204.507931] run fstests generic/176 at 2024-05-27 12:52:30 XFS (nvme0n1): Mounting V5 Filesystem cd936307-415f-48a3-b99d-a2d52ae1f273 XFS (nvme0n1): Ending clean mount XFS (nvme1n1): Mounting V5 Filesystem ab3ee1a4-af62-4934-9a6a-6c2fde321850 XFS (nvme1n1): Ending clean mount XFS (nvme1n1): Unmounting Filesystem ab3ee1a4-af62-4934-9a6a-6c2fde321850 XFS (nvme1n1): Mounting V5 Filesystem 7099b02d-9c58-4d1d-be1d-2cc472d12cd9 XFS (nvme1n1): Ending clean mount ------------[ cut here ]------------ page type is 3, passed migratetype is 1 (nr=512) WARNING: CPU: 0 PID: 509870 at mm/page_alloc.c:645 expand+0x1c5/0x1f0 Modules linked in: i2c_i801 crc32_pclmul i2c_smbus [last unloaded: scsi_debug] CPU: 0 PID: 509870 Comm: xfs_io Not tainted 6.10.0-rc1+ #2437 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.3-debian-1.16.3-2 04/01/2014 RIP: 0010:expand+0x1c5/0x1f0 Code: 05 16 70 bf 02 01 e8 ca fc ff ff 8b 54 24 34 44 89 e1 48 c7 c7 80 a2 28 83 48 89 c6 b8 01 00 3 RSP: 0018:ffffc90003b2b968 EFLAGS: 00010082 RAX: 0000000000000000 RBX: ffffffff83fa9480 RCX: 0000000000000000 RDX: 0000000000000005 RSI: 0000000000000027 RDI: 00000000ffffffff RBP: 00000000001f2600 R08: 00000000fffeffff R09: 0000000000000001 R10: 0000000000000000 R11: ffffffff83676200 R12: 0000000000000009 R13: 0000000000000200 R14: 0000000000000001 R15: ffffea0007c98000 FS: 00007f72ca3d5780(0000) GS:ffff8881f9c00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f72ca1fff38 CR3: 00000001aa0c6002 CR4: 0000000000770ef0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000ffff07f0 DR7: 0000000000000400 PKRU: 55555554 Call Trace: ? __warn+0x7b/0x120 ? expand+0x1c5/0x1f0 ? report_bug+0x191/0x1c0 ? handle_bug+0x3c/0x80 ? exc_invalid_op+0x17/0x70 ? asm_exc_invalid_op+0x1a/0x20 ? expand+0x1c5/0x1f0 ? expand+0x1c5/0x1f0 __rmqueue_pcplist+0x3a9/0x730 get_page_from_freelist+0x7a0/0xf00 __alloc_pages_noprof+0x153/0x2e0 __folio_alloc_noprof+0x10/0xa0 __filemap_get_folio+0x16b/0x370 iomap_write_begin+0x496/0x680 While trying to service a movable allocation (page type 1), the page allocator runs into a two-pageblock buddy on the movable freelist whose second block is typed as highatomic (page type 3). This inconsistency is caused by the highatomic reservation system operating on single pageblocks, while MAX_ORDER can be bigger than that - in this configuration, pageblock_order is 9 while MAX_PAGE_ORDER is 10. The test case is observed to make several adjacent order-3 requests with __GFP_DIRECT_RECLAIM cleared, which marks the surrounding block as highatomic. Upon freeing, the blocks merge into an order-10 buddy. When the highatomic pool is drained later on, this order-10 buddy gets moved back to the movable list, but only the first pageblock is marked movable again. A subsequent expand() of this buddy warns about the tail being of a different type. This is a long-standing bug that's surfaced by the recent block type warnings added to the allocator. The consequences seem mostly benign, it just results in odd behavior: the highatomic tail blocks are not properly drained, instead they end up on the movable list first, then go back to the highatomic list after an alloc-free cycle. To fix this, make the highatomic reservation code aware that allocations/buddies can be larger than a pageblock. While it's an old quirk, the recently added type consistency warnings seem to be the most prominent consequence of it. Set the Fixes: tag accordingly to highlight this backporting dependency. Link: https://lkml.kernel.org/r/20240530114203.GA1222079@cmpxchg.org Fixes: e0932b6c1f94 ("mm: page_alloc: consolidate free page accounting") Signed-off-by: Johannes Weiner Reported-by: Christoph Hellwig Reviewed-by: Zi Yan Tested-by: Christoph Hellwig Cc: Andy Shevchenko Cc: Baolin Wang Cc: Mel Gorman Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/page_alloc.c | 50 +++++++++++++++++++++++++++++++++---------------- 1 file changed, 34 insertions(+), 16 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index fc88d932e6f7..aa23203a64e0 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1979,10 +1979,12 @@ int find_suitable_fallback(struct free_area *area, unsigned int order, } /* - * Reserve a pageblock for exclusive use of high-order atomic allocations if - * there are no empty page blocks that contain a page with a suitable order + * Reserve the pageblock(s) surrounding an allocation request for + * exclusive use of high-order atomic allocations if there are no + * empty page blocks that contain a page with a suitable order */ -static void reserve_highatomic_pageblock(struct page *page, struct zone *zone) +static void reserve_highatomic_pageblock(struct page *page, int order, + struct zone *zone) { int mt; unsigned long max_managed, flags; @@ -2004,10 +2006,17 @@ static void reserve_highatomic_pageblock(struct page *page, struct zone *zone) /* Yoink! */ mt = get_pageblock_migratetype(page); /* Only reserve normal pageblocks (i.e., they can merge with others) */ - if (migratetype_is_mergeable(mt)) - if (move_freepages_block(zone, page, mt, - MIGRATE_HIGHATOMIC) != -1) - zone->nr_reserved_highatomic += pageblock_nr_pages; + if (!migratetype_is_mergeable(mt)) + goto out_unlock; + + if (order < pageblock_order) { + if (move_freepages_block(zone, page, mt, MIGRATE_HIGHATOMIC) == -1) + goto out_unlock; + zone->nr_reserved_highatomic += pageblock_nr_pages; + } else { + change_pageblock_range(page, order, MIGRATE_HIGHATOMIC); + zone->nr_reserved_highatomic += 1 << order; + } out_unlock: spin_unlock_irqrestore(&zone->lock, flags); @@ -2019,7 +2028,7 @@ static void reserve_highatomic_pageblock(struct page *page, struct zone *zone) * intense memory pressure but failed atomic allocations should be easier * to recover from than an OOM. * - * If @force is true, try to unreserve a pageblock even though highatomic + * If @force is true, try to unreserve pageblocks even though highatomic * pageblock is exhausted. */ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac, @@ -2061,6 +2070,7 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac, * adjust the count once. */ if (is_migrate_highatomic(mt)) { + unsigned long size; /* * It should never happen but changes to * locking could inadvertently allow a per-cpu @@ -2068,9 +2078,9 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac, * while unreserving so be safe and watch for * underflows. */ - zone->nr_reserved_highatomic -= min( - pageblock_nr_pages, - zone->nr_reserved_highatomic); + size = max(pageblock_nr_pages, 1UL << order); + size = min(size, zone->nr_reserved_highatomic); + zone->nr_reserved_highatomic -= size; } /* @@ -2082,11 +2092,19 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac, * of pageblocks that cannot be completely freed * may increase. */ - ret = move_freepages_block(zone, page, mt, - ac->migratetype); + if (order < pageblock_order) + ret = move_freepages_block(zone, page, mt, + ac->migratetype); + else { + move_to_free_list(page, zone, order, mt, + ac->migratetype); + change_pageblock_range(page, order, + ac->migratetype); + ret = 1; + } /* - * Reserving this block already succeeded, so this should - * not fail on zone boundaries. + * Reserving the block(s) already succeeded, + * so this should not fail on zone boundaries. */ WARN_ON_ONCE(ret == -1); if (ret > 0) { @@ -3481,7 +3499,7 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags, * if the pageblock should be reserved for the future */ if (unlikely(alloc_flags & ALLOC_HIGHATOMIC)) - reserve_highatomic_pageblock(page, zone); + reserve_highatomic_pageblock(page, order, zone); return page; } else { -- Gitee From ef541658e4a593c5aa50252ae79466d336f4511d Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Fri, 31 May 2024 14:56:16 +0200 Subject: [PATCH 416/484] mm: read page_type using READ_ONCE commit dc9e6f7053dd540db2c9fba30c31a07de5edab01 upstream Conflicts: none Backport-reason: THP fixes KCSAN complains about possible data races: while we check for a page_type -- for example for sanity checks -- we might concurrently modify the mapcount that overlays page_type. Let's use READ_ONCE to avoid load tearing (shouldn't make a difference) and to make KCSAN happy. Likely, we might also want to use WRITE_ONCE for the writer side of page_type, if KCSAN ever complains about that. But we'll not mess with that for now. Note: nothing should really be broken besides wrong KCSAN complaints. The sanity check that triggers this was added in commit 68f0320824fa ("mm/rmap: convert folio_add_file_rmap_range() into folio_add_file_rmap_[pte|ptes|pmd]()"). Even before that similar races likely where possible, ever since we added page_type in commit 6e292b9be7f4 ("mm: split page_type out from _mapcount"). Link: https://lkml.kernel.org/r/20240531125616.2850153-1-david@redhat.com Signed-off-by: David Hildenbrand Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-lkp/202405281431.c46a3be9-lkp@intel.com Reviewed-by: Oscar Salvador Reviewed-by: Miaohe Lin Cc: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- include/linux/page-flags.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index ff8bc1f5e747..28ac95fdbe13 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -951,9 +951,9 @@ PAGEFLAG_FALSE(HasHWPoisoned, has_hwpoisoned) #define PG_hugetlb 0x00000800 #define PageType(page, flag) \ - ((page->page_type & (PAGE_TYPE_BASE | flag)) == PAGE_TYPE_BASE) + ((READ_ONCE(page->page_type) & (PAGE_TYPE_BASE | flag)) == PAGE_TYPE_BASE) #define folio_test_type(folio, flag) \ - ((folio->page.page_type & (PAGE_TYPE_BASE | flag)) == PAGE_TYPE_BASE) + ((READ_ONCE(folio->page.page_type) & (PAGE_TYPE_BASE | flag)) == PAGE_TYPE_BASE) static inline int page_type_has_type(unsigned int page_type) { @@ -962,7 +962,7 @@ static inline int page_type_has_type(unsigned int page_type) static inline int page_has_type(struct page *page) { - return page_type_has_type(page->page_type); + return page_type_has_type(READ_ONCE(page->page_type)); } #define FOLIO_TYPE_OPS(lname, fname) \ -- Gitee From 502de6e0b07789a9d7bf5b43ff41388e3a9044e2 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 2 Apr 2024 21:06:54 +0100 Subject: [PATCH 417/484] hugetlb: convert alloc_buddy_hugetlb_folio to use a folio commit f6a8dd98a2ce7ace0be59d77868751131af6d2f0 upstream Conflicts: none Backport-reason: More folios While this function returned a folio, it was still using __alloc_pages() and __free_pages(). Use __folio_alloc() and put_folio() instead. This actually removes a call to compound_head(), but more importantly, it prepares us for the move to memdescs. Link: https://lkml.kernel.org/r/20240402200656.913841-1-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Sidhartha Kumar Reviewed-by: Oscar Salvador Reviewed-by: Muchun Song Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/hugetlb.c | 33 ++++++++++++++++----------------- 1 file changed, 16 insertions(+), 17 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 08adf1c880c3..7715b834da54 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2096,13 +2096,13 @@ static struct folio *alloc_buddy_hugetlb_folio(struct hstate *h, nodemask_t *node_alloc_noretry) { int order = huge_page_order(h); - struct page *page; + struct folio *folio; bool alloc_try_hard = true; bool retry = true; /* - * By default we always try hard to allocate the page with - * __GFP_RETRY_MAYFAIL flag. However, if we are allocating pages in + * By default we always try hard to allocate the folio with + * __GFP_RETRY_MAYFAIL flag. However, if we are allocating folios in * a loop (to adjust global huge page counts) and previous allocation * failed, do not continue to try hard on the same node. Use the * node_alloc_noretry bitmap to manage this state information. @@ -2115,43 +2115,42 @@ static struct folio *alloc_buddy_hugetlb_folio(struct hstate *h, if (nid == NUMA_NO_NODE) nid = numa_mem_id(); retry: - page = __alloc_pages(gfp_mask, order, nid, nmask); + folio = __folio_alloc(gfp_mask, order, nid, nmask); - /* Freeze head page */ - if (page && !page_ref_freeze(page, 1)) { - __free_pages(page, order); + if (folio && !folio_ref_freeze(folio, 1)) { + folio_put(folio); if (retry) { /* retry once */ retry = false; goto retry; } /* WOW! twice in a row. */ - pr_warn("HugeTLB head page unexpected inflated ref count\n"); - page = NULL; + pr_warn("HugeTLB unexpected inflated folio ref count\n"); + folio = NULL; } /* - * If we did not specify __GFP_RETRY_MAYFAIL, but still got a page this - * indicates an overall state change. Clear bit so that we resume - * normal 'try hard' allocations. + * If we did not specify __GFP_RETRY_MAYFAIL, but still got a + * folio this indicates an overall state change. Clear bit so + * that we resume normal 'try hard' allocations. */ - if (node_alloc_noretry && page && !alloc_try_hard) + if (node_alloc_noretry && folio && !alloc_try_hard) node_clear(nid, *node_alloc_noretry); /* - * If we tried hard to get a page but failed, set bit so that + * If we tried hard to get a folio but failed, set bit so that * subsequent attempts will not try as hard until there is an * overall state change. */ - if (node_alloc_noretry && !page && alloc_try_hard) + if (node_alloc_noretry && !folio && alloc_try_hard) node_set(nid, *node_alloc_noretry); - if (!page) { + if (!folio) { __count_vm_event(HTLB_BUDDY_PGALLOC_FAIL); return NULL; } __count_vm_event(HTLB_BUDDY_PGALLOC); - return page_folio(page); + return folio; } /* -- Gitee From 103ae36b02fbc87045caa9d3d0590ece80c01f0d Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Tue, 9 Jul 2024 20:04:33 +0800 Subject: [PATCH 418/484] mm/hugetlb: fix kernel NULL pointer dereference when migrating hugetlb folio commit 1390a3334a48ecac5175865fd433d55eec255db8 upstream Conflicts: none Backport-reason: THP fixes A kernel crash was observed when migrating hugetlb folio: BUG: kernel NULL pointer dereference, address: 0000000000000008 PGD 0 P4D 0 Oops: Oops: 0002 [#1] PREEMPT SMP NOPTI CPU: 0 PID: 3435 Comm: bash Not tainted 6.10.0-rc6-00450-g8578ca01f21f #66 RIP: 0010:__folio_undo_large_rmappable+0x70/0xb0 RSP: 0018:ffffb165c98a7b38 EFLAGS: 00000097 RAX: fffffbbc44528090 RBX: 0000000000000000 RCX: 0000000000000000 RDX: ffffa30e000a2800 RSI: 0000000000000246 RDI: ffffa3153ffffcc0 RBP: fffffbbc44528000 R08: 0000000000002371 R09: ffffffffbe4e5868 R10: 0000000000000001 R11: 0000000000000001 R12: ffffa3153ffffcc0 R13: fffffbbc44468000 R14: 0000000000000001 R15: 0000000000000001 FS: 00007f5b3a716740(0000) GS:ffffa3151fc00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000008 CR3: 000000010959a000 CR4: 00000000000006f0 Call Trace: __folio_migrate_mapping+0x59e/0x950 __migrate_folio.constprop.0+0x5f/0x120 move_to_new_folio+0xfd/0x250 migrate_pages+0x383/0xd70 soft_offline_page+0x2ab/0x7f0 soft_offline_page_store+0x52/0x90 kernfs_fop_write_iter+0x12c/0x1d0 vfs_write+0x380/0x540 ksys_write+0x64/0xe0 do_syscall_64+0xb9/0x1d0 entry_SYSCALL_64_after_hwframe+0x77/0x7f RIP: 0033:0x7f5b3a514887 RSP: 002b:00007ffe138fce68 EFLAGS: 00000246 ORIG_RAX: 0000000000000001 RAX: ffffffffffffffda RBX: 000000000000000c RCX: 00007f5b3a514887 RDX: 000000000000000c RSI: 0000556ab809ee10 RDI: 0000000000000001 RBP: 0000556ab809ee10 R08: 00007f5b3a5d1460 R09: 000000007fffffff R10: 0000000000000000 R11: 0000000000000246 R12: 000000000000000c R13: 00007f5b3a61b780 R14: 00007f5b3a617600 R15: 00007f5b3a616a00 It's because hugetlb folio is passed to __folio_undo_large_rmappable() unexpectedly. large_rmappable flag is imperceptibly set to hugetlb folio since commit f6a8dd98a2ce ("hugetlb: convert alloc_buddy_hugetlb_folio to use a folio"). Then commit be9581ea8c05 ("mm: fix crashes from deferred split racing folio migration") makes folio_migrate_mapping() call folio_undo_large_rmappable() triggering the bug. Fix this issue by clearing large_rmappable flag for hugetlb folios. They don't need that flag set anyway. Link: https://lkml.kernel.org/r/20240709120433.4136700-1-linmiaohe@huawei.com Fixes: f6a8dd98a2ce ("hugetlb: convert alloc_buddy_hugetlb_folio to use a folio") Fixes: be9581ea8c05 ("mm: fix crashes from deferred split racing folio migration") Signed-off-by: Miaohe Lin Cc: Hugh Dickins Cc: Matthew Wilcox (Oracle) Cc: Muchun Song Cc: Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/hugetlb.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 7715b834da54..79973df6df63 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2116,6 +2116,9 @@ static struct folio *alloc_buddy_hugetlb_folio(struct hstate *h, nid = numa_mem_id(); retry: folio = __folio_alloc(gfp_mask, order, nid, nmask); + /* Ensure hugetlb folio won't have large_rmappable flag set. */ + if (folio) + folio_clear_large_rmappable(folio); if (folio && !folio_ref_freeze(folio, 1)) { folio_put(folio); -- Gitee From 9ddc66cac18c1c8ea6838ded107d2956f67cb9f1 Mon Sep 17 00:00:00 2001 From: Bang Li Date: Fri, 28 Jun 2024 11:23:27 +0800 Subject: [PATCH 419/484] mm/shmem: fix input and output inconsistencies commit 843a2e24c24c5311831860c6b78ceacdd4627000 upstream Conflicts: none Backport-reason: Shmem mTHP fixes Commit 19eaf44954df ("mm: thp: support allocation of anonymous multi-size THP") added mTHP support for anonymous shmem. We can configure different policies through the multi-size THP sysfs interface for anonymous shmem. But when we configure the "advise" policy of /sys/kernel/mm/transparent_hugepage/hugepages-xxxkB/shmem_enabled, we cannot write the "advise", but write the "madvise", which is unreasonable. We should keep the output and input values consistent, which is more convenient for users. Link: https://lkml.kernel.org/r/20240628032327.16987-1-libang.li@antgroup.com Fixes: 61a57f1b1da9 ("mm: shmem: add multi-size THP sysfs interface for anonymous shmem") Signed-off-by: Bang Li Reviewed-by: Baolin Wang Cc: Bang Li Cc: David Hildenbrand Cc: Hugh Dickins Cc: Ryan Roberts Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/shmem.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/shmem.c b/mm/shmem.c index 82c398081da7..dc065bb4b35c 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -5252,7 +5252,7 @@ static ssize_t thpsize_shmem_enabled_store(struct kobject *kobj, clear_bit(order, &huge_shmem_orders_madvise); set_bit(order, &huge_shmem_orders_within_size); spin_unlock(&huge_shmem_orders_lock); - } else if (sysfs_streq(buf, "madvise")) { + } else if (sysfs_streq(buf, "advise")) { spin_lock(&huge_shmem_orders_lock); clear_bit(order, &huge_shmem_orders_always); clear_bit(order, &huge_shmem_orders_inherit); -- Gitee From 5f32adf7af0230064e080feafba4168fb0542bb0 Mon Sep 17 00:00:00 2001 From: Ran Xiaokai Date: Wed, 15 May 2024 10:47:54 +0800 Subject: [PATCH 420/484] mm/huge_memory: mark racy access onhuge_anon_orders_always commit 7f83bf14603ef41a44dc907594d749a283e22c37 upstream Conflicts: none Backport-reason: THP fixes huge_anon_orders_always is accessed lockless, it is better to use the READ_ONCE() wrapper. This is not fixing any visible bug, hopefully this can cease some KCSAN complains in the future. Also do that for huge_anon_orders_madvise. Link: https://lkml.kernel.org/r/20240515104754889HqrahFPePOIE1UlANHVAh@zte.com.cn Signed-off-by: Ran Xiaokai Acked-by: David Hildenbrand Reviewed-by: Lu Zhongjun Reviewed-by: xu xin Cc: Yang Yang Cc: Matthew Wilcox (Oracle) Cc: Yang Shi Cc: Zi Yan Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- include/linux/huge_mm.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 591f5da3618a..4dacc2cd5b34 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -206,8 +206,8 @@ static inline bool hugepage_flags_enabled(void) * So we don't need to look at huge_anon_orders_inherit. */ return hugepage_global_enabled() || - huge_anon_orders_always || - huge_anon_orders_madvise; + READ_ONCE(huge_anon_orders_always) || + READ_ONCE(huge_anon_orders_madvise); } static inline int highest_order(unsigned long orders) -- Gitee From 1878479dc3a3faddd84c9ab7a6851e3320d29d7d Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Thu, 4 Jul 2024 10:10:50 +0100 Subject: [PATCH 421/484] mm: fix khugepaged activation policy commit 00f58104202c472e487f0866fbd38832523fd4f9 upstream Conflicts: none Backport-reason: THP fixes Since the introduction of mTHP, the docuementation has stated that khugepaged would be enabled when any mTHP size is enabled, and disabled when all mTHP sizes are disabled. There are 2 problems with this; 1. this is not what was implemented by the code and 2. this is not the desirable behavior. Desirable behavior is for khugepaged to be enabled when any PMD-sized THP is enabled, anon or file. (Note that file THP is still controlled by the top-level control so we must always consider that, as well as the PMD-size mTHP control for anon). khugepaged only supports collapsing to PMD-sized THP so there is no value in enabling it when PMD-sized THP is disabled. So let's change the code and documentation to reflect this policy. Further, per-size enabled control modification events were not previously forwarded to khugepaged to give it an opportunity to start or stop. Consequently the following was resulting in khugepaged eroneously not being activated: echo never > /sys/kernel/mm/transparent_hugepage/enabled echo always > /sys/kernel/mm/transparent_hugepage/hugepages-2048kB/enabled [ryan.roberts@arm.com: v3] Link: https://lkml.kernel.org/r/20240705102849.2479686-1-ryan.roberts@arm.com Link: https://lkml.kernel.org/r/20240705102849.2479686-1-ryan.roberts@arm.com Link: https://lkml.kernel.org/r/20240704091051.2411934-1-ryan.roberts@arm.com Signed-off-by: Ryan Roberts Fixes: 3485b88390b0 ("mm: thp: introduce multi-size THP sysfs interface") Closes: https://lore.kernel.org/linux-mm/7a0bbe69-1e3d-4263-b206-da007791a5c4@redhat.com/ Acked-by: David Hildenbrand Cc: Baolin Wang Cc: Barry Song Cc: Jonathan Corbet Cc: Lance Yang Cc: Yang Shi Cc: Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- Documentation/admin-guide/mm/transhuge.rst | 11 ++++---- include/linux/huge_mm.h | 12 -------- mm/huge_memory.c | 7 +++++ mm/khugepaged.c | 33 +++++++++++++++++----- 4 files changed, 38 insertions(+), 25 deletions(-) diff --git a/Documentation/admin-guide/mm/transhuge.rst b/Documentation/admin-guide/mm/transhuge.rst index f526be5b4647..cebbf66f1142 100644 --- a/Documentation/admin-guide/mm/transhuge.rst +++ b/Documentation/admin-guide/mm/transhuge.rst @@ -202,12 +202,11 @@ PMD-mappable transparent hugepage:: cat /sys/kernel/mm/transparent_hugepage/hpage_pmd_size -khugepaged will be automatically started when one or more hugepage -sizes are enabled (either by directly setting "always" or "madvise", -or by setting "inherit" while the top-level enabled is set to "always" -or "madvise"), and it'll be automatically shutdown when the last -hugepage size is disabled (either by directly setting "never", or by -setting "inherit" while the top-level enabled is set to "never"). +khugepaged will be automatically started when PMD-sized THP is enabled +(either of the per-size anon control or the top-level control are set +to "always" or "madvise"), and it'll be automatically shutdown when +PMD-sized THP is disabled (when both the per-size anon control and the +top-level control are "never") Khugepaged controls ------------------- diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 4dacc2cd5b34..690b27b65cac 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -198,18 +198,6 @@ static inline bool hugepage_global_always(void) (1< 0) { + int err; + + err = start_stop_khugepaged(); + if (err) + ret = err; + } return ret; } diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 0fa355bc9cae..5d9906086ba9 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -446,6 +446,26 @@ void khugepaged_enter_exec_vma(struct vm_area_struct *vma, } #endif +static bool hugepage_pmd_enabled(void) +{ + /* + * We cover both the anon and the file-backed case here; file-backed + * hugepages, when configured in, are determined by the global control. + * Anon pmd-sized hugepages are determined by the pmd-size control. + */ + if (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && + hugepage_global_enabled()) + return true; + if (test_bit(PMD_ORDER, &huge_anon_orders_always)) + return true; + if (test_bit(PMD_ORDER, &huge_anon_orders_madvise)) + return true; + if (test_bit(PMD_ORDER, &huge_anon_orders_inherit) && + hugepage_global_enabled()) + return true; + return false; +} + void __khugepaged_enter(struct mm_struct *mm) { struct khugepaged_mm_slot *mm_slot; @@ -482,7 +502,7 @@ void khugepaged_enter_vma(struct vm_area_struct *vma, unsigned long vm_flags) { if (!test_bit(MMF_VM_HUGEPAGE, &vma->vm_mm->flags) && - hugepage_flags_enabled()) { + hugepage_pmd_enabled()) { if (thp_vma_allowable_order(vma, vm_flags, TVA_ENFORCE_SYSFS, PMD_ORDER)) __khugepaged_enter(vma->vm_mm); @@ -2692,8 +2712,7 @@ static unsigned int khugepaged_scan_exec_mm_slot(unsigned int pages, int *result static int khugepaged_has_work(void) { - return !list_empty(&khugepaged_scan.mm_head) && - hugepage_flags_enabled(); + return !list_empty(&khugepaged_scan.mm_head) && hugepage_pmd_enabled(); } static int khugepaged_wait_event(void) @@ -2789,7 +2808,7 @@ static void khugepaged_wait_work(void) return; } - if (hugepage_flags_enabled()) + if (hugepage_pmd_enabled()) wait_event_freezable(khugepaged_wait, khugepaged_wait_event()); } @@ -2829,7 +2848,7 @@ static void set_recommended_min_free_kbytes(void) int nr_zones = 0; unsigned long recommended_min; - if (!hugepage_flags_enabled()) { + if (!hugepage_pmd_enabled()) { calculate_min_free_kbytes(); goto update_wmarks; } @@ -2879,7 +2898,7 @@ int start_stop_khugepaged(void) int err = 0; mutex_lock(&khugepaged_mutex); - if (hugepage_flags_enabled()) { + if (hugepage_pmd_enabled()) { if (!khugepaged_thread) khugepaged_thread = kthread_run(khugepaged, NULL, "khugepaged"); @@ -2905,7 +2924,7 @@ int start_stop_khugepaged(void) void khugepaged_min_free_kbytes_update(void) { mutex_lock(&khugepaged_mutex); - if (hugepage_flags_enabled() && khugepaged_thread) + if (hugepage_pmd_enabled() && khugepaged_thread) set_recommended_min_free_kbytes(); mutex_unlock(&khugepaged_mutex); } -- Gitee From 3f1443126675dfd4ecf1841d556f1bb6c53c4a34 Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Thu, 18 Jul 2024 16:36:07 +0800 Subject: [PATCH 422/484] mm: list_lru: fix UAF for memory cgroup commit 5161b48712dcd08ec427c450399d4d1483e21dea upstream Conflicts: none Backport-reason: ZSWAP & list_lru fix The mem_cgroup_from_slab_obj() is supposed to be called under rcu lock or cgroup_mutex or others which could prevent returned memcg from being freed. Fix it by adding missing rcu read lock. Found by code inspection. [songmuchun@bytedance.com: only grab rcu lock when necessary, per Vlastimil] Link: https://lkml.kernel.org/r/20240801024603.1865-1-songmuchun@bytedance.com Link: https://lkml.kernel.org/r/20240718083607.42068-1-songmuchun@bytedance.com Fixes: 0a97c01cd20b ("list_lru: allow explicit memcg and NUMA node selection") Signed-off-by: Muchun Song Acked-by: Shakeel Butt Acked-by: Vlastimil Babka Cc: Johannes Weiner Cc: Nhat Pham Cc: Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/list_lru.c | 28 ++++++++++++++++++++++------ 1 file changed, 22 insertions(+), 6 deletions(-) diff --git a/mm/list_lru.c b/mm/list_lru.c index 5a0a0ef3378a..953b6e8d8b20 100644 --- a/mm/list_lru.c +++ b/mm/list_lru.c @@ -85,6 +85,7 @@ list_lru_from_memcg_idx(struct list_lru *lru, int nid, int idx) } #endif /* CONFIG_MEMCG_KMEM */ +/* The caller must ensure the memcg lifetime. */ bool list_lru_add(struct list_lru *lru, struct list_head *item, int nid, struct mem_cgroup *memcg) { @@ -109,14 +110,22 @@ EXPORT_SYMBOL_GPL(list_lru_add); bool list_lru_add_obj(struct list_lru *lru, struct list_head *item) { + bool ret; int nid = page_to_nid(virt_to_page(item)); - struct mem_cgroup *memcg = list_lru_memcg_aware(lru) ? - mem_cgroup_from_slab_obj(item) : NULL; - return list_lru_add(lru, item, nid, memcg); + if (list_lru_memcg_aware(lru)) { + rcu_read_lock(); + ret = list_lru_add(lru, item, nid, mem_cgroup_from_slab_obj(item)); + rcu_read_unlock(); + } else { + ret = list_lru_add(lru, item, nid, NULL); + } + + return ret; } EXPORT_SYMBOL_GPL(list_lru_add_obj); +/* The caller must ensure the memcg lifetime. */ bool list_lru_del(struct list_lru *lru, struct list_head *item, int nid, struct mem_cgroup *memcg) { @@ -139,11 +148,18 @@ EXPORT_SYMBOL_GPL(list_lru_del); bool list_lru_del_obj(struct list_lru *lru, struct list_head *item) { + bool ret; int nid = page_to_nid(virt_to_page(item)); - struct mem_cgroup *memcg = list_lru_memcg_aware(lru) ? - mem_cgroup_from_slab_obj(item) : NULL; - return list_lru_del(lru, item, nid, memcg); + if (list_lru_memcg_aware(lru)) { + rcu_read_lock(); + ret = list_lru_del(lru, item, nid, mem_cgroup_from_slab_obj(item)); + rcu_read_unlock(); + } else { + ret = list_lru_del(lru, item, nid, NULL); + } + + return ret; } EXPORT_SYMBOL_GPL(list_lru_del_obj); -- Gitee From 3e3a4721fc979201f3a039b63bd6def953db8acd Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Wed, 31 Jul 2024 13:46:19 +0800 Subject: [PATCH 423/484] mm: shmem: avoid allocating huge pages larger than MAX_PAGECACHE_ORDER for shmem commit b66b1b71d7ff5464d23a0ac6f73fae461b7264fd upstream Conflicts: none Backport-reason: Shmem mTHP swap Similar to commit d659b715e94ac ("mm/huge_memory: avoid PMD-size page cache if needed"), ARM64 can support 512MB PMD-sized THP when the base page size is 64KB, which is larger than the maximum supported page cache size MAX_PAGECACHE_ORDER. This is not expected. To fix this issue, use THP_ORDERS_ALL_FILE_DEFAULT for shmem to filter allowable huge orders. [baolin.wang@linux.alibaba.com: remove comment, per Barry] Link: https://lkml.kernel.org/r/c55d7ef7-78aa-4ed6-b897-c3e03a3f3ab7@linux.alibaba.com [wangkefeng.wang@huawei.com: remove local `orders'] Link: https://lkml.kernel.org/r/87769ae8-b6c6-4454-925d-1864364af9c8@huawei.com Link: https://lkml.kernel.org/r/117121665254442c3c7f585248296495e5e2b45c.1722404078.git.baolin.wang@linux.alibaba.com Fixes: e7a2ab7b3bb5 ("mm: shmem: add mTHP support for anonymous shmem") Signed-off-by: Baolin Wang Signed-off-by: Kefeng Wang Reviewed-by: Barry Song Cc: Barry Song <21cnbao@gmail.com> Cc: David Hildenbrand Cc: Gavin Shan Cc: Hugh Dickins Cc: Lance Yang Cc: Matthew Wilcox Cc: Ryan Roberts Cc: Zi Yan Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/shmem.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index dc065bb4b35c..f89ee091b400 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1681,11 +1681,6 @@ static unsigned long shmem_allowable_huge_orders(struct inode *inode, unsigned long mask = READ_ONCE(huge_shmem_orders_always); unsigned long within_size_orders = READ_ONCE(huge_shmem_orders_within_size); unsigned long vm_flags = vma->vm_flags; - /* - * Check all the (large) orders below HPAGE_PMD_ORDER + 1 that - * are enabled for this vma. - */ - unsigned long orders = BIT(PMD_ORDER + 1) - 1; loff_t i_size; int order; @@ -1730,7 +1725,7 @@ static unsigned long shmem_allowable_huge_orders(struct inode *inode, if (global_huge) mask |= READ_ONCE(huge_shmem_orders_inherit); - return orders & mask; + return THP_ORDERS_ALL_FILE_DEFAULT & mask; } static unsigned long shmem_suitable_orders(struct inode *inode, struct vm_fault *vmf, -- Gitee From ab5b96d5859b04d3f01ed629307982ff4886ecbc Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Wed, 31 Jul 2024 13:46:20 +0800 Subject: [PATCH 424/484] mm: shmem: fix incorrect aligned index when checking conflicts commit 4cbf320b1500fe64fcef8c96ed74dfc1ae2c9e2c upstream Conflicts: none Backport-reason: Shmem mTHP swap In the shmem_suitable_orders() function, xa_find() is used to check for conflicts in the pagecache to select suitable huge orders. However, when checking each huge order in every loop, the aligned index is calculated from the previous iteration, which may cause suitable huge orders to be missed. We should use the original index each time in the loop to calculate a new aligned index for checking conflicts to avoid this issue. Link: https://lkml.kernel.org/r/07433b0f16a152bffb8cee34934a5c040e8e2ad6.1722404078.git.baolin.wang@linux.alibaba.com Fixes: e7a2ab7b3bb5 ("mm: shmem: add mTHP support for anonymous shmem") Signed-off-by: Baolin Wang Acked-by: David Hildenbrand Cc: Barry Song <21cnbao@gmail.com> Cc: Gavin Shan Cc: Hugh Dickins Cc: Lance Yang Cc: Matthew Wilcox Cc: Ryan Roberts Cc: Zi Yan Cc: Barry Song Cc: Kefeng Wang Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/shmem.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index f89ee091b400..b02ad95b0932 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1733,6 +1733,7 @@ static unsigned long shmem_suitable_orders(struct inode *inode, struct vm_fault unsigned long orders) { struct vm_area_struct *vma = vmf->vma; + pgoff_t aligned_index; unsigned long pages; int order; @@ -1744,9 +1745,9 @@ static unsigned long shmem_suitable_orders(struct inode *inode, struct vm_fault order = highest_order(orders); while (orders) { pages = 1UL << order; - index = round_down(index, pages); - if (!xa_find(&mapping->i_pages, &index, - index + pages - 1, XA_PRESENT)) + aligned_index = round_down(index, pages); + if (!xa_find(&mapping->i_pages, &aligned_index, + aligned_index + pages - 1, XA_PRESENT)) break; order = next_order(&orders, order); } -- Gitee From fec25d8104d79f68ebf17b5286bcfc86c968bd45 Mon Sep 17 00:00:00 2001 From: Mike Yuan Date: Fri, 23 Aug 2024 16:27:06 +0000 Subject: [PATCH 425/484] mm/memcontrol: respect zswap.writeback setting from parent cg too MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit e399257349098bf7c84343f99efb2bc9c22eb9fd upstream Conflicts: none Backport-reason: ZSWAP Fixes Currently, the behavior of zswap.writeback wrt. the cgroup hierarchy seems a bit odd. Unlike zswap.max, it doesn't honor the value from parent cgroups. This surfaced when people tried to globally disable zswap writeback, i.e. reserve physical swap space only for hibernation [1] - disabling zswap.writeback only for the root cgroup results in subcgroups with zswap.writeback=1 still performing writeback. The inconsistency became more noticeable after I introduced the MemoryZSwapWriteback= systemd unit setting [2] for controlling the knob. The patch assumed that the kernel would enforce the value of parent cgroups. It could probably be workarounded from systemd's side, by going up the slice unit tree and inheriting the value. Yet I think it's more sensible to make it behave consistently with zswap.max and friends. [1] https://wiki.archlinux.org/title/Power_management/Suspend_and_hibernate#Disable_zswap_writeback_to_use_the_swap_space_only_for_hibernation [2] https://github.com/systemd/systemd/pull/31734 Link: https://lkml.kernel.org/r/20240823162506.12117-1-me@yhndnzj.com Fixes: 501a06fe8e4c ("zswap: memcontrol: implement zswap writeback disabling") Signed-off-by: Mike Yuan Reviewed-by: Nhat Pham Acked-by: Yosry Ahmed Cc: Johannes Weiner Cc: Michal Hocko Cc: Michal Koutný Cc: Muchun Song Cc: Roman Gushchin Cc: Shakeel Butt Cc: Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- Documentation/admin-guide/cgroup-v2.rst | 7 ++++--- mm/memcontrol.c | 12 +++++++++--- 2 files changed, 13 insertions(+), 6 deletions(-) diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index c9df14a23707..d3d6227ded39 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -1762,9 +1762,10 @@ PAGE_SIZE multiple when read back. entries fault back in or are written out to disk. memory.zswap.writeback - A read-write single value file. The default value is "1". The - initial value of the root cgroup is 1, and when a new cgroup is - created, it inherits the current value of its parent. + A read-write single value file. The default value is "1". + Note that this setting is hierarchical, i.e. the writeback would be + implicitly disabled for child cgroups if the upper hierarchy + does so. When this is set to 0, all swapping attempts to swapping devices are disabled. This included both zswap writebacks, and swapping due diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 53ec5de48083..f8568e8d47b7 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -7342,8 +7342,7 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) memcg->pagecache_max_ratio = PAGECACHE_MAX_RATIO_MAX; #if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP) memcg->zswap_max = PAGE_COUNTER_MAX; - WRITE_ONCE(memcg->zswap_writeback, - !parent || READ_ONCE(parent->zswap_writeback)); + WRITE_ONCE(memcg->zswap_writeback, true); #endif #ifdef CONFIG_MEMCG_ZRAM memcg->zram_max = PAGE_COUNTER_MAX; @@ -10311,7 +10310,14 @@ void obj_cgroup_uncharge_zswap(struct obj_cgroup *objcg, size_t size) bool mem_cgroup_zswap_writeback_enabled(struct mem_cgroup *memcg) { /* if zswap is disabled, do not block pages going to the swapping device */ - return !zswap_is_enabled() || !memcg || READ_ONCE(memcg->zswap_writeback); + if (!zswap_is_enabled()) + return true; + + for (; memcg; memcg = parent_mem_cgroup(memcg)) + if (!READ_ONCE(memcg->zswap_writeback)) + return false; + + return true; } static u64 zswap_current_read(struct cgroup_subsys_state *css, -- Gitee From 650bed85be9b171c62f755ea91d4a0baaf75fa31 Mon Sep 17 00:00:00 2001 From: Jeongjun Park Date: Tue, 24 Sep 2024 22:00:53 +0900 Subject: [PATCH 426/484] mm: migrate: annotate data-race in migrate_folio_unmap() commit 8001070cfbec5cd4ea00b8b48ea51df91122f265 upstream Conflicts: none Backport-reason: mTHP fixes I found a report from syzbot [1] This report shows that the value can be changed, but in reality, the value of __folio_set_movable() cannot be changed because it holds the folio refcount. Therefore, it is appropriate to add an annotate to make KCSAN ignore that data-race. [1] ================================================================== BUG: KCSAN: data-race in __filemap_remove_folio / migrate_pages_batch write to 0xffffea0004b81dd8 of 8 bytes by task 6348 on cpu 0: page_cache_delete mm/filemap.c:153 [inline] __filemap_remove_folio+0x1ac/0x2c0 mm/filemap.c:233 filemap_remove_folio+0x6b/0x1f0 mm/filemap.c:265 truncate_inode_folio+0x42/0x50 mm/truncate.c:178 shmem_undo_range+0x25b/0xa70 mm/shmem.c:1028 shmem_truncate_range mm/shmem.c:1144 [inline] shmem_evict_inode+0x14d/0x530 mm/shmem.c:1272 evict+0x2f0/0x580 fs/inode.c:731 iput_final fs/inode.c:1883 [inline] iput+0x42a/0x5b0 fs/inode.c:1909 dentry_unlink_inode+0x24f/0x260 fs/dcache.c:412 __dentry_kill+0x18b/0x4c0 fs/dcache.c:615 dput+0x5c/0xd0 fs/dcache.c:857 __fput+0x3fb/0x6d0 fs/file_table.c:439 ____fput+0x1c/0x30 fs/file_table.c:459 task_work_run+0x13a/0x1a0 kernel/task_work.c:228 resume_user_mode_work include/linux/resume_user_mode.h:50 [inline] exit_to_user_mode_loop kernel/entry/common.c:114 [inline] exit_to_user_mode_prepare include/linux/entry-common.h:328 [inline] __syscall_exit_to_user_mode_work kernel/entry/common.c:207 [inline] syscall_exit_to_user_mode+0xbe/0x130 kernel/entry/common.c:218 do_syscall_64+0xd6/0x1c0 arch/x86/entry/common.c:89 entry_SYSCALL_64_after_hwframe+0x77/0x7f read to 0xffffea0004b81dd8 of 8 bytes by task 6342 on cpu 1: __folio_test_movable include/linux/page-flags.h:699 [inline] migrate_folio_unmap mm/migrate.c:1199 [inline] migrate_pages_batch+0x24c/0x1940 mm/migrate.c:1797 migrate_pages_sync mm/migrate.c:1963 [inline] migrate_pages+0xff1/0x1820 mm/migrate.c:2072 do_mbind mm/mempolicy.c:1390 [inline] kernel_mbind mm/mempolicy.c:1533 [inline] __do_sys_mbind mm/mempolicy.c:1607 [inline] __se_sys_mbind+0xf76/0x1160 mm/mempolicy.c:1603 __x64_sys_mbind+0x78/0x90 mm/mempolicy.c:1603 x64_sys_call+0x2b4d/0x2d60 arch/x86/include/generated/asm/syscalls_64.h:238 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xc9/0x1c0 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x77/0x7f value changed: 0xffff888127601078 -> 0x0000000000000000 Link: https://lkml.kernel.org/r/20240924130053.107490-1-aha310510@gmail.com Fixes: 7e2a5e5ab217 ("mm: migrate: use __folio_test_movable()") Signed-off-by: Jeongjun Park Reported-by: syzbot Acked-by: David Hildenbrand Cc: Kefeng Wang Cc: Matthew Wilcox Cc: Zi Yan Cc: Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/migrate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/migrate.c b/mm/migrate.c index dc0af1856bff..7c49ea21b245 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1126,7 +1126,7 @@ static int migrate_folio_unmap(new_folio_t get_new_folio, int rc = -EAGAIN; int old_page_state = 0; struct anon_vma *anon_vma = NULL; - bool is_lru = !__folio_test_movable(src); + bool is_lru = data_race(!__folio_test_movable(src)); bool locked = false; bool dst_locked = false; -- Gitee From 19a33e6586a83bb6b1a853f0781de8f1e3b970fa Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Tue, 8 Oct 2024 22:13:29 +0300 Subject: [PATCH 427/484] mm: remove unused stub for can_swapin_thp() commit a5e8eb25135a48d400e5a695ba9329bc632c3bb4 upstream Conflicts: none Backport-reason: THP fixes When can_swapin_thp() is unused, it prevents kernel builds with clang, `make W=1` and CONFIG_WERROR=y: mm/memory.c:4184:20: error: unused function 'can_swapin_thp' [-Werror,-Wunused-function] Fix this by removing the unused stub. See also commit 6863f5643dd7 ("kbuild: allow Clang to find unused static inline functions for W=1 build"). Link: https://lkml.kernel.org/r/20241008191329.2332346-1-andriy.shevchenko@linux.intel.com Fixes: 242d12c98174 ("mm: support large folios swap-in for sync io devices") Signed-off-by: Andy Shevchenko Acked-by: Barry Song Cc: Bill Wendling Cc: Chuanhua Han Cc: Justin Stitt Cc: Nathan Chancellor Cc: Nick Desaulniers Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/memory.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index 374e7c13f228..fec5ec77c319 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4064,11 +4064,6 @@ static struct folio *alloc_swap_folio(struct vm_fault *vmf) return __alloc_swap_folio(vmf); } #else /* !CONFIG_TRANSPARENT_HUGEPAGE */ -static inline bool can_swapin_thp(struct vm_fault *vmf, pte_t *ptep, int nr_pages) -{ - return false; -} - static struct folio *alloc_swap_folio(struct vm_fault *vmf) { return __alloc_swap_folio(vmf); -- Gitee From a2ecda4b83a65d43a25de8398ac6de6e56454be8 Mon Sep 17 00:00:00 2001 From: Chen Ridong Date: Fri, 25 Oct 2024 06:09:42 +0000 Subject: [PATCH 428/484] mm: shrinker: avoid memleak in alloc_shrinker_info commit 15e8156713cc38031642fafc8baf7d53f19f2e83 upstream Conflicts: none Backport-reason: Shrinker fixes A memleak was found as below: unreferenced object 0xffff8881010d2a80 (size 32): comm "mkdir", pid 1559, jiffies 4294932666 hex dump (first 32 bytes): 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ 40 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 @............... backtrace (crc 2e7ef6fa): [] __kmalloc_node_noprof+0x394/0x470 [] alloc_shrinker_info+0x7b/0x1a0 [] mem_cgroup_css_online+0x11a/0x3b0 [] online_css+0x29/0xa0 [] cgroup_apply_control_enable+0x20d/0x360 [] cgroup_mkdir+0x168/0x5f0 [] kernfs_iop_mkdir+0x5e/0x90 [] vfs_mkdir+0x144/0x220 [] do_mkdirat+0x87/0x130 [] __x64_sys_mkdir+0x49/0x70 [] do_syscall_64+0x68/0x140 [] entry_SYSCALL_64_after_hwframe+0x76/0x7e alloc_shrinker_info(), when shrinker_unit_alloc() returns an errer, the info won't be freed. Just fix it. Link: https://lkml.kernel.org/r/20241025060942.1049263-1-chenridong@huaweicloud.com Fixes: 307bececcd12 ("mm: shrinker: add a secondary array for shrinker_info::{map, nr_deferred}") Signed-off-by: Chen Ridong Acked-by: Qi Zheng Acked-by: Roman Gushchin Acked-by: Vlastimil Babka Acked-by: Kirill A. Shutemov Reviewed-by: Dave Chinner Cc: Anshuman Khandual Cc: Muchun Song Cc: Wang Weiyang Cc: Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/shrinker.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/mm/shrinker.c b/mm/shrinker.c index dc5d2a6fcfc4..4a93fd433689 100644 --- a/mm/shrinker.c +++ b/mm/shrinker.c @@ -76,19 +76,21 @@ void free_shrinker_info(struct mem_cgroup *memcg) int alloc_shrinker_info(struct mem_cgroup *memcg) { - struct shrinker_info *info; int nid, ret = 0; int array_size = 0; mutex_lock(&shrinker_mutex); array_size = shrinker_unit_size(shrinker_nr_max); for_each_node(nid) { - info = kvzalloc_node(sizeof(*info) + array_size, GFP_KERNEL, nid); + struct shrinker_info *info = kvzalloc_node(sizeof(*info) + array_size, + GFP_KERNEL, nid); if (!info) goto err; info->map_nr_max = shrinker_nr_max; - if (shrinker_unit_alloc(info, NULL, nid)) + if (shrinker_unit_alloc(info, NULL, nid)) { + kvfree(info); goto err; + } rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_info, info); } mutex_unlock(&shrinker_mutex); -- Gitee From 9bf7c5318ddb93999402f09c7b2077ed43f88544 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Thu, 10 Oct 2024 14:15:56 +0800 Subject: [PATCH 429/484] mm: remove unused hugepage for vma_alloc_folio() commit 6359c39c9de66dede8ff5ff257c9e117483dbc7c upstream Conflicts: minor Backport-reason: a nice cleanup to kill redundant param. The hugepage parameter was deprecated since commit ddc1a5cbc05d ("mempolicy: alloc_pages_mpol() for NUMA policy without vma"), for PMD-sized THP, it still tries only preferred node if possible in vma_alloc_folio() by checking the order of the folio allocation. Link: https://lkml.kernel.org/r/20241010061556.1846751-1-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Acked-by: David Hildenbrand Reviewed-by: Zi Yan Reviewed-by: Barry Song Cc: Hugh Dickins Cc: Matthew Wilcox Cc: Ryan Roberts Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- arch/alpha/include/asm/page.h | 2 +- arch/arm64/mm/fault.c | 2 +- arch/m68k/include/asm/page_no.h | 2 +- arch/s390/include/asm/page.h | 2 +- arch/x86/include/asm/page.h | 2 +- include/linux/gfp.h | 6 +++--- include/linux/highmem.h | 2 +- mm/huge_memory.c | 2 +- mm/ksm.c | 2 +- mm/memory.c | 10 ++++------ mm/mempolicy.c | 3 +-- mm/userfaultfd.c | 2 +- 12 files changed, 17 insertions(+), 20 deletions(-) diff --git a/arch/alpha/include/asm/page.h b/arch/alpha/include/asm/page.h index 4db1ebc0ed99..6484c6382ed6 100644 --- a/arch/alpha/include/asm/page.h +++ b/arch/alpha/include/asm/page.h @@ -18,7 +18,7 @@ extern void clear_page(void *page); #define clear_user_page(page, vaddr, pg) clear_page(page) #define vma_alloc_zeroed_movable_folio(vma, vaddr) \ - vma_alloc_folio(GFP_HIGHUSER_MOVABLE | __GFP_ZERO, 0, vma, vaddr, false) + vma_alloc_folio(GFP_HIGHUSER_MOVABLE | __GFP_ZERO, 0, vma, vaddr) extern void copy_page(void * _to, void * _from); #define copy_user_page(to, from, vaddr, pg) copy_page(to, from) diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index b5600b9e58c9..4c97af26e49a 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -1911,7 +1911,7 @@ struct folio *vma_alloc_zeroed_movable_folio(struct vm_area_struct *vma, if (vma->vm_flags & VM_MTE) flags |= __GFP_ZEROTAGS; - return vma_alloc_folio(flags, 0, vma, vaddr, false); + return vma_alloc_folio(flags, 0, vma, vaddr); } void tag_clear_highpage(struct page *page) diff --git a/arch/m68k/include/asm/page_no.h b/arch/m68k/include/asm/page_no.h index af3a10973233..63c0e706084b 100644 --- a/arch/m68k/include/asm/page_no.h +++ b/arch/m68k/include/asm/page_no.h @@ -14,7 +14,7 @@ extern unsigned long memory_end; #define copy_user_page(to, from, vaddr, pg) copy_page(to, from) #define vma_alloc_zeroed_movable_folio(vma, vaddr) \ - vma_alloc_folio(GFP_HIGHUSER_MOVABLE | __GFP_ZERO, 0, vma, vaddr, false) + vma_alloc_folio(GFP_HIGHUSER_MOVABLE | __GFP_ZERO, 0, vma, vaddr) #define __pa(vaddr) ((unsigned long)(vaddr)) #define __va(paddr) ((void *)((unsigned long)(paddr))) diff --git a/arch/s390/include/asm/page.h b/arch/s390/include/asm/page.h index cfec0743314e..be2f69d0e85d 100644 --- a/arch/s390/include/asm/page.h +++ b/arch/s390/include/asm/page.h @@ -74,7 +74,7 @@ static inline void copy_page(void *to, void *from) #define copy_user_page(to, from, vaddr, pg) copy_page(to, from) #define vma_alloc_zeroed_movable_folio(vma, vaddr) \ - vma_alloc_folio(GFP_HIGHUSER_MOVABLE | __GFP_ZERO, 0, vma, vaddr, false) + vma_alloc_folio(GFP_HIGHUSER_MOVABLE | __GFP_ZERO, 0, vma, vaddr) /* * These are used to make use of C type-checking.. diff --git a/arch/x86/include/asm/page.h b/arch/x86/include/asm/page.h index 1b93ff80b43b..c9fe207916f4 100644 --- a/arch/x86/include/asm/page.h +++ b/arch/x86/include/asm/page.h @@ -35,7 +35,7 @@ static inline void copy_user_page(void *to, void *from, unsigned long vaddr, } #define vma_alloc_zeroed_movable_folio(vma, vaddr) \ - vma_alloc_folio(GFP_HIGHUSER_MOVABLE | __GFP_ZERO, 0, vma, vaddr, false) + vma_alloc_folio(GFP_HIGHUSER_MOVABLE | __GFP_ZERO, 0, vma, vaddr) #ifndef __pa #define __pa(x) __phys_addr((unsigned long)(x)) diff --git a/include/linux/gfp.h b/include/linux/gfp.h index b20f51dbc3cc..4c7856785661 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -269,7 +269,7 @@ struct folio *folio_alloc(gfp_t gfp, unsigned int order); struct folio *folio_alloc_mpol(gfp_t gfp, unsigned int order, struct mempolicy *mpol, pgoff_t ilx, int nid); struct folio *vma_alloc_folio(gfp_t gfp, int order, struct vm_area_struct *vma, - unsigned long addr, bool hugepage); + unsigned long addr); #else static inline struct page *alloc_pages(gfp_t gfp_mask, unsigned int order) { @@ -289,14 +289,14 @@ static inline struct folio *folio_alloc_mpol(gfp_t gfp, unsigned int order, { return folio_alloc(gfp, order); } -#define vma_alloc_folio(gfp, order, vma, addr, hugepage) \ +#define vma_alloc_folio(gfp, order, vma, addr) \ folio_alloc(gfp, order) #endif #define alloc_page(gfp_mask) alloc_pages(gfp_mask, 0) static inline struct page *alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr) { - struct folio *folio = vma_alloc_folio(gfp, 0, vma, addr, false); + struct folio *folio = vma_alloc_folio(gfp, 0, vma, addr); return &folio->page; } diff --git a/include/linux/highmem.h b/include/linux/highmem.h index 714966d5701e..532ea9a9214d 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -226,7 +226,7 @@ struct folio *vma_alloc_zeroed_movable_folio(struct vm_area_struct *vma, { struct folio *folio; - folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma, vaddr, false); + folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma, vaddr); if (folio) clear_user_highpage(&folio->page, vaddr); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 2cd0038607aa..7657ebf369a5 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1326,7 +1326,7 @@ vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf) return ret; } gfp = vma_thp_gfp_mask(vma); - folio = vma_alloc_folio(gfp, HPAGE_PMD_ORDER, vma, haddr, true); + folio = vma_alloc_folio(gfp, HPAGE_PMD_ORDER, vma, haddr); if (unlikely(!folio)) { count_vm_event(THP_FAULT_FALLBACK); count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_FALLBACK); diff --git a/mm/ksm.c b/mm/ksm.c index 9aafdc73efa2..45b74100ba83 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -2815,7 +2815,7 @@ struct folio *ksm_might_need_to_copy(struct folio *folio, if (!folio_test_uptodate(folio)) return folio; /* let do_swap_page report the error */ - new_folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma, addr, false); + new_folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma, addr); if (new_folio && mem_cgroup_charge(new_folio, vma->vm_mm, GFP_KERNEL)) { folio_put(new_folio); diff --git a/mm/memory.c b/mm/memory.c index fec5ec77c319..5c2d6594e58e 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1047,8 +1047,7 @@ static inline struct folio *folio_prealloc(struct mm_struct *src_mm, if (need_zero) new_folio = vma_alloc_zeroed_movable_folio(vma, addr); else - new_folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma, - addr, false); + new_folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma, addr); if (!new_folio) return NULL; @@ -3913,8 +3912,7 @@ static struct folio *__alloc_swap_folio(struct vm_fault *vmf) struct folio *folio; swp_entry_t entry; - folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma, - vmf->address, false); + folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma, vmf->address); if (!folio) return NULL; @@ -4050,7 +4048,7 @@ static struct folio *alloc_swap_folio(struct vm_fault *vmf) gfp = vma_thp_gfp_mask(vma); while (orders) { addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order); - folio = vma_alloc_folio(gfp, order, vma, addr, true); + folio = vma_alloc_folio(gfp, order, vma, addr); if (folio) { if (!mem_cgroup_swapin_charge_folio(folio, vma->vm_mm, gfp, entry)) @@ -4571,7 +4569,7 @@ static struct folio *alloc_anon_folio(struct vm_fault *vmf) gfp = vma_thp_gfp_mask(vma); while (orders) { addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order); - folio = vma_alloc_folio(gfp, order, vma, addr, true); + folio = vma_alloc_folio(gfp, order, vma, addr); if (folio) { if (mem_cgroup_charge(folio, vma->vm_mm, gfp)) { count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE); diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 357463cc3b17..2d85f722c103 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -2154,7 +2154,6 @@ struct folio *folio_alloc_mpol(gfp_t gfp, unsigned int order, * @order: Order of the folio. * @vma: Pointer to VMA. * @addr: Virtual address of the allocation. Must be inside @vma. - * @hugepage: Unused (was: For hugepages try only preferred node if possible). * * Allocate a folio for a specific address in @vma, using the appropriate * NUMA policy. The caller must hold the mmap_lock of the mm_struct of the @@ -2165,7 +2164,7 @@ struct folio *folio_alloc_mpol(gfp_t gfp, unsigned int order, * Return: The folio on success or NULL if allocation fails. */ struct folio *vma_alloc_folio(gfp_t gfp, int order, struct vm_area_struct *vma, - unsigned long addr, bool hugepage) + unsigned long addr) { struct mempolicy *pol; pgoff_t ilx; diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index dcf24184f985..f6d533adca55 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -151,7 +151,7 @@ static int mfill_atomic_pte_copy(pmd_t *dst_pmd, if (!*foliop) { ret = -ENOMEM; folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, dst_vma, - dst_addr, false); + dst_addr); if (!folio) goto out; -- Gitee From 6a5dbfcf5accfb446b8662c78f7fe736ac77da5c Mon Sep 17 00:00:00 2001 From: Luoxi Li Date: Fri, 18 Oct 2024 17:22:35 +0800 Subject: [PATCH 430/484] mm: remove unused has_isolate_pageblock commit b7f058f827392022d8c689329f88c7b324d71dad upstream Conflicts: none Backport-reason: a nice cleanup has_isolate_pageblock() has been unused since commit 55612e80e722 ("mm: page_alloc: close migratetype race between freeing and stealing") Remove it. Link: https://lkml.kernel.org/r/20241018092235.2764859-1-kaixa@kiloview.com Signed-off-by: Luoxi Li Acked-by: David Hildenbrand Reviewed-by: Muhammad Usama Anjum Acked-by: Johannes Weiner Reviewed-by: Anshuman Khandual Cc: Baolin Wang Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- include/linux/page-isolation.h | 8 -------- 1 file changed, 8 deletions(-) diff --git a/include/linux/page-isolation.h b/include/linux/page-isolation.h index c16db0067090..73dc2c1841ec 100644 --- a/include/linux/page-isolation.h +++ b/include/linux/page-isolation.h @@ -3,10 +3,6 @@ #define __LINUX_PAGEISOLATION_H #ifdef CONFIG_MEMORY_ISOLATION -static inline bool has_isolate_pageblock(struct zone *zone) -{ - return zone->nr_isolate_pageblock; -} static inline bool is_migrate_isolate_page(struct page *page) { return get_pageblock_migratetype(page) == MIGRATE_ISOLATE; @@ -16,10 +12,6 @@ static inline bool is_migrate_isolate(int migratetype) return migratetype == MIGRATE_ISOLATE; } #else -static inline bool has_isolate_pageblock(struct zone *zone) -{ - return false; -} static inline bool is_migrate_isolate_page(struct page *page) { return false; -- Gitee From 3ddc52002c09cf258de79953ac79c74feedeaa8e Mon Sep 17 00:00:00 2001 From: Yu Zhao Date: Mon, 28 Oct 2024 12:26:53 -0600 Subject: [PATCH 431/484] mm/page_alloc: keep track of free highatomic commit c928807f6f6b6d595a7e199591ae297c81de3aeb upstream Conflicts: none Backport-reason: mm fixes OOM kills due to vastly overestimated free highatomic reserves were observed: ... invoked oom-killer: gfp_mask=0x100cca(GFP_HIGHUSER_MOVABLE), order=0 ... Node 0 Normal free:1482936kB boost:0kB min:410416kB low:739404kB high:1068392kB reserved_highatomic:1073152KB ... Node 0 Normal: 1292*4kB (ME) 1920*8kB (E) 383*16kB (UE) 220*32kB (ME) 340*64kB (E) 2155*128kB (UE) 3243*256kB (UE) 615*512kB (U) 1*1024kB (M) 0*2048kB 0*4096kB = 1477408kB The second line above shows that the OOM kill was due to the following condition: free (1482936kB) - reserved_highatomic (1073152kB) = 409784KB < min (410416kB) And the third line shows there were no free pages in any MIGRATE_HIGHATOMIC pageblocks, which otherwise would show up as type 'H'. Therefore __zone_watermark_unusable_free() underestimated the usable free memory by over 1GB, which resulted in the unnecessary OOM kill above. The comments in __zone_watermark_unusable_free() warns about the potential risk, i.e., If the caller does not have rights to reserves below the min watermark then subtract the high-atomic reserves. This will over-estimate the size of the atomic reserve but it avoids a search. However, it is possible to keep track of free pages in reserved highatomic pageblocks with a new per-zone counter nr_free_highatomic protected by the zone lock, to avoid a search when calculating the usable free memory. And the cost would be minimal, i.e., simple arithmetics in the highatomic alloc/free/move paths. Note that since nr_free_highatomic can be relatively small, using a per-cpu counter might cause too much drift and defeat its purpose, in addition to the extra memory overhead. Dependson e0932b6c1f94 ("mm: page_alloc: consolidate free page accounting") - see [1] [akpm@linux-foundation.org: s/if/else if/, per Johannes, stealth whitespace tweak] Link: https://lkml.kernel.org/r/20241028182653.3420139-1-yuzhao@google.com Link: https://lkml.kernel.org/r/0d0ddb33-fcdc-43e2-801f-0c1df2031afb@suse.cz [1] Fixes: 0aaa29a56e4f ("mm, page_alloc: reserve pageblocks for high-order atomic allocations on demand") Signed-off-by: Yu Zhao Reported-by: Link Lin Acked-by: David Rientjes Acked-by: Vlastimil Babka Acked-by: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- include/linux/mmzone.h | 1 + mm/page_alloc.c | 10 +++++++--- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 4ff96faa356a..67c7e48c424b 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -866,6 +866,7 @@ struct zone { unsigned long watermark_boost; unsigned long nr_reserved_highatomic; + unsigned long nr_free_highatomic; /* * We don't know if the memory that we're going to allocate will be diff --git a/mm/page_alloc.c b/mm/page_alloc.c index aa23203a64e0..374b683c3302 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -633,6 +633,8 @@ compaction_capture(struct capture_control *capc, struct page *page, static inline void account_freepages(struct zone *zone, int nr_pages, int migratetype) { + lockdep_assert_held(&zone->lock); + if (is_migrate_isolate(migratetype)) return; @@ -640,6 +642,9 @@ static inline void account_freepages(struct zone *zone, int nr_pages, if (is_migrate_cma(migratetype)) __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, nr_pages); + else if (is_migrate_highatomic(migratetype)) + WRITE_ONCE(zone->nr_free_highatomic, + zone->nr_free_highatomic + nr_pages); } /* Used for pages not on another list */ @@ -3117,11 +3122,10 @@ static inline long __zone_watermark_unusable_free(struct zone *z, /* * If the caller does not have rights to reserves below the min - * watermark then subtract the high-atomic reserves. This will - * over-estimate the size of the atomic reserve but it avoids a search. + * watermark then subtract the free pages reserved for highatomic. */ if (likely(!(alloc_flags & ALLOC_RESERVES))) - unusable_free += z->nr_reserved_highatomic; + unusable_free += READ_ONCE(z->nr_free_highatomic); #ifdef CONFIG_CMA /* If allocation can't use CMA areas don't use free CMA pages */ -- Gitee From e3da1be227e585bf50e0120e0140f65f7fd11e0f Mon Sep 17 00:00:00 2001 From: gaoxu Date: Sat, 12 Apr 2025 09:27:24 +0000 Subject: [PATCH 432/484] mm: add nr_free_highatomic in show_free_areas commit 4f219913c1368a372b07832451cb86ea3a1df31c upstream Conflicts: none Backport-reason: More mm statistic commit c928807f6f6b6("mm/page_alloc: keep track of free highatomic") adds a new variable nr_free_highatomic, which is useful for analyzing low mem issues. add nr_free_highatomic in show_free_areas. Signed-off-by: gao xu Link: https://lkml.kernel.org/r/d92eeff74f7a4578a14ac777cfe3603a@honor.com Acked-by: David Hildenbrand Reviewed-by: Barry Song Acked-by: David Rientjes Reviewed-by: Anshuman Khandual Cc: Mike Rapoport Cc: Suren Baghdasaryan Cc: Yu Zhao Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/show_mem.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mm/show_mem.c b/mm/show_mem.c index 807a297f9240..f80fb6d30118 100644 --- a/mm/show_mem.c +++ b/mm/show_mem.c @@ -316,6 +316,7 @@ static void show_free_areas(unsigned int filter, nodemask_t *nodemask, int max_z " low:%lukB" " high:%lukB" " reserved_highatomic:%luKB" + " free_highatomic:%luKB" " active_anon:%lukB" " inactive_anon:%lukB" " active_file:%lukB" @@ -337,6 +338,7 @@ static void show_free_areas(unsigned int filter, nodemask_t *nodemask, int max_z K(low_wmark_pages(zone)), K(high_wmark_pages(zone)), K(zone->nr_reserved_highatomic), + K(zone->nr_free_highatomic), K(zone_page_state(zone, NR_ZONE_ACTIVE_ANON)), K(zone_page_state(zone, NR_ZONE_INACTIVE_ANON)), K(zone_page_state(zone, NR_ZONE_ACTIVE_FILE)), -- Gitee From 25346db344238fbdbc653c1d9eebb8b799ceb4dd Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Sun, 27 Oct 2024 12:33:21 +0000 Subject: [PATCH 433/484] mm/mlock: set the correct prev on failure commit faa242b1d2a97143150bdc50d5b61fd70fcd17cd upstream Conflicts: none Backport-reason: mm fixes After commit 94d7d9233951 ("mm: abstract the vma_merge()/split_vma() pattern for mprotect() et al."), if vma_modify_flags() return error, the vma is set to an error code. This will lead to an invalid prev be returned. Generally this shouldn't matter as the caller should treat an error as indicating state is now invalidated, however unfortunately apply_mlockall_flags() does not check for errors and assumes that mlock_fixup() correctly maintains prev even if an error were to occur. This patch fixes that assumption. [lorenzo.stoakes@oracle.com: provide a better fix and rephrase the log] Link: https://lkml.kernel.org/r/20241027123321.19511-1-richard.weiyang@gmail.com Fixes: 94d7d9233951 ("mm: abstract the vma_merge()/split_vma() pattern for mprotect() et al.") Signed-off-by: Wei Yang Reviewed-by: Lorenzo Stoakes Reviewed-by: Liam R. Howlett Cc: Vlastimil Babka Cc: Jann Horn Cc: Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/mlock.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/mm/mlock.c b/mm/mlock.c index 826a93c4af2c..d374444c8273 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -736,14 +736,17 @@ static int apply_mlockall_flags(int flags) } for_each_vma(vmi, vma) { + int error; vm_flags_t newflags; newflags = vma->vm_flags & ~VM_LOCKED_MASK; newflags |= to_add; - /* Ignore errors */ - mlock_fixup(&vmi, vma, &prev, vma->vm_start, vma->vm_end, - newflags); + error = mlock_fixup(&vmi, vma, &prev, vma->vm_start, vma->vm_end, + newflags); + /* Ignore errors, but prev needs fixing up. */ + if (error) + prev = vma; cond_resched(); } out: -- Gitee From 0b49e42662f46eb9299bc5b3d3924bd9af68f363 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Wed, 4 Dec 2024 22:50:06 -0800 Subject: [PATCH 434/484] mm: shmem: fix ShmemHugePages at swapout commit dad2dc9c92e0f93f33cebcb0595b8daa3d57473f upstream Conflicts: none Backport-reason: Shmem mTHP swap /proc/meminfo ShmemHugePages has been showing overlarge amounts (more than Shmem) after swapping out THPs: we forgot to update NR_SHMEM_THPS. Add shmem_update_stats(), to avoid repetition, and risk of making that mistake again: the call from shmem_delete_from_page_cache() is the bugfix; the call from shmem_replace_folio() is reassuring, but not really a bugfix (replace corrects misplaced swapin readahead, but huge swapin readahead would be a mistake). Link: https://lkml.kernel.org/r/5ba477c8-a569-70b5-923e-09ab221af45b@google.com Fixes: 809bc86517cc ("mm: shmem: support large folio swap out") Signed-off-by: Hugh Dickins Reviewed-by: Shakeel Butt Reviewed-by: Yosry Ahmed Reviewed-by: Baolin Wang Tested-by: Baolin Wang Cc: Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/shmem.c | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index b02ad95b0932..3fd652d966d6 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -781,6 +781,14 @@ static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo, } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ +static void shmem_update_stats(struct folio *folio, int nr_pages) +{ + if (folio_test_pmd_mappable(folio)) + __lruvec_stat_mod_folio(folio, NR_SHMEM_THPS, nr_pages); + __lruvec_stat_mod_folio(folio, NR_FILE_PAGES, nr_pages); + __lruvec_stat_mod_folio(folio, NR_SHMEM, nr_pages); +} + /* * Somewhat like filemap_add_folio, but error if expected item has gone. */ @@ -828,10 +836,7 @@ static int shmem_add_to_page_cache(struct folio *folio, xas_store(&xas, folio); if (xas_error(&xas)) goto unlock; - if (folio_test_pmd_mappable(folio)) - __lruvec_stat_mod_folio(folio, NR_SHMEM_THPS, nr); - __lruvec_stat_mod_folio(folio, NR_FILE_PAGES, nr); - __lruvec_stat_mod_folio(folio, NR_SHMEM, nr); + shmem_update_stats(folio, nr); mapping->nrpages += nr; unlock: xas_unlock_irq(&xas); @@ -859,8 +864,7 @@ static void shmem_delete_from_page_cache(struct folio *folio, void *radswap) error = shmem_replace_entry(mapping, folio->index, folio, radswap); folio->mapping = NULL; mapping->nrpages -= nr; - __lruvec_stat_mod_folio(folio, NR_FILE_PAGES, -nr); - __lruvec_stat_mod_folio(folio, NR_SHMEM, -nr); + shmem_update_stats(folio, -nr); xa_unlock_irq(&mapping->i_pages); folio_put_refs(folio, nr); BUG_ON(error); @@ -2062,10 +2066,8 @@ static int shmem_replace_folio(struct folio **foliop, gfp_t gfp, } if (!error) { mem_cgroup_replace_folio(old, new); - __lruvec_stat_mod_folio(new, NR_FILE_PAGES, nr_pages); - __lruvec_stat_mod_folio(new, NR_SHMEM, nr_pages); - __lruvec_stat_mod_folio(old, NR_FILE_PAGES, -nr_pages); - __lruvec_stat_mod_folio(old, NR_SHMEM, -nr_pages); + shmem_update_stats(new, nr_pages); + shmem_update_stats(old, -nr_pages); } xa_unlock_irq(&swap_mapping->i_pages); -- Gitee From 63091f0c60df07d573f457a610016ac7c54a2e61 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 10 Dec 2024 10:34:37 +0100 Subject: [PATCH 435/484] mm/page_alloc: don't call pfn_to_page() on possibly non-existent PFN in split_large_buddy() commit faeec8e23c10bd30e8aa759a2eb3018dae00f924 upstream Conflicts: major, due to lack of e98337d11bbd Backport-reason: mTHP & mm fixes In split_large_buddy(), we might call pfn_to_page() on a PFN that might not exist. In corner cases, such as when freeing the highest pageblock in the last memory section, this could result with CONFIG_SPARSEMEM && !CONFIG_SPARSEMEM_EXTREME in __pfn_to_section() returning NULL and and __section_mem_map_addr() dereferencing that NULL pointer. Let's fix it, and avoid doing a pfn_to_page() call for the first iteration, where we already have the page. So far this was found by code inspection, but let's just CC stable as the fix is easy. Link: https://lkml.kernel.org/r/20241210093437.174413-1-david@redhat.com Fixes: fd919a85cd55 ("mm: page_isolation: prepare for hygienic freelists") Signed-off-by: David Hildenbrand Reported-by: Vlastimil Babka Closes: https://lkml.kernel.org/r/e1a898ba-a717-4d20-9144-29df1a6c8813@suse.cz Reviewed-by: Vlastimil Babka Reviewed-by: Zi Yan Acked-by: Johannes Weiner Cc: Yu Zhao Cc: Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/page_alloc.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 374b683c3302..225506e19165 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1701,7 +1701,7 @@ static unsigned long find_large_buddy(unsigned long start_pfn) static void split_large_buddy(struct zone *zone, struct page *page, unsigned long pfn, int order) { - unsigned long end_pfn = pfn + (1 << order); + unsigned long end = pfn + (1 << order); VM_WARN_ON_ONCE(order <= pageblock_order); VM_WARN_ON_ONCE(pfn & (pageblock_nr_pages - 1)); @@ -1709,13 +1709,15 @@ static void split_large_buddy(struct zone *zone, struct page *page, /* Caller removed page from freelist, buddy info cleared! */ VM_WARN_ON_ONCE(PageBuddy(page)); - while (pfn != end_pfn) { + do { int mt = get_pfnblock_migratetype(page, pfn); __free_one_page(page, pfn, zone, pageblock_order, mt, FPI_NONE); pfn += pageblock_nr_pages; + if (pfn == end) + break; page = pfn_to_page(pfn); - } + } while (1); } /** -- Gitee From 12a7b25deba15ce6ee82f5aa90b7a1451c2f8cbd Mon Sep 17 00:00:00 2001 From: Brian Geffon Date: Wed, 26 Feb 2025 11:23:41 -0500 Subject: [PATCH 436/484] mm: fix finish_fault() handling for large folios commit 34b82f33cf3f03bc39e9a205a913d790e1520ade upstream Conflicts: none Backport-reason: THP & mm fixes When handling faults for anon shmem finish_fault() will attempt to install ptes for the entire folio. Unfortunately if it encounters a single non-pte_none entry in that range it will bail, even if the pte that triggered the fault is still pte_none. When this situation happens the fault will be retried endlessly never making forward progress. This patch fixes this behavior and if it detects that a pte in the range is not pte_none it will fall back to setting a single pte. [bgeffon@google.com: tweak whitespace] Link: https://lkml.kernel.org/r/20250227133236.1296853-1-bgeffon@google.com Link: https://lkml.kernel.org/r/20250226162341.915535-1-bgeffon@google.com Fixes: 43e027e41423 ("mm: memory: extend finish_fault() to support large folio") Signed-off-by: Brian Geffon Suggested-by: Baolin Wang Reported-by: Marek Maslanka Cc: Hugh Dickins Cc: David Hildenbrand Cc: Hugh Dickens Cc: Kefeng Wang Cc: Matthew Wilcow (Oracle) Cc: Suren Baghdasaryan Cc: Zi Yan Cc: Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/memory.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index 5c2d6594e58e..cd7e8375803e 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4941,7 +4941,11 @@ vm_fault_t finish_fault(struct vm_fault *vmf) bool is_cow = (vmf->flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED); int type, nr_pages; - unsigned long addr = vmf->address; + unsigned long addr; + bool needs_fallback = false; + +fallback: + addr = vmf->address; /* Did we COW the page? */ if (is_cow) @@ -4980,7 +4984,8 @@ vm_fault_t finish_fault(struct vm_fault *vmf) * approach also applies to non-anonymous-shmem faults to avoid * inflating the RSS of the process. */ - if (!vma_is_anon_shmem(vma) || unlikely(userfaultfd_armed(vma))) { + if (!vma_is_anon_shmem(vma) || unlikely(userfaultfd_armed(vma)) || + unlikely(needs_fallback)) { nr_pages = 1; } else if (nr_pages > 1) { pgoff_t idx = folio_page_idx(folio, page); @@ -5016,9 +5021,9 @@ vm_fault_t finish_fault(struct vm_fault *vmf) ret = VM_FAULT_NOPAGE; goto unlock; } else if (nr_pages > 1 && !pte_range_none(vmf->pte, nr_pages)) { - update_mmu_tlb_range(vma, addr, vmf->pte, nr_pages); - ret = VM_FAULT_NOPAGE; - goto unlock; + needs_fallback = true; + pte_unmap_unlock(vmf->pte, vmf->ptl); + goto fallback; } folio_ref_add(folio, nr_pages - 1); -- Gitee From 422d1d59d0cb8baeef64031cbdc405dea7375a08 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 28 Feb 2024 16:42:36 +0000 Subject: [PATCH 437/484] mm: use folio more widely in __split_huge_page commit 435a75548109f19e5b5b14ae35b9acb063c084e9 upstream Conflicts: none Backport-reason: More folios We already have a folio; use it instead of the head page where reasonable. Saves a couple of calls to compound_head() and elimimnates a few references to page->mapping. Link: https://lkml.kernel.org/r/20240228164326.1355045-1-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Sidhartha Kumar Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/huge_memory.c | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 7657ebf369a5..ebdb5aefc0e8 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -3061,7 +3061,7 @@ static void __split_huge_page(struct page *page, struct list_head *list, if (head[i].index >= end) { struct folio *tail = page_folio(head + i); - if (shmem_mapping(head->mapping)) + if (shmem_mapping(folio->mapping)) nr_dropped++; else if (folio_test_clear_dirty(tail)) folio_account_cleaned(tail, @@ -3069,7 +3069,7 @@ static void __split_huge_page(struct page *page, struct list_head *list, __filemap_remove_folio(tail, NULL); folio_put(tail); } else if (!PageAnon(page)) { - __xa_store(&head->mapping->i_pages, head[i].index, + __xa_store(&folio->mapping->i_pages, head[i].index, head + i, 0); } else if (swap_cache) { __xa_store(&swap_cache->i_pages, offset + i, @@ -3090,23 +3090,23 @@ static void __split_huge_page(struct page *page, struct list_head *list, split_page_owner(head, order, new_order); /* See comment in __split_huge_page_tail() */ - if (PageAnon(head)) { + if (folio_test_anon(folio)) { /* Additional pin to swap cache */ - if (PageSwapCache(head)) { - page_ref_add(head, 1 + new_nr); + if (folio_test_swapcache(folio)) { + folio_ref_add(folio, 1 + new_nr); xa_unlock(&swap_cache->i_pages); } else { - page_ref_inc(head); + folio_ref_inc(folio); } } else { /* Additional pin to page cache */ - page_ref_add(head, 1 + new_nr); - xa_unlock(&head->mapping->i_pages); + folio_ref_add(folio, 1 + new_nr); + xa_unlock(&folio->mapping->i_pages); } local_irq_enable(); if (nr_dropped) - shmem_uncharge(head->mapping->host, nr_dropped); + shmem_uncharge(folio->mapping->host, nr_dropped); remap_page(folio, nr); /* @@ -3119,9 +3119,10 @@ static void __split_huge_page(struct page *page, struct list_head *list, for (i = 0; i < nr; i += new_nr) { struct page *subpage = head + i; + struct folio *new_folio = page_folio(subpage); if (subpage == page) continue; - unlock_page(subpage); + folio_unlock(new_folio); /* * Subpages may be freed if there wasn't any mapping -- Gitee From 881757bd81eef0ef90e34cdf2ad1a2fae61fe401 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Sat, 5 Oct 2024 21:01:17 +0100 Subject: [PATCH 438/484] mm: remove references to page->index in huge_memory.c commit 544ec0ed376486fae387c023390add32e68b58dd upstream Conflicts: none Backport-reason: A good standlone cleanup to solve conflict We already have folios in all these places; it's just a matter of using them instead of the pages. Link: https://lkml.kernel.org/r/20241005200121.3231142-7-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/huge_memory.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index ebdb5aefc0e8..9f9a52cd5601 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2979,8 +2979,8 @@ static void __split_huge_page_tail(struct folio *folio, int tail, /* ->mapping in first and second tail page is replaced by other uses */ VM_BUG_ON_PAGE(tail > 2 && page_tail->mapping != TAIL_MAPPING, page_tail); - page_tail->mapping = head->mapping; - page_tail->index = head->index + tail; + new_folio->mapping = folio->mapping; + new_folio->index = folio->index + tail; /* * page->private should not be set in tail pages. Fix up and warn once @@ -3056,11 +3056,11 @@ static void __split_huge_page(struct page *page, struct list_head *list, ClearPageHasHWPoisoned(head); for (i = nr - new_nr; i >= new_nr; i -= new_nr) { + struct folio *tail; __split_huge_page_tail(folio, i, lruvec, list, new_order); + tail = page_folio(head + i); /* Some pages can be beyond EOF: drop them from page cache */ - if (head[i].index >= end) { - struct folio *tail = page_folio(head + i); - + if (tail->index >= end) { if (shmem_mapping(folio->mapping)) nr_dropped++; else if (folio_test_clear_dirty(tail)) @@ -3068,12 +3068,12 @@ static void __split_huge_page(struct page *page, struct list_head *list, inode_to_wb(folio->mapping->host)); __filemap_remove_folio(tail, NULL); folio_put(tail); - } else if (!PageAnon(page)) { - __xa_store(&folio->mapping->i_pages, head[i].index, - head + i, 0); + } else if (!folio_test_anon(folio)) { + __xa_store(&folio->mapping->i_pages, tail->index, + tail, 0); } else if (swap_cache) { __xa_store(&swap_cache->i_pages, offset + i, - head + i, 0); + tail, 0); } } -- Gitee From 09752f7411fac67a732da1ab07cdb0328c3ac507 Mon Sep 17 00:00:00 2001 From: Zi Yan Date: Mon, 10 Mar 2025 11:57:27 -0400 Subject: [PATCH 439/484] mm/huge_memory: drop beyond-EOF folios with the right number of refs commit 14efb4793519d73fb2902bb0ece319b886e4b4b9 upstream Conflicts: none Backport-reason: THP & mm fixes When an after-split folio is large and needs to be dropped due to EOF, folio_put_refs(folio, folio_nr_pages(folio)) should be used to drop all page cache refs. Otherwise, the folio will not be freed, causing memory leak. This leak would happen on a filesystem with blocksize > page_size and a truncate is performed, where the blocksize makes folios split to >0 order ones, causing truncated folios not being freed. Link: https://lkml.kernel.org/r/20250310155727.472846-1-ziy@nvidia.com Fixes: c010d47f107f ("mm: thp: split huge page to any lower order pages") Signed-off-by: Zi Yan Reported-by: Hugh Dickins Closes: https://lore.kernel.org/all/fcbadb7f-dd3e-21df-f9a7-2853b53183c4@google.com/ Cc: Baolin Wang Cc: David Hildenbrand Cc: John Hubbard Cc: Kefeng Wang Cc: Kirill A. Shuemov Cc: Luis Chamberalin Cc: Matthew Wilcow (Oracle) Cc: Miaohe Lin Cc: Pankaj Raghav Cc: Ryan Roberts Cc: Yang Shi Cc: Yu Zhao Cc: Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/huge_memory.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 9f9a52cd5601..0910e4493523 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -3067,7 +3067,7 @@ static void __split_huge_page(struct page *page, struct list_head *list, folio_account_cleaned(tail, inode_to_wb(folio->mapping->host)); __filemap_remove_folio(tail, NULL); - folio_put(tail); + folio_put_refs(tail, folio_nr_pages(tail)); } else if (!folio_test_anon(folio)) { __xa_store(&folio->mapping->i_pages, tail->index, tail, 0); -- Gitee From 7ed7fc921e9509eed5c25fbb0aa5b489d863c3b2 Mon Sep 17 00:00:00 2001 From: Huan Yang Date: Mon, 26 Aug 2024 14:40:48 +0800 Subject: [PATCH 440/484] mm: page_alloc: simpify page del and expand commit 94deaf69dcd33462c61fa8cabb0883e3085a1046 upstream Conflicts: none Backport-reason: THP & mm fixes When page del from buddy and need expand, it will account free_pages in zone's migratetype. The current way is to subtract the page number of the current order when deleting, and then add it back when expanding. This is unnecessary, as when migrating the same type, we can directly record the difference between the high-order pages and the expand added, and then subtract it directly. This patch merge that, only when del and expand done, then account free_pages. Link: https://lkml.kernel.org/r/20240826064048.187790-1-link@vivo.com Signed-off-by: Huan Yang Reviewed-by: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/page_alloc.c | 35 +++++++++++++++++++++++++---------- 1 file changed, 25 insertions(+), 10 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 225506e19165..1517243105b8 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1342,11 +1342,11 @@ struct page *__pageblock_pfn_to_page(unsigned long start_pfn, * * -- nyc */ -static inline void expand(struct zone *zone, struct page *page, - int low, int high, int migratetype) +static inline unsigned int expand(struct zone *zone, struct page *page, int low, + int high, int migratetype) { - unsigned long size = 1 << high; - unsigned long nr_added = 0; + unsigned int size = 1 << high; + unsigned int nr_added = 0; while (high > low) { high--; @@ -1366,7 +1366,19 @@ static inline void expand(struct zone *zone, struct page *page, set_buddy_order(&page[size], high); nr_added += size; } - account_freepages(zone, nr_added, migratetype); + + return nr_added; +} + +static __always_inline void page_del_and_expand(struct zone *zone, + struct page *page, int low, + int high, int migratetype) +{ + int nr_pages = 1 << high; + + __del_page_from_free_list(page, zone, high, migratetype); + nr_pages -= expand(zone, page, low, high, migratetype); + account_freepages(zone, -nr_pages, migratetype); } static void check_new_page_bad(struct page *page) @@ -1534,8 +1546,9 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order, page = get_page_from_free_area(area, migratetype); if (!page) continue; - del_page_from_free_list(page, zone, current_order, migratetype); - expand(zone, page, order, current_order, migratetype); + + page_del_and_expand(zone, page, order, current_order, + migratetype); trace_mm_page_alloc_zone_locked(page, order, migratetype, pcp_allowed_order(order) && migratetype < MIGRATE_PCPTYPES); @@ -1891,9 +1904,12 @@ steal_suitable_fallback(struct zone *zone, struct page *page, /* Take ownership for orders >= pageblock_order */ if (current_order >= pageblock_order) { + unsigned int nr_added; + del_page_from_free_list(page, zone, current_order, block_type); change_pageblock_range(page, current_order, start_type); - expand(zone, page, order, current_order, start_type); + nr_added = expand(zone, page, order, current_order, start_type); + account_freepages(zone, nr_added, start_type); return page; } @@ -1946,8 +1962,7 @@ steal_suitable_fallback(struct zone *zone, struct page *page, } single_page: - del_page_from_free_list(page, zone, current_order, block_type); - expand(zone, page, order, current_order, block_type); + page_del_and_expand(zone, page, order, current_order, block_type); return page; } -- Gitee From f184bf6dd26a3efcaeedabb893703996e5e7bea5 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Thu, 25 Apr 2024 20:56:04 -0700 Subject: [PATCH 441/484] mm: page_alloc: change move_freepages() to __move_freepages_block() commit e1f42a577f63647dadf1abe4583053c03d6be045 upstream Conflicts: none Backport-reason: THP & mm fixes The function is now supposed to be called only on a single pageblock and checks start_pfn and end_pfn accordingly. Rename it to make this more obvious and drop the end_pfn parameter which can be determined trivially and none of the callers use it for anything else. Also make the (now internal) end_pfn exclusive, which is more common. Link: https://lkml.kernel.org/r/81b1d642-2ec0-49f5-89fc-19a3828419ff@suse.cz Signed-off-by: Vlastimil Babka Reviewed-by: Zi Yan Acked-by: Johannes Weiner Cc: David Hildenbrand Cc: "Huang, Ying" Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/page_alloc.c | 43 ++++++++++++++++++++----------------------- 1 file changed, 20 insertions(+), 23 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 1517243105b8..6deca5becc51 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1586,18 +1586,18 @@ static inline struct page *__rmqueue_cma_fallback(struct zone *zone, * Change the type of a block and move all its free pages to that * type's freelist. */ -static int move_freepages(struct zone *zone, unsigned long start_pfn, - unsigned long end_pfn, int old_mt, int new_mt) +static int __move_freepages_block(struct zone *zone, unsigned long start_pfn, + int old_mt, int new_mt) { struct page *page; - unsigned long pfn; + unsigned long pfn, end_pfn; unsigned int order; int pages_moved = 0; VM_WARN_ON(start_pfn & (pageblock_nr_pages - 1)); - VM_WARN_ON(start_pfn + pageblock_nr_pages - 1 != end_pfn); + end_pfn = pageblock_end_pfn(start_pfn); - for (pfn = start_pfn; pfn <= end_pfn;) { + for (pfn = start_pfn; pfn < end_pfn;) { page = pfn_to_page(pfn); if (!PageBuddy(page)) { pfn++; @@ -1623,14 +1623,13 @@ static int move_freepages(struct zone *zone, unsigned long start_pfn, static bool prep_move_freepages_block(struct zone *zone, struct page *page, unsigned long *start_pfn, - unsigned long *end_pfn, int *num_free, int *num_movable) { unsigned long pfn, start, end; pfn = page_to_pfn(page); start = pageblock_start_pfn(pfn); - end = pageblock_end_pfn(pfn) - 1; + end = pageblock_end_pfn(pfn); /* * The caller only has the lock for @zone, don't touch ranges @@ -1641,16 +1640,15 @@ static bool prep_move_freepages_block(struct zone *zone, struct page *page, */ if (!zone_spans_pfn(zone, start)) return false; - if (!zone_spans_pfn(zone, end)) + if (!zone_spans_pfn(zone, end - 1)) return false; *start_pfn = start; - *end_pfn = end; if (num_free) { *num_free = 0; *num_movable = 0; - for (pfn = start; pfn <= end;) { + for (pfn = start; pfn < end;) { page = pfn_to_page(pfn); if (PageBuddy(page)) { int nr = 1 << buddy_order(page); @@ -1676,13 +1674,12 @@ static bool prep_move_freepages_block(struct zone *zone, struct page *page, static int move_freepages_block(struct zone *zone, struct page *page, int old_mt, int new_mt) { - unsigned long start_pfn, end_pfn; + unsigned long start_pfn; - if (!prep_move_freepages_block(zone, page, &start_pfn, &end_pfn, - NULL, NULL)) + if (!prep_move_freepages_block(zone, page, &start_pfn, NULL, NULL)) return -1; - return move_freepages(zone, start_pfn, end_pfn, old_mt, new_mt); + return __move_freepages_block(zone, start_pfn, old_mt, new_mt); } #ifdef CONFIG_MEMORY_ISOLATION @@ -1755,10 +1752,9 @@ static void split_large_buddy(struct zone *zone, struct page *page, bool move_freepages_block_isolate(struct zone *zone, struct page *page, int migratetype) { - unsigned long start_pfn, end_pfn, pfn; + unsigned long start_pfn, pfn; - if (!prep_move_freepages_block(zone, page, &start_pfn, &end_pfn, - NULL, NULL)) + if (!prep_move_freepages_block(zone, page, &start_pfn, NULL, NULL)) return false; /* No splits needed if buddies can't span multiple blocks */ @@ -1789,8 +1785,9 @@ bool move_freepages_block_isolate(struct zone *zone, struct page *page, return true; } move: - move_freepages(zone, start_pfn, end_pfn, - get_pfnblock_migratetype(page, start_pfn), migratetype); + __move_freepages_block(zone, start_pfn, + get_pfnblock_migratetype(page, start_pfn), + migratetype); return true; } #endif /* CONFIG_MEMORY_ISOLATION */ @@ -1890,7 +1887,7 @@ steal_suitable_fallback(struct zone *zone, struct page *page, unsigned int alloc_flags, bool whole_block) { int free_pages, movable_pages, alike_pages; - unsigned long start_pfn, end_pfn; + unsigned long start_pfn; int block_type; block_type = get_pageblock_migratetype(page); @@ -1926,8 +1923,8 @@ steal_suitable_fallback(struct zone *zone, struct page *page, goto single_page; /* moving whole block can fail due to zone boundary conditions */ - if (!prep_move_freepages_block(zone, page, &start_pfn, &end_pfn, - &free_pages, &movable_pages)) + if (!prep_move_freepages_block(zone, page, &start_pfn, &free_pages, + &movable_pages)) goto single_page; /* @@ -1957,7 +1954,7 @@ steal_suitable_fallback(struct zone *zone, struct page *page, */ if (free_pages + alike_pages >= (1 << (pageblock_order-1)) || page_group_by_mobility_disabled) { - move_freepages(zone, start_pfn, end_pfn, block_type, start_type); + __move_freepages_block(zone, start_pfn, block_type, start_type); return __rmqueue_smallest(zone, order, start_type); } -- Gitee From d3a16b62cd1d5449decc9c08a2d0499132962eb3 Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Wed, 24 Sep 2025 17:48:30 +0800 Subject: [PATCH 442/484] mm, huge_memory: minor fix for HUGETEXT Upstream: no At least we should return supported orders, rather than a bool. Signed-off-by: Kairui Song --- mm/huge_memory.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 0910e4493523..0cd639eec47f 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -155,9 +155,9 @@ unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma, #ifdef CONFIG_HUGETEXT /* Enable hugetext does not require THP settings */ if (hugetext_anon_enabled() && vma_is_hugetext_anon(vma, vm_flags)) - return true; + return orders; if (!in_pf && hugetext_vma_enabled(vma, vm_flags)) - return true; + return orders; #endif if (!vma_is_anonymous(vma)) { -- Gitee From 9b82b69ddc8545e00cf7c3547fd0b7a2bbed4449 Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Mon, 22 Jul 2024 13:43:17 +0800 Subject: [PATCH 443/484] mm: shmem: simplify the suitable huge orders validation for tmpfs commit 0bedf001e359368badbdefe722162ed4dd33e296 upstream Conflicts: none Backport-reason: Shmem mTHP cleanup Patch series "Some cleanups for shmem", v3. This series does some cleanups to reuse code, rename functions and simplify logic to make code more clear. No functional changes are expected. This patch (of 3): Move the suitable huge orders validation into shmem_suitable_orders() for tmpfs, which can reuse some code to simplify the logic. In addition, we don't have special handling for the error code -E2BIG when checking for conflicts with PMD sized THP in the pagecache for tmpfs, instead, it will just fallback to order-0 allocations like this patch does, so this simplification will not add functional changes. Link: https://lkml.kernel.org/r/cover.1721626645.git.baolin.wang@linux.alibaba.com Link: https://lkml.kernel.org/r/965985dd6d322929d78a0beee0dafa1c2a1b81e2.1721626645.git.baolin.wang@linux.alibaba.com Signed-off-by: Baolin Wang Reviewed-by: Ryan Roberts Acked-by: David Hildenbrand Cc: Barry Song <21cnbao@gmail.com> Cc: Hugh Dickins Cc: Lance Yang Cc: Matthew Wilcox (Oracle) Cc: Zi Yan Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/shmem.c | 39 +++++++++++++++------------------------ 1 file changed, 15 insertions(+), 24 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index 3fd652d966d6..bcdc2c9f7cb3 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1736,20 +1736,30 @@ static unsigned long shmem_suitable_orders(struct inode *inode, struct vm_fault struct address_space *mapping, pgoff_t index, unsigned long orders) { - struct vm_area_struct *vma = vmf->vma; + struct vm_area_struct *vma = vmf ? vmf->vma : NULL; pgoff_t aligned_index; unsigned long pages; int order; - orders = thp_vma_suitable_orders(vma, vmf->address, orders); - if (!orders) - return 0; + if (vma) { + orders = thp_vma_suitable_orders(vma, vmf->address, orders); + if (!orders) + return 0; + } /* Find the highest order that can add into the page cache */ order = highest_order(orders); while (orders) { pages = 1UL << order; aligned_index = round_down(index, pages); + /* + * Check for conflict before waiting on a huge allocation. + * Conflict might be that a huge page has just been allocated + * and added to page cache by a racing thread, or that there + * is already at least one small page in the huge extent. + * Be careful to retry when appropriate, but not forever! + * Elsewhere -EEXIST would be the right code, but not here. + */ if (!xa_find(&mapping->i_pages, &aligned_index, aligned_index + pages - 1, XA_PRESENT)) break; @@ -1794,7 +1804,6 @@ static struct folio *shmem_alloc_and_add_folio(struct vm_fault *vmf, { struct address_space *mapping = inode->i_mapping; struct shmem_inode_info *info = SHMEM_I(inode); - struct vm_area_struct *vma = vmf ? vmf->vma : NULL; unsigned long suitable_orders = 0; struct folio *folio = NULL; long pages; @@ -1804,26 +1813,8 @@ static struct folio *shmem_alloc_and_add_folio(struct vm_fault *vmf, orders = 0; if (orders > 0) { - if (vma && vma_is_anon_shmem(vma)) { - suitable_orders = shmem_suitable_orders(inode, vmf, + suitable_orders = shmem_suitable_orders(inode, vmf, mapping, index, orders); - } else if (orders & BIT(HPAGE_PMD_ORDER)) { - pages = HPAGE_PMD_NR; - suitable_orders = BIT(HPAGE_PMD_ORDER); - index = round_down(index, HPAGE_PMD_NR); - - /* - * Check for conflict before waiting on a huge allocation. - * Conflict might be that a huge page has just been allocated - * and added to page cache by a racing thread, or that there - * is already at least one small page in the huge extent. - * Be careful to retry when appropriate, but not forever! - * Elsewhere -EEXIST would be the right code, but not here. - */ - if (xa_find(&mapping->i_pages, &index, - index + HPAGE_PMD_NR - 1, XA_PRESENT)) - return ERR_PTR(-E2BIG); - } order = highest_order(suitable_orders); while (suitable_orders) { -- Gitee From 940aea7fb59d01c1ce1f294c4d5a83aba9e9a5f8 Mon Sep 17 00:00:00 2001 From: Bang Li Date: Fri, 5 Jul 2024 11:23:09 +0800 Subject: [PATCH 444/484] mm: thp: support "THPeligible" semantics for mTHP with anonymous shmem commit 26c7d8413aaf113a54b54f63e151416a5c5c2a88 upstream Conflicts: trivial, indention mismatch in an existing backport Backport-reason: Shmem mTHP cleanup After the commit 7fb1b252afb5 ("mm: shmem: add mTHP support for anonymous shmem"), we can configure different policies through the multi-size THP sysfs interface for anonymous shmem. But currently "THPeligible" indicates only whether the mapping is eligible for allocating THP-pages as well as the THP is PMD mappable or not for anonymous shmem, we need to support semantics for mTHP with anonymous shmem similar to those for mTHP with anonymous memory. Link: https://lkml.kernel.org/r/20240705032309.24933-1-libang.li@antgroup.com Signed-off-by: Bang Li Reviewed-by: Baolin Wang Cc: David Hildenbrand Cc: Hugh Dickins Cc: Kefeng Wang Cc: Lance Yang Cc: Ryan Roberts Cc: Zi Yan Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- include/linux/shmem_fs.h | 9 +++++++++ mm/huge_memory.c | 13 +++++++++---- mm/shmem.c | 9 +-------- 3 files changed, 19 insertions(+), 12 deletions(-) diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index f0c6bf982832..41aa4e0d6dbc 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h @@ -117,12 +117,21 @@ int shmem_unuse(unsigned int type); #ifdef CONFIG_TRANSPARENT_HUGEPAGE extern bool shmem_is_huge(struct inode *inode, pgoff_t index, bool shmem_huge_force, struct mm_struct *mm, unsigned long vm_flags); +unsigned long shmem_allowable_huge_orders(struct inode *inode, + struct vm_area_struct *vma, pgoff_t index, + bool global_huge); #else static __always_inline bool shmem_is_huge(struct inode *inode, pgoff_t index, bool shmem_huge_force, struct mm_struct *mm, unsigned long vm_flags) { return false; } +static inline unsigned long shmem_allowable_huge_orders(struct inode *inode, + struct vm_area_struct *vma, pgoff_t index, + bool global_huge) +{ + return 0; +} #endif #ifdef CONFIG_SHMEM diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 0cd639eec47f..aad258c851bd 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -147,10 +147,15 @@ unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma, * Must be done before hugepage flags check since shmem has its * own flags. */ - if (!in_pf && shmem_file(vma->vm_file)) - return shmem_is_huge(file_inode(vma->vm_file), vma->vm_pgoff, - !enforce_sysfs, vma->vm_mm, vm_flags) - ? orders : 0; + if (!in_pf && shmem_file(vma->vm_file)) { + bool global_huge = shmem_is_huge(file_inode(vma->vm_file), vma->vm_pgoff, + !enforce_sysfs, vma->vm_mm, vm_flags); + + if (!vma_is_anon_shmem(vma)) + return global_huge ? orders : 0; + return shmem_allowable_huge_orders(file_inode(vma->vm_file), + vma, vma->vm_pgoff, global_huge); + } #ifdef CONFIG_HUGETEXT /* Enable hugetext does not require THP settings */ diff --git a/mm/shmem.c b/mm/shmem.c index bcdc2c9f7cb3..a067b9065b88 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1678,7 +1678,7 @@ static gfp_t limit_gfp_mask(gfp_t huge_gfp, gfp_t limit_gfp) } #ifdef CONFIG_TRANSPARENT_HUGEPAGE -static unsigned long shmem_allowable_huge_orders(struct inode *inode, +unsigned long shmem_allowable_huge_orders(struct inode *inode, struct vm_area_struct *vma, pgoff_t index, bool global_huge) { @@ -1769,13 +1769,6 @@ static unsigned long shmem_suitable_orders(struct inode *inode, struct vm_fault return orders; } #else -static unsigned long shmem_allowable_huge_orders(struct inode *inode, - struct vm_area_struct *vma, pgoff_t index, - bool global_huge) -{ - return 0; -} - static unsigned long shmem_suitable_orders(struct inode *inode, struct vm_fault *vmf, struct address_space *mapping, pgoff_t index, unsigned long orders) -- Gitee From e73263c15edf298ddc3d07285eabd23b05bd03e7 Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Mon, 22 Jul 2024 13:43:18 +0800 Subject: [PATCH 445/484] mm: shmem: rename shmem_is_huge() to shmem_huge_global_enabled() commit d58a2a581f132529eefac5377676011562b631b8 upstream Conflicts: none Backport-reason: Shmem mTHP cleanup shmem_is_huge() is now used to check if the top-level huge page is enabled, thus rename it to reflect its usage. Link: https://lkml.kernel.org/r/da53296e0ab6359aa083561d9dc01e4223d60fbe.1721626645.git.baolin.wang@linux.alibaba.com Signed-off-by: Baolin Wang Reviewed-by: Ryan Roberts Acked-by: David Hildenbrand Cc: Barry Song <21cnbao@gmail.com> Cc: Hugh Dickins Cc: Lance Yang Cc: Matthew Wilcox (Oracle) Cc: Zi Yan Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- include/linux/shmem_fs.h | 9 +++++---- mm/huge_memory.c | 5 +++-- mm/shmem.c | 15 ++++++++------- 3 files changed, 16 insertions(+), 13 deletions(-) diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index 41aa4e0d6dbc..83a4fd53df8c 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h @@ -115,14 +115,15 @@ extern void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end); int shmem_unuse(unsigned int type); #ifdef CONFIG_TRANSPARENT_HUGEPAGE -extern bool shmem_is_huge(struct inode *inode, pgoff_t index, bool shmem_huge_force, - struct mm_struct *mm, unsigned long vm_flags); +extern bool shmem_huge_global_enabled(struct inode *inode, pgoff_t index, bool shmem_huge_force, + struct mm_struct *mm, unsigned long vm_flags); unsigned long shmem_allowable_huge_orders(struct inode *inode, struct vm_area_struct *vma, pgoff_t index, bool global_huge); #else -static __always_inline bool shmem_is_huge(struct inode *inode, pgoff_t index, bool shmem_huge_force, - struct mm_struct *mm, unsigned long vm_flags) +static __always_inline bool shmem_huge_global_enabled(struct inode *inode, pgoff_t index, + bool shmem_huge_force, struct mm_struct *mm, + unsigned long vm_flags) { return false; } diff --git a/mm/huge_memory.c b/mm/huge_memory.c index aad258c851bd..1ac2eb5be5f3 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -148,8 +148,9 @@ unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma, * own flags. */ if (!in_pf && shmem_file(vma->vm_file)) { - bool global_huge = shmem_is_huge(file_inode(vma->vm_file), vma->vm_pgoff, - !enforce_sysfs, vma->vm_mm, vm_flags); + bool global_huge = shmem_huge_global_enabled(file_inode(vma->vm_file), + vma->vm_pgoff, !enforce_sysfs, + vma->vm_mm, vm_flags); if (!vma_is_anon_shmem(vma)) return global_huge ? orders : 0; diff --git a/mm/shmem.c b/mm/shmem.c index a067b9065b88..1f7ede36422d 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -556,9 +556,9 @@ static int shmem_confirm_swap(struct address_space *mapping, pgoff_t index, static int shmem_huge __read_mostly = SHMEM_HUGE_NEVER; -static bool __shmem_is_huge(struct inode *inode, pgoff_t index, - bool shmem_huge_force, struct mm_struct *mm, - unsigned long vm_flags) +static bool __shmem_huge_global_enabled(struct inode *inode, pgoff_t index, + bool shmem_huge_force, struct mm_struct *mm, + unsigned long vm_flags) { loff_t i_size; @@ -589,14 +589,15 @@ static bool __shmem_is_huge(struct inode *inode, pgoff_t index, } } -bool shmem_is_huge(struct inode *inode, pgoff_t index, +bool shmem_huge_global_enabled(struct inode *inode, pgoff_t index, bool shmem_huge_force, struct mm_struct *mm, unsigned long vm_flags) { if (HPAGE_PMD_ORDER > MAX_PAGECACHE_ORDER) return false; - return __shmem_is_huge(inode, index, shmem_huge_force, mm, vm_flags); + return __shmem_huge_global_enabled(inode, index, shmem_huge_force, + mm, vm_flags); } #if defined(CONFIG_SYSFS) @@ -1187,7 +1188,7 @@ static int shmem_getattr(struct mnt_idmap *idmap, STATX_ATTR_NODUMP); generic_fillattr(idmap, request_mask, inode, stat); - if (shmem_is_huge(inode, 0, false, NULL, 0)) + if (shmem_huge_global_enabled(inode, 0, false, NULL, 0)) stat->blksize = HPAGE_PMD_SIZE; if (request_mask & STATX_BTIME) { @@ -2463,7 +2464,7 @@ static int shmem_get_folio_gfp(struct inode *inode, pgoff_t index, return 0; } - huge = shmem_is_huge(inode, index, false, fault_mm, + huge = shmem_huge_global_enabled(inode, index, false, fault_mm, vma ? vma->vm_flags : 0); /* Find hugepage orders that are allowed for anonymous shmem. */ if (vma && vma_is_anon_shmem(vma)) -- Gitee From 9cce51032260b3cbb482013f2879b5f9b370b1b6 Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Mon, 22 Jul 2024 13:43:19 +0800 Subject: [PATCH 446/484] mm: shmem: move shmem_huge_global_enabled() into shmem_allowable_huge_orders() commit 6beeab870e70b2d4f49baf6c6be9da1b61c169f8 upstream Conflicts: none Backport-reason: Shmem mTHP cleanup Move shmem_huge_global_enabled() into shmem_allowable_huge_orders(), so that shmem_allowable_huge_orders() can also help to find the allowable huge orders for tmpfs. Moreover the shmem_huge_global_enabled() can become static. While we are at it, passing the vma instead of mm for shmem_huge_global_enabled() makes code cleaner. No functional changes. Link: https://lkml.kernel.org/r/8e825146bb29ee1a1c7bd64d2968ff3e19be7815.1721626645.git.baolin.wang@linux.alibaba.com Signed-off-by: Baolin Wang Reviewed-by: Ryan Roberts Acked-by: David Hildenbrand Cc: Barry Song <21cnbao@gmail.com> Cc: Hugh Dickins Cc: Lance Yang Cc: Matthew Wilcox (Oracle) Cc: Zi Yan Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- include/linux/shmem_fs.h | 12 ++-------- mm/huge_memory.c | 12 +++------- mm/shmem.c | 47 +++++++++++++++++++++++++--------------- 3 files changed, 35 insertions(+), 36 deletions(-) diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index 83a4fd53df8c..57e8a6689439 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h @@ -115,21 +115,13 @@ extern void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end); int shmem_unuse(unsigned int type); #ifdef CONFIG_TRANSPARENT_HUGEPAGE -extern bool shmem_huge_global_enabled(struct inode *inode, pgoff_t index, bool shmem_huge_force, - struct mm_struct *mm, unsigned long vm_flags); unsigned long shmem_allowable_huge_orders(struct inode *inode, struct vm_area_struct *vma, pgoff_t index, - bool global_huge); + bool shmem_huge_force); #else -static __always_inline bool shmem_huge_global_enabled(struct inode *inode, pgoff_t index, - bool shmem_huge_force, struct mm_struct *mm, - unsigned long vm_flags) -{ - return false; -} static inline unsigned long shmem_allowable_huge_orders(struct inode *inode, struct vm_area_struct *vma, pgoff_t index, - bool global_huge) + bool shmem_huge_force) { return 0; } diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 1ac2eb5be5f3..6e897362d453 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -147,16 +147,10 @@ unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma, * Must be done before hugepage flags check since shmem has its * own flags. */ - if (!in_pf && shmem_file(vma->vm_file)) { - bool global_huge = shmem_huge_global_enabled(file_inode(vma->vm_file), - vma->vm_pgoff, !enforce_sysfs, - vma->vm_mm, vm_flags); - - if (!vma_is_anon_shmem(vma)) - return global_huge ? orders : 0; + if (!in_pf && shmem_file(vma->vm_file)) return shmem_allowable_huge_orders(file_inode(vma->vm_file), - vma, vma->vm_pgoff, global_huge); - } + vma, vma->vm_pgoff, + !enforce_sysfs); #ifdef CONFIG_HUGETEXT /* Enable hugetext does not require THP settings */ diff --git a/mm/shmem.c b/mm/shmem.c index 1f7ede36422d..4b78fb420269 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -557,9 +557,10 @@ static int shmem_confirm_swap(struct address_space *mapping, pgoff_t index, static int shmem_huge __read_mostly = SHMEM_HUGE_NEVER; static bool __shmem_huge_global_enabled(struct inode *inode, pgoff_t index, - bool shmem_huge_force, struct mm_struct *mm, + bool shmem_huge_force, struct vm_area_struct *vma, unsigned long vm_flags) { + struct mm_struct *mm = vma ? vma->vm_mm : NULL; loff_t i_size; if (!S_ISREG(inode->i_mode)) @@ -589,15 +590,15 @@ static bool __shmem_huge_global_enabled(struct inode *inode, pgoff_t index, } } -bool shmem_huge_global_enabled(struct inode *inode, pgoff_t index, - bool shmem_huge_force, struct mm_struct *mm, +static bool shmem_huge_global_enabled(struct inode *inode, pgoff_t index, + bool shmem_huge_force, struct vm_area_struct *vma, unsigned long vm_flags) { if (HPAGE_PMD_ORDER > MAX_PAGECACHE_ORDER) return false; return __shmem_huge_global_enabled(inode, index, shmem_huge_force, - mm, vm_flags); + vma, vm_flags); } #if defined(CONFIG_SYSFS) @@ -780,6 +781,13 @@ static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo, { return 0; } + +static bool shmem_huge_global_enabled(struct inode *inode, pgoff_t index, + bool shmem_huge_force, struct vm_area_struct *vma, + unsigned long vm_flags) +{ + return false; +} #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ static void shmem_update_stats(struct folio *folio, int nr_pages) @@ -1681,22 +1689,33 @@ static gfp_t limit_gfp_mask(gfp_t huge_gfp, gfp_t limit_gfp) #ifdef CONFIG_TRANSPARENT_HUGEPAGE unsigned long shmem_allowable_huge_orders(struct inode *inode, struct vm_area_struct *vma, pgoff_t index, - bool global_huge) + bool shmem_huge_force) { unsigned long mask = READ_ONCE(huge_shmem_orders_always); unsigned long within_size_orders = READ_ONCE(huge_shmem_orders_within_size); - unsigned long vm_flags = vma->vm_flags; + unsigned long vm_flags = vma ? vma->vm_flags : 0; + bool global_huge; loff_t i_size; int order; - if ((vm_flags & VM_NOHUGEPAGE) || - test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags)) + if (vma && ((vm_flags & VM_NOHUGEPAGE) || + test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags))) return 0; /* If the hardware/firmware marked hugepage support disabled. */ if (transparent_hugepage_flags & (1 << TRANSPARENT_HUGEPAGE_UNSUPPORTED)) return 0; + global_huge = shmem_huge_global_enabled(inode, index, shmem_huge_force, + vma, vm_flags); + if (!vma || !vma_is_anon_shmem(vma)) { + /* + * For tmpfs, we now only support PMD sized THP if huge page + * is enabled, otherwise fallback to order 0. + */ + return global_huge ? BIT(HPAGE_PMD_ORDER) : 0; + } + /* * Following the 'deny' semantics of the top level, force the huge * option off from all mounts. @@ -2394,7 +2413,7 @@ static int shmem_get_folio_gfp(struct inode *inode, pgoff_t index, struct mm_struct *fault_mm; struct folio *folio; int error; - bool alloced, huge; + bool alloced; unsigned long orders = 0; if (index > (MAX_LFS_FILESIZE >> PAGE_SHIFT)) @@ -2464,14 +2483,8 @@ static int shmem_get_folio_gfp(struct inode *inode, pgoff_t index, return 0; } - huge = shmem_huge_global_enabled(inode, index, false, fault_mm, - vma ? vma->vm_flags : 0); - /* Find hugepage orders that are allowed for anonymous shmem. */ - if (vma && vma_is_anon_shmem(vma)) - orders = shmem_allowable_huge_orders(inode, vma, index, huge); - else if (huge) - orders = BIT(HPAGE_PMD_ORDER); - + /* Find hugepage orders that are allowed for anonymous shmem and tmpfs. */ + orders = shmem_allowable_huge_orders(inode, vma, index, false); if (orders > 0) { gfp_t huge_gfp; -- Gitee From 51ddcd0fec7caa63327ddcc37950666516d104c4 Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Thu, 19 Dec 2024 15:30:08 +0800 Subject: [PATCH 447/484] mm: shmem: fix incorrect index alignment for within_size policy commit d0e6983a6d1719738cf8d13982a68094f0a1872a upstream Conflicts: none Backport-reason: Shmem mTHP fixes With enabling the shmem per-size within_size policy, using an incorrect 'order' size to round_up() the index can lead to incorrect i_size checks, resulting in an inappropriate large orders being returned. Changing to use '1 << order' to round_up() the index to fix this issue. Additionally, adding an 'aligned_index' variable to avoid affecting the index checks. Link: https://lkml.kernel.org/r/77d8ef76a7d3d646e9225e9af88a76549a68aab1.1734593154.git.baolin.wang@linux.alibaba.com Fixes: e7a2ab7b3bb5 ("mm: shmem: add mTHP support for anonymous shmem") Signed-off-by: Baolin Wang Acked-by: David Hildenbrand Cc: Hugh Dickins Cc: Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/shmem.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index 4b78fb420269..16553bbcbe7e 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1694,6 +1694,7 @@ unsigned long shmem_allowable_huge_orders(struct inode *inode, unsigned long mask = READ_ONCE(huge_shmem_orders_always); unsigned long within_size_orders = READ_ONCE(huge_shmem_orders_within_size); unsigned long vm_flags = vma ? vma->vm_flags : 0; + pgoff_t aligned_index; bool global_huge; loff_t i_size; int order; @@ -1733,9 +1734,9 @@ unsigned long shmem_allowable_huge_orders(struct inode *inode, /* Allow mTHP that will be fully within i_size. */ order = highest_order(within_size_orders); while (within_size_orders) { - index = round_up(index + 1, order); + aligned_index = round_up(index + 1, 1 << order); i_size = round_up(i_size_read(inode), PAGE_SIZE); - if (i_size >> PAGE_SHIFT >= index) { + if (i_size >> PAGE_SHIFT >= aligned_index) { mask |= within_size_orders; break; } -- Gitee From 744936eb361241d6306dda2462249e9510f13418 Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Fri, 13 Jun 2025 17:12:19 +0800 Subject: [PATCH 448/484] mm: huge_memory: fix the check for allowed huge orders in shmem commit fa493f50df3adaddf9707d14b2f972ddd388f452 upstream Conflicts: trivial, due to missing e1e4cfd01a6e Backport-reason: Shmem mTHP fixes Shmem already supports mTHP, and shmem_allowable_huge_orders() will return the huge orders allowed by shmem. However, there is no check against the 'orders' parameter passed by __thp_vma_allowable_orders(), which can lead to incorrect check results for __thp_vma_allowable_orders(). For example, when a user wants to check if shmem supports PMD-sized THP by thp_vma_allowable_order(), if shmem only enables 64K mTHP, the current logic would cause thp_vma_allowable_order() to return true, implying that shmem allows PMD-sized THP allocation, which it actually does not. I don't think this will cause a significant impact on users, and this will only have some impact on the shmem THP collapse. That is to say, even though the shmem sysfs setting does not enable the PMD-sized THP, the thp_vma_allowable_order() still indicates that shmem allows PMD-sized collapse, meaning it might successfully collapse into THP, or it might not (for example, thp_vma_suitable_order() check failed in the collapse process). However, this still does not align with the shmem sysfs configuration, fix it. Link: https://lkml.kernel.org/r/529affb3220153d0d5a542960b535cdfc33f51d7.1749804835.git.baolin.wang@linux.alibaba.com Fixes: 26c7d8413aaf ("mm: thp: support "THPeligible" semantics for mTHP with anonymous shmem") Signed-off-by: Baolin Wang Reviewed-by: Lorenzo Stoakes Acked-by: Zi Yan Acked-by: David Hildenbrand Cc: Barry Song Cc: Dev Jain Cc: Liam Howlett Cc: Mariano Pache Cc: Ryan Roberts Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/huge_memory.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 6e897362d453..9f029e1d32fd 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -148,7 +148,7 @@ unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma, * own flags. */ if (!in_pf && shmem_file(vma->vm_file)) - return shmem_allowable_huge_orders(file_inode(vma->vm_file), + return orders & shmem_allowable_huge_orders(file_inode(vma->vm_file), vma, vma->vm_pgoff, !enforce_sysfs); -- Gitee From 2687e9d292aee2d446f73731229d9ee0f9e0bc63 Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Thu, 19 Dec 2024 15:30:09 +0800 Subject: [PATCH 449/484] mm: shmem: fix the update of 'shmem_falloc->nr_unswapped' commit d77b90d2b2642655b5f60953c36ad887257e1802 upstream Conflicts: none Backport-reason: Shmem mTHP fixes The 'shmem_falloc->nr_unswapped' is used to record how many writepage refused to swap out because fallocate() is allocating, but after shmem supports large folio swap out, the update of 'shmem_falloc->nr_unswapped' does not use the correct number of pages in the large folio, which may lead to fallocate() not exiting as soon as possible. Anyway, this is found through code inspection, and I am not sure whether it would actually cause serious issues. Link: https://lkml.kernel.org/r/f66a0119d0564c2c37c84f045835b870d1b2196f.1734593154.git.baolin.wang@linux.alibaba.com Fixes: 809bc86517cc ("mm: shmem: support large folio swap out") Signed-off-by: Baolin Wang Acked-by: David Hildenbrand Cc: Hugh Dickins Cc: Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/shmem.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/shmem.c b/mm/shmem.c index 16553bbcbe7e..8080a0abc793 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1565,7 +1565,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) !shmem_falloc->waitq && index >= shmem_falloc->start && index < shmem_falloc->next) - shmem_falloc->nr_unswapped++; + shmem_falloc->nr_unswapped += nr_pages; else shmem_falloc = NULL; spin_unlock(&inode->i_lock); -- Gitee From bfc43218c3ee312e77ea5afccb4c158a9281706d Mon Sep 17 00:00:00 2001 From: Barry Song Date: Thu, 22 Feb 2024 21:11:34 +1300 Subject: [PATCH 450/484] crypto: introduce: acomp_is_async to expose if comp drivers might sleep commit 6c303f1af356f5f6847146ab0199dc3fe61d8f46 upstream Conflicts: none Backport-reason: ZSWAP crypto update acomp's users might want to know if acomp is really async to optimize themselves. One typical user which can benefit from exposed async stat is zswap. In zswap, zsmalloc is the most commonly used allocator for (and perhaps the only one). For zsmalloc, we cannot sleep while we map the compressed memory, so we copy it to a temporary buffer. By knowing the alg won't sleep can help zswap to avoid the need for a buffer. This shows noticeable improvement in load/store latency of zswap. Link: https://lkml.kernel.org/r/20240222081135.173040-2-21cnbao@gmail.com Signed-off-by: Barry Song Acked-by: Herbert Xu Acked-by: Chris Li Cc: Chengming Zhou Cc: Dan Streetman Cc: David S. Miller Cc: Johannes Weiner Cc: Nhat Pham Cc: Seth Jennings Cc: Vitaly Wool Cc: Yosry Ahmed Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- include/crypto/acompress.h | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/include/crypto/acompress.h b/include/crypto/acompress.h index 574cffc90730..80e243611fe2 100644 --- a/include/crypto/acompress.h +++ b/include/crypto/acompress.h @@ -160,6 +160,12 @@ static inline void acomp_request_set_tfm(struct acomp_req *req, req->base.tfm = crypto_acomp_tfm(tfm); } +static inline bool acomp_is_async(struct crypto_acomp *tfm) +{ + return crypto_comp_alg_common(tfm)->base.cra_flags & + CRYPTO_ALG_ASYNC; +} + static inline struct crypto_acomp *crypto_acomp_reqtfm(struct acomp_req *req) { return __crypto_acomp_tfm(req->base.tfm); -- Gitee From 818cfb02f090b017e085645c180e7a51022dd0af Mon Sep 17 00:00:00 2001 From: Barry Song Date: Thu, 22 Feb 2024 21:11:35 +1300 Subject: [PATCH 451/484] mm/zswap: remove the memcpy if acomp is not sleepable commit 270700dd06ca41a4779c19eb46608f076bb7d40e upstream Conflicts: none Backport-reason: ZSWAP crypto update Most compressors are actually CPU-based and won't sleep during compression and decompression. We should remove the redundant memcpy for them. This patch checks if the algorithm is sleepable by testing the CRYPTO_ALG_ASYNC algorithm flag. Generally speaking, async and sleepable are semantically similar but not equal. But for compress drivers, they are basically equal at least due to the below facts. Firstly, scompress drivers - crypto/deflate.c, lz4.c, zstd.c, lzo.c etc have no sleep. Secondly, zRAM has been using these scompress drivers for years in atomic contexts, and never worried those drivers going to sleep. One exception is that an async driver can sometimes still return synchronously per Herbert's clarification. In this case, we are still having a redundant memcpy. But we can't know if one particular acomp request will sleep or not unless crypto can expose more details for each specific request from offload drivers. Link: https://lkml.kernel.org/r/20240222081135.173040-3-21cnbao@gmail.com Signed-off-by: Barry Song Tested-by: Chengming Zhou Reviewed-by: Nhat Pham Acked-by: Yosry Ahmed Reviewed-by: Chengming Zhou Acked-by: Chris Li Acked-by: Johannes Weiner Cc: Dan Streetman Cc: David S. Miller Cc: Herbert Xu Cc: Seth Jennings Cc: Vitaly Wool Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/zswap.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/mm/zswap.c b/mm/zswap.c index 5fbea06912be..2c65caa4acdf 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -149,6 +149,7 @@ struct crypto_acomp_ctx { struct crypto_wait wait; u8 *buffer; struct mutex mutex; + bool is_sleepable; }; /* @@ -841,6 +842,7 @@ static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node) goto acomp_fail; } acomp_ctx->acomp = acomp; + acomp_ctx->is_sleepable = acomp_is_async(acomp); req = acomp_request_alloc(acomp_ctx->acomp); if (!req) { @@ -996,7 +998,7 @@ static bool zswap_decompress(struct zswap_entry *entry, struct folio *folio) acomp_ctx = acomp_ctx_get_cpu_lock(entry->pool); src = zpool_map_handle(zpool, entry->handle, ZPOOL_MM_RO); - if (!zpool_can_sleep_mapped(zpool)) { + if (acomp_ctx->is_sleepable && !zpool_can_sleep_mapped(zpool)) { memcpy(acomp_ctx->buffer, src, entry->length); src = acomp_ctx->buffer; zpool_unmap_handle(zpool, entry->handle); @@ -1009,7 +1011,7 @@ static bool zswap_decompress(struct zswap_entry *entry, struct folio *folio) decomp_ret = crypto_wait_req(crypto_acomp_decompress(acomp_ctx->req), &acomp_ctx->wait); dlen = acomp_ctx->req->dlen; - if (zpool_can_sleep_mapped(zpool)) + if (!acomp_ctx->is_sleepable || zpool_can_sleep_mapped(zpool)) zpool_unmap_handle(zpool, entry->handle); acomp_ctx_put_unlock(acomp_ctx); -- Gitee From 2a3e510d0539383cbf2121390598aed1051525d8 Mon Sep 17 00:00:00 2001 From: Barry Song Date: Tue, 19 Mar 2024 12:47:06 +1300 Subject: [PATCH 452/484] mm: zswap: fix kernel BUG in sg_init_one commit 9c500835f279e636722bbcafdfe62cc0153ec292 upstream Conflicts: none Backport-reason: ZSWAP Fixes sg_init_one() relies on linearly mapped low memory for the safe utilization of virt_to_page(). Otherwise, we trigger a kernel BUG, kernel BUG at include/linux/scatterlist.h:187! Internal error: Oops - BUG: 0 [#1] PREEMPT SMP ARM Modules linked in: CPU: 0 PID: 2997 Comm: syz-executor198 Not tainted 6.8.0-syzkaller #0 Hardware name: ARM-Versatile Express PC is at sg_set_buf include/linux/scatterlist.h:187 [inline] PC is at sg_init_one+0x9c/0xa8 lib/scatterlist.c:143 LR is at sg_init_table+0x2c/0x40 lib/scatterlist.c:128 Backtrace: [<807e16ac>] (sg_init_one) from [<804c1824>] (zswap_decompress+0xbc/0x208 mm/zswap.c:1089) r7:83471c80 r6:def6d08c r5:844847d0 r4:ff7e7ef4 [<804c1768>] (zswap_decompress) from [<804c4468>] (zswap_load+0x15c/0x198 mm/zswap.c:1637) r9:8446eb80 r8:8446eb80 r7:8446eb84 r6:def6d08c r5:00000001 r4:844847d0 [<804c430c>] (zswap_load) from [<804b9644>] (swap_read_folio+0xa8/0x498 mm/page_io.c:518) r9:844ac800 r8:835e6c00 r7:00000000 r6:df955d4c r5:00000001 r4:def6d08c [<804b959c>] (swap_read_folio) from [<804bb064>] (swap_cluster_readahead+0x1c4/0x34c mm/swap_state.c:684) r10:00000000 r9:00000007 r8:df955d4b r7:00000000 r6:00000000 r5:00100cca r4:00000001 [<804baea0>] (swap_cluster_readahead) from [<804bb3b8>] (swapin_readahead+0x68/0x4a8 mm/swap_state.c:904) r10:df955eb8 r9:00000000 r8:00100cca r7:84476480 r6:00000001 r5:00000000 r4:00000001 [<804bb350>] (swapin_readahead) from [<8047cde0>] (do_swap_page+0x200/0xcc4 mm/memory.c:4046) r10:00000040 r9:00000000 r8:844ac800 r7:84476480 r6:00000001 r5:00000000 r4:df955eb8 [<8047cbe0>] (do_swap_page) from [<8047e6c4>] (handle_pte_fault mm/memory.c:5301 [inline]) [<8047cbe0>] (do_swap_page) from [<8047e6c4>] (__handle_mm_fault mm/memory.c:5439 [inline]) [<8047cbe0>] (do_swap_page) from [<8047e6c4>] (handle_mm_fault+0x3d8/0x12b8 mm/memory.c:5604) r10:00000040 r9:842b3900 r8:7eb0d000 r7:84476480 r6:7eb0d000 r5:835e6c00 r4:00000254 [<8047e2ec>] (handle_mm_fault) from [<80215d28>] (do_page_fault+0x148/0x3a8 arch/arm/mm/fault.c:326) r10:00000007 r9:842b3900 r8:7eb0d000 r7:00000207 r6:00000254 r5:7eb0d9b4 r4:df955fb0 [<80215be0>] (do_page_fault) from [<80216170>] (do_DataAbort+0x38/0xa8 arch/arm/mm/fault.c:558) r10:7eb0da7c r9:00000000 r8:80215be0 r7:df955fb0 r6:7eb0d9b4 r5:00000207 r4:8261d0e0 [<80216138>] (do_DataAbort) from [<80200e3c>] (__dabt_usr+0x5c/0x60 arch/arm/kernel/entry-armv.S:427) Exception stack(0xdf955fb0 to 0xdf955ff8) 5fa0: 00000000 00000000 22d5f800 0008d158 5fc0: 00000000 7eb0d9a4 00000000 00000109 00000000 00000000 7eb0da7c 7eb0da3c 5fe0: 00000000 7eb0d9a0 00000001 00066bd4 00000010 ffffffff r8:824a9044 r7:835e6c00 r6:ffffffff r5:00000010 r4:00066bd4 Code: 1a000004 e1822003 e8860094 e89da8f0 (e7f001f2) ---[ end trace 0000000000000000 ]--- ---------------- Code disassembly (best guess): 0: 1a000004 bne 0x18 4: e1822003 orr r2, r2, r3 8: e8860094 stm r6, {r2, r4, r7} c: e89da8f0 ldm sp, {r4, r5, r6, r7, fp, sp, pc} * 10: e7f001f2 udf #18 <-- trapping instruction Consequently, we have two choices: either employ kmap_to_page() alongside sg_set_page(), or resort to copying high memory contents to a temporary buffer residing in low memory. However, considering the introduction of the WARN_ON_ONCE in commit ef6e06b2ef870 ("highmem: fix kmap_to_page() for kmap_local_page() addresses"), which specifically addresses high memory concerns, it appears that memcpy remains the sole viable option. Link: https://lkml.kernel.org/r/20240318234706.95347-1-21cnbao@gmail.com Fixes: 270700dd06ca ("mm/zswap: remove the memcpy if acomp is not sleepable") Signed-off-by: Barry Song Reported-by: syzbot+adbc983a1588b7805de3@syzkaller.appspotmail.com Closes: https://lore.kernel.org/all/000000000000bbb3d80613f243a6@google.com/ Tested-by: syzbot+adbc983a1588b7805de3@syzkaller.appspotmail.com Acked-by: Yosry Ahmed Reviewed-by: Nhat Pham Acked-by: Johannes Weiner Cc: Chris Li Cc: Ira Weiny Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/zswap.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/mm/zswap.c b/mm/zswap.c index 2c65caa4acdf..60b1389983ae 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -998,7 +998,17 @@ static bool zswap_decompress(struct zswap_entry *entry, struct folio *folio) acomp_ctx = acomp_ctx_get_cpu_lock(entry->pool); src = zpool_map_handle(zpool, entry->handle, ZPOOL_MM_RO); - if (acomp_ctx->is_sleepable && !zpool_can_sleep_mapped(zpool)) { + /* + * If zpool_map_handle is atomic, we cannot reliably utilize its mapped buffer + * to do crypto_acomp_decompress() which might sleep. In such cases, we must + * resort to copying the buffer to a temporary one. + * Meanwhile, zpool_map_handle() might return a non-linearly mapped buffer, + * such as a kmap address of high memory or even ever a vmap address. + * However, sg_init_one is only equipped to handle linearly mapped low memory. + * In such cases, we also must copy the buffer to a temporary and lowmem one. + */ + if ((acomp_ctx->is_sleepable && !zpool_can_sleep_mapped(zpool)) || + !virt_addr_valid(src)) { memcpy(acomp_ctx->buffer, src, entry->length); src = acomp_ctx->buffer; zpool_unmap_handle(zpool, entry->handle); @@ -1011,7 +1021,7 @@ static bool zswap_decompress(struct zswap_entry *entry, struct folio *folio) decomp_ret = crypto_wait_req(crypto_acomp_decompress(acomp_ctx->req), &acomp_ctx->wait); dlen = acomp_ctx->req->dlen; - if (!acomp_ctx->is_sleepable || zpool_can_sleep_mapped(zpool)) + if (src != acomp_ctx->buffer) zpool_unmap_handle(zpool, entry->handle); acomp_ctx_put_unlock(acomp_ctx); -- Gitee From dc2533df18fabe3e477cd949501fc7c6091da2d0 Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Mon, 13 Jan 2025 21:44:58 +0000 Subject: [PATCH 453/484] mm: zswap: move allocations during CPU init outside the lock commit 779b9955f64327c339a16f68055af98252fd3315 upstream Conflicts: none Backport-reason: ZSWAP Fixes In zswap_cpu_comp_prepare(), allocations are made and assigned to various members of acomp_ctx under acomp_ctx->mutex. However, allocations may recurse into zswap through reclaim, trying to acquire the same mutex and deadlocking. Move the allocations before the mutex critical section. Only the initialization of acomp_ctx needs to be done with the mutex held. Link: https://lkml.kernel.org/r/20250113214458.2123410-1-yosryahmed@google.com Fixes: 12dcb0ef5406 ("mm: zswap: properly synchronize freeing resources during CPU hotunplug") Signed-off-by: Yosry Ahmed Reviewed-by: Chengming Zhou Cc: Johannes Weiner Cc: Nhat Pham Cc: Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/zswap.c | 42 ++++++++++++++++++++++++------------------ 1 file changed, 24 insertions(+), 18 deletions(-) diff --git a/mm/zswap.c b/mm/zswap.c index 60b1389983ae..35a196d7772e 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -823,15 +823,15 @@ static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node) { struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node); struct crypto_acomp_ctx *acomp_ctx = per_cpu_ptr(pool->acomp_ctx, cpu); - struct crypto_acomp *acomp; - struct acomp_req *req; + struct crypto_acomp *acomp = NULL; + struct acomp_req *req = NULL; + u8 *buffer = NULL; int ret; - mutex_lock(&acomp_ctx->mutex); - acomp_ctx->buffer = kmalloc_node(PAGE_SIZE * 2, GFP_KERNEL, cpu_to_node(cpu)); - if (!acomp_ctx->buffer) { + buffer = kmalloc_node(PAGE_SIZE * 2, GFP_KERNEL, cpu_to_node(cpu)); + if (!buffer) { ret = -ENOMEM; - goto buffer_fail; + goto fail; } acomp = crypto_alloc_acomp_node(pool->tfm_name, 0, 0, cpu_to_node(cpu)); @@ -839,21 +839,25 @@ static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node) pr_err("could not alloc crypto acomp %s : %ld\n", pool->tfm_name, PTR_ERR(acomp)); ret = PTR_ERR(acomp); - goto acomp_fail; + goto fail; } - acomp_ctx->acomp = acomp; - acomp_ctx->is_sleepable = acomp_is_async(acomp); - req = acomp_request_alloc(acomp_ctx->acomp); + req = acomp_request_alloc(acomp); if (!req) { pr_err("could not alloc crypto acomp_request %s\n", pool->tfm_name); ret = -ENOMEM; - goto req_fail; + goto fail; } - acomp_ctx->req = req; + /* + * Only hold the mutex after completing allocations, otherwise we may + * recurse into zswap through reclaim and attempt to hold the mutex + * again resulting in a deadlock. + */ + mutex_lock(&acomp_ctx->mutex); crypto_init_wait(&acomp_ctx->wait); + /* * if the backend of acomp is async zip, crypto_req_done() will wakeup * crypto_wait_req(); if the backend of acomp is scomp, the callback @@ -862,15 +866,17 @@ static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node) acomp_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG, crypto_req_done, &acomp_ctx->wait); + acomp_ctx->buffer = buffer; + acomp_ctx->acomp = acomp; + acomp_ctx->is_sleepable = acomp_is_async(acomp); + acomp_ctx->req = req; mutex_unlock(&acomp_ctx->mutex); return 0; -req_fail: - crypto_free_acomp(acomp_ctx->acomp); -acomp_fail: - kfree(acomp_ctx->buffer); -buffer_fail: - mutex_unlock(&acomp_ctx->mutex); +fail: + if (acomp) + crypto_free_acomp(acomp); + kfree(buffer); return ret; } -- Gitee From ff4cd578592300304eb8067cab66b0e639482256 Mon Sep 17 00:00:00 2001 From: Liu Shixin Date: Thu, 23 Jan 2025 10:10:29 +0800 Subject: [PATCH 454/484] mm/compaction: fix UBSAN shift-out-of-bounds warning commit d1366e74342e75555af2648a2964deb2d5c92200 upstream Conflicts: none Backport-reason: THP & mm fixes syzkaller reported a UBSAN shift-out-of-bounds warning of (1UL << order) in isolate_freepages_block(). The bogus compound_order can be any value because it is union with flags. Add back the MAX_PAGE_ORDER check to fix the warning. Link: https://lkml.kernel.org/r/20250123021029.2826736-1-liushixin2@huawei.com Fixes: 3da0272a4c7d ("mm/compaction: correctly return failure with bogus compound_order in strict mode") Signed-off-by: Liu Shixin Reviewed-by: Kemeng Shi Acked-by: David Hildenbrand Reviewed-by: Oscar Salvador Cc: Baolin Wang Cc: David Hildenbrand Cc: Kemeng Shi Cc: Matthew Wilcox Cc: Mel Gorman Cc: Nanyong Sun Cc: Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/compaction.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mm/compaction.c b/mm/compaction.c index a8b118a89a62..ef5efdc893c4 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -649,7 +649,8 @@ static unsigned long isolate_freepages_block(struct compact_control *cc, if (PageCompound(page)) { const unsigned int order = compound_order(page); - if (blockpfn + (1UL << order) <= end_pfn) { + if ((order <= MAX_PAGE_ORDER) && + (blockpfn + (1UL << order) <= end_pfn)) { blockpfn += (1UL << order) - 1; page += (1UL << order) - 1; nr_scanned += (1UL << order) - 1; -- Gitee From 06284276f17a725d7c49f077f9c175d6a05dcc1a Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Tue, 11 Feb 2025 18:13:39 +0800 Subject: [PATCH 455/484] mm/mmu_gather: remove unused __tlb_remove_page() commit 1d23b9403aedea2d8eb4175e8eeb05fcd2967219 upstream Conflicts: none Backport-reason: mm cleanup Nobody is using __tlb_remove_page() now, clean it up. And also remove the code comment above tlb_remove_page() because it's not meaningful any more. Link: https://lkml.kernel.org/r/Z6si0/A/zzEF/bFJ@MiWiFi-R3L-srv Signed-off-by: Baoquan He Reviewed-by: Qi Zheng Cc: "Aneesh Kumar K.V" Cc: Nicholas Piggin Cc: Peter Zijlstra (Intel) Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- include/asm-generic/tlb.h | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h index 709830274b75..5976e878be4a 100644 --- a/include/asm-generic/tlb.h +++ b/include/asm-generic/tlb.h @@ -473,16 +473,6 @@ static inline void tlb_remove_page_size(struct mmu_gather *tlb, tlb_flush_mmu(tlb); } -static __always_inline bool __tlb_remove_page(struct mmu_gather *tlb, - struct page *page, bool delay_rmap) -{ - return __tlb_remove_page_size(tlb, page, delay_rmap, PAGE_SIZE); -} - -/* tlb_remove_page - * Similar to __tlb_remove_page but will call tlb_flush_mmu() itself when - * required. - */ static inline void tlb_remove_page(struct mmu_gather *tlb, struct page *page) { return tlb_remove_page_size(tlb, page, PAGE_SIZE); -- Gitee From 20db2cae20bde140c98d2971abd434c8d10f778b Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Tue, 11 Feb 2025 11:43:48 +0800 Subject: [PATCH 456/484] mm/mmu_gather: clean up the stale code comment commit 7bd1fa0d5624e78628ad7ce70d7e69082c34c634 upstream Conflicts: none Backport-reason: mm cleanup In commit d7f861b9c43a ("mm/mmu_gather: add __tlb_remove_folio_pages()"), helper function __tlb_remove_folio_pages_size() was added. And based on the helper, wrapper functions __tlb_remove_folio_pages() and __tlb_remove_page_size() are created and used by upper level functions. So let's update the code comment to reflect the current code about tlb_remove_page()/tlb_remove_page_size(), etc. Link: https://lkml.kernel.org/r/20250211034348.39531-2-bhe@redhat.com Signed-off-by: Baoquan He Cc: "Aneesh Kumar K.V" Cc: Nicholas Piggin Cc: Peter Zijlstra (Intel) Cc: Qi Zheng Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- include/asm-generic/tlb.h | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h index 5976e878be4a..91a9c362e57b 100644 --- a/include/asm-generic/tlb.h +++ b/include/asm-generic/tlb.h @@ -67,22 +67,21 @@ * * See also MMU_GATHER_TABLE_FREE and MMU_GATHER_RCU_TABLE_FREE. * - * - tlb_remove_page() / __tlb_remove_page() - * - tlb_remove_page_size() / __tlb_remove_page_size() - * - __tlb_remove_folio_pages() + * - tlb_remove_page() / tlb_remove_page_size() + * - __tlb_remove_folio_pages() / __tlb_remove_page_size() + * - __tlb_remove_folio_pages_size() * - * __tlb_remove_page_size() is the basic primitive that queues a page for - * freeing. __tlb_remove_page() assumes PAGE_SIZE. Both will return a - * boolean indicating if the queue is (now) full and a call to - * tlb_flush_mmu() is required. + * __tlb_remove_folio_pages_size() is the basic primitive that queues pages + * for freeing. It will return a boolean indicating if the queue is (now) + * full and a call to tlb_flush_mmu() is required. * * tlb_remove_page() and tlb_remove_page_size() imply the call to * tlb_flush_mmu() when required and has no return value. * - * __tlb_remove_folio_pages() is similar to __tlb_remove_page(), however, - * instead of removing a single page, remove the given number of consecutive - * pages that are all part of the same (large) folio: just like calling - * __tlb_remove_page() on each page individually. + * __tlb_remove_folio_pages() is similar to __tlb_remove_page_size(), + * however, instead of removing a single page, assume PAGE_SIZE and remove + * the given number of consecutive pages that are all part of the + * same (large) folio. * * - tlb_change_page_size() * -- Gitee From 84052a5860d18104fc3206d51b0d62d9c4217899 Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Wed, 26 Feb 2025 18:56:25 +0000 Subject: [PATCH 457/484] mm: zswap: fix crypto_free_acomp() deadlock in zswap_cpu_comp_dead() commit c11bcbc0a517acf69282c8225059b2a8ac5fe628 upstream Conflicts: none Backport-reason: ZSWAP Fixes Currently, zswap_cpu_comp_dead() calls crypto_free_acomp() while holding the per-CPU acomp_ctx mutex. crypto_free_acomp() then holds scomp_lock (through crypto_exit_scomp_ops_async()). On the other hand, crypto_alloc_acomp_node() holds the scomp_lock (through crypto_scomp_init_tfm()), and then allocates memory. If the allocation results in reclaim, we may attempt to hold the per-CPU acomp_ctx mutex. The above dependencies can cause an ABBA deadlock. For example in the following scenario: (1) Task A running on CPU #1: crypto_alloc_acomp_node() Holds scomp_lock Enters reclaim Reads per_cpu_ptr(pool->acomp_ctx, 1) (2) Task A is descheduled (3) CPU #1 goes offline zswap_cpu_comp_dead(CPU #1) Holds per_cpu_ptr(pool->acomp_ctx, 1)) Calls crypto_free_acomp() Waits for scomp_lock (4) Task A running on CPU #2: Waits for per_cpu_ptr(pool->acomp_ctx, 1) // Read on CPU #1 DEADLOCK Since there is no requirement to call crypto_free_acomp() with the per-CPU acomp_ctx mutex held in zswap_cpu_comp_dead(), move it after the mutex is unlocked. Also move the acomp_request_free() and kfree() calls for consistency and to avoid any potential sublte locking dependencies in the future. With this, only setting acomp_ctx fields to NULL occurs with the mutex held. This is similar to how zswap_cpu_comp_prepare() only initializes acomp_ctx fields with the mutex held, after performing all allocations before holding the mutex. Opportunistically, move the NULL check on acomp_ctx so that it takes place before the mutex dereference. Link: https://lkml.kernel.org/r/20250226185625.2672936-1-yosry.ahmed@linux.dev Fixes: 12dcb0ef5406 ("mm: zswap: properly synchronize freeing resources during CPU hotunplug") Signed-off-by: Herbert Xu Co-developed-by: Herbert Xu Signed-off-by: Yosry Ahmed Reported-by: syzbot+1a517ccfcbc6a7ab0f82@syzkaller.appspotmail.com Closes: https://lore.kernel.org/all/67bcea51.050a0220.bbfd1.0096.GAE@google.com/ Acked-by: Herbert Xu Reviewed-by: Chengming Zhou Reviewed-by: Nhat Pham Tested-by: Nhat Pham Cc: David S. Miller Cc: Eric Biggers Cc: Johannes Weiner Cc: Chris Murphy Cc: Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/zswap.c | 30 ++++++++++++++++++++++-------- 1 file changed, 22 insertions(+), 8 deletions(-) diff --git a/mm/zswap.c b/mm/zswap.c index 35a196d7772e..b1161b6e929d 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -884,18 +884,32 @@ static int zswap_cpu_comp_dead(unsigned int cpu, struct hlist_node *node) { struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node); struct crypto_acomp_ctx *acomp_ctx = per_cpu_ptr(pool->acomp_ctx, cpu); + struct acomp_req *req; + struct crypto_acomp *acomp; + u8 *buffer; + + if (IS_ERR_OR_NULL(acomp_ctx)) + return 0; mutex_lock(&acomp_ctx->mutex); - if (!IS_ERR_OR_NULL(acomp_ctx)) { - if (!IS_ERR_OR_NULL(acomp_ctx->req)) - acomp_request_free(acomp_ctx->req); - acomp_ctx->req = NULL; - if (!IS_ERR_OR_NULL(acomp_ctx->acomp)) - crypto_free_acomp(acomp_ctx->acomp); - kfree(acomp_ctx->buffer); - } + req = acomp_ctx->req; + acomp = acomp_ctx->acomp; + buffer = acomp_ctx->buffer; + acomp_ctx->req = NULL; + acomp_ctx->acomp = NULL; + acomp_ctx->buffer = NULL; mutex_unlock(&acomp_ctx->mutex); + /* + * Do the actual freeing after releasing the mutex to avoid subtle + * locking dependencies causing deadlocks. + */ + if (!IS_ERR_OR_NULL(req)) + acomp_request_free(req); + if (!IS_ERR_OR_NULL(acomp)) + crypto_free_acomp(acomp); + kfree(buffer); + return 0; } -- Gitee From 107f3cd1f8a24c5b024eac2cebc742074f166f6f Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 24 Feb 2025 19:08:24 -0500 Subject: [PATCH 458/484] mm: page_alloc: don't steal single pages from biggest buddy commit c2f6ea38fc1b640aa7a2e155cc1c0410ff91afa2 upstream Conflicts: none Backport-reason: THP & mm fixes The fallback code searches for the biggest buddy first in an attempt to steal the whole block and encourage type grouping down the line. The approach used to be this: - Non-movable requests will split the largest buddy and steal the remainder. This splits up contiguity, but it allows subsequent requests of this type to fall back into adjacent space. - Movable requests go and look for the smallest buddy instead. The thinking is that movable requests can be compacted, so grouping is less important than retaining contiguity. c0cd6f557b90 ("mm: page_alloc: fix freelist movement during block conversion") enforces freelist type hygiene, which restricts stealing to either claiming the whole block or just taking the requested chunk; no additional pages or buddy remainders can be stolen any more. The patch mishandled when to switch to finding the smallest buddy in that new reality. As a result, it may steal the exact request size, but from the biggest buddy. This causes fracturing for no good reason. Fix this by committing to the new behavior: either steal the whole block, or fall back to the smallest buddy. Remove single-page stealing from steal_suitable_fallback(). Rename it to try_to_steal_block() to make the intentions clear. If this fails, always fall back to the smallest buddy. The following is from 4 runs of mmtest's thpchallenge. "Pollute" is single page fallback, "steal" is conversion of a partially used block. The numbers for free block conversions (omitted) are comparable. vanilla patched @pollute[unmovable from reclaimable]: 27 106 @pollute[unmovable from movable]: 82 46 @pollute[reclaimable from unmovable]: 256 83 @pollute[reclaimable from movable]: 46 8 @pollute[movable from unmovable]: 4841 868 @pollute[movable from reclaimable]: 5278 12568 @steal[unmovable from reclaimable]: 11 12 @steal[unmovable from movable]: 113 49 @steal[reclaimable from unmovable]: 19 34 @steal[reclaimable from movable]: 47 21 @steal[movable from unmovable]: 250 183 @steal[movable from reclaimable]: 81 93 The allocator appears to do a better job at keeping stealing and polluting to the first fallback preference. As a result, the numbers for "from movable" - the least preferred fallback option, and most detrimental to compactability - are down across the board. Link: https://lkml.kernel.org/r/20250225001023.1494422-2-hannes@cmpxchg.org Fixes: c0cd6f557b90 ("mm: page_alloc: fix freelist movement during block conversion") Signed-off-by: Johannes Weiner Suggested-by: Vlastimil Babka Reviewed-by: Brendan Jackman Reviewed-by: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/page_alloc.c | 80 +++++++++++++++++++++---------------------------- 1 file changed, 34 insertions(+), 46 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 6deca5becc51..310300a07910 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1878,13 +1878,12 @@ static inline bool boost_watermark(struct zone *zone) * can claim the whole pageblock for the requested migratetype. If not, we check * the pageblock for constituent pages; if at least half of the pages are free * or compatible, we can still claim the whole block, so pages freed in the - * future will be put on the correct free list. Otherwise, we isolate exactly - * the order we need from the fallback block and leave its migratetype alone. + * future will be put on the correct free list. */ static struct page * -steal_suitable_fallback(struct zone *zone, struct page *page, - int current_order, int order, int start_type, - unsigned int alloc_flags, bool whole_block) +try_to_steal_block(struct zone *zone, struct page *page, + int current_order, int order, int start_type, + unsigned int alloc_flags) { int free_pages, movable_pages, alike_pages; unsigned long start_pfn; @@ -1897,7 +1896,7 @@ steal_suitable_fallback(struct zone *zone, struct page *page, * highatomic accounting. */ if (is_migrate_highatomic(block_type)) - goto single_page; + return NULL; /* Take ownership for orders >= pageblock_order */ if (current_order >= pageblock_order) { @@ -1918,14 +1917,10 @@ steal_suitable_fallback(struct zone *zone, struct page *page, if (boost_watermark(zone) && (alloc_flags & ALLOC_KSWAPD)) set_bit(ZONE_BOOSTED_WATERMARK, &zone->flags); - /* We are not allowed to try stealing from the whole block */ - if (!whole_block) - goto single_page; - /* moving whole block can fail due to zone boundary conditions */ if (!prep_move_freepages_block(zone, page, &start_pfn, &free_pages, &movable_pages)) - goto single_page; + return NULL; /* * Determine how many pages are compatible with our allocation. @@ -1958,9 +1953,7 @@ steal_suitable_fallback(struct zone *zone, struct page *page, return __rmqueue_smallest(zone, order, start_type); } -single_page: - page_del_and_expand(zone, page, order, current_order, block_type); - return page; + return NULL; } /* @@ -2138,14 +2131,19 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac, } /* - * Try finding a free buddy page on the fallback list and put it on the free - * list of requested migratetype, possibly along with other pages from the same - * block, depending on fragmentation avoidance heuristics. Returns true if - * fallback was found so that __rmqueue_smallest() can grab it. + * Try finding a free buddy page on the fallback list. + * + * This will attempt to steal a whole pageblock for the requested type + * to ensure grouping of such requests in the future. + * + * If a whole block cannot be stolen, regress to __rmqueue_smallest() + * logic to at least break up as little contiguity as possible. * * The use of signed ints for order and current_order is a deliberate * deviation from the rest of this file, to make the for loop * condition simpler. + * + * Return the stolen page, or NULL if none can be found. */ static __always_inline struct page * __rmqueue_fallback(struct zone *zone, int order, int start_migratetype, @@ -2179,45 +2177,35 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype, if (fallback_mt == -1) continue; - /* - * We cannot steal all free pages from the pageblock and the - * requested migratetype is movable. In that case it's better to - * steal and split the smallest available page instead of the - * largest available page, because even if the next movable - * allocation falls back into a different pageblock than this - * one, it won't cause permanent fragmentation. - */ - if (!can_steal && start_migratetype == MIGRATE_MOVABLE - && current_order > order) - goto find_smallest; + if (!can_steal) + break; - goto do_steal; + page = get_page_from_free_area(area, fallback_mt); + page = try_to_steal_block(zone, page, current_order, order, + start_migratetype, alloc_flags); + if (page) + goto got_one; } - return NULL; + if (alloc_flags & ALLOC_NOFRAGMENT) + return NULL; -find_smallest: + /* No luck stealing blocks. Find the smallest fallback page */ for (current_order = order; current_order < NR_PAGE_ORDERS; current_order++) { area = &(zone->free_area[current_order]); fallback_mt = find_suitable_fallback(area, current_order, start_migratetype, false, &can_steal); - if (fallback_mt != -1) - break; - } - - /* - * This should not happen - we already found a suitable fallback - * when looking for the largest page. - */ - VM_BUG_ON(current_order > MAX_PAGE_ORDER); + if (fallback_mt == -1) + continue; -do_steal: - page = get_page_from_free_area(area, fallback_mt); + page = get_page_from_free_area(area, fallback_mt); + page_del_and_expand(zone, page, order, current_order, fallback_mt); + goto got_one; + } - /* take off list, maybe claim block, expand remainder */ - page = steal_suitable_fallback(zone, page, current_order, order, - start_migratetype, alloc_flags, can_steal); + return NULL; +got_one: trace_mm_page_alloc_extfrag(page, order, current_order, start_migratetype, fallback_mt); -- Gitee From 4c0b869973d11c1eabea560f1089b9880a0ad541 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 24 Feb 2025 19:08:25 -0500 Subject: [PATCH 459/484] mm: page_alloc: remove remnants of unlocked migratetype updates commit 020396a581dc69be2d30939fabde6c029d847034 upstream Conflicts: none Backport-reason: mm & mTHP fixes The freelist hygiene patches made migratetype accesses fully protected under the zone->lock. Remove remnants of handling the race conditions that existed before from the MIGRATE_HIGHATOMIC code. Link: https://lkml.kernel.org/r/20250225001023.1494422-3-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Reviewed-by: Brendan Jackman Reviewed-by: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/page_alloc.c | 50 ++++++++++++++++--------------------------------- 1 file changed, 16 insertions(+), 34 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 310300a07910..913ea7ab163d 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1883,20 +1883,10 @@ static inline bool boost_watermark(struct zone *zone) static struct page * try_to_steal_block(struct zone *zone, struct page *page, int current_order, int order, int start_type, - unsigned int alloc_flags) + int block_type, unsigned int alloc_flags) { int free_pages, movable_pages, alike_pages; unsigned long start_pfn; - int block_type; - - block_type = get_pageblock_migratetype(page); - - /* - * This can happen due to races and we want to prevent broken - * highatomic accounting. - */ - if (is_migrate_highatomic(block_type)) - return NULL; /* Take ownership for orders >= pageblock_order */ if (current_order >= pageblock_order) { @@ -2067,33 +2057,22 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac, spin_lock_irqsave(&zone->lock, flags); for (order = 0; order < NR_PAGE_ORDERS; order++) { struct free_area *area = &(zone->free_area[order]); - int mt; + unsigned long size; page = get_page_from_free_area(area, MIGRATE_HIGHATOMIC); if (!page) continue; - mt = get_pageblock_migratetype(page); /* - * In page freeing path, migratetype change is racy so - * we can counter several free pages in a pageblock - * in this loop although we changed the pageblock type - * from highatomic to ac->migratetype. So we should - * adjust the count once. + * It should never happen but changes to + * locking could inadvertently allow a per-cpu + * drain to add pages to MIGRATE_HIGHATOMIC + * while unreserving so be safe and watch for + * underflows. */ - if (is_migrate_highatomic(mt)) { - unsigned long size; - /* - * It should never happen but changes to - * locking could inadvertently allow a per-cpu - * drain to add pages to MIGRATE_HIGHATOMIC - * while unreserving so be safe and watch for - * underflows. - */ - size = max(pageblock_nr_pages, 1UL << order); - size = min(size, zone->nr_reserved_highatomic); - zone->nr_reserved_highatomic -= size; - } + size = max(pageblock_nr_pages, 1UL << order); + size = min(size, zone->nr_reserved_highatomic); + zone->nr_reserved_highatomic -= size; /* * Convert to ac->migratetype and avoid the normal @@ -2105,10 +2084,12 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac, * may increase. */ if (order < pageblock_order) - ret = move_freepages_block(zone, page, mt, + ret = move_freepages_block(zone, page, + MIGRATE_HIGHATOMIC, ac->migratetype); else { - move_to_free_list(page, zone, order, mt, + move_to_free_list(page, zone, order, + MIGRATE_HIGHATOMIC, ac->migratetype); change_pageblock_range(page, order, ac->migratetype); @@ -2182,7 +2163,8 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype, page = get_page_from_free_area(area, fallback_mt); page = try_to_steal_block(zone, page, current_order, order, - start_migratetype, alloc_flags); + start_migratetype, fallback_mt, + alloc_flags); if (page) goto got_one; } -- Gitee From 937e7f74b1a9be26666e08f1eaf32624f2422cb0 Mon Sep 17 00:00:00 2001 From: Charan Teja Kalla Date: Fri, 24 Nov 2023 16:35:52 +0530 Subject: [PATCH 460/484] mm: page_alloc: correct high atomic reserve calculations commit d68e39fc45f70e35eb74df2128d315c1d91e4dc4 upstream Conflicts: none Backport-reason: mm & mTHP allocation updates Patch series "mm: page_alloc: fixes for high atomic reserve caluculations", v3. The state of the system where the issue exposed shown in oom kill logs: [ 295.998653] Normal free:7728kB boost:0kB min:804kB low:1004kB high:1204kB reserved_highatomic:8192KB active_anon:4kB inactive_anon:0kB active_file:24kB inactive_file:24kB unevictable:1220kB writepending:0kB present:70732kB managed:49224kB mlocked:0kB bounce:0kB free_pcp:688kBlocal_pcp:492kB free_cma:0kB [ 295.998656] lowmem_reserve[]: 0 32 [ 295.998659] Normal: 508*4kB (UMEH) 241*8kB (UMEH) 143*16kB (UMEH) 33*32kB (UH) 7*64kB (UH) 0*128kB 0*256kB 0*512kB 0*1024kB 0*2048kB 0*4096kB = 7752kB From the above, it is seen that ~16MB of memory reserved for high atomic reserves against the expectation of 1% reserves which is fixed in the 1st patch. Don't reserve the high atomic page blocks if 1% of zone memory size is below a pageblock size. This patch (of 2): reserve_highatomic_pageblock() aims to reserve the 1% of the managed pages of a zone, which is used for the high order atomic allocations. It uses the below calculation to reserve: static void reserve_highatomic_pageblock(struct page *page, ....) { ....... max_managed = (zone_managed_pages(zone) / 100) + pageblock_nr_pages; if (zone->nr_reserved_highatomic >= max_managed) goto out; zone->nr_reserved_highatomic += pageblock_nr_pages; set_pageblock_migratetype(page, MIGRATE_HIGHATOMIC); move_freepages_block(zone, page, MIGRATE_HIGHATOMIC, NULL); out: .... } Since we are always appending the 1% of zone managed pages count to pageblock_nr_pages, the minimum it is turning into 2 pageblocks as the nr_reserved_highatomic is incremented/decremented in pageblock sizes. Encountered a system(actually a VM running on the Linux kernel) with the below zone configuration: Normal free:7728kB boost:0kB min:804kB low:1004kB high:1204kB reserved_highatomic:8192KB managed:49224kB The existing calculations making it to reserve the 8MB(with pageblock size of 4MB) i.e. 16% of the zone managed memory. Reserving such high amount of memory can easily exert memory pressure in the system thus may lead into unnecessary reclaims till unreserving of high atomic reserves. Since high atomic reserves are managed in pageblock size granules, as MIGRATE_HIGHATOMIC is set for such pageblock, fix the calculations for high atomic reserves as, minimum is pageblock size , maximum is approximately 1% of the zone managed pages. Link: https://lkml.kernel.org/r/cover.1700821416.git.quic_charante@quicinc.com Link: https://lkml.kernel.org/r/1660034138397b82a0a8b6ae51cbe96bd583d89e.1700821416.git.quic_charante@quicinc.com Signed-off-by: Charan Teja Kalla Acked-by: Mel Gorman Acked-by: David Rientjes Cc: David Hildenbrand Cc: Johannes Weiner Cc: Michal Hocko Cc: Pavankumar Kondeti Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/page_alloc.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 913ea7ab163d..46dbf14ad7c0 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1992,10 +1992,11 @@ static void reserve_highatomic_pageblock(struct page *page, int order, unsigned long max_managed, flags; /* - * Limit the number reserved to 1 pageblock or roughly 1% of a zone. + * The number reserved as: minimum is 1 pageblock, maximum is + * roughly 1% of a zone. * Check is race-prone but harmless. */ - max_managed = (zone_managed_pages(zone) / 100) + pageblock_nr_pages; + max_managed = ALIGN((zone_managed_pages(zone) / 100), pageblock_nr_pages); if (zone->nr_reserved_highatomic >= max_managed) return; -- Gitee From 7b0a96ff4411e5ba4b2d2a759fb75a00af677d21 Mon Sep 17 00:00:00 2001 From: Charan Teja Kalla Date: Fri, 24 Nov 2023 16:35:53 +0530 Subject: [PATCH 461/484] mm: page_alloc: enforce minimum zone size to do high atomic reserves commit 9cd20f3fe045af95a8fe7a12328b21bfd2f3b8bf upstream Conflicts: none Backport-reason: mm & mTHP allocation updates Highatomic reserves are set to roughly 1% of zone for maximum and a pageblock size for minimum. Encountered a system with the below configuration: Normal free:7728kB boost:0kB min:804kB low:1004kB high:1204kB reserved_highatomic:8192KB managed:49224kB On such systems, even a single pageblock makes highatomic reserves are set to ~8% of the zone memory. This high value can easily exert pressure on the zone. Per discussion with Michal and Mel, it is not much useful to reserve the memory for highatomic allocations on such small systems[1]. Since the minimum size for high atomic reserves is always going to be a pageblock size and if 1% of zone managed pages is going to be below pageblock size, don't reserve memory for high atomic allocations. Thanks Michal for this suggestion[2]. Since no memory is being reserved for high atomic allocations and if respective allocation failures are seen, this patch can be reverted. [1] https://lore.kernel.org/linux-mm/20231117161956.d3yjdxhhm4rhl7h2@techsingularity.net/ [2] https://lore.kernel.org/linux-mm/ZVYRJMUitykepLRy@tiehlicka/ Link: https://lkml.kernel.org/r/c3a2a48e2cfe08176a80eaf01c110deb9e918055.1700821416.git.quic_charante@quicinc.com Signed-off-by: Charan Teja Kalla Acked-by: David Rientjes Cc: David Hildenbrand Cc: Johannes Weiner Cc: Mel Gorman Cc: Michal Hocko Cc: Pavankumar Kondeti Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/page_alloc.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 46dbf14ad7c0..f0621785e973 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1993,9 +1993,12 @@ static void reserve_highatomic_pageblock(struct page *page, int order, /* * The number reserved as: minimum is 1 pageblock, maximum is - * roughly 1% of a zone. + * roughly 1% of a zone. But if 1% of a zone falls below a + * pageblock size, then don't reserve any pageblocks. * Check is race-prone but harmless. */ + if ((zone_managed_pages(zone) / 100) < pageblock_nr_pages) + return; max_managed = ALIGN((zone_managed_pages(zone) / 100), pageblock_nr_pages); if (zone->nr_reserved_highatomic >= max_managed) return; -- Gitee From 4a65b84f092731192cec7834532c7d4290b12387 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 24 Feb 2025 19:08:26 -0500 Subject: [PATCH 462/484] mm: page_alloc: group fallback functions together commit a4138a2702a4428317ecdb115934554df4b788b4 upstream Conflicts: trivial Backport-reason: mm & mTHP allocation fixes The way the fallback rules are spread out makes them hard to follow. Move the functions next to each other at least. Link: https://lkml.kernel.org/r/20250225001023.1494422-4-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Reviewed-by: Brendan Jackman Reviewed-by: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/page_alloc.c | 394 ++++++++++++++++++++++++------------------------ 1 file changed, 197 insertions(+), 197 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index f0621785e973..b39794f1f334 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1803,6 +1803,43 @@ static void change_pageblock_range(struct page *pageblock_page, } } +static inline bool boost_watermark(struct zone *zone) +{ + unsigned long max_boost; + + if (!watermark_boost_factor) + return false; + /* + * Don't bother in zones that are unlikely to produce results. + * On small machines, including kdump capture kernels running + * in a small area, boosting the watermark can cause an out of + * memory situation immediately. + */ + if ((pageblock_nr_pages * 4) > zone_managed_pages(zone)) + return false; + + max_boost = mult_frac(zone->_watermark[WMARK_HIGH], + watermark_boost_factor, 10000); + + /* + * high watermark may be uninitialised if fragmentation occurs + * very early in boot so do not boost. We do not fall + * through and boost by pageblock_nr_pages as failing + * allocations that early means that reclaim is not going + * to help and it may even be impossible to reclaim the + * boosted watermark resulting in a hang. + */ + if (!max_boost) + return false; + + max_boost = max(pageblock_nr_pages, max_boost); + + zone->watermark_boost = min(zone->watermark_boost + pageblock_nr_pages, + max_boost); + + return true; +} + /* * When we are falling back to another migratetype during allocation, try to * steal extra free pages from the same pageblocks to satisfy further @@ -1836,41 +1873,38 @@ static bool can_steal_fallback(unsigned int order, int start_mt) return false; } -static inline bool boost_watermark(struct zone *zone) +/* + * Check whether there is a suitable fallback freepage with requested order. + * If only_stealable is true, this function returns fallback_mt only if + * we can steal other freepages all together. This would help to reduce + * fragmentation due to mixed migratetype pages in one pageblock. + */ +int find_suitable_fallback(struct free_area *area, unsigned int order, + int migratetype, bool only_stealable, bool *can_steal) { - unsigned long max_boost; + int i; + int fallback_mt; - if (!watermark_boost_factor) - return false; - /* - * Don't bother in zones that are unlikely to produce results. - * On small machines, including kdump capture kernels running - * in a small area, boosting the watermark can cause an out of - * memory situation immediately. - */ - if ((pageblock_nr_pages * 4) > zone_managed_pages(zone)) - return false; + if (area->nr_free == 0) + return -1; - max_boost = mult_frac(zone->_watermark[WMARK_HIGH], - watermark_boost_factor, 10000); + *can_steal = false; + for (i = 0; i < MIGRATE_PCPTYPES - 1 ; i++) { + fallback_mt = fallbacks[migratetype][i]; + if (free_area_empty(area, fallback_mt)) + continue; - /* - * high watermark may be uninitialised if fragmentation occurs - * very early in boot so do not boost. We do not fall - * through and boost by pageblock_nr_pages as failing - * allocations that early means that reclaim is not going - * to help and it may even be impossible to reclaim the - * boosted watermark resulting in a hang. - */ - if (!max_boost) - return false; + if (can_steal_fallback(order, migratetype)) + *can_steal = true; - max_boost = max(pageblock_nr_pages, max_boost); + if (!only_stealable) + return fallback_mt; - zone->watermark_boost = min(zone->watermark_boost + pageblock_nr_pages, - max_boost); + if (*can_steal) + return fallback_mt; + } - return true; + return -1; } /* @@ -1946,175 +1980,6 @@ try_to_steal_block(struct zone *zone, struct page *page, return NULL; } -/* - * Check whether there is a suitable fallback freepage with requested order. - * If only_stealable is true, this function returns fallback_mt only if - * we can steal other freepages all together. This would help to reduce - * fragmentation due to mixed migratetype pages in one pageblock. - */ -int find_suitable_fallback(struct free_area *area, unsigned int order, - int migratetype, bool only_stealable, bool *can_steal) -{ - int i; - int fallback_mt; - - if (area->nr_free == 0) - return -1; - - *can_steal = false; - for (i = 0; i < MIGRATE_PCPTYPES - 1 ; i++) { - fallback_mt = fallbacks[migratetype][i]; - if (free_area_empty(area, fallback_mt)) - continue; - - if (can_steal_fallback(order, migratetype)) - *can_steal = true; - - if (!only_stealable) - return fallback_mt; - - if (*can_steal) - return fallback_mt; - } - - return -1; -} - -/* - * Reserve the pageblock(s) surrounding an allocation request for - * exclusive use of high-order atomic allocations if there are no - * empty page blocks that contain a page with a suitable order - */ -static void reserve_highatomic_pageblock(struct page *page, int order, - struct zone *zone) -{ - int mt; - unsigned long max_managed, flags; - - /* - * The number reserved as: minimum is 1 pageblock, maximum is - * roughly 1% of a zone. But if 1% of a zone falls below a - * pageblock size, then don't reserve any pageblocks. - * Check is race-prone but harmless. - */ - if ((zone_managed_pages(zone) / 100) < pageblock_nr_pages) - return; - max_managed = ALIGN((zone_managed_pages(zone) / 100), pageblock_nr_pages); - if (zone->nr_reserved_highatomic >= max_managed) - return; - - spin_lock_irqsave(&zone->lock, flags); - - /* Recheck the nr_reserved_highatomic limit under the lock */ - if (zone->nr_reserved_highatomic >= max_managed) - goto out_unlock; - - /* Yoink! */ - mt = get_pageblock_migratetype(page); - /* Only reserve normal pageblocks (i.e., they can merge with others) */ - if (!migratetype_is_mergeable(mt)) - goto out_unlock; - - if (order < pageblock_order) { - if (move_freepages_block(zone, page, mt, MIGRATE_HIGHATOMIC) == -1) - goto out_unlock; - zone->nr_reserved_highatomic += pageblock_nr_pages; - } else { - change_pageblock_range(page, order, MIGRATE_HIGHATOMIC); - zone->nr_reserved_highatomic += 1 << order; - } - -out_unlock: - spin_unlock_irqrestore(&zone->lock, flags); -} - -/* - * Used when an allocation is about to fail under memory pressure. This - * potentially hurts the reliability of high-order allocations when under - * intense memory pressure but failed atomic allocations should be easier - * to recover from than an OOM. - * - * If @force is true, try to unreserve pageblocks even though highatomic - * pageblock is exhausted. - */ -static bool unreserve_highatomic_pageblock(const struct alloc_context *ac, - bool force) -{ - struct zonelist *zonelist = ac->zonelist; - unsigned long flags; - struct zoneref *z; - struct zone *zone; - struct page *page; - int order; - int ret; - - for_each_zone_zonelist_nodemask(zone, z, zonelist, ac->highest_zoneidx, - ac->nodemask) { - /* - * Preserve at least one pageblock unless memory pressure - * is really high. - */ - if (!force && zone->nr_reserved_highatomic <= - pageblock_nr_pages) - continue; - - spin_lock_irqsave(&zone->lock, flags); - for (order = 0; order < NR_PAGE_ORDERS; order++) { - struct free_area *area = &(zone->free_area[order]); - unsigned long size; - - page = get_page_from_free_area(area, MIGRATE_HIGHATOMIC); - if (!page) - continue; - - /* - * It should never happen but changes to - * locking could inadvertently allow a per-cpu - * drain to add pages to MIGRATE_HIGHATOMIC - * while unreserving so be safe and watch for - * underflows. - */ - size = max(pageblock_nr_pages, 1UL << order); - size = min(size, zone->nr_reserved_highatomic); - zone->nr_reserved_highatomic -= size; - - /* - * Convert to ac->migratetype and avoid the normal - * pageblock stealing heuristics. Minimally, the caller - * is doing the work and needs the pages. More - * importantly, if the block was always converted to - * MIGRATE_UNMOVABLE or another type then the number - * of pageblocks that cannot be completely freed - * may increase. - */ - if (order < pageblock_order) - ret = move_freepages_block(zone, page, - MIGRATE_HIGHATOMIC, - ac->migratetype); - else { - move_to_free_list(page, zone, order, - MIGRATE_HIGHATOMIC, - ac->migratetype); - change_pageblock_range(page, order, - ac->migratetype); - ret = 1; - } - /* - * Reserving the block(s) already succeeded, - * so this should not fail on zone boundaries. - */ - WARN_ON_ONCE(ret == -1); - if (ret > 0) { - spin_unlock_irqrestore(&zone->lock, flags); - return ret; - } - } - spin_unlock_irqrestore(&zone->lock, flags); - } - - return false; -} - /* * Try finding a free buddy page on the fallback list. * @@ -3103,6 +2968,141 @@ noinline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) } ALLOW_ERROR_INJECTION(should_fail_alloc_page, TRUE); +/* + * Reserve the pageblock(s) surrounding an allocation request for + * exclusive use of high-order atomic allocations if there are no + * empty page blocks that contain a page with a suitable order + */ +static void reserve_highatomic_pageblock(struct page *page, int order, + struct zone *zone) +{ + int mt; + unsigned long max_managed, flags; + + /* + * The number reserved as: minimum is 1 pageblock, maximum is + * roughly 1% of a zone. But if 1% of a zone falls below a + * pageblock size, then don't reserve any pageblocks. + * Check is race-prone but harmless. + */ + if ((zone_managed_pages(zone) / 100) < pageblock_nr_pages) + return; + max_managed = ALIGN((zone_managed_pages(zone) / 100), pageblock_nr_pages); + if (zone->nr_reserved_highatomic >= max_managed) + return; + + spin_lock_irqsave(&zone->lock, flags); + + /* Recheck the nr_reserved_highatomic limit under the lock */ + if (zone->nr_reserved_highatomic >= max_managed) + goto out_unlock; + + /* Yoink! */ + mt = get_pageblock_migratetype(page); + /* Only reserve normal pageblocks (i.e., they can merge with others) */ + if (!migratetype_is_mergeable(mt)) + goto out_unlock; + + if (order < pageblock_order) { + if (move_freepages_block(zone, page, mt, MIGRATE_HIGHATOMIC) == -1) + goto out_unlock; + zone->nr_reserved_highatomic += pageblock_nr_pages; + } else { + change_pageblock_range(page, order, MIGRATE_HIGHATOMIC); + zone->nr_reserved_highatomic += 1 << order; + } + +out_unlock: + spin_unlock_irqrestore(&zone->lock, flags); +} + +/* + * Used when an allocation is about to fail under memory pressure. This + * potentially hurts the reliability of high-order allocations when under + * intense memory pressure but failed atomic allocations should be easier + * to recover from than an OOM. + * + * If @force is true, try to unreserve pageblocks even though highatomic + * pageblock is exhausted. + */ +static bool unreserve_highatomic_pageblock(const struct alloc_context *ac, + bool force) +{ + struct zonelist *zonelist = ac->zonelist; + unsigned long flags; + struct zoneref *z; + struct zone *zone; + struct page *page; + int order; + int ret; + + for_each_zone_zonelist_nodemask(zone, z, zonelist, ac->highest_zoneidx, + ac->nodemask) { + /* + * Preserve at least one pageblock unless memory pressure + * is really high. + */ + if (!force && zone->nr_reserved_highatomic <= + pageblock_nr_pages) + continue; + + spin_lock_irqsave(&zone->lock, flags); + for (order = 0; order < NR_PAGE_ORDERS; order++) { + struct free_area *area = &(zone->free_area[order]); + unsigned long size; + + page = get_page_from_free_area(area, MIGRATE_HIGHATOMIC); + if (!page) + continue; + + /* + * It should never happen but changes to + * locking could inadvertently allow a per-cpu + * drain to add pages to MIGRATE_HIGHATOMIC + * while unreserving so be safe and watch for + * underflows. + */ + size = max(pageblock_nr_pages, 1UL << order); + size = min(size, zone->nr_reserved_highatomic); + zone->nr_reserved_highatomic -= size; + + /* + * Convert to ac->migratetype and avoid the normal + * pageblock stealing heuristics. Minimally, the caller + * is doing the work and needs the pages. More + * importantly, if the block was always converted to + * MIGRATE_UNMOVABLE or another type then the number + * of pageblocks that cannot be completely freed + * may increase. + */ + if (order < pageblock_order) + ret = move_freepages_block(zone, page, + MIGRATE_HIGHATOMIC, + ac->migratetype); + else { + move_to_free_list(page, zone, order, + MIGRATE_HIGHATOMIC, + ac->migratetype); + change_pageblock_range(page, order, + ac->migratetype); + ret = 1; + } + /* + * Reserving the block(s) already succeeded, + * so this should not fail on zone boundaries. + */ + WARN_ON_ONCE(ret == -1); + if (ret > 0) { + spin_unlock_irqrestore(&zone->lock, flags); + return ret; + } + } + spin_unlock_irqrestore(&zone->lock, flags); + } + + return false; +} + static inline long __zone_watermark_unusable_free(struct zone *z, unsigned int order, unsigned int alloc_flags) { -- Gitee From 1c2d385e517ac2faeb070a980704eaf20f4b3aa9 Mon Sep 17 00:00:00 2001 From: gaoxiang17 Date: Fri, 20 Sep 2024 20:20:30 +0800 Subject: [PATCH 463/484] mm/page_alloc: add some detailed comments in can_steal_fallback commit 6025ea5abbe5d813d6a41c78e6ea14259fb503f4 upstream Conflicts: none Backport-reason: mm allocation cleanup, only name changes for easier backport [akpm@linux-foundation.org: tweak grammar, fit to 80 cols] Link: https://lkml.kernel.org/r/20240920122030.159751-1-gxxa03070307@gmail.com Signed-off-by: gaoxiang17 Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/page_alloc.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index b39794f1f334..ca2c69399b44 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1864,6 +1864,14 @@ static bool can_steal_fallback(unsigned int order, int start_mt) if (order >= pageblock_order) return true; + /* + * Movable pages won't cause permanent fragmentation, so when you alloc + * small pages, you just need to temporarily steal unmovable or + * reclaimable pages that are closest to the request size. After a + * while, memory compaction may occur to form large contiguous pages, + * and the next movable allocation may not need to steal. Unmovable and + * reclaimable allocations need to actually steal pages. + */ if (order >= pageblock_order / 2 || start_mt == MIGRATE_RECLAIMABLE || start_mt == MIGRATE_UNMOVABLE || -- Gitee From aa00a2a35fd3bb28f3948f146d307db5c5ca5f35 Mon Sep 17 00:00:00 2001 From: Brendan Jackman Date: Fri, 28 Feb 2025 09:52:17 +0000 Subject: [PATCH 464/484] mm/page_alloc: clarify terminology in migratetype fallback code commit e47f1f56dd82cc6d91f5c4d914a534aa03cd12ca upstream Conflicts: none Backport-reason: mm allocation cleanup, only name changes for easier backport Patch series "mm/page_alloc: Some clarifications for migratetype fallback", v4. A couple of patches to try and make the code easier to follow. This patch (of 2): This code is rather confusing because: 1. "Steal" is sometimes used to refer to the general concept of allocating from a from a block of a fallback migratetype (steal_suitable_fallback()) but sometimes it refers specifically to converting a whole block's migratetype (can_steal_fallback()). 2. can_steal_fallback() sounds as though it's answering the question "am I functionally permitted to allocate from that other type" but in fact it is encoding a heuristic preference. 3. The same piece of data has different names in different places: can_steal vs whole_block. This reinforces point 2 because it looks like the different names reflect a shift in intent from "am I allowed to steal" to "do I want to steal", but no such shift exists. Fix 1. by avoiding the term "steal" in ambiguous contexts. Start using the term "claim" to refer to the special case of stealing the entire block. Fix 2. by using "should" instead of "can", and also rename its parameters and add some commentary to make it more explicit what they mean. Fix 3. by adopting the new "claim" terminology universally for this set of variables. Link: https://lkml.kernel.org/r/20250228-clarify-steal-v4-0-cb2ef1a4e610@google.com Link: https://lkml.kernel.org/r/20250228-clarify-steal-v4-1-cb2ef1a4e610@google.com Signed-off-by: Brendan Jackman Reviewed-by: Vlastimil Babka Cc: Johannes Weiner Cc: Mel Gorman Cc: Michal Hocko Cc: Yosry Ahmed Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/compaction.c | 4 +-- mm/internal.h | 2 +- mm/page_alloc.c | 72 ++++++++++++++++++++++++------------------------- 3 files changed, 39 insertions(+), 39 deletions(-) diff --git a/mm/compaction.c b/mm/compaction.c index ef5efdc893c4..7714b2353902 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -2326,7 +2326,7 @@ static enum compact_result __compact_finished(struct compact_control *cc) ret = COMPACT_NO_SUITABLE_PAGE; for (order = cc->order; order < NR_PAGE_ORDERS; order++) { struct free_area *area = &cc->zone->free_area[order]; - bool can_steal; + bool claim_block; /* Job done if page is free of the right migratetype */ if (!free_area_empty(area, migratetype)) @@ -2343,7 +2343,7 @@ static enum compact_result __compact_finished(struct compact_control *cc) * other migratetype buddy lists. */ if (find_suitable_fallback(area, order, migratetype, - true, &can_steal) != -1) + true, &claim_block) != -1) /* * Movable pages are OK in any pageblock. If we are * stealing for a non-movable allocation, make sure diff --git a/mm/internal.h b/mm/internal.h index 9badc16c69b8..8dd29e7f1c39 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -808,7 +808,7 @@ void init_cma_reserved_pageblock(struct page *page); #endif /* CONFIG_COMPACTION || CONFIG_CMA */ int find_suitable_fallback(struct free_area *area, unsigned int order, - int migratetype, bool only_stealable, bool *can_steal); + int migratetype, bool claim_only, bool *claim_block); static inline bool free_area_empty(struct free_area *area, int migratetype) { diff --git a/mm/page_alloc.c b/mm/page_alloc.c index ca2c69399b44..3efa1e59f136 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1842,22 +1842,22 @@ static inline bool boost_watermark(struct zone *zone) /* * When we are falling back to another migratetype during allocation, try to - * steal extra free pages from the same pageblocks to satisfy further - * allocations, instead of polluting multiple pageblocks. + * claim entire blocks to satisfy further allocations, instead of polluting + * multiple pageblocks. * - * If we are stealing a relatively large buddy page, it is likely there will - * be more free pages in the pageblock, so try to steal them all. For - * reclaimable and unmovable allocations, we steal regardless of page size, - * as fragmentation caused by those allocations polluting movable pageblocks - * is worse than movable allocations stealing from unmovable and reclaimable - * pageblocks. + * If we are stealing a relatively large buddy page, it is likely there will be + * more free pages in the pageblock, so try to claim the whole block. For + * reclaimable and unmovable allocations, we try to claim the whole block + * regardless of page size, as fragmentation caused by those allocations + * polluting movable pageblocks is worse than movable allocations stealing from + * unmovable and reclaimable pageblocks. */ -static bool can_steal_fallback(unsigned int order, int start_mt) +static bool should_try_claim_block(unsigned int order, int start_mt) { /* * Leaving this order check is intended, although there is * relaxed order check in next check. The reason is that - * we can actually steal whole pageblock if this condition met, + * we can actually claim the whole pageblock if this condition met, * but, below check doesn't guarantee it and that is just heuristic * so could be changed anytime. */ @@ -1870,7 +1870,7 @@ static bool can_steal_fallback(unsigned int order, int start_mt) * reclaimable pages that are closest to the request size. After a * while, memory compaction may occur to form large contiguous pages, * and the next movable allocation may not need to steal. Unmovable and - * reclaimable allocations need to actually steal pages. + * reclaimable allocations need to actually claim the whole block. */ if (order >= pageblock_order / 2 || start_mt == MIGRATE_RECLAIMABLE || @@ -1883,12 +1883,14 @@ static bool can_steal_fallback(unsigned int order, int start_mt) /* * Check whether there is a suitable fallback freepage with requested order. - * If only_stealable is true, this function returns fallback_mt only if - * we can steal other freepages all together. This would help to reduce + * Sets *claim_block to instruct the caller whether it should convert a whole + * pageblock to the returned migratetype. + * If only_claim is true, this function returns fallback_mt only if + * we would do this whole-block claiming. This would help to reduce * fragmentation due to mixed migratetype pages in one pageblock. */ int find_suitable_fallback(struct free_area *area, unsigned int order, - int migratetype, bool only_stealable, bool *can_steal) + int migratetype, bool only_claim, bool *claim_block) { int i; int fallback_mt; @@ -1896,19 +1898,16 @@ int find_suitable_fallback(struct free_area *area, unsigned int order, if (area->nr_free == 0) return -1; - *can_steal = false; + *claim_block = false; for (i = 0; i < MIGRATE_PCPTYPES - 1 ; i++) { fallback_mt = fallbacks[migratetype][i]; if (free_area_empty(area, fallback_mt)) continue; - if (can_steal_fallback(order, migratetype)) - *can_steal = true; + if (should_try_claim_block(order, migratetype)) + *claim_block = true; - if (!only_stealable) - return fallback_mt; - - if (*can_steal) + if (*claim_block || !only_claim) return fallback_mt; } @@ -1916,14 +1915,14 @@ int find_suitable_fallback(struct free_area *area, unsigned int order, } /* - * This function implements actual steal behaviour. If order is large enough, we - * can claim the whole pageblock for the requested migratetype. If not, we check - * the pageblock for constituent pages; if at least half of the pages are free - * or compatible, we can still claim the whole block, so pages freed in the - * future will be put on the correct free list. + * This function implements actual block claiming behaviour. If order is large + * enough, we can claim the whole pageblock for the requested migratetype. If + * not, we check the pageblock for constituent pages; if at least half of the + * pages are free or compatible, we can still claim the whole block, so pages + * freed in the future will be put on the correct free list. */ static struct page * -try_to_steal_block(struct zone *zone, struct page *page, +try_to_claim_block(struct zone *zone, struct page *page, int current_order, int order, int start_type, int block_type, unsigned int alloc_flags) { @@ -1991,11 +1990,12 @@ try_to_steal_block(struct zone *zone, struct page *page, /* * Try finding a free buddy page on the fallback list. * - * This will attempt to steal a whole pageblock for the requested type + * This will attempt to claim a whole pageblock for the requested type * to ensure grouping of such requests in the future. * - * If a whole block cannot be stolen, regress to __rmqueue_smallest() - * logic to at least break up as little contiguity as possible. + * If a whole block cannot be claimed, steal an individual page, regressing to + * __rmqueue_smallest() logic to at least break up as little contiguity as + * possible. * * The use of signed ints for order and current_order is a deliberate * deviation from the rest of this file, to make the for loop @@ -2012,7 +2012,7 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype, int min_order = order; struct page *page; int fallback_mt; - bool can_steal; + bool claim_block; /* * Do not steal pages from freelists belonging to other pageblocks @@ -2031,15 +2031,15 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype, --current_order) { area = &(zone->free_area[current_order]); fallback_mt = find_suitable_fallback(area, current_order, - start_migratetype, false, &can_steal); + start_migratetype, false, &claim_block); if (fallback_mt == -1) continue; - if (!can_steal) + if (!claim_block) break; page = get_page_from_free_area(area, fallback_mt); - page = try_to_steal_block(zone, page, current_order, order, + page = try_to_claim_block(zone, page, current_order, order, start_migratetype, fallback_mt, alloc_flags); if (page) @@ -2049,11 +2049,11 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype, if (alloc_flags & ALLOC_NOFRAGMENT) return NULL; - /* No luck stealing blocks. Find the smallest fallback page */ + /* No luck claiming pageblock. Find the smallest fallback page */ for (current_order = order; current_order < NR_PAGE_ORDERS; current_order++) { area = &(zone->free_area[current_order]); fallback_mt = find_suitable_fallback(area, current_order, - start_migratetype, false, &can_steal); + start_migratetype, false, &claim_block); if (fallback_mt == -1) continue; -- Gitee From 2c1e53bd480a58aaa889be93593f94bc8c6e34be Mon Sep 17 00:00:00 2001 From: Brendan Jackman Date: Fri, 28 Feb 2025 09:52:18 +0000 Subject: [PATCH 465/484] mm/page_alloc: clarify should_claim_block() commentary commit a14efee04796dd3f614eaf5348ca1ac099c21349 upstream Conflicts: none Backport-reason: mm allocation cleanup, only name changes for easier backport There's lots of text here but it's a little hard to follow, this is an attempt to break it up and align its structure more closely with the code. Reword the top-level function comment to just explain what question the function answers from the point of view of the caller. Break up the internal logic into different sections that can have their own commentary describing why that part of the rationale is present. Note the page_group_by_mobility_disabled logic is not explained in the commentary, that is outside the scope of this patch... Link: https://lkml.kernel.org/r/20250228-clarify-steal-v4-2-cb2ef1a4e610@google.com Signed-off-by: Brendan Jackman Reviewed-by: Vlastimil Babka Cc: Johannes Weiner Cc: Mel Gorman Cc: Michal Hocko Cc: Yosry Ahmed Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/page_alloc.c | 46 ++++++++++++++++++++++++++-------------------- 1 file changed, 26 insertions(+), 20 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 3efa1e59f136..33064d113d4f 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1841,16 +1841,9 @@ static inline bool boost_watermark(struct zone *zone) } /* - * When we are falling back to another migratetype during allocation, try to - * claim entire blocks to satisfy further allocations, instead of polluting - * multiple pageblocks. - * - * If we are stealing a relatively large buddy page, it is likely there will be - * more free pages in the pageblock, so try to claim the whole block. For - * reclaimable and unmovable allocations, we try to claim the whole block - * regardless of page size, as fragmentation caused by those allocations - * polluting movable pageblocks is worse than movable allocations stealing from - * unmovable and reclaimable pageblocks. + * When we are falling back to another migratetype during allocation, should we + * try to claim an entire block to satisfy further allocations, instead of + * polluting multiple pageblocks? */ static bool should_try_claim_block(unsigned int order, int start_mt) { @@ -1865,19 +1858,32 @@ static bool should_try_claim_block(unsigned int order, int start_mt) return true; /* - * Movable pages won't cause permanent fragmentation, so when you alloc - * small pages, you just need to temporarily steal unmovable or - * reclaimable pages that are closest to the request size. After a - * while, memory compaction may occur to form large contiguous pages, - * and the next movable allocation may not need to steal. Unmovable and - * reclaimable allocations need to actually claim the whole block. + * Above a certain threshold, always try to claim, as it's likely there + * will be more free pages in the pageblock. + */ + if (order >= pageblock_order / 2) + return true; + + /* + * Unmovable/reclaimable allocations would cause permanent + * fragmentations if they fell back to allocating from a movable block + * (polluting it), so we try to claim the whole block regardless of the + * allocation size. Later movable allocations can always steal from this + * block, which is less problematic. */ - if (order >= pageblock_order / 2 || - start_mt == MIGRATE_RECLAIMABLE || - start_mt == MIGRATE_UNMOVABLE || - page_group_by_mobility_disabled) + if (start_mt == MIGRATE_RECLAIMABLE || start_mt == MIGRATE_UNMOVABLE) return true; + if (page_group_by_mobility_disabled) + return true; + + /* + * Movable pages won't cause permanent fragmentation, so when you alloc + * small pages, we just need to temporarily steal unmovable or + * reclaimable pages that are closest to the request size. After a + * while, memory compaction may occur to form large contiguous pages, + * and the next movable allocation may not need to steal. + */ return false; } -- Gitee From 6923acdcfe30944e881bbd5030238bc4b21c6cdc Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 7 Apr 2025 14:01:53 -0400 Subject: [PATCH 466/484] mm: page_alloc: speed up fallbacks in rmqueue_bulk() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 90abee6d7895d5eef18c91d870d8168be4e76e9d upstream Conflicts: none Backport-reason: mm & mTHP allocation fixes The test robot identified c2f6ea38fc1b ("mm: page_alloc: don't steal single pages from biggest buddy") as the root cause of a 56.4% regression in vm-scalability::lru-file-mmap-read. Carlos reports an earlier patch, c0cd6f557b90 ("mm: page_alloc: fix freelist movement during block conversion"), as the root cause for a regression in worst-case zone->lock+irqoff hold times. Both of these patches modify the page allocator's fallback path to be less greedy in an effort to stave off fragmentation. The flip side of this is that fallbacks are also less productive each time around, which means the fallback search can run much more frequently. Carlos' traces point to rmqueue_bulk() specifically, which tries to refill the percpu cache by allocating a large batch of pages in a loop. It highlights how once the native freelists are exhausted, the fallback code first scans orders top-down for whole blocks to claim, then falls back to a bottom-up search for the smallest buddy to steal. For the next batch page, it goes through the same thing again. This can be made more efficient. Since rmqueue_bulk() holds the zone->lock over the entire batch, the freelists are not subject to outside changes; when the search for a block to claim has already failed, there is no point in trying again for the next page. Modify __rmqueue() to remember the last successful fallback mode, and restart directly from there on the next rmqueue_bulk() iteration. Oliver confirms that this improves beyond the regression that the test robot reported against c2f6ea38fc1b: commit: f3b92176f4 ("tools/selftests: add guard region test for /proc/$pid/pagemap") c2f6ea38fc ("mm: page_alloc: don't steal single pages from biggest buddy") acc4d5ff0b ("Merge tag 'net-6.15-rc0' of git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net") 2c847f27c3 ("mm: page_alloc: speed up fallbacks in rmqueue_bulk()") <--- your patch f3b92176f4f7100f c2f6ea38fc1b640aa7a2e155cc1 acc4d5ff0b61eb1715c498b6536 2c847f27c37da65a93d23c237c5 ---------------- --------------------------- --------------------------- --------------------------- %stddev %change %stddev %change %stddev %change %stddev \ | \ | \ | \ 25525364 ± 3% -56.4% 11135467 -57.8% 10779336 +31.6% 33581409 vm-scalability.throughput Carlos confirms that worst-case times are almost fully recovered compared to before the earlier culprit patch: 2dd482ba627d (before freelist hygiene): 1ms c0cd6f557b90 (after freelist hygiene): 90ms next-20250319 (steal smallest buddy): 280ms this patch : 8ms [jackmanb@google.com: comment updates] Link: https://lkml.kernel.org/r/D92AC0P9594X.3BML64MUKTF8Z@google.com [hannes@cmpxchg.org: reset rmqueue_mode in rmqueue_buddy() error loop, per Yunsheng Lin] Link: https://lkml.kernel.org/r/20250409140023.GA2313@cmpxchg.org Link: https://lkml.kernel.org/r/20250407180154.63348-1-hannes@cmpxchg.org Fixes: c0cd6f557b90 ("mm: page_alloc: fix freelist movement during block conversion") Fixes: c2f6ea38fc1b ("mm: page_alloc: don't steal single pages from biggest buddy") Signed-off-by: Johannes Weiner Signed-off-by: Brendan Jackman Reported-by: kernel test robot Reported-by: Carlos Song Tested-by: Carlos Song Tested-by: kernel test robot Closes: https://lore.kernel.org/oe-lkp/202503271547.fc08b188-lkp@intel.com Reviewed-by: Brendan Jackman Tested-by: Shivank Garg Acked-by: Zi Yan Reviewed-by: Vlastimil Babka Cc: [6.10+] Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/page_alloc.c | 113 +++++++++++++++++++++++++++++++++--------------- 1 file changed, 79 insertions(+), 34 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 33064d113d4f..9d269118a37e 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1994,23 +1994,15 @@ try_to_claim_block(struct zone *zone, struct page *page, } /* - * Try finding a free buddy page on the fallback list. - * - * This will attempt to claim a whole pageblock for the requested type - * to ensure grouping of such requests in the future. - * - * If a whole block cannot be claimed, steal an individual page, regressing to - * __rmqueue_smallest() logic to at least break up as little contiguity as - * possible. + * Try to allocate from some fallback migratetype by claiming the entire block, + * i.e. converting it to the allocation's start migratetype. * * The use of signed ints for order and current_order is a deliberate * deviation from the rest of this file, to make the for loop * condition simpler. - * - * Return the stolen page, or NULL if none can be found. */ static __always_inline struct page * -__rmqueue_fallback(struct zone *zone, int order, int start_migratetype, +__rmqueue_claim(struct zone *zone, int order, int start_migratetype, unsigned int alloc_flags) { struct free_area *area; @@ -2048,14 +2040,29 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype, page = try_to_claim_block(zone, page, current_order, order, start_migratetype, fallback_mt, alloc_flags); - if (page) - goto got_one; + if (page) { + trace_mm_page_alloc_extfrag(page, order, current_order, + start_migratetype, fallback_mt); + return page; + } } - if (alloc_flags & ALLOC_NOFRAGMENT) - return NULL; + return NULL; +} + +/* + * Try to steal a single page from some fallback migratetype. Leave the rest of + * the block as its current migratetype, potentially causing fragmentation. + */ +static __always_inline struct page * +__rmqueue_steal(struct zone *zone, int order, int start_migratetype) +{ + struct free_area *area; + int current_order; + struct page *page; + int fallback_mt; + bool claim_block; - /* No luck claiming pageblock. Find the smallest fallback page */ for (current_order = order; current_order < NR_PAGE_ORDERS; current_order++) { area = &(zone->free_area[current_order]); fallback_mt = find_suitable_fallback(area, current_order, @@ -2065,25 +2072,28 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype, page = get_page_from_free_area(area, fallback_mt); page_del_and_expand(zone, page, order, current_order, fallback_mt); - goto got_one; + trace_mm_page_alloc_extfrag(page, order, current_order, + start_migratetype, fallback_mt); + return page; } return NULL; - -got_one: - trace_mm_page_alloc_extfrag(page, order, current_order, - start_migratetype, fallback_mt); - - return page; } +enum rmqueue_mode { + RMQUEUE_NORMAL, + RMQUEUE_CMA, + RMQUEUE_CLAIM, + RMQUEUE_STEAL, +}; + /* * Do the hard work of removing an element from the buddy allocator. * Call me with the zone->lock already held. */ static __always_inline struct page * __rmqueue(struct zone *zone, unsigned int order, int migratetype, - unsigned int alloc_flags) + unsigned int alloc_flags, enum rmqueue_mode *mode) { struct page *page; @@ -2102,16 +2112,48 @@ __rmqueue(struct zone *zone, unsigned int order, int migratetype, } } - page = __rmqueue_smallest(zone, order, migratetype); - if (unlikely(!page)) { - if (alloc_flags & ALLOC_CMA) + /* + * First try the freelists of the requested migratetype, then try + * fallbacks modes with increasing levels of fragmentation risk. + * + * The fallback logic is expensive and rmqueue_bulk() calls in + * a loop with the zone->lock held, meaning the freelists are + * not subject to any outside changes. Remember in *mode where + * we found pay dirt, to save us the search on the next call. + */ + switch (*mode) { + case RMQUEUE_NORMAL: + page = __rmqueue_smallest(zone, order, migratetype); + if (page) + return page; + fallthrough; + case RMQUEUE_CMA: + if (alloc_flags & ALLOC_CMA) { page = __rmqueue_cma_fallback(zone, order); - - if (!page) - page = __rmqueue_fallback(zone, order, migratetype, - alloc_flags); + if (page) { + *mode = RMQUEUE_CMA; + return page; + } + } + fallthrough; + case RMQUEUE_CLAIM: + page = __rmqueue_claim(zone, order, migratetype, alloc_flags); + if (page) { + /* Replenished preferred freelist, back to normal mode. */ + *mode = RMQUEUE_NORMAL; + return page; + } + fallthrough; + case RMQUEUE_STEAL: + if (!(alloc_flags & ALLOC_NOFRAGMENT)) { + page = __rmqueue_steal(zone, order, migratetype); + if (page) { + *mode = RMQUEUE_STEAL; + return page; + } + } } - return page; + return NULL; } /* @@ -2123,13 +2165,14 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, unsigned long count, struct list_head *list, int migratetype, unsigned int alloc_flags) { + enum rmqueue_mode rmqm = RMQUEUE_NORMAL; unsigned long flags; int i; spin_lock_irqsave(&zone->lock, flags); for (i = 0; i < count; ++i) { struct page *page = __rmqueue(zone, order, migratetype, - alloc_flags); + alloc_flags, &rmqm); if (unlikely(page == NULL)) break; @@ -2784,7 +2827,9 @@ struct page *rmqueue_buddy(struct zone *preferred_zone, struct zone *zone, if (alloc_flags & ALLOC_HIGHATOMIC) page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC); if (!page) { - page = __rmqueue(zone, order, migratetype, alloc_flags); + enum rmqueue_mode rmqm = RMQUEUE_NORMAL; + + page = __rmqueue(zone, order, migratetype, alloc_flags, &rmqm); /* * If the allocation fails, allow OOM handling and -- Gitee From faa56eafdf014ac513defd795924bb586aa8fcee Mon Sep 17 00:00:00 2001 From: Brendan Jackman Date: Thu, 21 Aug 2025 13:29:47 +0000 Subject: [PATCH 467/484] mm: remove is_migrate_highatomic() commit ce32123b9bc02bb4cd343fa03d1b8bb7f9ce9c51 upstream Conflicts: none Backport-reason: mm allocation cleanup, only name changes for easier backport There are 3 potential reasons for is_migrate_*() helpers: 1. They represent higher-level attributes of migratetypes, like is_migrate_movable() 2. They are ifdef'd, like is_migrate_isolate(). 3. For consistency with an is_migrate_*_page() helper, also like is_migrate_isolate(). It looks like is_migrate_highatomic() was for case 3, but that was removed in commit e0932b6c1f94 ("mm: page_alloc: consolidate free page accounting"). So remove the indirection and go back to a simple comparison. Link: https://lkml.kernel.org/r/20250821-is-migrate-highatomic-v1-1-ddb6e5d7c566@google.com Signed-off-by: Brendan Jackman Reviewed-by: Zi Yan Acked-by: David Hildenbrand Acked-by: Johannes Weiner Reviewed-by: Lorenzo Stoakes Acked-by: SeongJae Park Cc: Liam Howlett Cc: Michal Hocko Cc: Mike Rapoport Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/internal.h | 5 ----- mm/page_alloc.c | 2 +- 2 files changed, 1 insertion(+), 6 deletions(-) diff --git a/mm/internal.h b/mm/internal.h index 8dd29e7f1c39..cbd279069fd0 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -1202,11 +1202,6 @@ extern const struct trace_print_flags pagetype_names[]; extern const struct trace_print_flags vmaflag_names[]; extern const struct trace_print_flags gfpflag_names[]; -static inline bool is_migrate_highatomic(enum migratetype migratetype) -{ - return migratetype == MIGRATE_HIGHATOMIC; -} - void setup_zone_pageset(struct zone *zone); struct migration_target_control { diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 9d269118a37e..f334d5b8d5d8 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -642,7 +642,7 @@ static inline void account_freepages(struct zone *zone, int nr_pages, if (is_migrate_cma(migratetype)) __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, nr_pages); - else if (is_migrate_highatomic(migratetype)) + else if (migratetype == MIGRATE_HIGHATOMIC) WRITE_ONCE(zone->nr_free_highatomic, zone->nr_free_highatomic + nr_pages); } -- Gitee From 5f2edf5a3fb287ce05b9992066af82648e326524 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Petr=20Van=C4=9Bk?= Date: Fri, 2 May 2025 23:50:19 +0200 Subject: [PATCH 468/484] mm: fix folio_pte_batch() on XEN PV MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 7b08b74f3d99f6b801250683c751d391128799ec upstream Conflicts: trivial Backport-reason: mm & mTHP fixes On XEN PV, folio_pte_batch() can incorrectly batch beyond the end of a folio due to a corner case in pte_advance_pfn(). Specifically, when the PFN following the folio maps to an invalidated MFN, expected_pte = pte_advance_pfn(expected_pte, nr); produces a pte_none(). If the actual next PTE in memory is also pte_none(), the pte_same() succeeds, if (!pte_same(pte, expected_pte)) break; the loop is not broken, and batching continues into unrelated memory. For example, with a 4-page folio, the PTE layout might look like this: [ 53.465673] [ T2552] folio_pte_batch: printing PTE values at addr=0x7f1ac9dc5000 [ 53.465674] [ T2552] PTE[453] = 000000010085c125 [ 53.465679] [ T2552] PTE[454] = 000000010085d125 [ 53.465682] [ T2552] PTE[455] = 000000010085e125 [ 53.465684] [ T2552] PTE[456] = 000000010085f125 [ 53.465686] [ T2552] PTE[457] = 0000000000000000 <-- not present [ 53.465689] [ T2552] PTE[458] = 0000000101da7125 pte_advance_pfn(PTE[456]) returns a pte_none() due to invalid PFN->MFN mapping. The next actual PTE (PTE[457]) is also pte_none(), so the loop continues and includes PTE[457] in the batch, resulting in 5 batched entries for a 4-page folio. This triggers the following warning: [ 53.465751] [ T2552] page: refcount:85 mapcount:20 mapping:ffff88813ff4f6a8 index:0x110 pfn:0x10085c [ 53.465754] [ T2552] head: order:2 mapcount:80 entire_mapcount:0 nr_pages_mapped:4 pincount:0 [ 53.465756] [ T2552] memcg:ffff888003573000 [ 53.465758] [ T2552] aops:0xffffffff8226fd20 ino:82467c dentry name(?):"libc.so.6" [ 53.465761] [ T2552] flags: 0x2000000000416c(referenced|uptodate|lru|active|private|head|node=0|zone=2) [ 53.465764] [ T2552] raw: 002000000000416c ffffea0004021f08 ffffea0004021908 ffff88813ff4f6a8 [ 53.465767] [ T2552] raw: 0000000000000110 ffff888133d8bd40 0000005500000013 ffff888003573000 [ 53.465768] [ T2552] head: 002000000000416c ffffea0004021f08 ffffea0004021908 ffff88813ff4f6a8 [ 53.465770] [ T2552] head: 0000000000000110 ffff888133d8bd40 0000005500000013 ffff888003573000 [ 53.465772] [ T2552] head: 0020000000000202 ffffea0004021701 000000040000004f 00000000ffffffff [ 53.465774] [ T2552] head: 0000000300000003 8000000300000002 0000000000000013 0000000000000004 [ 53.465775] [ T2552] page dumped because: VM_WARN_ON_FOLIO((_Generic((page + nr_pages - 1), const struct page *: (const struct folio *)_compound_head(page + nr_pages - 1), struct page *: (struct folio *)_compound_head(page + nr_pages - 1))) != folio) Original code works as expected everywhere, except on XEN PV, where pte_advance_pfn() can yield a pte_none() after balloon inflation due to MFNs invalidation. In XEN, pte_advance_pfn() ends up calling __pte()->xen_make_pte()->pte_pfn_to_mfn(), which returns pte_none() when mfn == INVALID_P2M_ENTRY. The pte_pfn_to_mfn() documents that nastiness: If there's no mfn for the pfn, then just create an empty non-present pte. Unfortunately this loses information about the original pfn, so pte_mfn_to_pfn is asymmetric. While such hacks should certainly be removed, we can do better in folio_pte_batch() and simply check ahead of time how many PTEs we can possibly batch in our folio. This way, we can not only fix the issue but cleanup the code: removing the pte_pfn() check inside the loop body and avoiding end_ptr comparison + arithmetic. Link: https://lkml.kernel.org/r/20250502215019.822-2-arkamar@atlas.cz Fixes: f8d937761d65 ("mm/memory: optimize fork() with PTE-mapped THP") Co-developed-by: David Hildenbrand Signed-off-by: David Hildenbrand Signed-off-by: Petr Vaněk Cc: Ryan Roberts Cc: Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/internal.h | 27 +++++++++++---------------- 1 file changed, 11 insertions(+), 16 deletions(-) diff --git a/mm/internal.h b/mm/internal.h index cbd279069fd0..5b7085bf46da 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -194,11 +194,9 @@ static inline int folio_pte_batch(struct folio *folio, unsigned long addr, pte_t *start_ptep, pte_t pte, int max_nr, fpb_t flags, bool *any_writable, bool *any_young) { - unsigned long folio_end_pfn = folio_pfn(folio) + folio_nr_pages(folio); - const pte_t *end_ptep = start_ptep + max_nr; pte_t expected_pte, *ptep; bool writable, young; - int nr; + int nr, cur_nr; if (any_writable) *any_writable = false; @@ -209,11 +207,15 @@ static inline int folio_pte_batch(struct folio *folio, unsigned long addr, VM_WARN_ON_FOLIO(!folio_test_large(folio) || max_nr < 1, folio); VM_WARN_ON_FOLIO(page_folio(pfn_to_page(pte_pfn(pte))) != folio, folio); + /* Limit max_nr to the actual remaining PFNs in the folio we could batch. */ + max_nr = min_t(unsigned long, max_nr, + folio_pfn(folio) + folio_nr_pages(folio) - pte_pfn(pte)); + nr = pte_batch_hint(start_ptep, pte); expected_pte = __pte_batch_clear_ignored(pte_advance_pfn(pte, nr), flags); ptep = start_ptep + nr; - while (ptep < end_ptep) { + while (nr < max_nr) { pte = ptep_get(ptep); if (any_writable) writable = !!pte_write(pte); @@ -224,25 +226,18 @@ static inline int folio_pte_batch(struct folio *folio, unsigned long addr, if (!pte_same(pte, expected_pte)) break; - /* - * Stop immediately once we reached the end of the folio. In - * corner cases the next PFN might fall into a different - * folio. - */ - if (pte_pfn(pte) >= folio_end_pfn) - break; - if (any_writable) *any_writable |= writable; if (any_young) *any_young |= young; - nr = pte_batch_hint(ptep, pte); - expected_pte = pte_advance_pfn(expected_pte, nr); - ptep += nr; + cur_nr = pte_batch_hint(ptep, pte); + expected_pte = pte_advance_pfn(expected_pte, cur_nr); + ptep += cur_nr; + nr += cur_nr; } - return min(ptep - start_ptep, max_nr); + return min(nr, max_nr); } /** -- Gitee From 005100446e7df2cf534ac73e1ad49ad14ba9848b Mon Sep 17 00:00:00 2001 From: Chris Li Date: Wed, 17 Sep 2025 00:00:46 +0800 Subject: [PATCH 469/484] docs/mm: add document for swap table commit 87cc51571a7790f50151c4fdca10c43aa18a29ba upstream Conflicts: resolved Backport-reason: Swap Table: phase 1 Patch series "mm, swap: introduce swap table as swap cache (phase I)", v4. This is the first phase of the bigger series implementing basic infrastructures for the Swap Table idea proposed at the LSF/MM/BPF topic "Integrate swap cache, swap maps with swap allocator" [1]. To give credit where it is due, this is based on Chris Li's idea and a prototype of using cluster size atomic arrays to implement swap cache. This phase I contains 15 patches, introduces the swap table infrastructure and uses it as the swap cache backend. By doing so, we have up to ~5-20% performance gain in throughput, RPS or build time for benchmark and workload tests. The speed up is due to less contention on the swap cache access and shallower swap cache lookup path. The cluster size is much finer-grained than the 64M address space split, which is removed in this phase I. It also unifies and cleans up the swap code base. Each swap cluster will dynamically allocate the swap table, which is an atomic array to cover every swap slot in the cluster. It replaces the swap cache backed by XArray. In phase I, the static allocated swap_map still co-exists with the swap table. The memory usage is about the same as the original on average. A few exception test cases show about 1% higher in memory usage. In the following phases of the series, swap_map will merge into the swap table without additional memory allocation. It will result in net memory reduction compared to the original swap cache. Testing has shown that phase I has a significant performance improvement from 8c/1G ARM machine to 48c96t/128G x86_64 servers in many practical workloads. The full picture with a summary can be found at [2]. An older bigger series of 28 patches is posted at [3]. vm-scability test: ================== Test with: usemem --init-time -O -y -x -n 31 1G (4G memcg, PMEM as swap) Before: After: System time: 219.12s 158.16s (-27.82%) Sum Throughput: 4767.13 MB/s 6128.59 MB/s (+28.55%) Single process Throughput: 150.21 MB/s 196.52 MB/s (+30.83%) Free latency: 175047.58 us 131411.87 us (-24.92%) usemem --init-time -O -y -x -n 32 1536M (16G memory, global pressure, PMEM as swap) Before: After: System time: 356.16s 284.68s (-20.06%) Sum Throughput: 4648.35 MB/s 5453.52 MB/s (+17.32%) Single process Throughput: 141.63 MB/s 168.35 MB/s (+18.86%) Free latency: 499907.71 us 484977.03 us (-2.99%) This shows an improvement of more than 20% improvement in most readings. Build kernel test: ================== The following result matrix is from building kernel with defconfig on tmpfs with ZSWAP / ZRAM, using different memory pressure and setups. Measuring sys and real time in seconds, less is better (user time is almost identical as expected): -j / Mem | Sys before / after | Real before / after Using 16G ZRAM with memcg limit: 6 / 192M | 9686 / 9472 -2.21% | 2130 / 2096 -1.59% 12 / 256M | 6610 / 6451 -2.41% | 827 / 812 -1.81% 24 / 384M | 5938 / 5701 -3.37% | 414 / 405 -2.17% 48 / 768M | 4696 / 4409 -6.11% | 188 / 182 -3.19% With 64k folio: 24 / 512M | 4222 / 4162 -1.42% | 326 / 321 -1.53% 48 / 1G | 3688 / 3622 -1.79% | 151 / 149 -1.32% With ZSWAP with 3G memcg (using higher limit due to kmem account): 48 / 3G | 603 / 581 -3.65% | 81 / 80 -1.23% Testing extremely high global memory and schedule pressure: Using ZSWAP with 32G NVMEs in a 48c VM that has 4G memory, no memcg limit, system components take up about 1.5G already, using make -j48 to build defconfig: Before: sys time: 2069.53s real time: 135.76s After: sys time: 2021.13s (-2.34%) real time: 134.23s (-1.12%) On another 48c 4G memory VM, using 16G ZRAM as swap, testing make -j48 with same config: Before: sys time: 1756.96s real time: 111.01s After: sys time: 1715.90s (-2.34%) real time: 109.51s (-1.35%) All cases are more or less faster, and no regression even under extremely heavy global memory pressure. Redis / Valkey bench: ===================== The test machine is a ARM64 VM with 1536M memory 12 cores, Redis is set to use 2500M memory, and ZRAM swap size is set to 5G: Testing with: redis-benchmark -r 2000000 -n 2000000 -d 1024 -c 12 -P 32 -t get no BGSAVE with BGSAVE Before: 487576.06 RPS 280016.02 RPS After: 487541.76 RPS (-0.01%) 300155.32 RPS (+7.19%) Testing with: redis-benchmark -r 2500000 -n 2500000 -d 1024 -c 12 -P 32 -t get no BGSAVE with BGSAVE Before: 466789.59 RPS 281213.92 RPS After: 466402.89 RPS (-0.08%) 298411.84 RPS (+6.12%) With BGSAVE enabled, most Redis memory will have a swap count > 1 so swap cache is heavily in use. We can see a about 6% performance gain. No BGSAVE is very slightly slower (<0.1%) due to the higher memory pressure of the co-existence of swap_map and swap table. This will be optimzed into a net gain and up to 20% gain in BGSAVE case in the following phases. HDD swap is also ~40% faster with usemem because we removed an old contention workaround. This patch (of 15): Swap table is the new swap cache. [chrisl@kernel.org: move swap table document, redo swap table size sentence] Link: https://lkml.kernel.org/r/CACePvbXjaUyzB_9RSSSgR6BNvz+L9anvn0vcNf_J0jD7-4Yy6Q@mail.gmail.com Link: https://lkml.kernel.org/r/20250916160100.31545-1-ryncsn@gmail.com Link: https://lore.kernel.org/linux-mm/20250514201729.48420-1-ryncsn@gmail.com/ [3] Link: https://lkml.kernel.org/r/20250916160100.31545-2-ryncsn@gmail.com Link: https://lore.kernel.org/CAMgjq7BvQ0ZXvyLGp2YP96+i+6COCBBJCYmjXHGBnfisCAb8VA@mail.gmail.com [1] Link: https://github.com/ryncsn/linux/tree/kasong/devel/swap-table [2] Signed-off-by: Chris Li Signed-off-by: Kairui Song Suggested-by: Chris Li Cc: Baolin Wang Cc: Baoquan He Cc: Barry Song Cc: David Hildenbrand Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Johannes Weiner Cc: Kemeng Shi Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Nhat Pham Cc: Yosry Ahmed Cc: Zi Yan Cc: kernel test robot Cc: SeongJae Park Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- Documentation/mm/index.rst | 1 + Documentation/mm/swap-table.rst | 69 +++++++++++++++++++++++++++++++++ MAINTAINERS | 19 +++++++++ 3 files changed, 89 insertions(+) create mode 100644 Documentation/mm/swap-table.rst diff --git a/Documentation/mm/index.rst b/Documentation/mm/index.rst index 31d2ac306438..e7ac730468d6 100644 --- a/Documentation/mm/index.rst +++ b/Documentation/mm/index.rst @@ -23,6 +23,7 @@ see the :doc:`admin guide <../admin-guide/mm/index>`. highmem page_reclaim swap + swap-table page_cache shmfs oom diff --git a/Documentation/mm/swap-table.rst b/Documentation/mm/swap-table.rst new file mode 100644 index 000000000000..da10bb7a0dc3 --- /dev/null +++ b/Documentation/mm/swap-table.rst @@ -0,0 +1,69 @@ +.. SPDX-License-Identifier: GPL-2.0 + +:Author: Chris Li , Kairui Song + +========== +Swap Table +========== + +Swap table implements swap cache as a per-cluster swap cache value array. + +Swap Entry +---------- + +A swap entry contains the information required to serve the anonymous page +fault. + +Swap entry is encoded as two parts: swap type and swap offset. + +The swap type indicates which swap device to use. +The swap offset is the offset of the swap file to read the page data from. + +Swap Cache +---------- + +Swap cache is a map to look up folios using swap entry as the key. The result +value can have three possible types depending on which stage of this swap entry +was in. + +1. NULL: This swap entry is not used. + +2. folio: A folio has been allocated and bound to this swap entry. This is + the transient state of swap out or swap in. The folio data can be in + the folio or swap file, or both. + +3. shadow: The shadow contains the working set information of the swapped + out folio. This is the normal state for a swapped out page. + +Swap Table Internals +-------------------- + +The previous swap cache is implemented by XArray. The XArray is a tree +structure. Each lookup will go through multiple nodes. Can we do better? + +Notice that most of the time when we look up the swap cache, we are either +in a swap in or swap out path. We should already have the swap cluster, +which contains the swap entry. + +If we have a per-cluster array to store swap cache value in the cluster. +Swap cache lookup within the cluster can be a very simple array lookup. + +We give such a per-cluster swap cache value array a name: the swap table. + +A swap table is an array of pointers. Each pointer is the same size as a +PTE. The size of a swap table for one swap cluster typically matches a PTE +page table, which is one page on modern 64-bit systems. + +With swap table, swap cache lookup can achieve great locality, simpler, +and faster. + +Locking +------- + +Swap table modification requires taking the cluster lock. If a folio +is being added to or removed from the swap table, the folio must be +locked prior to the cluster lock. After adding or removing is done, the +folio shall be unlocked. + +Swap table lookup is protected by RCU and atomic read. If the lookup +returns a folio, the user must lock the folio before use. diff --git a/MAINTAINERS b/MAINTAINERS index 1809d6d70d1e..4bee80d14c1f 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -13829,6 +13829,25 @@ F: mm/ F: tools/mm/ F: tools/testing/selftests/mm/ +MEMORY MANAGEMENT - SWAP +M: Andrew Morton +R: Kemeng Shi +R: Kairui Song +R: Nhat Pham +R: Baoquan He +R: Barry Song +R: Chris Li +L: linux-mm@kvack.org +S: Maintained +F: include/linux/swap.h +F: include/linux/swapfile.h +F: include/linux/swapops.h +F: mm/page_io.c +F: mm/swap.c +F: mm/swap.h +F: mm/swap_state.c +F: mm/swapfile.c + MEMORY TECHNOLOGY DEVICES (MTD) M: Miquel Raynal M: Richard Weinberger -- Gitee From bc40c05a566f3fc10c177de04240639c96f30a62 Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Wed, 17 Sep 2025 00:00:47 +0800 Subject: [PATCH 470/484] mm, swap: use unified helper for swap cache look up commit f28124617f34153b34d7716eaff084e25a2d71fa upstream Conflicts: minor, also convert memcg-v1 migration to use new API. Backport-reason: Swap Table: phase 1 The swap cache lookup helper swap_cache_get_folio currently does readahead updates as well, so callers that are not doing swapin from any VMA or mapping are forced to reuse filemap helpers instead, and have to access the swap cache space directly. So decouple readahead update with swap cache lookup. Move the readahead update part into a standalone helper. Let the caller call the readahead update helper if they do readahead. And convert all swap cache lookups to use swap_cache_get_folio. After this commit, there are only three special cases for accessing swap cache space now: huge memory splitting, migration, and shmem replacing, because they need to lock the XArray. The following commits will wrap their accesses to the swap cache too, with special helpers. And worth noting, currently dropbehind is not supported for anon folio, and we will never see a dropbehind folio in swap cache. The unified helper can be updated later to handle that. While at it, add proper kernedoc for touched helpers. No functional change. Link: https://lkml.kernel.org/r/20250916160100.31545-3-ryncsn@gmail.com Signed-off-by: Kairui Song Reviewed-by: Baolin Wang Reviewed-by: Barry Song Acked-by: David Hildenbrand Acked-by: Chris Li Acked-by: Nhat Pham Suggested-by: Chris Li Cc: Baoquan He Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Johannes Weiner Cc: Kemeng Shi Cc: kernel test robot Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Yosry Ahmed Cc: Zi Yan Cc: SeongJae Park Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/memcontrol.c | 9 +--- mm/memory.c | 6 ++- mm/mincore.c | 3 +- mm/shmem.c | 4 +- mm/swap.h | 13 ++++-- mm/swap_state.c | 109 +++++++++++++++++++++++++----------------------- mm/swapfile.c | 11 +++-- 7 files changed, 81 insertions(+), 74 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index f8568e8d47b7..1ccbde08bff8 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -7786,11 +7786,7 @@ static struct page *mc_handle_swap_pte(struct vm_area_struct *vma, if (non_swap_entry(ent)) return NULL; - /* - * Because swap_cache_get_folio() updates some statistics counter, - * we call find_get_page() with swapper_space directly. - */ - page = find_get_page(swap_address_space(ent), swap_cache_index(ent)); + page = folio_file_page(swap_cache_get_folio(ent), swp_offset(ent)); entry->val = ent.val; return page; @@ -7830,8 +7826,7 @@ static struct page *mc_handle_file_pte(struct vm_area_struct *vma, if (IS_ENABLED(CONFIG_SWAP) && shmem_mapping(mapping)) { si = get_swap_device(entry); if (si) { - folio = filemap_get_folio(swap_address_space(entry), - swap_cache_index(entry)); + folio = swap_cache_get_folio(entry); put_swap_device(si); } } diff --git a/mm/memory.c b/mm/memory.c index cd7e8375803e..09f26c92dc07 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4148,9 +4148,11 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) if (unlikely(!si)) goto out; - folio = swap_cache_get_folio(entry, vma, vmf->address); - if (folio) + folio = swap_cache_get_folio(entry); + if (folio) { + swap_update_readahead(folio, vma, vmf->address); page = folio_file_page(folio, swp_offset(entry)); + } swapcache = folio; if (!folio) { diff --git a/mm/mincore.c b/mm/mincore.c index c99e5c645ea0..0e9031fe4927 100644 --- a/mm/mincore.c +++ b/mm/mincore.c @@ -73,8 +73,7 @@ static unsigned char mincore_swap(swp_entry_t entry, bool shmem) if (!si) return 0; } - folio = filemap_get_entry(swap_address_space(entry), - swap_cache_index(entry)); + folio = swap_cache_get_folio(entry); if (shmem) put_swap_device(si); /* The swap cache space contains either folio, shadow or NULL */ diff --git a/mm/shmem.c b/mm/shmem.c index 8080a0abc793..fc3d7bed86bb 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2260,7 +2260,7 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, } /* Look it up and read it in.. */ - folio = swap_cache_get_folio(swap, NULL, 0); + folio = swap_cache_get_folio(swap); if (!folio) { if (data_race(si->flags & SWP_SYNCHRONOUS_IO)) { /* Direct swapin skipping swap cache & readahead */ @@ -2291,6 +2291,8 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, count_vm_event(PGMAJFAULT); count_memcg_event_mm(fault_mm, PGMAJFAULT); } + } else { + swap_update_readahead(folio, NULL, 0); } if (order > folio_order(folio)) { diff --git a/mm/swap.h b/mm/swap.h index 60f4cfd7cbe4..47350cfed300 100644 --- a/mm/swap.h +++ b/mm/swap.h @@ -59,8 +59,7 @@ void delete_from_swap_cache(struct folio *folio); void clear_shadow_from_swap_cache(int type, unsigned long begin, unsigned long end); void swapcache_clear(struct swap_info_struct *si, swp_entry_t entry, int nr); -struct folio *swap_cache_get_folio(swp_entry_t entry, - struct vm_area_struct *vma, unsigned long addr); +struct folio *swap_cache_get_folio(swp_entry_t entry); struct folio *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, struct vm_area_struct *vma, unsigned long addr, struct swap_iocb **plug); @@ -71,6 +70,8 @@ struct folio *swap_cluster_readahead(swp_entry_t entry, gfp_t flag, struct mempolicy *mpol, pgoff_t ilx); struct folio *swapin_readahead(swp_entry_t entry, gfp_t flag, struct vm_fault *vmf); +void swap_update_readahead(struct folio *folio, struct vm_area_struct *vma, + unsigned long addr); static inline unsigned int folio_swap_flags(struct folio *folio) { @@ -161,12 +162,16 @@ static inline int swap_writepage(struct page *p, struct writeback_control *wbc) return 0; } +static inline void swap_update_readahead(struct folio *folio, + struct vm_area_struct *vma, unsigned long addr) +{ +} + static inline void swapcache_clear(struct swap_info_struct *si, swp_entry_t entry, int nr) { } -static inline struct folio *swap_cache_get_folio(swp_entry_t entry, - struct vm_area_struct *vma, unsigned long addr) +static inline struct folio *swap_cache_get_folio(swp_entry_t entry) { return NULL; } diff --git a/mm/swap_state.c b/mm/swap_state.c index fbd37375b66a..658629799efa 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -70,6 +70,27 @@ void show_swap_cache_info(void) printk("Total swap = %lukB\n", K(total_swap_pages)); } +/** + * swap_cache_get_folio - Looks up a folio in the swap cache. + * @entry: swap entry used for the lookup. + * + * A found folio will be returned unlocked and with its refcount increased. + * + * Context: Caller must ensure @entry is valid and protect the swap device + * with reference count or locks. + * Return: Returns the found folio on success, NULL otherwise. The caller + * must lock and check if the folio still matches the swap entry before + * use. + */ +struct folio *swap_cache_get_folio(swp_entry_t entry) +{ + struct folio *folio = filemap_get_folio(swap_address_space(entry), + swap_cache_index(entry)); + if (IS_ERR(folio)) + return NULL; + return folio; +} + void *get_shadow_from_swap_cache(swp_entry_t entry) { struct address_space *address_space = swap_address_space(entry); @@ -278,55 +299,43 @@ static inline bool swap_use_vma_readahead(void) return READ_ONCE(enable_vma_readahead) && !atomic_read(&nr_rotate_swap); } -/* - * Lookup a swap entry in the swap cache. A found folio will be returned - * unlocked and with its refcount incremented - we rely on the kernel - * lock getting page table operations atomic even if we drop the folio - * lock before returning. - * - * Caller must lock the swap device or hold a reference to keep it valid. +/** + * swap_update_readahead - Update the readahead statistics of VMA or globally. + * @folio: the swap cache folio that just got hit. + * @vma: the VMA that should be updated, could be NULL for global update. + * @addr: the addr that triggered the swapin, ignored if @vma is NULL. */ -struct folio *swap_cache_get_folio(swp_entry_t entry, - struct vm_area_struct *vma, unsigned long addr) +void swap_update_readahead(struct folio *folio, struct vm_area_struct *vma, + unsigned long addr) { - struct folio *folio; - - folio = filemap_get_folio(swap_address_space(entry), swap_cache_index(entry)); - if (!IS_ERR(folio)) { - bool vma_ra = swap_use_vma_readahead(); - bool readahead; + bool readahead, vma_ra = swap_use_vma_readahead(); - /* - * At the moment, we don't support PG_readahead for anon THP - * so let's bail out rather than confusing the readahead stat. - */ - if (unlikely(folio_test_large(folio))) - return folio; - - readahead = folio_test_clear_readahead(folio); - if (vma && vma_ra) { - unsigned long ra_val; - int win, hits; - - ra_val = GET_SWAP_RA_VAL(vma); - win = SWAP_RA_WIN(ra_val); - hits = SWAP_RA_HITS(ra_val); - if (readahead) - hits = min_t(int, hits + 1, SWAP_RA_HITS_MAX); - atomic_long_set(&vma->swap_readahead_info, - SWAP_RA_VAL(addr, win, hits)); - } - - if (readahead) { - count_vm_event(SWAP_RA_HIT); - if (!vma || !vma_ra) - atomic_inc(&swapin_readahead_hits); - } - } else { - folio = NULL; + /* + * At the moment, we don't support PG_readahead for anon THP + * so let's bail out rather than confusing the readahead stat. + */ + if (unlikely(folio_test_large(folio))) + return; + + readahead = folio_test_clear_readahead(folio); + if (vma && vma_ra) { + unsigned long ra_val; + int win, hits; + + ra_val = GET_SWAP_RA_VAL(vma); + win = SWAP_RA_WIN(ra_val); + hits = SWAP_RA_HITS(ra_val); + if (readahead) + hits = min_t(int, hits + 1, SWAP_RA_HITS_MAX); + atomic_long_set(&vma->swap_readahead_info, + SWAP_RA_VAL(addr, win, hits)); } - return folio; + if (readahead) { + count_vm_event(SWAP_RA_HIT); + if (!vma || !vma_ra) + atomic_inc(&swapin_readahead_hits); + } } struct folio *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, @@ -343,14 +352,10 @@ struct folio *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, for (;;) { int err; - /* - * First check the swap cache. Since this is normally - * called after swap_cache_get_folio() failed, re-calling - * that would confuse statistics. - */ - folio = filemap_get_folio(swap_address_space(entry), - swap_cache_index(entry)); - if (!IS_ERR(folio)) + + /* Check the swap cache in case the folio is already there */ + folio = swap_cache_get_folio(entry); + if (folio) goto got_folio; /* diff --git a/mm/swapfile.c b/mm/swapfile.c index 5c2a6e75848d..89caf3468d86 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -214,15 +214,14 @@ static int __try_to_reclaim_swap(struct swap_info_struct *si, unsigned long offset, unsigned long flags) { swp_entry_t entry = swp_entry(si->type, offset); - struct address_space *address_space = swap_address_space(entry); struct swap_cluster_info *ci; struct folio *folio; int ret, nr_pages; bool need_reclaim; again: - folio = filemap_get_folio(address_space, swap_cache_index(entry)); - if (IS_ERR(folio)) + folio = swap_cache_get_folio(entry); + if (!folio) return 0; nr_pages = folio_nr_pages(folio); @@ -2158,7 +2157,7 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, pte_unmap(pte); pte = NULL; - folio = swap_cache_get_folio(entry, vma, addr); + folio = swap_cache_get_folio(entry); if (!folio) { struct vm_fault vmf = { .vma = vma, @@ -2384,8 +2383,8 @@ static int try_to_unuse(unsigned int type) (i = find_next_to_unuse(si, i)) != 0) { entry = swp_entry(type, i); - folio = filemap_get_folio(swap_address_space(entry), swap_cache_index(entry)); - if (IS_ERR(folio)) + folio = swap_cache_get_folio(entry); + if (!folio) continue; /* -- Gitee From 3f411588ac783aa3bf49cb15b0bda86980229dd4 Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Wed, 17 Sep 2025 00:00:48 +0800 Subject: [PATCH 471/484] mm, swap: fix swap cache index error when retrying reclaim commit a733d8de7f1ccbf093ce2fde424616e529073876 upstream Conflicts: none Backport-reason: Swap Table: phase 1 The allocator will reclaim cached slots while scanning. Currently, it will try again if reclaim found a folio that is already removed from the swap cache due to a race. But the following lookup will be using the wrong index. It won't cause any OOB issue since the swap cache index is truncated upon lookup, but it may lead to reclaiming of an irrelevant folio. This should not cause a measurable issue, but we should fix it. Link: https://lkml.kernel.org/r/20250916160100.31545-4-ryncsn@gmail.com Fixes: fae859550531 ("mm, swap: avoid reclaiming irrelevant swap cache") Signed-off-by: Kairui Song Reviewed-by: Baolin Wang Acked-by: Nhat Pham Acked-by: Chris Li Acked-by: David Hildenbrand Suggested-by: Chris Li Cc: Baoquan He Cc: Barry Song Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Johannes Weiner Cc: Kemeng Shi Cc: kernel test robot Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Yosry Ahmed Cc: Zi Yan Cc: SeongJae Park Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/swapfile.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/mm/swapfile.c b/mm/swapfile.c index 89caf3468d86..33e8ecdaa607 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -213,7 +213,7 @@ static bool swap_is_last_map(struct swap_info_struct *si, static int __try_to_reclaim_swap(struct swap_info_struct *si, unsigned long offset, unsigned long flags) { - swp_entry_t entry = swp_entry(si->type, offset); + const swp_entry_t entry = swp_entry(si->type, offset); struct swap_cluster_info *ci; struct folio *folio; int ret, nr_pages; @@ -241,13 +241,13 @@ static int __try_to_reclaim_swap(struct swap_info_struct *si, * Offset could point to the middle of a large folio, or folio * may no longer point to the expected offset before it's locked. */ - entry = folio->swap; - if (offset < swp_offset(entry) || offset >= swp_offset(entry) + nr_pages) { + if (offset < swp_offset(folio->swap) || + offset >= swp_offset(folio->swap) + nr_pages) { folio_unlock(folio); folio_put(folio); goto again; } - offset = swp_offset(entry); + offset = swp_offset(folio->swap); need_reclaim = ((flags & TTRS_ANYWAY) || ((flags & TTRS_UNMAPPED) && !folio_mapped(folio)) || -- Gitee From c8b8db6fdc501fe1de69b8d2c805a03975e58f19 Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Wed, 17 Sep 2025 00:00:49 +0800 Subject: [PATCH 472/484] mm, swap: check page poison flag after locking it commit 3518b931df0cc57b1b97e5c3a307ad7d5fe17650 upstream Conflicts: none Backport-reason: Swap Table: phase 1 Instead of checking the poison flag only in the fast swap cache lookup path, always check the poison flags after locking a swap cache folio. There are two reasons to do so. The folio is unstable and could be removed from the swap cache anytime, so it's totally possible that the folio is no longer the backing folio of a swap entry, and could be an irrelevant poisoned folio. We might mistakenly kill a faulting process. And it's totally possible or even common for the slow swap in path (swapin_readahead) to bring in a cached folio. The cache folio could be poisoned, too. Only checking the poison flag in the fast path will miss such folios. The race window is tiny, so it's very unlikely to happen, though. While at it, also add a unlikely prefix. Link: https://lkml.kernel.org/r/20250916160100.31545-5-ryncsn@gmail.com Signed-off-by: Kairui Song Acked-by: Chris Li Acked-by: David Hildenbrand Acked-by: Nhat Pham Suggested-by: Chris Li Cc: Baolin Wang Cc: Baoquan He Cc: Barry Song Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Johannes Weiner Cc: Kemeng Shi Cc: kernel test robot Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Yosry Ahmed Cc: Zi Yan Cc: SeongJae Park Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/memory.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index 09f26c92dc07..f43e5ecbc4a4 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4149,10 +4149,8 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) goto out; folio = swap_cache_get_folio(entry); - if (folio) { + if (folio) swap_update_readahead(folio, vma, vmf->address); - page = folio_file_page(folio, swp_offset(entry)); - } swapcache = folio; if (!folio) { @@ -4221,20 +4219,13 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) ret = VM_FAULT_MAJOR; count_vm_event(PGMAJFAULT); count_memcg_event_mm(vma->vm_mm, PGMAJFAULT); - page = folio_file_page(folio, swp_offset(entry)); - } else if (PageHWPoison(page)) { - /* - * hwpoisoned dirty swapcache pages are kept for killing - * owner processes (which may be unknown at hwpoison time) - */ - ret = VM_FAULT_HWPOISON; - goto out_release; } ret |= folio_lock_or_retry(folio, vmf); if (ret & VM_FAULT_RETRY) goto out_release; + page = folio_file_page(folio, swp_offset(entry)); if (swapcache) { /* * Make sure folio_free_swap() or swapoff did not release the @@ -4247,6 +4238,15 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) page_swap_entry(page).val != entry.val)) goto out_page; + if (unlikely(PageHWPoison(page))) { + /* + * hwpoisoned dirty swapcache pages are kept for killing + * owner processes (which may be unknown at hwpoison time) + */ + ret = VM_FAULT_HWPOISON; + goto out_page; + } + /* * KSM sometimes has to copy on read faults, for example, if * page->index of !PageKSM() pages would be nonlinear inside the -- Gitee From a67208622c49d09856a6c2fb1da4e03fb3c22754 Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Wed, 17 Sep 2025 00:00:50 +0800 Subject: [PATCH 473/484] mm, swap: always lock and check the swap cache folio before use commit ae38eb210590ed69fa4f70f29959df76a4bdc4da upstream Conflicts: none Backport-reason: Swap Table: phase 1 Swap cache lookup only increases the reference count of the returned folio. That's not enough to ensure a folio is stable in the swap cache, so the folio could be removed from the swap cache at any time. The caller should always lock and check the folio before using it. We have just documented this in kerneldoc, now introduce a helper for swap cache folio verification with proper sanity checks. Also, sanitize a few current users to use this convention and the new helper for easier debugging. They were not having observable problems yet, only trivial issues like wasted CPU cycles on swapoff or reclaiming. They would fail in some other way, but it is still better to always follow this convention to make things robust and make later commits easier to do. Link: https://lkml.kernel.org/r/20250916160100.31545-6-ryncsn@gmail.com Signed-off-by: Kairui Song Acked-by: David Hildenbrand Acked-by: Chris Li Acked-by: Nhat Pham Suggested-by: Chris Li Reviewed-by: Barry Song Cc: Baolin Wang Cc: Baoquan He Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Johannes Weiner Cc: Kemeng Shi Cc: kernel test robot Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Yosry Ahmed Cc: Zi Yan Cc: SeongJae Park Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/memory.c | 3 +-- mm/swap.h | 27 +++++++++++++++++++++++++++ mm/swap_state.c | 7 +++++-- mm/swapfile.c | 10 ++++++++-- 4 files changed, 41 insertions(+), 6 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index f43e5ecbc4a4..08982ccb2e4b 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4234,8 +4234,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) * swapcache, we need to check that the page's swap has not * changed. */ - if (unlikely(!folio_test_swapcache(folio) || - page_swap_entry(page).val != entry.val)) + if (unlikely(!folio_matches_swap_entry(folio, entry))) goto out_page; if (unlikely(PageHWPoison(page))) { diff --git a/mm/swap.h b/mm/swap.h index 47350cfed300..78ea6705d3d2 100644 --- a/mm/swap.h +++ b/mm/swap.h @@ -49,6 +49,28 @@ static inline pgoff_t swap_cache_index(swp_entry_t entry) return swp_offset(entry) & SWAP_ADDRESS_SPACE_MASK; } +/** + * folio_matches_swap_entry - Check if a folio matches a given swap entry. + * @folio: The folio. + * @entry: The swap entry to check against. + * + * Context: The caller should have the folio locked to ensure it's stable + * and nothing will move it in or out of the swap cache. + * Return: true or false. + */ +static inline bool folio_matches_swap_entry(const struct folio *folio, + swp_entry_t entry) +{ + swp_entry_t folio_entry = folio->swap; + long nr_pages = folio_nr_pages(folio); + + VM_WARN_ON_ONCE_FOLIO(!folio_test_locked(folio), folio); + if (!folio_test_swapcache(folio)) + return false; + VM_WARN_ON_ONCE_FOLIO(!IS_ALIGNED(folio_entry.val, nr_pages), folio); + return folio_entry.val == round_down(entry.val, nr_pages); +} + void show_swap_cache_info(void); void *get_shadow_from_swap_cache(swp_entry_t entry); int add_to_swap_cache(struct folio *folio, swp_entry_t entry, @@ -141,6 +163,11 @@ static inline pgoff_t swap_cache_index(swp_entry_t entry) return 0; } +static inline bool folio_matches_swap_entry(const struct folio *folio, swp_entry_t entry) +{ + return false; +} + static inline void show_swap_cache_info(void) { } diff --git a/mm/swap_state.c b/mm/swap_state.c index 658629799efa..e0e5dc0b9ba1 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -80,7 +80,7 @@ void show_swap_cache_info(void) * with reference count or locks. * Return: Returns the found folio on success, NULL otherwise. The caller * must lock and check if the folio still matches the swap entry before - * use. + * use (e.g. with folio_matches_swap_entry). */ struct folio *swap_cache_get_folio(swp_entry_t entry) { @@ -353,7 +353,10 @@ struct folio *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, for (;;) { int err; - /* Check the swap cache in case the folio is already there */ + /* + * Check the swap cache first, if a cached folio is found, + * return it unlocked. The caller will lock and check it. + */ folio = swap_cache_get_folio(entry); if (folio) goto got_folio; diff --git a/mm/swapfile.c b/mm/swapfile.c index 33e8ecdaa607..32668e8a4384 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -241,8 +241,7 @@ static int __try_to_reclaim_swap(struct swap_info_struct *si, * Offset could point to the middle of a large folio, or folio * may no longer point to the expected offset before it's locked. */ - if (offset < swp_offset(folio->swap) || - offset >= swp_offset(folio->swap) + nr_pages) { + if (!folio_matches_swap_entry(folio, entry)) { folio_unlock(folio); folio_put(folio); goto again; @@ -2031,6 +2030,13 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, bool hwpoisoned = false; int ret = 1; + /* + * If the folio is removed from swap cache by others, continue to + * unuse other PTEs. try_to_unuse may try again if we missed this one. + */ + if (!folio_matches_swap_entry(folio, entry)) + return 0; + swapcache = folio; folio = ksm_might_need_to_copy(folio, vma, addr); if (unlikely(!folio)) -- Gitee From 2be7ee1e5087e3e7f36c7a1b9b5629ec76fc0ae2 Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Wed, 17 Sep 2025 00:00:51 +0800 Subject: [PATCH 474/484] mm, swap: rename and move some swap cluster definition and helpers commit 4522aed4fffbbd18ab3581d733d0572d45780d07 upstream Conflicts: trivial, blank space. Backport-reason: Swap Table: phase 1 No feature change, move cluster related definitions and helpers to mm/swap.h, also tidy up and add a "swap_" prefix for cluster lock/unlock helpers, so they can be used outside of swap files. And while at it, add kerneldoc. Link: https://lkml.kernel.org/r/20250916160100.31545-7-ryncsn@gmail.com Signed-off-by: Kairui Song Reviewed-by: Baolin Wang Reviewed-by: Barry Song Acked-by: Chris Li Acked-by: David Hildenbrand Suggested-by: Chris Li Acked-by: Nhat Pham Cc: Baoquan He Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Johannes Weiner Cc: Kemeng Shi Cc: kernel test robot Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Yosry Ahmed Cc: Zi Yan Cc: SeongJae Park Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- include/linux/swap.h | 34 --------------- mm/swap.h | 70 +++++++++++++++++++++++++++++++ mm/swapfile.c | 98 ++++++++++++++------------------------------ 3 files changed, 100 insertions(+), 102 deletions(-) diff --git a/include/linux/swap.h b/include/linux/swap.h index c916ee478270..f1c6f34aa5ee 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -235,40 +235,6 @@ enum { /* Special value in each swap_map continuation */ #define SWAP_CONT_MAX 0x7f /* Max count */ -/* - * We use this to track usage of a cluster. A cluster is a block of swap disk - * space with SWAPFILE_CLUSTER pages long and naturally aligns in disk. All - * free clusters are organized into a list. We fetch an entry from the list to - * get a free cluster. - * - * The flags field determines if a cluster is free. This is - * protected by cluster lock. - */ -struct swap_cluster_info { - spinlock_t lock; /* - * Protect swap_cluster_info fields - * other than list, and swap_info_struct->swap_map - * elements corresponding to the swap cluster. - */ - u16 count; - u8 flags; - u8 order; - struct list_head list; -}; - -/* All on-list cluster must have a non-zero flag. */ -enum swap_cluster_flags { - CLUSTER_FLAG_NONE = 0, /* For temporary off-list cluster */ - CLUSTER_FLAG_FREE, - CLUSTER_FLAG_NONFULL, - CLUSTER_FLAG_FRAG, - /* Clusters with flags above are allocatable */ - CLUSTER_FLAG_USABLE = CLUSTER_FLAG_FRAG, - CLUSTER_FLAG_FULL, - CLUSTER_FLAG_DISCARD, - CLUSTER_FLAG_MAX, -}; - /* * The first page in the swap file is the swap header, which is always marked * bad to prevent it from being allocated as an entry. This also prevents the diff --git a/mm/swap.h b/mm/swap.h index 78ea6705d3d2..b8074d6888a2 100644 --- a/mm/swap.h +++ b/mm/swap.h @@ -4,10 +4,80 @@ struct mempolicy; +#ifdef CONFIG_THP_SWAP +#define SWAPFILE_CLUSTER HPAGE_PMD_NR +#define swap_entry_order(order) (order) +#else +#define SWAPFILE_CLUSTER 256 +#define swap_entry_order(order) 0 +#endif + +/* + * We use this to track usage of a cluster. A cluster is a block of swap disk + * space with SWAPFILE_CLUSTER pages long and naturally aligns in disk. All + * free clusters are organized into a list. We fetch an entry from the list to + * get a free cluster. + * + * The flags field determines if a cluster is free. This is + * protected by cluster lock. + */ +struct swap_cluster_info { + spinlock_t lock; /* + * Protect swap_cluster_info fields + * other than list, and swap_info_struct->swap_map + * elements corresponding to the swap cluster. + */ + u16 count; + u8 flags; + u8 order; + struct list_head list; +}; + +/* All on-list cluster must have a non-zero flag. */ +enum swap_cluster_flags { + CLUSTER_FLAG_NONE = 0, /* For temporary off-list cluster */ + CLUSTER_FLAG_FREE, + CLUSTER_FLAG_NONFULL, + CLUSTER_FLAG_FRAG, + /* Clusters with flags above are allocatable */ + CLUSTER_FLAG_USABLE = CLUSTER_FLAG_FRAG, + CLUSTER_FLAG_FULL, + CLUSTER_FLAG_DISCARD, + CLUSTER_FLAG_MAX, +}; + #ifdef CONFIG_SWAP #include /* for swp_offset */ #include /* for bio_end_io_t */ +static inline struct swap_cluster_info *swp_offset_cluster( + struct swap_info_struct *si, pgoff_t offset) +{ + return &si->cluster_info[offset / SWAPFILE_CLUSTER]; +} + +/** + * swap_cluster_lock - Lock and return the swap cluster of given offset. + * @si: swap device the cluster belongs to. + * @offset: the swap entry offset, pointing to a valid slot. + * + * Context: The caller must ensure the offset is in the valid range and + * protect the swap device with reference count or locks. + */ +static inline struct swap_cluster_info *swap_cluster_lock( + struct swap_info_struct *si, unsigned long offset) +{ + struct swap_cluster_info *ci = swp_offset_cluster(si, offset); + + spin_lock(&ci->lock); + return ci; +} + +static inline void swap_cluster_unlock(struct swap_cluster_info *ci) +{ + spin_unlock(&ci->lock); +} + /* linux/mm/page_io.c */ int sio_pool_init(void); struct swap_iocb; diff --git a/mm/swapfile.c b/mm/swapfile.c index 32668e8a4384..22266111b03a 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -57,9 +57,6 @@ static void swap_entries_free(struct swap_info_struct *si, static void swap_range_alloc(struct swap_info_struct *si, unsigned int nr_entries); static bool folio_swapcache_freeable(struct folio *folio); -static struct swap_cluster_info *lock_cluster(struct swap_info_struct *si, - unsigned long offset); -static inline void unlock_cluster(struct swap_cluster_info *ci); DEFINE_SPINLOCK(swap_lock); EXPORT_SYMBOL(swap_lock); @@ -259,9 +256,9 @@ static int __try_to_reclaim_swap(struct swap_info_struct *si, * swap_map is HAS_CACHE only, which means the slots have no page table * reference or pending writeback, and can't be allocated to others. */ - ci = lock_cluster(si, offset); + ci = swap_cluster_lock(si, offset); need_reclaim = swap_only_has_cache(si, offset, nr_pages); - unlock_cluster(ci); + swap_cluster_unlock(ci); if (!need_reclaim) goto out_unlock; @@ -386,19 +383,6 @@ static void discard_swap_cluster(struct swap_info_struct *si, } } -#ifdef CONFIG_THP_SWAP -#define SWAPFILE_CLUSTER HPAGE_PMD_NR - -#define swap_entry_order(order) (order) -#else -#define SWAPFILE_CLUSTER 256 - -/* - * Define swap_entry_order() as constant to let compiler to optimize - * out some code if !CONFIG_THP_SWAP - */ -#define swap_entry_order(order) 0 -#endif #define LATENCY_LIMIT 256 static inline bool cluster_is_empty(struct swap_cluster_info *info) @@ -426,34 +410,12 @@ static inline unsigned int cluster_index(struct swap_info_struct *si, return ci - si->cluster_info; } -static inline struct swap_cluster_info *offset_to_cluster(struct swap_info_struct *si, - unsigned long offset) -{ - return &si->cluster_info[offset / SWAPFILE_CLUSTER]; -} - static inline unsigned int cluster_offset(struct swap_info_struct *si, struct swap_cluster_info *ci) { return cluster_index(si, ci) * SWAPFILE_CLUSTER; } -static inline struct swap_cluster_info *lock_cluster(struct swap_info_struct *si, - unsigned long offset) -{ - struct swap_cluster_info *ci; - - ci = offset_to_cluster(si, offset); - spin_lock(&ci->lock); - - return ci; -} - -static inline void unlock_cluster(struct swap_cluster_info *ci) -{ - spin_unlock(&ci->lock); -} - static void move_cluster(struct swap_info_struct *si, struct swap_cluster_info *ci, struct list_head *list, enum swap_cluster_flags new_flags) @@ -809,7 +771,7 @@ static unsigned int alloc_swap_scan_cluster(struct swap_info_struct *si, } out: relocate_cluster(si, ci); - unlock_cluster(ci); + swap_cluster_unlock(ci); if (si->flags & SWP_SOLIDSTATE) { this_cpu_write(percpu_swap_cluster.offset[order], next); this_cpu_write(percpu_swap_cluster.si[order], si); @@ -876,7 +838,7 @@ static void swap_reclaim_full_clusters(struct swap_info_struct *si, bool force) if (ci->flags == CLUSTER_FLAG_NONE) relocate_cluster(si, ci); - unlock_cluster(ci); + swap_cluster_unlock(ci); if (to_scan <= 0) break; } @@ -915,7 +877,7 @@ static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int o if (offset == SWAP_ENTRY_INVALID) goto new_cluster; - ci = lock_cluster(si, offset); + ci = swap_cluster_lock(si, offset); /* Cluster could have been used by another order */ if (cluster_is_usable(ci, order)) { if (cluster_is_empty(ci)) @@ -923,7 +885,7 @@ static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int o found = alloc_swap_scan_cluster(si, ci, offset, order, usage); } else { - unlock_cluster(ci); + swap_cluster_unlock(ci); } if (found) goto done; @@ -1197,7 +1159,7 @@ static bool swap_alloc_fast(swp_entry_t *entry, if (!si || !offset || !get_swap_device_info(si)) return false; - ci = lock_cluster(si, offset); + ci = swap_cluster_lock(si, offset); if (cluster_is_usable(ci, order)) { if (cluster_is_empty(ci)) offset = cluster_offset(si, ci); @@ -1205,7 +1167,7 @@ static bool swap_alloc_fast(swp_entry_t *entry, if (found) *entry = swp_entry(si->type, found); } else { - unlock_cluster(ci); + swap_cluster_unlock(ci); } put_swap_device(si); @@ -1507,14 +1469,14 @@ static void swap_entries_put_cache(struct swap_info_struct *si, unsigned long offset = swp_offset(entry); struct swap_cluster_info *ci; - ci = lock_cluster(si, offset); - if (swap_only_has_cache(si, offset, nr)) + ci = swap_cluster_lock(si, offset); + if (swap_only_has_cache(si, offset, nr)) { swap_entries_free(si, ci, entry, nr); - else { + } else { for (int i = 0; i < nr; i++, entry.val++) swap_entry_put_locked(si, ci, entry, SWAP_HAS_CACHE); } - unlock_cluster(ci); + swap_cluster_unlock(ci); } static bool swap_entries_put_map(struct swap_info_struct *si, @@ -1532,7 +1494,7 @@ static bool swap_entries_put_map(struct swap_info_struct *si, if (count != 1 && count != SWAP_MAP_SHMEM) goto fallback; - ci = lock_cluster(si, offset); + ci = swap_cluster_lock(si, offset); if (!swap_is_last_map(si, offset, nr, &has_cache)) { goto locked_fallback; } @@ -1541,19 +1503,19 @@ static bool swap_entries_put_map(struct swap_info_struct *si, else for (i = 0; i < nr; i++) WRITE_ONCE(si->swap_map[offset + i], SWAP_HAS_CACHE); - unlock_cluster(ci); + swap_cluster_unlock(ci); + return has_cache; fallback: - ci = lock_cluster(si, offset); + ci = swap_cluster_lock(si, offset); locked_fallback: for (i = 0; i < nr; i++, entry.val++) { count = swap_entry_put_locked(si, ci, entry, 1); if (count == SWAP_HAS_CACHE) has_cache = true; } - unlock_cluster(ci); + swap_cluster_unlock(ci); return has_cache; - } /* @@ -1603,7 +1565,7 @@ static void swap_entries_free(struct swap_info_struct *si, unsigned char *map_end = map + nr_pages; /* It should never free entries across different clusters */ - VM_BUG_ON(ci != offset_to_cluster(si, offset + nr_pages - 1)); + VM_BUG_ON(ci != swp_offset_cluster(si, offset + nr_pages - 1)); VM_BUG_ON(cluster_is_empty(ci)); VM_BUG_ON(ci->count < nr_pages); @@ -1678,9 +1640,9 @@ bool swap_entry_swapped(struct swap_info_struct *si, swp_entry_t entry) struct swap_cluster_info *ci; int count; - ci = lock_cluster(si, offset); + ci = swap_cluster_lock(si, offset); count = swap_count(si->swap_map[offset]); - unlock_cluster(ci); + swap_cluster_unlock(ci); return !!count; } @@ -1703,7 +1665,7 @@ int swp_swapcount(swp_entry_t entry) offset = swp_offset(entry); - ci = lock_cluster(si, offset); + ci = swap_cluster_lock(si, offset); count = swap_count(si->swap_map[offset]); if (!(count & COUNT_CONTINUED)) @@ -1726,7 +1688,7 @@ int swp_swapcount(swp_entry_t entry) n *= (SWAP_CONT_MAX + 1); } while (tmp_count & COUNT_CONTINUED); out: - unlock_cluster(ci); + swap_cluster_unlock(ci); return count; } EXPORT_SYMBOL(swp_swapcount); @@ -1742,7 +1704,7 @@ static bool swap_page_trans_huge_swapped(struct swap_info_struct *si, int i; bool ret = false; - ci = lock_cluster(si, offset); + ci = swap_cluster_lock(si, offset); if (nr_pages == 1) { if (swap_count(map[roffset])) ret = true; @@ -1755,7 +1717,7 @@ static bool swap_page_trans_huge_swapped(struct swap_info_struct *si, } } unlock_out: - unlock_cluster(ci); + swap_cluster_unlock(ci); return ret; } @@ -2689,8 +2651,8 @@ static void wait_for_allocation(struct swap_info_struct *si) BUG_ON(si->flags & SWP_WRITEOK); for (offset = 0; offset < end; offset += SWAPFILE_CLUSTER) { - ci = lock_cluster(si, offset); - unlock_cluster(ci); + ci = swap_cluster_lock(si, offset); + swap_cluster_unlock(ci); } } @@ -3603,7 +3565,7 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage, int nr) offset = swp_offset(entry); VM_WARN_ON(nr > SWAPFILE_CLUSTER - offset % SWAPFILE_CLUSTER); VM_WARN_ON(usage == 1 && nr > 1); - ci = lock_cluster(si, offset); + ci = swap_cluster_lock(si, offset); err = 0; for (i = 0; i < nr; i++) { @@ -3658,7 +3620,7 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage, int nr) } unlock_out: - unlock_cluster(ci); + swap_cluster_unlock(ci); return err; } @@ -3759,7 +3721,7 @@ int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask) offset = swp_offset(entry); - ci = lock_cluster(si, offset); + ci = swap_cluster_lock(si, offset); count = swap_count(si->swap_map[offset]); @@ -3819,7 +3781,7 @@ int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask) out_unlock_cont: spin_unlock(&si->cont_lock); out: - unlock_cluster(ci); + swap_cluster_unlock(ci); put_swap_device(si); outer: if (page) -- Gitee From dd52ed27ce8455819c21be77939cadd9fafacd90 Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Wed, 17 Sep 2025 00:00:52 +0800 Subject: [PATCH 475/484] mm, swap: tidy up swap device and cluster info helpers commit 0fcf8ef4fdab8e5c91d1bce39c7fe6565974ffad upstream Conflicts: trivial Backport-reason: Swap Table: phase 1 swp_swap_info is the most commonly used helper for retrieving swap info. It has an internal check that may lead to a NULL return value, but almost none of its caller checks the return value, making the internal check pointless. In fact, most of these callers already ensured the entry is valid and never expect a NULL value. Tidy this up and improve the function names. If the caller can make sure the swap entry/type is valid and the device is pinned, use the new introduced __swap_entry_to_info/__swap_type_to_info instead. They have more debug sanity checks and lower overhead as they are inlined. Callers that may expect a NULL value should use swap_entry_to_info/swap_type_to_info instead. No feature change. The rearranged codes should have had no effect, or they should have been hitting NULL de-ref bugs already. Only some new sanity checks are added so potential issues may show up in debug build. The new helpers will be frequently used with swap table later when working with swap cache folios. A locked swap cache folio ensures the entries are valid and stable so these helpers are very helpful. Link: https://lkml.kernel.org/r/20250916160100.31545-8-ryncsn@gmail.com Signed-off-by: Kairui Song Acked-by: Chris Li Reviewed-by: Barry Song Acked-by: David Hildenbrand Suggested-by: Chris Li Cc: Baolin Wang Cc: Baoquan He Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Johannes Weiner Cc: Kemeng Shi Cc: kernel test robot Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Nhat Pham Cc: Yosry Ahmed Cc: Zi Yan Cc: SeongJae Park Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- include/linux/swap.h | 6 ------ mm/page_io.c | 12 ++++++------ mm/swap.h | 38 +++++++++++++++++++++++++++++++++----- mm/swap_state.c | 4 ++-- mm/swapfile.c | 38 +++++++++++++++++++------------------- 5 files changed, 60 insertions(+), 38 deletions(-) diff --git a/include/linux/swap.h b/include/linux/swap.h index f1c6f34aa5ee..ae651ce19638 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -491,7 +491,6 @@ extern sector_t swapdev_block(int, pgoff_t); extern int __swap_count(swp_entry_t entry); extern bool swap_entry_swapped(struct swap_info_struct *si, swp_entry_t entry); extern int swp_swapcount(swp_entry_t entry); -struct swap_info_struct *swp_swap_info(swp_entry_t entry); struct backing_dev_info; extern int init_swap_address_space(struct swap_info_struct *si, unsigned long nr_pages); extern void exit_swap_address_space(unsigned int type); @@ -504,11 +503,6 @@ static inline void put_swap_device(struct swap_info_struct *si) } #else /* CONFIG_SWAP */ -static inline struct swap_info_struct *swp_swap_info(swp_entry_t entry) -{ - return NULL; -} - static inline struct swap_info_struct *get_swap_device(swp_entry_t entry) { return NULL; diff --git a/mm/page_io.c b/mm/page_io.c index 08c1dd91539e..e828d902c47f 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -204,7 +204,7 @@ static bool is_folio_zero_filled(struct folio *folio) static void swap_zeromap_folio_set(struct folio *folio) { struct obj_cgroup *objcg = get_obj_cgroup_from_folio(folio); - struct swap_info_struct *sis = swp_swap_info(folio->swap); + struct swap_info_struct *sis = __swap_entry_to_info(folio->swap); int nr_pages = folio_nr_pages(folio); swp_entry_t entry; unsigned int i; @@ -223,7 +223,7 @@ static void swap_zeromap_folio_set(struct folio *folio) static void swap_zeromap_folio_clear(struct folio *folio) { - struct swap_info_struct *sis = swp_swap_info(folio->swap); + struct swap_info_struct *sis = __swap_entry_to_info(folio->swap); swp_entry_t entry; unsigned int i; @@ -379,7 +379,7 @@ static void sio_write_complete(struct kiocb *iocb, long ret) static void swap_writepage_fs(struct folio *folio, struct writeback_control *wbc) { struct swap_iocb *sio = NULL; - struct swap_info_struct *sis = swp_swap_info(folio->swap); + struct swap_info_struct *sis = __swap_entry_to_info(folio->swap); struct file *swap_file = sis->swap_file; loff_t pos = swap_dev_pos(folio->swap); @@ -456,7 +456,7 @@ static void swap_writepage_bdev_async(struct folio *folio, int __swap_writepage(struct folio *folio, struct writeback_control *wbc) { - struct swap_info_struct *sis = swp_swap_info(folio->swap); + struct swap_info_struct *sis = __swap_entry_to_info(folio->swap); VM_BUG_ON_FOLIO(!folio_test_swapcache(folio), folio); @@ -554,7 +554,7 @@ static bool swap_read_folio_zeromap(struct folio *folio) static void swap_read_folio_fs(struct folio *folio, struct swap_iocb **plug) { - struct swap_info_struct *sis = swp_swap_info(folio->swap); + struct swap_info_struct *sis = __swap_entry_to_info(folio->swap); struct swap_iocb *sio = NULL; loff_t pos = swap_dev_pos(folio->swap); @@ -625,7 +625,7 @@ static void swap_read_folio_bdev_async(struct folio *folio, void swap_read_folio(struct folio *folio, struct swap_iocb **plug) { - struct swap_info_struct *sis = swp_swap_info(folio->swap); + struct swap_info_struct *sis = __swap_entry_to_info(folio->swap); bool synchronous = sis->flags & SWP_SYNCHRONOUS_IO; bool workingset = folio_test_workingset(folio); unsigned long pflags; diff --git a/mm/swap.h b/mm/swap.h index b8074d6888a2..32a4e67686cc 100644 --- a/mm/swap.h +++ b/mm/swap.h @@ -12,6 +12,8 @@ struct mempolicy; #define swap_entry_order(order) 0 #endif +extern struct swap_info_struct *swap_info[]; + /* * We use this to track usage of a cluster. A cluster is a block of swap disk * space with SWAPFILE_CLUSTER pages long and naturally aligns in disk. All @@ -50,9 +52,29 @@ enum swap_cluster_flags { #include /* for swp_offset */ #include /* for bio_end_io_t */ -static inline struct swap_cluster_info *swp_offset_cluster( +/* + * Callers of all helpers below must ensure the entry, type, or offset is + * valid, and protect the swap device with reference count or locks. + */ +static inline struct swap_info_struct *__swap_type_to_info(int type) +{ + struct swap_info_struct *si; + + si = READ_ONCE(swap_info[type]); /* rcu_dereference() */ + VM_WARN_ON_ONCE(percpu_ref_is_zero(&si->users)); /* race with swapoff */ + return si; +} + +static inline struct swap_info_struct *__swap_entry_to_info(swp_entry_t entry) +{ + return __swap_type_to_info(swp_type(entry)); +} + +static inline struct swap_cluster_info *__swap_offset_to_cluster( struct swap_info_struct *si, pgoff_t offset) { + VM_WARN_ON_ONCE(percpu_ref_is_zero(&si->users)); /* race with swapoff */ + VM_WARN_ON_ONCE(offset >= si->max); return &si->cluster_info[offset / SWAPFILE_CLUSTER]; } @@ -67,8 +89,9 @@ static inline struct swap_cluster_info *swp_offset_cluster( static inline struct swap_cluster_info *swap_cluster_lock( struct swap_info_struct *si, unsigned long offset) { - struct swap_cluster_info *ci = swp_offset_cluster(si, offset); + struct swap_cluster_info *ci = __swap_offset_to_cluster(si, offset); + VM_WARN_ON_ONCE(percpu_ref_is_zero(&si->users)); /* race with swapoff */ spin_lock(&ci->lock); return ci; } @@ -167,7 +190,7 @@ void swap_update_readahead(struct folio *folio, struct vm_area_struct *vma, static inline unsigned int folio_swap_flags(struct folio *folio) { - return swp_swap_info(folio->swap)->flags; + return __swap_entry_to_info(folio->swap)->flags; } /* @@ -178,7 +201,7 @@ static inline unsigned int folio_swap_flags(struct folio *folio) static inline int swap_zeromap_batch(swp_entry_t entry, int max_nr, bool *is_zeromap) { - struct swap_info_struct *sis = swp_swap_info(entry); + struct swap_info_struct *sis = __swap_entry_to_info(entry); unsigned long start = swp_offset(entry); unsigned long end = start + max_nr; bool first_bit; @@ -197,7 +220,7 @@ static inline int swap_zeromap_batch(swp_entry_t entry, int max_nr, static inline int non_swapcache_batch(swp_entry_t entry, int max_nr) { - struct swap_info_struct *si = swp_swap_info(entry); + struct swap_info_struct *si = __swap_entry_to_info(entry); pgoff_t offset = swp_offset(entry); int i; @@ -216,6 +239,11 @@ static inline int non_swapcache_batch(swp_entry_t entry, int max_nr) #else /* CONFIG_SWAP */ struct swap_iocb; +static inline struct swap_info_struct *__swap_entry_to_info(swp_entry_t entry) +{ + return NULL; +} + static inline void swap_read_folio(struct folio *folio, struct swap_iocb **plug) { } diff --git a/mm/swap_state.c b/mm/swap_state.c index e0e5dc0b9ba1..79b76a27d87f 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -342,7 +342,7 @@ struct folio *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, struct mempolicy *mpol, pgoff_t ilx, bool *new_page_allocated, bool skip_if_exists) { - struct swap_info_struct *si = swp_swap_info(entry); + struct swap_info_struct *si = __swap_entry_to_info(entry); struct folio *folio; struct folio *new_folio = NULL; struct folio *result = NULL; @@ -568,7 +568,7 @@ struct folio *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask, unsigned long offset = entry_offset; unsigned long start_offset, end_offset; unsigned long mask; - struct swap_info_struct *si = swp_swap_info(entry); + struct swap_info_struct *si = __swap_entry_to_info(entry); struct blk_plug plug; struct swap_iocb *splug = NULL; bool page_allocated; diff --git a/mm/swapfile.c b/mm/swapfile.c index 22266111b03a..d10ab8829cdd 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -103,7 +103,7 @@ EXPORT_SYMBOL(swap_active_head); static struct plist_head *swap_avail_heads; static DEFINE_SPINLOCK(swap_avail_lock); -static struct swap_info_struct *swap_info[MAX_SWAPFILES]; +struct swap_info_struct *swap_info[MAX_SWAPFILES]; static DEFINE_MUTEX(swapon_mutex); @@ -125,14 +125,20 @@ static DEFINE_PER_CPU(struct percpu_swap_cluster, percpu_swap_cluster) = { .lock = INIT_LOCAL_LOCK(), }; -static struct swap_info_struct *swap_type_to_swap_info(int type) +/* May return NULL on invalid type, caller must check for NULL return */ +static struct swap_info_struct *swap_type_to_info(int type) { if (type >= MAX_SWAPFILES) return NULL; - return READ_ONCE(swap_info[type]); /* rcu_dereference() */ } +/* May return NULL on invalid entry, caller must check for NULL return */ +static struct swap_info_struct *swap_entry_to_info(swp_entry_t entry) +{ + return swap_type_to_info(swp_type(entry)); +} + static inline unsigned char swap_count(unsigned char ent) { return ent & ~SWAP_HAS_CACHE; /* may include COUNT_CONTINUED flag */ @@ -343,7 +349,7 @@ offset_to_swap_extent(struct swap_info_struct *sis, unsigned long offset) sector_t swap_folio_sector(struct folio *folio) { - struct swap_info_struct *sis = swp_swap_info(folio->swap); + struct swap_info_struct *sis = __swap_entry_to_info(folio->swap); struct swap_extent *se; sector_t sector; pgoff_t offset; @@ -1328,7 +1334,7 @@ static struct swap_info_struct *_swap_info_get(swp_entry_t entry) if (!entry.val) goto out; - si = swp_swap_info(entry); + si = swap_entry_to_info(entry); if (!si) goto bad_nofile; if (data_race(!(si->flags & SWP_USED))) @@ -1443,7 +1449,7 @@ struct swap_info_struct *get_swap_device(swp_entry_t entry) if (!entry.val) goto out; - si = swp_swap_info(entry); + si = swap_entry_to_info(entry); if (!si) goto bad_nofile; if (!get_swap_device_info(si)) @@ -1565,7 +1571,7 @@ static void swap_entries_free(struct swap_info_struct *si, unsigned char *map_end = map + nr_pages; /* It should never free entries across different clusters */ - VM_BUG_ON(ci != swp_offset_cluster(si, offset + nr_pages - 1)); + VM_BUG_ON(ci != __swap_offset_to_cluster(si, offset + nr_pages - 1)); VM_BUG_ON(cluster_is_empty(ci)); VM_BUG_ON(ci->count < nr_pages); @@ -1623,7 +1629,7 @@ void put_swap_folio(struct folio *folio, swp_entry_t entry) int __swap_count(swp_entry_t entry) { - struct swap_info_struct *si = swp_swap_info(entry); + struct swap_info_struct *si = __swap_entry_to_info(entry); pgoff_t offset = swp_offset(entry); return swap_count(si->swap_map[offset]); @@ -1855,7 +1861,7 @@ void free_swap_and_cache_nr(swp_entry_t entry, int nr) swp_entry_t get_swap_page_of_type(int type) { - struct swap_info_struct *si = swap_type_to_swap_info(type); + struct swap_info_struct *si = swap_type_to_info(type); unsigned long offset; swp_entry_t entry = {0}; @@ -1936,7 +1942,7 @@ int find_first_swap(dev_t *device) */ sector_t swapdev_block(int type, pgoff_t offset) { - struct swap_info_struct *si = swap_type_to_swap_info(type); + struct swap_info_struct *si = swap_type_to_info(type); struct swap_extent *se; if (!si || !(si->flags & SWP_WRITEOK)) @@ -2865,7 +2871,7 @@ static void *swap_start(struct seq_file *swap, loff_t *pos) if (!l) return SEQ_START_TOKEN; - for (type = 0; (si = swap_type_to_swap_info(type)); type++) { + for (type = 0; (si = swap_type_to_info(type)); type++) { if (!(si->flags & SWP_USED) || !si->swap_map) continue; if (!--l) @@ -2886,7 +2892,7 @@ static void *swap_next(struct seq_file *swap, void *v, loff_t *pos) type = si->type + 1; ++(*pos); - for (; (si = swap_type_to_swap_info(type)); type++) { + for (; (si = swap_type_to_info(type)); type++) { if (!(si->flags & SWP_USED) || !si->swap_map) continue; return si; @@ -3556,7 +3562,7 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage, int nr) unsigned char has_cache; int err, i; - si = swp_swap_info(entry); + si = swap_entry_to_info(entry); if (WARN_ON_ONCE(!si)) { pr_err("%s%08lx\n", Bad_file, entry.val); return -EINVAL; @@ -3672,12 +3678,6 @@ void swapcache_clear(struct swap_info_struct *si, swp_entry_t entry, int nr) swap_entries_put_cache(si, entry, nr); } -struct swap_info_struct *swp_swap_info(swp_entry_t entry) -{ - return swap_type_to_swap_info(swp_type(entry)); -} -EXPORT_SYMBOL(swp_swap_info); - /* * add_swap_count_continuation - called when a swap count is duplicated * beyond SWAP_MAP_MAX, it allocates a new page and links that to the entry's -- Gitee From 2e1d1543d883634fa8d95fa956c9ea8932ee8a02 Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Wed, 17 Sep 2025 00:00:53 +0800 Subject: [PATCH 476/484] mm, swap: cleanup swap cache API and add kerneldoc commit fd8d4f862f8c278fd1f5b61cef20056e88d8dfa5 upstream Conflicts: minor, and we are not using const folio here. Backport-reason: Swap Table: phase 1 In preparation for replacing the swap cache backend with the swap table, clean up and add proper kernel doc for all swap cache APIs. Now all swap cache APIs are well-defined with consistent names. No feature change, only renaming and documenting. Link: https://lkml.kernel.org/r/20250916160100.31545-9-ryncsn@gmail.com Signed-off-by: Kairui Song Acked-by: Chris Li Reviewed-by: Barry Song Reviewed-by: Baolin Wang Acked-by: David Hildenbrand Suggested-by: Chris Li Cc: Baoquan He Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Johannes Weiner Cc: Kemeng Shi Cc: kernel test robot Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Nhat Pham Cc: Yosry Ahmed Cc: Zi Yan Cc: SeongJae Park Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/filemap.c | 2 +- mm/memory-failure.c | 2 +- mm/memory.c | 2 +- mm/shmem.c | 6 ++-- mm/swap.h | 50 ++++++++++++++------------ mm/swap_state.c | 86 ++++++++++++++++++++++++++++++++------------- mm/swapfile.c | 8 ++--- mm/vmscan.c | 2 +- mm/zswap.c | 2 +- 9 files changed, 102 insertions(+), 58 deletions(-) diff --git a/mm/filemap.c b/mm/filemap.c index 1aaac1c2ad2c..0fa274308012 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -4371,7 +4371,7 @@ static void filemap_cachestat(struct address_space *mapping, * invalidation, so there might not be * a shadow in the swapcache (yet). */ - shadow = get_shadow_from_swap_cache(swp); + shadow = swap_cache_get_shadow(swp); if (!shadow) goto resched; } diff --git a/mm/memory-failure.c b/mm/memory-failure.c index ebd7c2d1140f..1f5e208f5d96 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1179,7 +1179,7 @@ static int me_swapcache_clean(struct page_state *ps, struct page *p) struct folio *folio = page_folio(p); int ret; - delete_from_swap_cache(folio); + swap_cache_del_folio(folio); ret = delete_from_lru_cache(p) ? MF_FAILED : MF_RECOVERED; folio_unlock(folio); diff --git a/mm/memory.c b/mm/memory.c index 08982ccb2e4b..50ee54c946dc 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4185,7 +4185,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) mem_cgroup_swapin_uncharge_swap(entry, nr_pages); - shadow = get_shadow_from_swap_cache(entry); + shadow = swap_cache_get_shadow(entry); if (shadow) workingset_refault(folio, shadow); diff --git a/mm/shmem.c b/mm/shmem.c index fc3d7bed86bb..14d46e8e07e4 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1983,7 +1983,7 @@ static struct folio *shmem_swap_alloc_folio(struct inode *inode, new->swap = entry; mem_cgroup_swapin_uncharge_swap(entry, nr_pages); - shadow = get_shadow_from_swap_cache(entry); + shadow = swap_cache_get_shadow(entry); if (shadow) workingset_refault(new, shadow); folio_add_lru(new); @@ -2120,7 +2120,7 @@ static void shmem_set_folio_swapin_error(struct inode *inode, pgoff_t index, nr_pages = folio_nr_pages(folio); folio_wait_writeback(folio); if (!skip_swapcache) - delete_from_swap_cache(folio); + swap_cache_del_folio(folio); /* * Don't treat swapin error folio as alloced. Otherwise inode->i_blocks * won't be 0 when inode is released and thus trigger WARN_ON(i_blocks) @@ -2372,7 +2372,7 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, folio->swap.val = 0; swapcache_clear(si, swap, nr_pages); } else { - delete_from_swap_cache(folio); + swap_cache_del_folio(folio); } folio_mark_dirty(folio); swap_free_nr(swap, nr_pages); diff --git a/mm/swap.h b/mm/swap.h index 32a4e67686cc..ce3a0413f0a0 100644 --- a/mm/swap.h +++ b/mm/swap.h @@ -151,7 +151,7 @@ static inline pgoff_t swap_cache_index(swp_entry_t entry) * and nothing will move it in or out of the swap cache. * Return: true or false. */ -static inline bool folio_matches_swap_entry(const struct folio *folio, +static inline bool folio_matches_swap_entry(struct folio *folio, swp_entry_t entry) { swp_entry_t folio_entry = folio->swap; @@ -164,17 +164,29 @@ static inline bool folio_matches_swap_entry(const struct folio *folio, return folio_entry.val == round_down(entry.val, nr_pages); } +/* + * All swap cache helpers below require the caller to ensure the swap entries + * used are valid and stablize the device by any of the following ways: + * - Hold a reference by get_swap_device(): this ensures a single entry is + * valid and increases the swap device's refcount. + * - Locking a folio in the swap cache: this ensures the folio's swap entries + * are valid and pinned, also implies reference to the device. + * - Locking anything referencing the swap entry: e.g. PTL that protects + * swap entries in the page table, similar to locking swap cache folio. + * - See the comment of get_swap_device() for more complex usage. + */ +struct folio *swap_cache_get_folio(swp_entry_t entry); +void *swap_cache_get_shadow(swp_entry_t entry); +int swap_cache_add_folio(struct folio *folio, swp_entry_t entry, + gfp_t gfp, void **shadow); +void swap_cache_del_folio(struct folio *folio); +void __swap_cache_del_folio(struct folio *folio, + swp_entry_t entry, void *shadow); +void swap_cache_clear_shadow(int type, unsigned long begin, + unsigned long end); + void show_swap_cache_info(void); -void *get_shadow_from_swap_cache(swp_entry_t entry); -int add_to_swap_cache(struct folio *folio, swp_entry_t entry, - gfp_t gfp, void **shadowp); -void __delete_from_swap_cache(struct folio *folio, - swp_entry_t entry, void *shadow); -void delete_from_swap_cache(struct folio *folio); -void clear_shadow_from_swap_cache(int type, unsigned long begin, - unsigned long end); void swapcache_clear(struct swap_info_struct *si, swp_entry_t entry, int nr); -struct folio *swap_cache_get_folio(swp_entry_t entry); struct folio *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, struct vm_area_struct *vma, unsigned long addr, struct swap_iocb **plug); @@ -301,28 +313,22 @@ static inline struct folio *swap_cache_get_folio(swp_entry_t entry) return NULL; } -static inline void *get_shadow_from_swap_cache(swp_entry_t entry) +static inline void *swap_cache_get_shadow(swp_entry_t entry) { return NULL; } -static inline int add_to_swap_cache(struct folio *folio, swp_entry_t entry, - gfp_t gfp_mask, void **shadowp) -{ - return -1; -} - -static inline void __delete_from_swap_cache(struct folio *folio, - swp_entry_t entry, void *shadow) +static inline int swap_cache_add_folio(swp_entry_t entry, struct folio *folio, + gfp_t gfp, void **shadow) { + return -EINVAL; } -static inline void delete_from_swap_cache(struct folio *folio) +static inline void swap_cache_del_folio(struct folio *folio) { } -static inline void clear_shadow_from_swap_cache(int type, unsigned long begin, - unsigned long end) +static inline void __swap_cache_del_folio(struct folio *folio, swp_entry_t entry, void *shadow) { } diff --git a/mm/swap_state.c b/mm/swap_state.c index 79b76a27d87f..7400ee1d7f83 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -79,8 +79,8 @@ void show_swap_cache_info(void) * Context: Caller must ensure @entry is valid and protect the swap device * with reference count or locks. * Return: Returns the found folio on success, NULL otherwise. The caller - * must lock and check if the folio still matches the swap entry before - * use (e.g. with folio_matches_swap_entry). + * must lock nd check if the folio still matches the swap entry before + * use (e.g., folio_matches_swap_entry). */ struct folio *swap_cache_get_folio(swp_entry_t entry) { @@ -91,7 +91,15 @@ struct folio *swap_cache_get_folio(swp_entry_t entry) return folio; } -void *get_shadow_from_swap_cache(swp_entry_t entry) +/** + * swap_cache_get_shadow - Looks up a shadow in the swap cache. + * @entry: swap entry used for the lookup. + * + * Context: Caller must ensure @entry is valid and protect the swap device + * with reference count or locks. + * Return: Returns either NULL or an XA_VALUE (shadow). + */ +void *swap_cache_get_shadow(swp_entry_t entry) { struct address_space *address_space = swap_address_space(entry); pgoff_t idx = swap_cache_index(entry); @@ -103,12 +111,21 @@ void *get_shadow_from_swap_cache(swp_entry_t entry) return NULL; } -/* - * add_to_swap_cache resembles filemap_add_folio on swapper_space, - * but sets SwapCache flag and 'swap' instead of mapping and index. +/** + * swap_cache_add_folio - Add a folio into the swap cache. + * @folio: The folio to be added. + * @entry: The swap entry corresponding to the folio. + * @gfp: gfp_mask for XArray node allocation. + * @shadowp: If a shadow is found, return the shadow. + * + * Context: Caller must ensure @entry is valid and protect the swap device + * with reference count or locks. + * The caller also needs to mark the corresponding swap_map slots with + * SWAP_HAS_CACHE to avoid race or conflict. + * Return: Returns 0 on success, error code otherwise. */ -int add_to_swap_cache(struct folio *folio, swp_entry_t entry, - gfp_t gfp, void **shadowp) +int swap_cache_add_folio(struct folio *folio, swp_entry_t entry, + gfp_t gfp, void **shadowp) { struct address_space *address_space = swap_address_space(entry); pgoff_t idx = swap_cache_index(entry); @@ -157,12 +174,20 @@ int add_to_swap_cache(struct folio *folio, swp_entry_t entry, } EXPORT_SYMBOL(add_to_swap_cache); -/* - * This must be called only on folios that have - * been verified to be in the swap cache. +/** + * __swap_cache_del_folio - Removes a folio from the swap cache. + * @folio: The folio. + * @entry: The first swap entry that the folio corresponds to. + * @shadow: shadow value to be filled in the swap cache. + * + * Removes a folio from the swap cache and fills a shadow in place. + * This won't put the folio's refcount. The caller has to do that. + * + * Context: Caller must hold the xa_lock, ensure the folio is + * locked and in the swap cache, using the index of @entry. */ -void __delete_from_swap_cache(struct folio *folio, - swp_entry_t entry, void *shadow) +void __swap_cache_del_folio(struct folio *folio, + swp_entry_t entry, void *shadow) { struct address_space *address_space = swap_address_space(entry); int i; @@ -188,19 +213,23 @@ void __delete_from_swap_cache(struct folio *folio, __lruvec_stat_mod_folio(folio, NR_SWAPCACHE, -nr); } -/* - * This must be called only on folios that have - * been verified to be in the swap cache and locked. - * It will never put the folio into the free list, - * the caller has a reference on the folio. +/** + * swap_cache_del_folio - Removes a folio from the swap cache. + * @folio: The folio. + * + * Same as __swap_cache_del_folio, but handles lock and refcount. The + * caller must ensure the folio is either clean or has a swap count + * equal to zero, or it may cause data loss. + * + * Context: Caller must ensure the folio is locked and in the swap cache. */ -void delete_from_swap_cache(struct folio *folio) +void swap_cache_del_folio(struct folio *folio) { swp_entry_t entry = folio->swap; struct address_space *address_space = swap_address_space(entry); xa_lock_irq(&address_space->i_pages); - __delete_from_swap_cache(folio, entry, NULL); + __swap_cache_del_folio(folio, entry, NULL); xa_unlock_irq(&address_space->i_pages); put_swap_folio(folio, entry); @@ -208,8 +237,17 @@ void delete_from_swap_cache(struct folio *folio) } EXPORT_SYMBOL(delete_from_swap_cache); -void clear_shadow_from_swap_cache(int type, unsigned long begin, - unsigned long end) +/** + * swap_cache_clear_shadow - Clears a set of shadows in the swap cache. + * @type: Indicates the swap device. + * @begin: Beginning offset of the range. + * @end: Ending offset of the range. + * + * Context: Caller must ensure the range is valid and hold a reference to + * the swap device. + */ +void swap_cache_clear_shadow(int type, unsigned long begin, + unsigned long end) { unsigned long curr = begin; void *old; @@ -400,7 +438,7 @@ struct folio *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, goto put_and_return; /* - * We might race against __delete_from_swap_cache(), and + * We might race against __swap_cache_del_folio(), and * stumble across a swap_map entry whose SWAP_HAS_CACHE * has not yet been cleared. Or race against another * __read_swap_cache_async(), which has set SWAP_HAS_CACHE @@ -419,7 +457,7 @@ struct folio *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, goto fail_unlock; /* May fail (-ENOMEM) if XArray node allocation failed. */ - if (add_to_swap_cache(new_folio, entry, gfp_mask & GFP_RECLAIM_MASK, &shadow)) + if (swap_cache_add_folio(new_folio, entry, gfp_mask & GFP_RECLAIM_MASK, &shadow)) goto fail_unlock; mem_cgroup_swapin_uncharge_swap(entry, 1); diff --git a/mm/swapfile.c b/mm/swapfile.c index d10ab8829cdd..de3ca2a2e16c 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -268,7 +268,7 @@ static int __try_to_reclaim_swap(struct swap_info_struct *si, if (!need_reclaim) goto out_unlock; - delete_from_swap_cache(folio); + swap_cache_del_folio(folio); folio_set_dirty(folio); ret = nr_pages; out_unlock: @@ -1118,7 +1118,7 @@ static void swap_range_free(struct swap_info_struct *si, unsigned long offset, swap_slot_free_notify(si->bdev, offset); offset++; } - clear_shadow_from_swap_cache(si->type, begin, end); + swap_cache_clear_shadow(si->type, begin, end); /* * Make sure that try_to_unuse() observes si->inuse_pages reaching 0 @@ -1317,7 +1317,7 @@ int folio_alloc_swap(struct folio *folio, gfp_t gfp) * TODO: this could cause a theoretical memory reclaim * deadlock in the swap out path. */ - if (add_to_swap_cache(folio, entry, gfp | __GFP_NOMEMALLOC, NULL)) + if (swap_cache_add_folio(folio, entry, gfp | __GFP_NOMEMALLOC, NULL)) goto out_free; return 0; @@ -1787,7 +1787,7 @@ bool folio_free_swap(struct folio *folio) if (folio_swapped(folio)) return false; - delete_from_swap_cache(folio); + swap_cache_del_folio(folio); folio_set_dirty(folio); return true; } diff --git a/mm/vmscan.c b/mm/vmscan.c index f0ff3362b345..285542e62d61 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -860,7 +860,7 @@ static int __remove_mapping(struct address_space *mapping, struct folio *folio, if (reclaimed && !mapping_exiting(mapping)) shadow = workingset_eviction(folio, target_memcg); - __delete_from_swap_cache(folio, swap, shadow); + __swap_cache_del_folio(folio, swap, shadow); mem_cgroup_swapout(folio, swap); xa_unlock_irq(&mapping->i_pages); put_swap_folio(folio, swap); diff --git a/mm/zswap.c b/mm/zswap.c index b1161b6e929d..56b393ae846c 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -1150,7 +1150,7 @@ static int zswap_writeback_entry(struct zswap_entry *entry, out: if (ret && ret != -EEXIST) { - delete_from_swap_cache(folio); + swap_cache_del_folio(folio); folio_unlock(folio); } folio_put(folio); -- Gitee From 02b95f715a785ae677a1edd424c4cdc68d3df3bd Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Wed, 17 Sep 2025 00:00:54 +0800 Subject: [PATCH 477/484] mm/shmem, swap: remove redundant error handling for replacing folio commit 84a7a9823e73fe3c0adcc4780fa7a091981048ef upstream Conflicts: trivial, out of sync comment. Backport-reason: Swap Table: phase 1 Shmem may replace a folio in the swap cache if the cached one doesn't fit the swapin's GFP zone. When doing so, shmem has already double checked that the swap cache folio is locked, still has the swap cache flag set, and contains the wanted swap entry. So it is impossible to fail due to an XArray mismatch. There is even a comment for that. Delete the defensive error handling path, and add a WARN_ON instead: if that happened, something has broken the basic principle of how the swap cache works, we should catch and fix that. Link: https://lkml.kernel.org/r/20250916160100.31545-10-ryncsn@gmail.com Signed-off-by: Kairui Song Reviewed-by: David Hildenbrand Reviewed-by: Baolin Wang Suggested-by: Chris Li Cc: Baoquan He Cc: Barry Song Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Johannes Weiner Cc: Kemeng Shi Cc: kernel test robot Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Nhat Pham Cc: Yosry Ahmed Cc: Zi Yan Cc: SeongJae Park Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/shmem.c | 31 +++++++------------------------ 1 file changed, 7 insertions(+), 24 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index 14d46e8e07e4..c9bda879e93b 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2059,34 +2059,17 @@ static int shmem_replace_folio(struct folio **foliop, gfp_t gfp, /* Swap cache still stores N entries instead of a high-order entry */ xa_lock_irq(&swap_mapping->i_pages); for (i = 0; i < nr_pages; i++) { - void *item = xas_load(&xas); - - if (item != old) { - error = -ENOENT; - break; - } - - xas_store(&xas, new); + WARN_ON_ONCE(xas_store(&xas, new) != old); xas_next(&xas); } - if (!error) { - mem_cgroup_replace_folio(old, new); - shmem_update_stats(new, nr_pages); - shmem_update_stats(old, -nr_pages); - } + + mem_cgroup_replace_folio(old, new); + shmem_update_stats(new, nr_pages); + shmem_update_stats(old, -nr_pages); xa_unlock_irq(&swap_mapping->i_pages); - if (unlikely(error)) { - /* - * Is this possible? I think not, now that our callers check - * both PageSwapCache and page_private after getting page lock; - * but be defensive. Reverse old to newpage for clear and free. - */ - old = new; - } else { - folio_add_lru(new); - *foliop = new; - } + folio_add_lru(new); + *foliop = new; folio_clear_swapcache(old); old->private = NULL; -- Gitee From 0d13542041451c8abe5f05de54f3b6525c476c5b Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Wed, 17 Sep 2025 00:00:55 +0800 Subject: [PATCH 478/484] mm, swap: wrap swap cache replacement with a helper commit 094dc8b059b11eef0888f43eeb0f3ac53ade5c87 upstream Conflicts: trivial, huge page split is a bit different. Backport-reason: Swap Table: phase 1 There are currently three swap cache users that are trying to replace an existing folio with a new one: huge memory splitting, migration, and shmem replacement. What they are doing is quite similar. Introduce a common helper for this. In later commits, this can be easily switched to use the swap table by updating this helper. The newly added helper also makes the swap cache API better defined, and make debugging easier by adding a few more debug checks. Migration and shmem replace are meant to clone the folio, including content, swap entry value, and flags. And splitting will adjust each sub folio's swap entry according to order, which could be non-uniform in the future. So document it clearly that it's the caller's responsibility to set up the new folio's swap entries and flags before calling the helper. The helper will just follow the new folio's entry value. This also prepares for replacing high-order folios in the swap cache. Currently, only splitting to order 0 is allowed for swap cache folios. Using the new helper, we can handle high-order folio splitting better. Link: https://lkml.kernel.org/r/20250916160100.31545-11-ryncsn@gmail.com Signed-off-by: Kairui Song Reviewed-by: Baolin Wang Acked-by: David Hildenbrand Acked-by: Chris Li Suggested-by: Chris Li Cc: Baoquan He Cc: Barry Song Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Johannes Weiner Cc: Kemeng Shi Cc: kernel test robot Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Nhat Pham Cc: Yosry Ahmed Cc: Zi Yan Cc: SeongJae Park Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/huge_memory.c | 3 +-- mm/migrate.c | 11 +++-------- mm/shmem.c | 11 ++--------- mm/swap.h | 5 +++++ mm/swap_state.c | 33 +++++++++++++++++++++++++++++++++ 5 files changed, 44 insertions(+), 19 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 9f029e1d32fd..18519059c4de 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -3072,8 +3072,7 @@ static void __split_huge_page(struct page *page, struct list_head *list, __xa_store(&folio->mapping->i_pages, tail->index, tail, 0); } else if (swap_cache) { - __xa_store(&swap_cache->i_pages, offset + i, - tail, 0); + __swap_cache_replace_folio(folio, tail); } } diff --git a/mm/migrate.c b/mm/migrate.c index 7c49ea21b245..43aab132cc3f 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -406,7 +406,6 @@ int folio_migrate_mapping(struct address_space *mapping, int dirty; int expected_count = folio_expected_refs(mapping, folio) + extra_count; long nr = folio_nr_pages(folio); - long entries, i; if (!mapping) { /* Anonymous page without mapping */ @@ -459,9 +458,6 @@ int folio_migrate_mapping(struct address_space *mapping, if (folio_test_swapcache(folio)) { folio_set_swapcache(newfolio); newfolio->private = folio_get_private(folio); - entries = nr; - } else { - entries = 1; } /* Move dirty while page refs frozen and newpage not yet exposed */ @@ -471,11 +467,10 @@ int folio_migrate_mapping(struct address_space *mapping, folio_set_dirty(newfolio); } - /* Swap cache still stores N entries instead of a high-order entry */ - for (i = 0; i < entries; i++) { + if (folio_test_swapcache(folio)) + __swap_cache_replace_folio(folio, newfolio); + else xas_store(&xas, newfolio); - xas_next(&xas); - } /* * Drop cache reference from old page by unfreezing diff --git a/mm/shmem.c b/mm/shmem.c index c9bda879e93b..4bb49f18e7ce 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2024,10 +2024,8 @@ static int shmem_replace_folio(struct folio **foliop, gfp_t gfp, struct folio *new, *old = *foliop; swp_entry_t entry = old->swap; struct address_space *swap_mapping = swap_address_space(entry); - pgoff_t swap_index = swap_cache_index(entry); - XA_STATE(xas, &swap_mapping->i_pages, swap_index); int nr_pages = folio_nr_pages(old); - int error = 0, i; + int error = 0; /* * We have arrived here because our zones are constrained, so don't @@ -2056,13 +2054,8 @@ static int shmem_replace_folio(struct folio **foliop, gfp_t gfp, new->swap = entry; folio_set_swapcache(new); - /* Swap cache still stores N entries instead of a high-order entry */ xa_lock_irq(&swap_mapping->i_pages); - for (i = 0; i < nr_pages; i++) { - WARN_ON_ONCE(xas_store(&xas, new) != old); - xas_next(&xas); - } - + __swap_cache_replace_folio(old, new); mem_cgroup_replace_folio(old, new); shmem_update_stats(new, nr_pages); shmem_update_stats(old, -nr_pages); diff --git a/mm/swap.h b/mm/swap.h index ce3a0413f0a0..a2de4c5df244 100644 --- a/mm/swap.h +++ b/mm/swap.h @@ -182,6 +182,7 @@ int swap_cache_add_folio(struct folio *folio, swp_entry_t entry, void swap_cache_del_folio(struct folio *folio); void __swap_cache_del_folio(struct folio *folio, swp_entry_t entry, void *shadow); +void __swap_cache_replace_folio(struct folio *old, struct folio *new); void swap_cache_clear_shadow(int type, unsigned long begin, unsigned long end); @@ -332,6 +333,10 @@ static inline void __swap_cache_del_folio(struct folio *folio, swp_entry_t entry { } +static inline void __swap_cache_replace_folio(struct folio *old, struct folio *new) +{ +} + static inline unsigned int folio_swap_flags(struct folio *folio) { return 0; diff --git a/mm/swap_state.c b/mm/swap_state.c index 7400ee1d7f83..9d2cfbae96bb 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -237,6 +237,39 @@ void swap_cache_del_folio(struct folio *folio) } EXPORT_SYMBOL(delete_from_swap_cache); +/** + * __swap_cache_replace_folio - Replace a folio in the swap cache. + * @old: The old folio to be replaced. + * @new: The new folio. + * + * Replace an existing folio in the swap cache with a new folio. The + * caller is responsible for setting up the new folio's flag and swap + * entries. Replacement will take the new folio's swap entry value as + * the starting offset to override all slots covered by the new folio. + * + * Context: Caller must ensure both folios are locked, also lock the + * swap address_space that holds the old folio to avoid races. + */ +void __swap_cache_replace_folio(struct folio *old, struct folio *new) +{ + swp_entry_t entry = new->swap; + unsigned long nr_pages = folio_nr_pages(new); + unsigned long offset = swap_cache_index(entry); + unsigned long end = offset + nr_pages; + + XA_STATE(xas, &swap_address_space(entry)->i_pages, offset); + + VM_WARN_ON_ONCE(!folio_test_swapcache(old) || !folio_test_swapcache(new)); + VM_WARN_ON_ONCE(!folio_test_locked(old) || !folio_test_locked(new)); + VM_WARN_ON_ONCE(!entry.val); + + /* Swap cache still stores N entries instead of a high-order entry */ + do { + WARN_ON_ONCE(xas_store(&xas, new) != old); + xas_next(&xas); + } while (++offset < end); +} + /** * swap_cache_clear_shadow - Clears a set of shadows in the swap cache. * @type: Indicates the swap device. -- Gitee From 195a82e4beb81cfdff508645252b95f00181767d Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Wed, 17 Sep 2025 00:00:56 +0800 Subject: [PATCH 479/484] mm, swap: use the swap table for the swap cache and switch API commit 8578e0c00dcf0c58fbc32d4904ecaf8e802a6590 upstream Conflicts: major, CONFIG_EMM_RAMDISK_SWAP is now deprecated, also we are not using const folio, also remove a few symbol export. Backport-reason: Swap Table: phase 1 Introduce basic swap table infrastructures, which are now just a fixed-sized flat array inside each swap cluster, with access wrappers. Each cluster contains a swap table of 512 entries. Each table entry is an opaque atomic long. It could be in 3 types: a shadow type (XA_VALUE), a folio type (pointer), or NULL. In this first step, it only supports storing a folio or shadow, and it is a drop-in replacement for the current swap cache. Convert all swap cache users to use the new sets of APIs. Chris Li has been suggesting using a new infrastructure for swap cache for better performance, and that idea combined well with the swap table as the new backing structure. Now the lock contention range is reduced to 2M clusters, which is much smaller than the 64M address_space. And we can also drop the multiple address_space design. All the internal works are done with swap_cache_get_* helpers. Swap cache lookup is still lock-less like before, and the helper's contexts are same with original swap cache helpers. They still require a pin on the swap device to prevent the backing data from being freed. Swap cache updates are now protected by the swap cluster lock instead of the XArray lock. This is mostly handled internally, but new __swap_cache_* helpers require the caller to lock the cluster. So, a few new cluster access and locking helpers are also introduced. A fully cluster-based unified swap table can be implemented on top of this to take care of all count tracking and synchronization work, with dynamic allocation. It should reduce the memory usage while making the performance even better. Link: https://lkml.kernel.org/r/20250916160100.31545-12-ryncsn@gmail.com Co-developed-by: Chris Li Signed-off-by: Chris Li Signed-off-by: Kairui Song Acked-by: Chris Li Suggested-by: Chris Li Cc: Baolin Wang Cc: Baoquan He Cc: Barry Song Cc: David Hildenbrand Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Johannes Weiner Cc: Kemeng Shi Cc: kernel test robot Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Nhat Pham Cc: Yosry Ahmed Cc: Zi Yan Cc: SeongJae Park Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- MAINTAINERS | 1 + include/linux/swap.h | 2 - mm/huge_memory.c | 15 +-- mm/migrate.c | 19 ++- mm/shmem.c | 8 +- mm/swap.h | 156 ++++++++++++++++------ mm/swap_state.c | 300 ++++++++++++++++++------------------------- mm/swap_table.h | 97 ++++++++++++++ mm/swapfile.c | 100 +++++++++++---- mm/vmscan.c | 20 ++- 10 files changed, 460 insertions(+), 258 deletions(-) create mode 100644 mm/swap_table.h diff --git a/MAINTAINERS b/MAINTAINERS index 4bee80d14c1f..531f906b4d09 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -13845,6 +13845,7 @@ F: include/linux/swapops.h F: mm/page_io.c F: mm/swap.c F: mm/swap.h +F: mm/swap_table.h F: mm/swap_state.c F: mm/swapfile.c diff --git a/include/linux/swap.h b/include/linux/swap.h index ae651ce19638..a295ed9f7960 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -492,8 +492,6 @@ extern int __swap_count(swp_entry_t entry); extern bool swap_entry_swapped(struct swap_info_struct *si, swp_entry_t entry); extern int swp_swapcount(swp_entry_t entry); struct backing_dev_info; -extern int init_swap_address_space(struct swap_info_struct *si, unsigned long nr_pages); -extern void exit_swap_address_space(unsigned int type); extern struct swap_info_struct *get_swap_device(swp_entry_t entry); sector_t swap_folio_sector(struct folio *folio); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 18519059c4de..955c1a08548d 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -3034,7 +3034,7 @@ static void __split_huge_page(struct page *page, struct list_head *list, struct folio *folio = page_folio(page); struct page *head = &folio->page; struct lruvec *lruvec; - struct address_space *swap_cache = NULL; + struct swap_cluster_info *ci = NULL; unsigned long offset = 0; int i, nr_dropped = 0; unsigned int new_nr = 1 << new_order; @@ -3044,11 +3044,8 @@ static void __split_huge_page(struct page *page, struct list_head *list, /* complete memcg works before add pages to LRU */ split_page_memcg(head, order, new_order); - if (folio_test_anon(folio) && folio_test_swapcache(folio)) { - offset = swap_cache_index(folio->swap); - swap_cache = swap_address_space(folio->swap); - xa_lock(&swap_cache->i_pages); - } + if (folio_test_anon(folio) && folio_test_swapcache(folio)) + ci = swap_cluster_get_and_lock(folio); /* lock lru list/PageCompound, ref frozen by page_ref_freeze */ lruvec = folio_lruvec_lock(folio); @@ -3071,8 +3068,8 @@ static void __split_huge_page(struct page *page, struct list_head *list, } else if (!folio_test_anon(folio)) { __xa_store(&folio->mapping->i_pages, tail->index, tail, 0); - } else if (swap_cache) { - __swap_cache_replace_folio(folio, tail); + } else if (ci) { + __swap_cache_replace_folio(ci, folio, tail); } } @@ -3093,7 +3090,7 @@ static void __split_huge_page(struct page *page, struct list_head *list, /* Additional pin to swap cache */ if (folio_test_swapcache(folio)) { folio_ref_add(folio, 1 + new_nr); - xa_unlock(&swap_cache->i_pages); + swap_cluster_unlock(ci); } else { folio_ref_inc(folio); } diff --git a/mm/migrate.c b/mm/migrate.c index 43aab132cc3f..b5b9174af5ce 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -402,6 +402,7 @@ int folio_migrate_mapping(struct address_space *mapping, struct folio *newfolio, struct folio *folio, int extra_count) { XA_STATE(xas, &mapping->i_pages, folio_index(folio)); + struct swap_cluster_info *ci = NULL; struct zone *oldzone, *newzone; int dirty; int expected_count = folio_expected_refs(mapping, folio) + extra_count; @@ -435,9 +436,16 @@ int folio_migrate_mapping(struct address_space *mapping, oldzone = folio_zone(folio); newzone = folio_zone(newfolio); - xas_lock_irq(&xas); + if (folio_test_swapcache(folio)) + ci = swap_cluster_get_and_lock_irq(folio); + else + xas_lock_irq(&xas); + if (!folio_ref_freeze(folio, expected_count)) { - xas_unlock_irq(&xas); + if (ci) + swap_cluster_unlock_irq(ci); + else + xas_unlock_irq(&xas); return -EAGAIN; } @@ -468,7 +476,7 @@ int folio_migrate_mapping(struct address_space *mapping, } if (folio_test_swapcache(folio)) - __swap_cache_replace_folio(folio, newfolio); + __swap_cache_replace_folio(ci, folio, newfolio); else xas_store(&xas, newfolio); @@ -479,8 +487,11 @@ int folio_migrate_mapping(struct address_space *mapping, */ folio_ref_unfreeze(folio, expected_count - nr); - xas_unlock(&xas); /* Leave irq disabled to prevent preemption while updating stats */ + if (ci) + swap_cluster_unlock(ci); + else + xas_unlock(&xas); /* * If moved to a different zone then also account diff --git a/mm/shmem.c b/mm/shmem.c index 4bb49f18e7ce..4b648c00d3df 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2021,9 +2021,9 @@ static int shmem_replace_folio(struct folio **foliop, gfp_t gfp, struct shmem_inode_info *info, pgoff_t index, struct vm_area_struct *vma) { + struct swap_cluster_info *ci; struct folio *new, *old = *foliop; swp_entry_t entry = old->swap; - struct address_space *swap_mapping = swap_address_space(entry); int nr_pages = folio_nr_pages(old); int error = 0; @@ -2054,12 +2054,12 @@ static int shmem_replace_folio(struct folio **foliop, gfp_t gfp, new->swap = entry; folio_set_swapcache(new); - xa_lock_irq(&swap_mapping->i_pages); - __swap_cache_replace_folio(old, new); + ci = swap_cluster_get_and_lock_irq(old); + __swap_cache_replace_folio(ci, old, new); mem_cgroup_replace_folio(old, new); shmem_update_stats(new, nr_pages); shmem_update_stats(old, -nr_pages); - xa_unlock_irq(&swap_mapping->i_pages); + swap_cluster_unlock_irq(ci); folio_add_lru(new); *foliop = new; diff --git a/mm/swap.h b/mm/swap.h index a2de4c5df244..12b4390006eb 100644 --- a/mm/swap.h +++ b/mm/swap.h @@ -2,6 +2,7 @@ #ifndef _MM_SWAP_H #define _MM_SWAP_H +#include /* for atomic_long_t */ struct mempolicy; #ifdef CONFIG_THP_SWAP @@ -32,6 +33,7 @@ struct swap_cluster_info { u16 count; u8 flags; u8 order; + atomic_long_t *table; /* Swap table entries, see mm/swap_table.h */ struct list_head list; }; @@ -52,6 +54,11 @@ enum swap_cluster_flags { #include /* for swp_offset */ #include /* for bio_end_io_t */ +static inline unsigned int swp_cluster_offset(swp_entry_t entry) +{ + return swp_offset(entry) % SWAPFILE_CLUSTER; +} + /* * Callers of all helpers below must ensure the entry, type, or offset is * valid, and protect the swap device with reference count or locks. @@ -78,6 +85,25 @@ static inline struct swap_cluster_info *__swap_offset_to_cluster( return &si->cluster_info[offset / SWAPFILE_CLUSTER]; } +static inline struct swap_cluster_info *__swap_entry_to_cluster(swp_entry_t entry) +{ + return __swap_offset_to_cluster(__swap_entry_to_info(entry), + swp_offset(entry)); +} + +static __always_inline struct swap_cluster_info *__swap_cluster_lock( + struct swap_info_struct *si, unsigned long offset, bool irq) +{ + struct swap_cluster_info *ci = __swap_offset_to_cluster(si, offset); + + VM_WARN_ON_ONCE(percpu_ref_is_zero(&si->users)); /* race with swapoff */ + if (irq) + spin_lock_irq(&ci->lock); + else + spin_lock(&ci->lock); + return ci; +} + /** * swap_cluster_lock - Lock and return the swap cluster of given offset. * @si: swap device the cluster belongs to. @@ -89,11 +115,49 @@ static inline struct swap_cluster_info *__swap_offset_to_cluster( static inline struct swap_cluster_info *swap_cluster_lock( struct swap_info_struct *si, unsigned long offset) { - struct swap_cluster_info *ci = __swap_offset_to_cluster(si, offset); + return __swap_cluster_lock(si, offset, false); +} - VM_WARN_ON_ONCE(percpu_ref_is_zero(&si->users)); /* race with swapoff */ - spin_lock(&ci->lock); - return ci; +static inline struct swap_cluster_info *__swap_cluster_get_and_lock( + struct folio *folio, bool irq) +{ + VM_WARN_ON_ONCE_FOLIO(!folio_test_locked(folio), folio); + VM_WARN_ON_ONCE_FOLIO(!folio_test_swapcache(folio), folio); + return __swap_cluster_lock(__swap_entry_to_info(folio->swap), + swp_offset(folio->swap), irq); +} + +/* + * swap_cluster_get_and_lock - Locks the cluster that holds a folio's entries. + * @folio: The folio. + * + * This locks and returns the swap cluster that contains a folio's swap + * entries. The swap entries of a folio are always in one single cluster. + * The folio has to be locked so its swap entries won't change and the + * cluster won't be freed. + * + * Context: Caller must ensure the folio is locked and in the swap cache. + * Return: Pointer to the swap cluster. + */ +static inline struct swap_cluster_info *swap_cluster_get_and_lock( + struct folio *folio) +{ + return __swap_cluster_get_and_lock(folio, false); +} + +/* + * swap_cluster_get_and_lock_irq - Locks the cluster that holds a folio's entries. + * @folio: The folio. + * + * Same as swap_cluster_get_and_lock but also disable IRQ. + * + * Context: Caller must ensure the folio is locked and in the swap cache. + * Return: Pointer to the swap cluster. + */ +static inline struct swap_cluster_info *swap_cluster_get_and_lock_irq( + struct folio *folio) +{ + return __swap_cluster_get_and_lock(folio, true); } static inline void swap_cluster_unlock(struct swap_cluster_info *ci) @@ -101,6 +165,11 @@ static inline void swap_cluster_unlock(struct swap_cluster_info *ci) spin_unlock(&ci->lock); } +static inline void swap_cluster_unlock_irq(struct swap_cluster_info *ci) +{ + spin_unlock_irq(&ci->lock); +} + /* linux/mm/page_io.c */ int sio_pool_init(void); struct swap_iocb; @@ -120,10 +189,11 @@ int __swap_writepage(struct folio *folio, struct writeback_control *wbc); #define SWAP_ADDRESS_SPACE_SHIFT 14 #define SWAP_ADDRESS_SPACE_PAGES (1 << SWAP_ADDRESS_SPACE_SHIFT) #define SWAP_ADDRESS_SPACE_MASK (SWAP_ADDRESS_SPACE_PAGES - 1) -extern struct address_space *swapper_spaces[]; -#define swap_address_space(entry) \ - (&swapper_spaces[swp_type(entry)][swp_offset(entry) \ - >> SWAP_ADDRESS_SPACE_SHIFT]) +extern struct address_space swap_space; +static inline struct address_space *swap_address_space(swp_entry_t entry) +{ + return &swap_space; +} /* * Return the swap device position of the swap entry. @@ -133,15 +203,6 @@ static inline loff_t swap_dev_pos(swp_entry_t entry) return ((loff_t)swp_offset(entry)) << PAGE_SHIFT; } -/* - * Return the swap cache index of the swap entry. - */ -static inline pgoff_t swap_cache_index(swp_entry_t entry) -{ - BUILD_BUG_ON((SWP_OFFSET_MASK | SWAP_ADDRESS_SPACE_MASK) != SWP_OFFSET_MASK); - return swp_offset(entry) & SWAP_ADDRESS_SPACE_MASK; -} - /** * folio_matches_swap_entry - Check if a folio matches a given swap entry. * @folio: The folio. @@ -177,14 +238,14 @@ static inline bool folio_matches_swap_entry(struct folio *folio, */ struct folio *swap_cache_get_folio(swp_entry_t entry); void *swap_cache_get_shadow(swp_entry_t entry); -int swap_cache_add_folio(struct folio *folio, swp_entry_t entry, - gfp_t gfp, void **shadow); +void swap_cache_add_folio(struct folio *folio, swp_entry_t entry, void **shadow); void swap_cache_del_folio(struct folio *folio); -void __swap_cache_del_folio(struct folio *folio, - swp_entry_t entry, void *shadow); -void __swap_cache_replace_folio(struct folio *old, struct folio *new); -void swap_cache_clear_shadow(int type, unsigned long begin, - unsigned long end); +/* Below helpers require the caller to lock and pass in the swap cluster. */ +void __swap_cache_del_folio(struct swap_cluster_info *ci, + struct folio *folio, swp_entry_t entry, void *shadow); +void __swap_cache_replace_folio(struct swap_cluster_info *ci, + struct folio *old, struct folio *new); +void __swap_cache_clear_shadow(swp_entry_t entry, int nr_ents); void show_swap_cache_info(void); void swapcache_clear(struct swap_info_struct *si, swp_entry_t entry, int nr); @@ -252,6 +313,32 @@ static inline int non_swapcache_batch(swp_entry_t entry, int max_nr) #else /* CONFIG_SWAP */ struct swap_iocb; +static inline struct swap_cluster_info *swap_cluster_lock( + struct swap_info_struct *si, pgoff_t offset, bool irq) +{ + return NULL; +} + +static inline struct swap_cluster_info *swap_cluster_get_and_lock( + struct folio *folio) +{ + return NULL; +} + +static inline struct swap_cluster_info *swap_cluster_get_and_lock_irq( + struct folio *folio) +{ + return NULL; +} + +static inline void swap_cluster_unlock(struct swap_cluster_info *ci) +{ +} + +static inline void swap_cluster_unlock_irq(struct swap_cluster_info *ci) +{ +} + static inline struct swap_info_struct *__swap_entry_to_info(swp_entry_t entry) { return NULL; @@ -269,12 +356,7 @@ static inline struct address_space *swap_address_space(swp_entry_t entry) return NULL; } -static inline pgoff_t swap_cache_index(swp_entry_t entry) -{ - return 0; -} - -static inline bool folio_matches_swap_entry(const struct folio *folio, swp_entry_t entry) +static inline bool folio_matches_swap_entry(struct folio *folio, swp_entry_t entry) { return false; } @@ -319,21 +401,21 @@ static inline void *swap_cache_get_shadow(swp_entry_t entry) return NULL; } -static inline int swap_cache_add_folio(swp_entry_t entry, struct folio *folio, - gfp_t gfp, void **shadow) +static inline void swap_cache_add_folio(struct folio *folio, swp_entry_t entry, void **shadow) { - return -EINVAL; } static inline void swap_cache_del_folio(struct folio *folio) { } -static inline void __swap_cache_del_folio(struct folio *folio, swp_entry_t entry, void *shadow) +static inline void __swap_cache_del_folio(struct swap_cluster_info *ci, + struct folio *folio, swp_entry_t entry, void *shadow) { } -static inline void __swap_cache_replace_folio(struct folio *old, struct folio *new) +static inline void __swap_cache_replace_folio(struct swap_cluster_info *ci, + struct folio *old, struct folio *new) { } @@ -367,8 +449,10 @@ static inline int non_swapcache_batch(swp_entry_t entry, int max_nr) */ static inline pgoff_t folio_index(struct folio *folio) { +#ifdef CONFIG_SWAP if (unlikely(folio_test_swapcache(folio))) - return swap_cache_index(folio->swap); + return swp_offset(folio->swap); +#endif return folio->index; } diff --git a/mm/swap_state.c b/mm/swap_state.c index 9d2cfbae96bb..7be1db3d2def 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -23,6 +23,7 @@ #include #include #include "internal.h" +#include "swap_table.h" #include "swap.h" /* @@ -37,8 +38,10 @@ static const struct address_space_operations swap_aops = { #endif }; -struct address_space *swapper_spaces[MAX_SWAPFILES] __read_mostly; -static unsigned int nr_swapper_spaces[MAX_SWAPFILES] __read_mostly; +struct address_space swap_space __read_mostly = { + .a_ops = &swap_aops, +}; + static bool enable_vma_readahead __read_mostly = true; #define SWAP_RA_ORDER_CEILING 5 @@ -84,11 +87,20 @@ void show_swap_cache_info(void) */ struct folio *swap_cache_get_folio(swp_entry_t entry) { - struct folio *folio = filemap_get_folio(swap_address_space(entry), - swap_cache_index(entry)); - if (IS_ERR(folio)) - return NULL; - return folio; + unsigned long swp_tb; + struct folio *folio; + + for (;;) { + swp_tb = __swap_table_get(__swap_entry_to_cluster(entry), + swp_cluster_offset(entry)); + if (!swp_tb_is_folio(swp_tb)) + return NULL; + folio = swp_tb_to_folio(swp_tb); + if (likely(folio_try_get(folio))) + return folio; + } + + return NULL; } /** @@ -101,13 +113,13 @@ struct folio *swap_cache_get_folio(swp_entry_t entry) */ void *swap_cache_get_shadow(swp_entry_t entry) { - struct address_space *address_space = swap_address_space(entry); - pgoff_t idx = swap_cache_index(entry); - void *shadow; + unsigned long swp_tb; + + swp_tb = __swap_table_get(__swap_entry_to_cluster(entry), + swp_cluster_offset(entry)); + if (swp_tb_is_shadow(swp_tb)) + return swp_tb_to_shadow(swp_tb); - shadow = xa_load(&address_space->i_pages, idx); - if (xa_is_value(shadow)) - return shadow; return NULL; } @@ -120,62 +132,48 @@ void *swap_cache_get_shadow(swp_entry_t entry) * * Context: Caller must ensure @entry is valid and protect the swap device * with reference count or locks. - * The caller also needs to mark the corresponding swap_map slots with - * SWAP_HAS_CACHE to avoid race or conflict. - * Return: Returns 0 on success, error code otherwise. + * The caller also needs to update the corresponding swap_map slots with + * SWAP_HAS_CACHE bit to avoid race or conflict. */ -int swap_cache_add_folio(struct folio *folio, swp_entry_t entry, - gfp_t gfp, void **shadowp) +void swap_cache_add_folio(struct folio *folio, swp_entry_t entry, void **shadowp) { - struct address_space *address_space = swap_address_space(entry); - pgoff_t idx = swap_cache_index(entry); - XA_STATE_ORDER(xas, &address_space->i_pages, idx, folio_order(folio)); - unsigned long i, nr = folio_nr_pages(folio); - void *old; - - xas_set_update(&xas, workingset_update_node); - - VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); - VM_BUG_ON_FOLIO(folio_test_swapcache(folio), folio); - VM_BUG_ON_FOLIO(!folio_test_swapbacked(folio), folio); + void *shadow = NULL; + unsigned long old_tb, new_tb; + struct swap_cluster_info *ci; + unsigned int ci_start, ci_off, ci_end; + unsigned long nr_pages = folio_nr_pages(folio); + + VM_WARN_ON_ONCE_FOLIO(!folio_test_locked(folio), folio); + VM_WARN_ON_ONCE_FOLIO(folio_test_swapcache(folio), folio); + VM_WARN_ON_ONCE_FOLIO(!folio_test_swapbacked(folio), folio); + + new_tb = folio_to_swp_tb(folio); + ci_start = swp_cluster_offset(entry); + ci_end = ci_start + nr_pages; + ci_off = ci_start; + ci = swap_cluster_lock(__swap_entry_to_info(entry), swp_offset(entry)); + do { + old_tb = __swap_table_xchg(ci, ci_off, new_tb); + WARN_ON_ONCE(swp_tb_is_folio(old_tb)); + if (swp_tb_is_shadow(old_tb)) + shadow = swp_tb_to_shadow(old_tb); + } while (++ci_off < ci_end); - folio_ref_add(folio, nr); + folio_ref_add(folio, nr_pages); folio_set_swapcache(folio); folio->swap = entry; + swap_cluster_unlock(ci); - do { - xas_lock_irq(&xas); - xas_create_range(&xas); - if (xas_error(&xas)) - goto unlock; - for (i = 0; i < nr; i++) { - VM_BUG_ON_FOLIO(xas.xa_index != idx + i, folio); - old = xas_load(&xas); - if (xa_is_value(old)) { - if (shadowp) - *shadowp = old; - } - xas_store(&xas, folio); - xas_next(&xas); - } - address_space->nrpages += nr; - __node_stat_mod_folio(folio, NR_FILE_PAGES, nr); - __lruvec_stat_mod_folio(folio, NR_SWAPCACHE, nr); -unlock: - xas_unlock_irq(&xas); - } while (xas_nomem(&xas, gfp)); - - if (!xas_error(&xas)) - return 0; + node_stat_mod_folio(folio, NR_FILE_PAGES, nr_pages); + lruvec_stat_mod_folio(folio, NR_SWAPCACHE, nr_pages); - folio_clear_swapcache(folio); - folio_ref_sub(folio, nr); - return xas_error(&xas); + if (shadowp) + *shadowp = shadow; } -EXPORT_SYMBOL(add_to_swap_cache); /** * __swap_cache_del_folio - Removes a folio from the swap cache. + * @ci: The locked swap cluster. * @folio: The folio. * @entry: The first swap entry that the folio corresponds to. * @shadow: shadow value to be filled in the swap cache. @@ -183,34 +181,36 @@ EXPORT_SYMBOL(add_to_swap_cache); * Removes a folio from the swap cache and fills a shadow in place. * This won't put the folio's refcount. The caller has to do that. * - * Context: Caller must hold the xa_lock, ensure the folio is - * locked and in the swap cache, using the index of @entry. + * Context: Caller must ensure the folio is locked and in the swap cache + * using the index of @entry, and lock the cluster that holds the entries. */ -void __swap_cache_del_folio(struct folio *folio, +void __swap_cache_del_folio(struct swap_cluster_info *ci, struct folio *folio, swp_entry_t entry, void *shadow) { - struct address_space *address_space = swap_address_space(entry); - int i; - long nr = folio_nr_pages(folio); - pgoff_t idx = swap_cache_index(entry); - XA_STATE(xas, &address_space->i_pages, idx); - - xas_set_update(&xas, workingset_update_node); - - VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); - VM_BUG_ON_FOLIO(!folio_test_swapcache(folio), folio); - VM_BUG_ON_FOLIO(folio_test_writeback(folio), folio); - - for (i = 0; i < nr; i++) { - void *entry = xas_store(&xas, shadow); - VM_BUG_ON_PAGE(entry != folio, entry); - xas_next(&xas); - } + unsigned long old_tb, new_tb; + unsigned int ci_start, ci_off, ci_end; + unsigned long nr_pages = folio_nr_pages(folio); + + VM_WARN_ON_ONCE(__swap_entry_to_cluster(entry) != ci); + VM_WARN_ON_ONCE_FOLIO(!folio_test_locked(folio), folio); + VM_WARN_ON_ONCE_FOLIO(!folio_test_swapcache(folio), folio); + VM_WARN_ON_ONCE_FOLIO(folio_test_writeback(folio), folio); + + new_tb = shadow_swp_to_tb(shadow); + ci_start = swp_cluster_offset(entry); + ci_end = ci_start + nr_pages; + ci_off = ci_start; + do { + /* If shadow is NULL, we sets an empty shadow */ + old_tb = __swap_table_xchg(ci, ci_off, new_tb); + WARN_ON_ONCE(!swp_tb_is_folio(old_tb) || + swp_tb_to_folio(old_tb) != folio); + } while (++ci_off < ci_end); + folio->swap.val = 0; folio_clear_swapcache(folio); - address_space->nrpages -= nr; - __node_stat_mod_folio(folio, NR_FILE_PAGES, -nr); - __lruvec_stat_mod_folio(folio, NR_SWAPCACHE, -nr); + node_stat_mod_folio(folio, NR_FILE_PAGES, -nr_pages); + lruvec_stat_mod_folio(folio, NR_SWAPCACHE, -nr_pages); } /** @@ -225,20 +225,20 @@ void __swap_cache_del_folio(struct folio *folio, */ void swap_cache_del_folio(struct folio *folio) { + struct swap_cluster_info *ci; swp_entry_t entry = folio->swap; - struct address_space *address_space = swap_address_space(entry); - xa_lock_irq(&address_space->i_pages); - __swap_cache_del_folio(folio, entry, NULL); - xa_unlock_irq(&address_space->i_pages); + ci = swap_cluster_lock(__swap_entry_to_info(entry), swp_offset(entry)); + __swap_cache_del_folio(ci, folio, entry, NULL); + swap_cluster_unlock(ci); put_swap_folio(folio, entry); folio_ref_sub(folio, folio_nr_pages(folio)); } -EXPORT_SYMBOL(delete_from_swap_cache); /** * __swap_cache_replace_folio - Replace a folio in the swap cache. + * @ci: The locked swap cluster. * @old: The old folio to be replaced. * @new: The new folio. * @@ -247,65 +247,62 @@ EXPORT_SYMBOL(delete_from_swap_cache); * entries. Replacement will take the new folio's swap entry value as * the starting offset to override all slots covered by the new folio. * - * Context: Caller must ensure both folios are locked, also lock the - * swap address_space that holds the old folio to avoid races. + * Context: Caller must ensure both folios are locked, and lock the + * cluster that holds the old folio to be replaced. */ -void __swap_cache_replace_folio(struct folio *old, struct folio *new) +void __swap_cache_replace_folio(struct swap_cluster_info *ci, + struct folio *old, struct folio *new) { swp_entry_t entry = new->swap; unsigned long nr_pages = folio_nr_pages(new); - unsigned long offset = swap_cache_index(entry); - unsigned long end = offset + nr_pages; - - XA_STATE(xas, &swap_address_space(entry)->i_pages, offset); + unsigned int ci_off = swp_cluster_offset(entry); + unsigned int ci_end = ci_off + nr_pages; + unsigned long old_tb, new_tb; VM_WARN_ON_ONCE(!folio_test_swapcache(old) || !folio_test_swapcache(new)); VM_WARN_ON_ONCE(!folio_test_locked(old) || !folio_test_locked(new)); VM_WARN_ON_ONCE(!entry.val); /* Swap cache still stores N entries instead of a high-order entry */ + new_tb = folio_to_swp_tb(new); do { - WARN_ON_ONCE(xas_store(&xas, new) != old); - xas_next(&xas); - } while (++offset < end); + old_tb = __swap_table_xchg(ci, ci_off, new_tb); + WARN_ON_ONCE(!swp_tb_is_folio(old_tb) || swp_tb_to_folio(old_tb) != old); + } while (++ci_off < ci_end); + + /* + * If the old folio is partially replaced (e.g., splitting a large + * folio, the old folio is shrunk, and new split sub folios replace + * the shrunk part), ensure the new folio doesn't overlap it. + */ + if (IS_ENABLED(CONFIG_DEBUG_VM) && + folio_order(old) != folio_order(new)) { + ci_off = swp_cluster_offset(old->swap); + ci_end = ci_off + folio_nr_pages(old); + while (ci_off++ < ci_end) + WARN_ON_ONCE(swp_tb_to_folio(__swap_table_get(ci, ci_off)) != old); + } } /** * swap_cache_clear_shadow - Clears a set of shadows in the swap cache. - * @type: Indicates the swap device. - * @begin: Beginning offset of the range. - * @end: Ending offset of the range. + * @entry: The starting index entry. + * @nr_ents: How many slots need to be cleared. * - * Context: Caller must ensure the range is valid and hold a reference to - * the swap device. + * Context: Caller must ensure the range is valid, all in one single cluster, + * not occupied by any folio, and lock the cluster. */ -void swap_cache_clear_shadow(int type, unsigned long begin, - unsigned long end) +void __swap_cache_clear_shadow(swp_entry_t entry, int nr_ents) { - unsigned long curr = begin; - void *old; - - for (;;) { - swp_entry_t entry = swp_entry(type, curr); - unsigned long index = curr & SWAP_ADDRESS_SPACE_MASK; - struct address_space *address_space = swap_address_space(entry); - XA_STATE(xas, &address_space->i_pages, index); - - xas_set_update(&xas, workingset_update_node); - - xa_lock_irq(&address_space->i_pages); - xas_for_each(&xas, old, min(index + (end - curr), SWAP_ADDRESS_SPACE_PAGES)) { - if (!xa_is_value(old)) - continue; - xas_store(&xas, NULL); - } - xa_unlock_irq(&address_space->i_pages); + struct swap_cluster_info *ci = __swap_entry_to_cluster(entry); + unsigned int ci_off = swp_cluster_offset(entry), ci_end; + unsigned long old; - /* search the next swapcache until we meet end */ - curr = ALIGN((curr + 1), SWAP_ADDRESS_SPACE_PAGES); - if (curr > end) - break; - } + ci_end = ci_off + nr_ents; + do { + old = __swap_table_xchg(ci, ci_off, null_to_swp_tb()); + WARN_ON_ONCE(swp_tb_is_folio(old)); + } while (++ci_off < ci_end); } /* @@ -489,10 +486,7 @@ struct folio *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, if (mem_cgroup_swapin_charge_folio(new_folio, NULL, gfp_mask, entry)) goto fail_unlock; - /* May fail (-ENOMEM) if XArray node allocation failed. */ - if (swap_cache_add_folio(new_folio, entry, gfp_mask & GFP_RECLAIM_MASK, &shadow)) - goto fail_unlock; - + swap_cache_add_folio(new_folio, entry, &shadow); mem_cgroup_swapin_uncharge_swap(entry, 1); if (shadow) @@ -685,46 +679,6 @@ struct folio *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask, return folio; } -int init_swap_address_space(struct swap_info_struct *si, unsigned long nr_pages) -{ - struct address_space *spaces, *space; - unsigned int i, nr, type; - - type = si->type; - nr = DIV_ROUND_UP(nr_pages, SWAP_ADDRESS_SPACE_PAGES); - spaces = kvcalloc(nr, sizeof(struct address_space), GFP_KERNEL); - if (!spaces) - return -ENOMEM; - for (i = 0; i < nr; i++) { - space = spaces + i; - xa_init_flags(&space->i_pages, XA_FLAGS_LOCK_IRQ); - atomic_set(&space->i_mmap_writable, 0); - space->a_ops = &swap_aops; - /* swap cache doesn't use writeback related tags */ - mapping_set_no_writeback_tags(space); -#ifdef CONFIG_EMM_RAMDISK_SWAP - if (si->bdev && bdev_ramdisk(si->bdev)) - set_bit(AS_RAM_SWAP, &space->flags); -#endif - } - nr_swapper_spaces[type] = nr; - swapper_spaces[type] = spaces; - - return 0; -} - -void exit_swap_address_space(unsigned int type) -{ - int i; - struct address_space *spaces = swapper_spaces[type]; - - for (i = 0; i < nr_swapper_spaces[type]; i++) - VM_WARN_ON_ONCE(!mapping_empty(&spaces[i])); - kvfree(spaces); - nr_swapper_spaces[type] = 0; - swapper_spaces[type] = NULL; -} - static int swap_vma_ra_win(struct vm_fault *vmf, unsigned long *start, unsigned long *end) { @@ -898,7 +852,7 @@ static const struct attribute_group swap_attr_group = { .attrs = swap_attrs, }; -static int __init swap_init_sysfs(void) +static int __init swap_init(void) { int err; struct kobject *swap_kobj; @@ -913,11 +867,13 @@ static int __init swap_init_sysfs(void) pr_err("failed to register swap group\n"); goto delete_obj; } + /* Swap cache writeback is LRU based, no tags for it */ + mapping_set_no_writeback_tags(&swap_space); return 0; delete_obj: kobject_put(swap_kobj); return err; } -subsys_initcall(swap_init_sysfs); +subsys_initcall(swap_init); #endif diff --git a/mm/swap_table.h b/mm/swap_table.h new file mode 100644 index 000000000000..e1f7cc009701 --- /dev/null +++ b/mm/swap_table.h @@ -0,0 +1,97 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _MM_SWAP_TABLE_H +#define _MM_SWAP_TABLE_H + +#include "swap.h" + +/* + * A swap table entry represents the status of a swap slot on a swap + * (physical or virtual) device. The swap table in each cluster is a + * 1:1 map of the swap slots in this cluster. + * + * Each swap table entry could be a pointer (folio), a XA_VALUE + * (shadow), or NULL. + */ + +/* + * Helpers for casting one type of info into a swap table entry. + */ +static inline unsigned long null_to_swp_tb(void) +{ + BUILD_BUG_ON(sizeof(unsigned long) != sizeof(atomic_long_t)); + return 0; +} + +static inline unsigned long folio_to_swp_tb(struct folio *folio) +{ + BUILD_BUG_ON(sizeof(unsigned long) != sizeof(void *)); + return (unsigned long)folio; +} + +static inline unsigned long shadow_swp_to_tb(void *shadow) +{ + BUILD_BUG_ON((BITS_PER_XA_VALUE + 1) != + BITS_PER_BYTE * sizeof(unsigned long)); + VM_WARN_ON_ONCE(shadow && !xa_is_value(shadow)); + return (unsigned long)shadow; +} + +/* + * Helpers for swap table entry type checking. + */ +static inline bool swp_tb_is_null(unsigned long swp_tb) +{ + return !swp_tb; +} + +static inline bool swp_tb_is_folio(unsigned long swp_tb) +{ + return !xa_is_value((void *)swp_tb) && !swp_tb_is_null(swp_tb); +} + +static inline bool swp_tb_is_shadow(unsigned long swp_tb) +{ + return xa_is_value((void *)swp_tb); +} + +/* + * Helpers for retrieving info from swap table. + */ +static inline struct folio *swp_tb_to_folio(unsigned long swp_tb) +{ + VM_WARN_ON(!swp_tb_is_folio(swp_tb)); + return (void *)swp_tb; +} + +static inline void *swp_tb_to_shadow(unsigned long swp_tb) +{ + VM_WARN_ON(!swp_tb_is_shadow(swp_tb)); + return (void *)swp_tb; +} + +/* + * Helpers for accessing or modifying the swap table of a cluster, + * the swap cluster must be locked. + */ +static inline void __swap_table_set(struct swap_cluster_info *ci, + unsigned int off, unsigned long swp_tb) +{ + VM_WARN_ON_ONCE(off >= SWAPFILE_CLUSTER); + atomic_long_set(&ci->table[off], swp_tb); +} + +static inline unsigned long __swap_table_xchg(struct swap_cluster_info *ci, + unsigned int off, unsigned long swp_tb) +{ + VM_WARN_ON_ONCE(off >= SWAPFILE_CLUSTER); + /* Ordering is guaranteed by cluster lock, relax */ + return atomic_long_xchg_relaxed(&ci->table[off], swp_tb); +} + +static inline unsigned long __swap_table_get(struct swap_cluster_info *ci, + unsigned int off) +{ + VM_WARN_ON_ONCE(off >= SWAPFILE_CLUSTER); + return atomic_long_read(&ci->table[off]); +} +#endif diff --git a/mm/swapfile.c b/mm/swapfile.c index de3ca2a2e16c..1c0a6f314ee6 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -45,6 +45,7 @@ #include #include #include +#include "swap_table.h" #include "internal.h" #include "swap.h" @@ -422,6 +423,34 @@ static inline unsigned int cluster_offset(struct swap_info_struct *si, return cluster_index(si, ci) * SWAPFILE_CLUSTER; } +static int swap_cluster_alloc_table(struct swap_cluster_info *ci) +{ + WARN_ON(ci->table); + ci->table = kzalloc(sizeof(unsigned long) * SWAPFILE_CLUSTER, GFP_KERNEL); + if (!ci->table) + return -ENOMEM; + return 0; +} + +static void swap_cluster_free_table(struct swap_cluster_info *ci) +{ + unsigned int ci_off; + unsigned long swp_tb; + + if (!ci->table) + return; + + for (ci_off = 0; ci_off < SWAPFILE_CLUSTER; ci_off++) { + swp_tb = __swap_table_get(ci, ci_off); + if (!swp_tb_is_null(swp_tb)) + pr_err_once("swap: unclean swap space on swapoff: 0x%lx", + swp_tb); + } + + kfree(ci->table); + ci->table = NULL; +} + static void move_cluster(struct swap_info_struct *si, struct swap_cluster_info *ci, struct list_head *list, enum swap_cluster_flags new_flags) @@ -704,6 +733,26 @@ static bool cluster_scan_range(struct swap_info_struct *si, return true; } +/* + * Currently, the swap table is not used for count tracking, just + * do a sanity check here to ensure nothing leaked, so the swap + * table should be empty upon freeing. + */ +static void swap_cluster_assert_table_empty(struct swap_cluster_info *ci, + unsigned int start, unsigned int nr) +{ + unsigned int ci_off = start % SWAPFILE_CLUSTER; + unsigned int ci_end = ci_off + nr; + unsigned long swp_tb; + + if (IS_ENABLED(CONFIG_DEBUG_VM)) { + do { + swp_tb = __swap_table_get(ci, ci_off); + VM_WARN_ON_ONCE(!swp_tb_is_null(swp_tb)); + } while (++ci_off < ci_end); + } +} + static bool cluster_alloc_range(struct swap_info_struct *si, struct swap_cluster_info *ci, unsigned int start, unsigned char usage, unsigned int order) @@ -723,6 +772,7 @@ static bool cluster_alloc_range(struct swap_info_struct *si, struct swap_cluster ci->order = order; memset(si->swap_map + start, usage, nr_pages); + swap_cluster_assert_table_empty(ci, start, nr_pages); swap_range_alloc(si, nr_pages); ci->count += nr_pages; @@ -1118,7 +1168,7 @@ static void swap_range_free(struct swap_info_struct *si, unsigned long offset, swap_slot_free_notify(si->bdev, offset); offset++; } - swap_cache_clear_shadow(si->type, begin, end); + __swap_cache_clear_shadow(swp_entry(si->type, begin), nr_entries); /* * Make sure that try_to_unuse() observes si->inuse_pages reaching 0 @@ -1309,16 +1359,7 @@ int folio_alloc_swap(struct folio *folio, gfp_t gfp) if (!entry.val) return -ENOMEM; - /* - * XArray node allocations from PF_MEMALLOC contexts could - * completely exhaust the page allocator. __GFP_NOMEMALLOC - * stops emergency reserves from being allocated. - * - * TODO: this could cause a theoretical memory reclaim - * deadlock in the swap out path. - */ - if (swap_cache_add_folio(folio, entry, gfp | __GFP_NOMEMALLOC, NULL)) - goto out_free; + swap_cache_add_folio(folio, entry, NULL); return 0; @@ -1583,6 +1624,7 @@ static void swap_entries_free(struct swap_info_struct *si, mem_cgroup_uncharge_swap(entry, nr_pages); swap_range_free(si, offset, nr_pages); + swap_cluster_assert_table_empty(ci, offset, nr_pages); if (!ci->count) free_cluster(si, ci); @@ -2662,6 +2704,18 @@ static void wait_for_allocation(struct swap_info_struct *si) } } +static void free_cluster_info(struct swap_cluster_info *cluster_info, + unsigned long maxpages) +{ + int i, nr_clusters = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER); + + if (!cluster_info) + return; + for (i = 0; i < nr_clusters; i++) + swap_cluster_free_table(&cluster_info[i]); + kvfree(cluster_info); +} + /* * Called after swap device's reference count is dead, so * neither scan nor allocation will use it. @@ -2796,12 +2850,13 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) swap_file = p->swap_file; p->swap_file = NULL; - p->max = 0; swap_map = p->swap_map; p->swap_map = NULL; zeromap = p->zeromap; p->zeromap = NULL; cluster_info = p->cluster_info; + free_cluster_info(cluster_info, p->max); + p->max = 0; p->cluster_info = NULL; spin_unlock(&p->lock); spin_unlock(&swap_lock); @@ -2812,10 +2867,8 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) p->global_cluster = NULL; vfree(swap_map); kvfree(zeromap); - kvfree(cluster_info); /* Destroy swap account information */ swap_cgroup_swapoff(p->type); - exit_swap_address_space(p->type); inode = mapping->host; @@ -3203,8 +3256,11 @@ static struct swap_cluster_info *setup_clusters(struct swap_info_struct *si, if (!cluster_info) goto err; - for (i = 0; i < nr_clusters; i++) + for (i = 0; i < nr_clusters; i++) { spin_lock_init(&cluster_info[i].lock); + if (swap_cluster_alloc_table(&cluster_info[i])) + goto err_free; + } if (!(si->flags & SWP_SOLIDSTATE)) { si->global_cluster = kmalloc(sizeof(*si->global_cluster), @@ -3265,9 +3321,8 @@ static struct swap_cluster_info *setup_clusters(struct swap_info_struct *si, } return cluster_info; - err_free: - kvfree(cluster_info); + free_cluster_info(cluster_info, maxpages); err: return ERR_PTR(err); } @@ -3452,13 +3507,9 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) } } - error = init_swap_address_space(si, maxpages); - if (error) - goto bad_swap_unlock_inode; - error = zswap_swapon(si->type, maxpages); if (error) - goto free_swap_address_space; + goto bad_swap_unlock_inode; /* * Flush any pending IO and dirty mappings before we start using this @@ -3493,8 +3544,6 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) goto out; free_swap_zswap: zswap_swapoff(si->type); -free_swap_address_space: - exit_swap_address_space(si->type); bad_swap_unlock_inode: inode_unlock(inode); bad_swap: @@ -3509,7 +3558,8 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) spin_unlock(&swap_lock); vfree(swap_map); kvfree(zeromap); - kvfree(cluster_info); + if (cluster_info) + free_cluster_info(cluster_info, maxpages); if (inced_nr_rotate_swap) atomic_dec(&nr_rotate_swap); if (swap_file) diff --git a/mm/vmscan.c b/mm/vmscan.c index 285542e62d61..8c3dc3fa55be 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -814,13 +814,18 @@ static int __remove_mapping(struct address_space *mapping, struct folio *folio, { int refcount; void *shadow = NULL; + struct swap_cluster_info *ci; BUG_ON(!folio_test_locked(folio)); BUG_ON(mapping != folio_mapping(folio)); - if (!folio_test_swapcache(folio)) + if (folio_test_swapcache(folio)) { + ci = swap_cluster_get_and_lock_irq(folio); + } else { spin_lock(&mapping->host->i_lock); - xa_lock_irq(&mapping->i_pages); + xa_lock_irq(&mapping->i_pages); + } + /* * The non racy check for a busy folio. * @@ -860,9 +865,9 @@ static int __remove_mapping(struct address_space *mapping, struct folio *folio, if (reclaimed && !mapping_exiting(mapping)) shadow = workingset_eviction(folio, target_memcg); - __swap_cache_del_folio(folio, swap, shadow); + __swap_cache_del_folio(ci, folio, swap, shadow); mem_cgroup_swapout(folio, swap); - xa_unlock_irq(&mapping->i_pages); + swap_cluster_unlock_irq(ci); put_swap_folio(folio, swap); } else { void (*free_folio)(struct folio *); @@ -900,9 +905,12 @@ static int __remove_mapping(struct address_space *mapping, struct folio *folio, return 1; cannot_free: - xa_unlock_irq(&mapping->i_pages); - if (!folio_test_swapcache(folio)) + if (folio_test_swapcache(folio)) { + swap_cluster_unlock_irq(ci); + } else { + xa_unlock_irq(&mapping->i_pages); spin_unlock(&mapping->host->i_lock); + } return 0; } -- Gitee From 9a05863960b62c248e941eb825c58ce989938aef Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Wed, 17 Sep 2025 00:00:57 +0800 Subject: [PATCH 480/484] mm, swap: mark swap address space ro and add context debug check commit 8b47299a411a178d572aaac31ff7ab33a8bd27e2 upstream Conflicts: none Backport-reason: Swap Table: phase 1 Swap cache is now backed by swap table, and the address space is not holding any mutable data anymore. And swap cache is now protected by the swap cluster lock, instead of the XArray lock. All access to swap cache are wrapped by swap cache helpers. Locking is mostly handled internally by swap cache helpers, only a few __swap_cache_* helpers require the caller to lock the cluster by themselves. Worth noting that, unlike XArray, the cluster lock is not IRQ safe. The swap cache was very different compared to filemap, and now it's completely separated from filemap. Nothing wants to mark or change anything or do a writeback callback in IRQ. So explicitly document this and add a debug check to avoid further potential misuse. And mark the swap cache space as read-only to avoid any user wrongly mixing unexpected filemap helpers with swap cache. Link: https://lkml.kernel.org/r/20250916160100.31545-13-ryncsn@gmail.com Signed-off-by: Kairui Song Acked-by: Chris Li Acked-by: David Hildenbrand Suggested-by: Chris Li Cc: Baolin Wang Cc: Baoquan He Cc: Barry Song Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Johannes Weiner Cc: Kemeng Shi Cc: kernel test robot Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Nhat Pham Cc: Yosry Ahmed Cc: Zi Yan Cc: SeongJae Park Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/swap.h | 12 +++++++++++- mm/swap_state.c | 3 ++- 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/mm/swap.h b/mm/swap.h index 12b4390006eb..291a785265f1 100644 --- a/mm/swap.h +++ b/mm/swap.h @@ -96,6 +96,16 @@ static __always_inline struct swap_cluster_info *__swap_cluster_lock( { struct swap_cluster_info *ci = __swap_offset_to_cluster(si, offset); + /* + * Nothing modifies swap cache in an IRQ context. All access to + * swap cache is wrapped by swap_cache_* helpers, and swap cache + * writeback is handled outside of IRQs. Swapin or swapout never + * occurs in IRQ, and neither does in-place split or replace. + * + * Besides, modifying swap cache requires synchronization with + * swap_map, which was never IRQ safe. + */ + VM_WARN_ON_ONCE(!in_task()); VM_WARN_ON_ONCE(percpu_ref_is_zero(&si->users)); /* race with swapoff */ if (irq) spin_lock_irq(&ci->lock); @@ -189,7 +199,7 @@ int __swap_writepage(struct folio *folio, struct writeback_control *wbc); #define SWAP_ADDRESS_SPACE_SHIFT 14 #define SWAP_ADDRESS_SPACE_PAGES (1 << SWAP_ADDRESS_SPACE_SHIFT) #define SWAP_ADDRESS_SPACE_MASK (SWAP_ADDRESS_SPACE_PAGES - 1) -extern struct address_space swap_space; +extern struct address_space swap_space __ro_after_init; static inline struct address_space *swap_address_space(swp_entry_t entry) { return &swap_space; diff --git a/mm/swap_state.c b/mm/swap_state.c index 7be1db3d2def..a5cf39f8849d 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -38,7 +38,8 @@ static const struct address_space_operations swap_aops = { #endif }; -struct address_space swap_space __read_mostly = { +/* Set swap_space as read only as swap cache is handled by swap table */ +struct address_space swap_space __ro_after_init = { .a_ops = &swap_aops, }; -- Gitee From e474faab52b5475bddd03b9b217559383d60d582 Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Wed, 17 Sep 2025 00:00:58 +0800 Subject: [PATCH 481/484] mm, swap: remove contention workaround for swap cache commit 685a17fbd35e66ae9b6440979b438caa2ae540cd upstream Conflicts: none Backport-reason: Swap Table: phase 1 Swap cluster setup will try to shuffle the clusters on initialization. It was helpful to avoid contention for the swap cache space. The cluster size (2M) was much smaller than each swap cache space (64M), so shuffling the cluster means the allocator will try to allocate swap slots that are in different swap cache spaces for each CPU, reducing the chance of two CPUs using the same swap cache space, and hence reducing the contention. Now, swap cache is managed by swap clusters, this shuffle is pointless. Just remove it, and clean up related macros. This also improves the HDD swap performance as shuffling IO is a bad idea for HDD, and now the shuffling is gone. Test have shown a ~40% performance gain for HDD [1]: Doing sequential swap in of 8G data using 8 processes with usemem, average of 3 test runs: Before: 1270.91 KB/s per process After: 1849.54 KB/s per process Link: https://lore.kernel.org/linux-mm/CAMgjq7AdauQ8=X0zeih2r21QoV=-WWj1hyBxLWRzq74n-C=-Ng@mail.gmail.com/ [1] Link: https://lkml.kernel.org/r/20250916160100.31545-14-ryncsn@gmail.com Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-lkp/202504241621.f27743ec-lkp@intel.com Signed-off-by: Kairui Song Acked-by: Chris Li Reviewed-by: Barry Song Acked-by: David Hildenbrand Suggested-by: Chris Li Cc: Baolin Wang Cc: Baoquan He Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Johannes Weiner Cc: Kemeng Shi Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Nhat Pham Cc: Yosry Ahmed Cc: Zi Yan Cc: SeongJae Park Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/swap.h | 4 ---- mm/swapfile.c | 32 ++++++++------------------------ mm/zswap.c | 7 +++++-- 3 files changed, 13 insertions(+), 30 deletions(-) diff --git a/mm/swap.h b/mm/swap.h index 291a785265f1..8dcb32e3e874 100644 --- a/mm/swap.h +++ b/mm/swap.h @@ -195,10 +195,6 @@ int swap_writepage(struct page *page, struct writeback_control *wbc); int __swap_writepage(struct folio *folio, struct writeback_control *wbc); /* linux/mm/swap_state.c */ -/* One swap address space for each 64M swap space */ -#define SWAP_ADDRESS_SPACE_SHIFT 14 -#define SWAP_ADDRESS_SPACE_PAGES (1 << SWAP_ADDRESS_SPACE_SHIFT) -#define SWAP_ADDRESS_SPACE_MASK (SWAP_ADDRESS_SPACE_PAGES - 1) extern struct address_space swap_space __ro_after_init; static inline struct address_space *swap_address_space(swp_entry_t entry) { diff --git a/mm/swapfile.c b/mm/swapfile.c index 1c0a6f314ee6..d4aad7ab872e 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -3236,21 +3236,14 @@ static int setup_swap_map(struct swap_info_struct *si, return 0; } -#define SWAP_CLUSTER_INFO_COLS \ - DIV_ROUND_UP(L1_CACHE_BYTES, sizeof(struct swap_cluster_info)) -#define SWAP_CLUSTER_SPACE_COLS \ - DIV_ROUND_UP(SWAP_ADDRESS_SPACE_PAGES, SWAPFILE_CLUSTER) -#define SWAP_CLUSTER_COLS \ - max_t(unsigned int, SWAP_CLUSTER_INFO_COLS, SWAP_CLUSTER_SPACE_COLS) - static struct swap_cluster_info *setup_clusters(struct swap_info_struct *si, union swap_header *swap_header, unsigned long maxpages) { unsigned long nr_clusters = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER); struct swap_cluster_info *cluster_info; - unsigned long i, j, idx; int err = -ENOMEM; + unsigned long i; cluster_info = kvcalloc(nr_clusters, sizeof(*cluster_info), GFP_KERNEL); if (!cluster_info) @@ -3299,22 +3292,13 @@ static struct swap_cluster_info *setup_clusters(struct swap_info_struct *si, INIT_LIST_HEAD(&si->frag_clusters[i]); } - /* - * Reduce false cache line sharing between cluster_info and - * sharing same address space. - */ - for (j = 0; j < SWAP_CLUSTER_COLS; j++) { - for (i = 0; i < DIV_ROUND_UP(nr_clusters, SWAP_CLUSTER_COLS); i++) { - struct swap_cluster_info *ci; - idx = i * SWAP_CLUSTER_COLS + j; - ci = cluster_info + idx; - if (idx >= nr_clusters) - continue; - if (ci->count) { - ci->flags = CLUSTER_FLAG_NONFULL; - list_add_tail(&ci->list, &si->nonfull_clusters[0]); - continue; - } + for (i = 0; i < nr_clusters; i++) { + struct swap_cluster_info *ci = &cluster_info[i]; + + if (ci->count) { + ci->flags = CLUSTER_FLAG_NONFULL; + list_add_tail(&ci->list, &si->nonfull_clusters[0]); + } else { ci->flags = CLUSTER_FLAG_FREE; list_add_tail(&ci->list, &si->free_clusters); } diff --git a/mm/zswap.c b/mm/zswap.c index 56b393ae846c..11cc0c684f04 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -233,10 +233,13 @@ static bool zswap_has_pool; * helpers and fwd declarations **********************************/ +/* One swap address space for each 64M swap space */ +#define ZSWAP_ADDRESS_SPACE_SHIFT 14 +#define ZSWAP_ADDRESS_SPACE_PAGES (1 << ZSWAP_ADDRESS_SPACE_SHIFT) static inline struct xarray *swap_zswap_tree(swp_entry_t swp) { return &zswap_trees[swp_type(swp)][swp_offset(swp) - >> SWAP_ADDRESS_SPACE_SHIFT]; + >> ZSWAP_ADDRESS_SPACE_SHIFT]; } #define zswap_pool_debug(msg, p) \ @@ -1754,7 +1757,7 @@ int zswap_swapon(int type, unsigned long nr_pages) struct xarray *trees, *tree; unsigned int nr, i; - nr = DIV_ROUND_UP(nr_pages, SWAP_ADDRESS_SPACE_PAGES); + nr = DIV_ROUND_UP(nr_pages, ZSWAP_ADDRESS_SPACE_PAGES); trees = kvcalloc(nr, sizeof(*tree), GFP_KERNEL); if (!trees) { pr_err("alloc failed, zswap disabled for swap type %d\n", type); -- Gitee From f0e155e1ff541f6e54a0756cabd94ed60ee895d3 Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Wed, 17 Sep 2025 00:00:59 +0800 Subject: [PATCH 482/484] mm, swap: implement dynamic allocation of swap table commit 07adc4cf1ecd316e7b6f4a142e5f5e96ce697e65 upstream Conflicts: none Backport-reason: Swap Table: phase 1 Now swap table is cluster based, which means free clusters can free its table since no one should modify it. There could be speculative readers, like swap cache look up, protect them by making them RCU protected. All swap table should be filled with null entries before free, so such readers will either see a NULL pointer or a null filled table being lazy freed. On allocation, allocate the table when a cluster is used by any order. This way, we can reduce the memory usage of large swap device significantly. This idea to dynamically release unused swap cluster data was initially suggested by Chris Li while proposing the cluster swap allocator and it suits the swap table idea very well. Link: https://lkml.kernel.org/r/20250916160100.31545-15-ryncsn@gmail.com Co-developed-by: Chris Li Signed-off-by: Chris Li Signed-off-by: Kairui Song Suggested-by: Chris Li Reviewed-by: Barry Song Cc: Baolin Wang Cc: Baoquan He Cc: David Hildenbrand Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Johannes Weiner Cc: Kemeng Shi Cc: kernel test robot Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Nhat Pham Cc: Yosry Ahmed Cc: Zi Yan Cc: SeongJae Park Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/swap.h | 2 +- mm/swap_state.c | 9 +-- mm/swap_table.h | 37 ++++++++- mm/swapfile.c | 197 +++++++++++++++++++++++++++++++++++++----------- 4 files changed, 194 insertions(+), 51 deletions(-) diff --git a/mm/swap.h b/mm/swap.h index 8dcb32e3e874..d6127148ff33 100644 --- a/mm/swap.h +++ b/mm/swap.h @@ -33,7 +33,7 @@ struct swap_cluster_info { u16 count; u8 flags; u8 order; - atomic_long_t *table; /* Swap table entries, see mm/swap_table.h */ + atomic_long_t __rcu *table; /* Swap table entries, see mm/swap_table.h */ struct list_head list; }; diff --git a/mm/swap_state.c b/mm/swap_state.c index a5cf39f8849d..bbd4f373e47b 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -92,8 +92,8 @@ struct folio *swap_cache_get_folio(swp_entry_t entry) struct folio *folio; for (;;) { - swp_tb = __swap_table_get(__swap_entry_to_cluster(entry), - swp_cluster_offset(entry)); + swp_tb = swap_table_get(__swap_entry_to_cluster(entry), + swp_cluster_offset(entry)); if (!swp_tb_is_folio(swp_tb)) return NULL; folio = swp_tb_to_folio(swp_tb); @@ -116,11 +116,10 @@ void *swap_cache_get_shadow(swp_entry_t entry) { unsigned long swp_tb; - swp_tb = __swap_table_get(__swap_entry_to_cluster(entry), - swp_cluster_offset(entry)); + swp_tb = swap_table_get(__swap_entry_to_cluster(entry), + swp_cluster_offset(entry)); if (swp_tb_is_shadow(swp_tb)) return swp_tb_to_shadow(swp_tb); - return NULL; } diff --git a/mm/swap_table.h b/mm/swap_table.h index e1f7cc009701..52254e455304 100644 --- a/mm/swap_table.h +++ b/mm/swap_table.h @@ -2,8 +2,15 @@ #ifndef _MM_SWAP_TABLE_H #define _MM_SWAP_TABLE_H +#include +#include #include "swap.h" +/* A typical flat array in each cluster as swap table */ +struct swap_table { + atomic_long_t entries[SWAPFILE_CLUSTER]; +}; + /* * A swap table entry represents the status of a swap slot on a swap * (physical or virtual) device. The swap table in each cluster is a @@ -76,22 +83,46 @@ static inline void *swp_tb_to_shadow(unsigned long swp_tb) static inline void __swap_table_set(struct swap_cluster_info *ci, unsigned int off, unsigned long swp_tb) { + atomic_long_t *table = rcu_dereference_protected(ci->table, true); + + lockdep_assert_held(&ci->lock); VM_WARN_ON_ONCE(off >= SWAPFILE_CLUSTER); - atomic_long_set(&ci->table[off], swp_tb); + atomic_long_set(&table[off], swp_tb); } static inline unsigned long __swap_table_xchg(struct swap_cluster_info *ci, unsigned int off, unsigned long swp_tb) { + atomic_long_t *table = rcu_dereference_protected(ci->table, true); + + lockdep_assert_held(&ci->lock); VM_WARN_ON_ONCE(off >= SWAPFILE_CLUSTER); /* Ordering is guaranteed by cluster lock, relax */ - return atomic_long_xchg_relaxed(&ci->table[off], swp_tb); + return atomic_long_xchg_relaxed(&table[off], swp_tb); } static inline unsigned long __swap_table_get(struct swap_cluster_info *ci, unsigned int off) { + atomic_long_t *table; + VM_WARN_ON_ONCE(off >= SWAPFILE_CLUSTER); - return atomic_long_read(&ci->table[off]); + table = rcu_dereference_check(ci->table, lockdep_is_held(&ci->lock)); + + return atomic_long_read(&table[off]); +} + +static inline unsigned long swap_table_get(struct swap_cluster_info *ci, + unsigned int off) +{ + atomic_long_t *table; + unsigned long swp_tb; + + rcu_read_lock(); + table = rcu_dereference(ci->table); + swp_tb = table ? atomic_long_read(&table[off]) : null_to_swp_tb(); + rcu_read_unlock(); + + return swp_tb; } #endif diff --git a/mm/swapfile.c b/mm/swapfile.c index d4aad7ab872e..9faa6876dc7e 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -58,6 +58,9 @@ static void swap_entries_free(struct swap_info_struct *si, static void swap_range_alloc(struct swap_info_struct *si, unsigned int nr_entries); static bool folio_swapcache_freeable(struct folio *folio); +static void move_cluster(struct swap_info_struct *si, + struct swap_cluster_info *ci, struct list_head *list, + enum swap_cluster_flags new_flags); DEFINE_SPINLOCK(swap_lock); EXPORT_SYMBOL(swap_lock); @@ -106,6 +109,8 @@ static DEFINE_SPINLOCK(swap_avail_lock); struct swap_info_struct *swap_info[MAX_SWAPFILES]; +static struct kmem_cache *swap_table_cachep; + static DEFINE_MUTEX(swapon_mutex); static DECLARE_WAIT_QUEUE_HEAD(proc_poll_wait); @@ -402,10 +407,17 @@ static inline bool cluster_is_discard(struct swap_cluster_info *info) return info->flags == CLUSTER_FLAG_DISCARD; } +static inline bool cluster_table_is_alloced(struct swap_cluster_info *ci) +{ + return rcu_dereference_protected(ci->table, lockdep_is_held(&ci->lock)); +} + static inline bool cluster_is_usable(struct swap_cluster_info *ci, int order) { if (unlikely(ci->flags > CLUSTER_FLAG_USABLE)) return false; + if (!cluster_table_is_alloced(ci)) + return false; if (!order) return true; return cluster_is_empty(ci) || order == ci->order; @@ -423,32 +435,90 @@ static inline unsigned int cluster_offset(struct swap_info_struct *si, return cluster_index(si, ci) * SWAPFILE_CLUSTER; } -static int swap_cluster_alloc_table(struct swap_cluster_info *ci) +static void swap_cluster_free_table(struct swap_cluster_info *ci) { - WARN_ON(ci->table); - ci->table = kzalloc(sizeof(unsigned long) * SWAPFILE_CLUSTER, GFP_KERNEL); - if (!ci->table) - return -ENOMEM; - return 0; + unsigned int ci_off; + struct swap_table *table; + + /* Only empty cluster's table is allow to be freed */ + lockdep_assert_held(&ci->lock); + VM_WARN_ON_ONCE(!cluster_is_empty(ci)); + for (ci_off = 0; ci_off < SWAPFILE_CLUSTER; ci_off++) + VM_WARN_ON_ONCE(!swp_tb_is_null(__swap_table_get(ci, ci_off))); + table = (void *)rcu_dereference_protected(ci->table, true); + rcu_assign_pointer(ci->table, NULL); + + kmem_cache_free(swap_table_cachep, table); } -static void swap_cluster_free_table(struct swap_cluster_info *ci) +/* + * Allocate swap table for one cluster. Attempt an atomic allocation first, + * then fallback to sleeping allocation. + */ +static struct swap_cluster_info * +swap_cluster_alloc_table(struct swap_info_struct *si, + struct swap_cluster_info *ci) { - unsigned int ci_off; - unsigned long swp_tb; + struct swap_table *table; - if (!ci->table) - return; + /* + * Only cluster isolation from the allocator does table allocation. + * Swap allocator uses percpu clusters and holds the local lock. + */ + lockdep_assert_held(&ci->lock); + lockdep_assert_held(&this_cpu_ptr(&percpu_swap_cluster)->lock); + + /* The cluster must be free and was just isolated from the free list. */ + VM_WARN_ON_ONCE(ci->flags || !cluster_is_empty(ci)); + + table = kmem_cache_zalloc(swap_table_cachep, + __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN); + if (table) { + rcu_assign_pointer(ci->table, table); + return ci; + } + + /* + * Try a sleep allocation. Each isolated free cluster may cause + * a sleep allocation, but there is a limited number of them, so + * the potential recursive allocation is limited. + */ + spin_unlock(&ci->lock); + if (!(si->flags & SWP_SOLIDSTATE)) + spin_unlock(&si->global_cluster_lock); + local_unlock(&percpu_swap_cluster.lock); + + table = kmem_cache_zalloc(swap_table_cachep, + __GFP_HIGH | __GFP_NOMEMALLOC | GFP_KERNEL); + + /* + * Back to atomic context. We might have migrated to a new CPU with a + * usable percpu cluster. But just keep using the isolated cluster to + * make things easier. Migration indicates a slight change of workload + * so using a new free cluster might not be a bad idea, and the worst + * could happen with ignoring the percpu cluster is fragmentation, + * which is acceptable since this fallback and race is rare. + */ + local_lock(&percpu_swap_cluster.lock); + if (!(si->flags & SWP_SOLIDSTATE)) + spin_lock(&si->global_cluster_lock); + spin_lock(&ci->lock); - for (ci_off = 0; ci_off < SWAPFILE_CLUSTER; ci_off++) { - swp_tb = __swap_table_get(ci, ci_off); - if (!swp_tb_is_null(swp_tb)) - pr_err_once("swap: unclean swap space on swapoff: 0x%lx", - swp_tb); + /* Nothing except this helper should touch a dangling empty cluster. */ + if (WARN_ON_ONCE(cluster_table_is_alloced(ci))) { + if (table) + kmem_cache_free(swap_table_cachep, table); + return ci; } - kfree(ci->table); - ci->table = NULL; + if (!table) { + move_cluster(si, ci, &si->free_clusters, CLUSTER_FLAG_FREE); + spin_unlock(&ci->lock); + return NULL; + } + + rcu_assign_pointer(ci->table, table); + return ci; } static void move_cluster(struct swap_info_struct *si, @@ -480,7 +550,7 @@ static void swap_cluster_schedule_discard(struct swap_info_struct *si, static void __free_cluster(struct swap_info_struct *si, struct swap_cluster_info *ci) { - lockdep_assert_held(&ci->lock); + swap_cluster_free_table(ci); move_cluster(si, ci, &si->free_clusters, CLUSTER_FLAG_FREE); ci->order = 0; } @@ -495,15 +565,11 @@ static void __free_cluster(struct swap_info_struct *si, struct swap_cluster_info * this returns NULL for an non-empty list. */ static struct swap_cluster_info *isolate_lock_cluster( - struct swap_info_struct *si, struct list_head *list) + struct swap_info_struct *si, struct list_head *list, int order) { - struct swap_cluster_info *ci, *ret = NULL; + struct swap_cluster_info *ci, *found = NULL; spin_lock(&si->lock); - - if (unlikely(!(si->flags & SWP_WRITEOK))) - goto out; - list_for_each_entry(ci, list, list) { if (!spin_trylock(&ci->lock)) continue; @@ -515,13 +581,19 @@ static struct swap_cluster_info *isolate_lock_cluster( list_del(&ci->list); ci->flags = CLUSTER_FLAG_NONE; - ret = ci; + found = ci; break; } -out: spin_unlock(&si->lock); - return ret; + if (found && !cluster_table_is_alloced(found)) { + /* Only an empty free cluster's swap table can be freed. */ + VM_WARN_ON_ONCE(list != &si->free_clusters); + VM_WARN_ON_ONCE(!cluster_is_empty(found)); + return swap_cluster_alloc_table(si, found); + } + + return found; } /* @@ -654,17 +726,27 @@ static void relocate_cluster(struct swap_info_struct *si, * added to free cluster list and its usage counter will be increased by 1. * Only used for initialization. */ -static void inc_cluster_info_page(struct swap_info_struct *si, +static int inc_cluster_info_page(struct swap_info_struct *si, struct swap_cluster_info *cluster_info, unsigned long page_nr) { unsigned long idx = page_nr / SWAPFILE_CLUSTER; + struct swap_table *table; struct swap_cluster_info *ci; ci = cluster_info + idx; + if (!ci->table) { + table = kmem_cache_zalloc(swap_table_cachep, GFP_KERNEL); + if (!table) + return -ENOMEM; + rcu_assign_pointer(ci->table, table); + } + ci->count++; VM_BUG_ON(ci->count > SWAPFILE_CLUSTER); VM_BUG_ON(ci->flags); + + return 0; } static bool cluster_reclaim_range(struct swap_info_struct *si, @@ -846,7 +928,7 @@ static unsigned int alloc_swap_scan_list(struct swap_info_struct *si, unsigned int found = SWAP_ENTRY_INVALID; do { - struct swap_cluster_info *ci = isolate_lock_cluster(si, list); + struct swap_cluster_info *ci = isolate_lock_cluster(si, list, order); unsigned long offset; if (!ci) @@ -871,7 +953,7 @@ static void swap_reclaim_full_clusters(struct swap_info_struct *si, bool force) if (force) to_scan = swap_usage_in_pages(si) / SWAPFILE_CLUSTER; - while ((ci = isolate_lock_cluster(si, &si->full_clusters))) { + while ((ci = isolate_lock_cluster(si, &si->full_clusters, 0))) { offset = cluster_offset(si, ci); end = min(si->max, offset + SWAPFILE_CLUSTER); to_scan--; @@ -1012,6 +1094,7 @@ static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int o done: if (!(si->flags & SWP_SOLIDSTATE)) spin_unlock(&si->global_cluster_lock); + return found; } @@ -1913,7 +1996,13 @@ swp_entry_t get_swap_page_of_type(int type) /* This is called for allocating swap entry, not cache */ if (get_swap_device_info(si)) { if (si->flags & SWP_WRITEOK) { + /* + * Grab the local lock to be complaint + * with swap table allocation. + */ + local_lock(&percpu_swap_cluster.lock); offset = cluster_alloc_swap_entry(si, 0, 1); + local_unlock(&percpu_swap_cluster.lock); if (offset) { entry = swp_entry(si->type, offset); atomic_long_dec(&nr_swap_pages); @@ -2707,12 +2796,21 @@ static void wait_for_allocation(struct swap_info_struct *si) static void free_cluster_info(struct swap_cluster_info *cluster_info, unsigned long maxpages) { + struct swap_cluster_info *ci; int i, nr_clusters = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER); if (!cluster_info) return; - for (i = 0; i < nr_clusters; i++) - swap_cluster_free_table(&cluster_info[i]); + for (i = 0; i < nr_clusters; i++) { + ci = cluster_info + i; + /* Cluster with bad marks count will have a remaining table */ + spin_lock(&ci->lock); + if (rcu_dereference_protected(ci->table, true)) { + ci->count = 0; + swap_cluster_free_table(ci); + } + spin_unlock(&ci->lock); + } kvfree(cluster_info); } @@ -2748,6 +2846,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) struct address_space *mapping; struct inode *inode; struct filename *pathname; + unsigned int maxpages; int err, found = 0; if (!capable(CAP_SYS_ADMIN)) @@ -2854,8 +2953,8 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) p->swap_map = NULL; zeromap = p->zeromap; p->zeromap = NULL; + maxpages = p->max; cluster_info = p->cluster_info; - free_cluster_info(cluster_info, p->max); p->max = 0; p->cluster_info = NULL; spin_unlock(&p->lock); @@ -2867,6 +2966,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) p->global_cluster = NULL; vfree(swap_map); kvfree(zeromap); + free_cluster_info(cluster_info, maxpages); /* Destroy swap account information */ swap_cgroup_swapoff(p->type); @@ -3249,11 +3349,8 @@ static struct swap_cluster_info *setup_clusters(struct swap_info_struct *si, if (!cluster_info) goto err; - for (i = 0; i < nr_clusters; i++) { + for (i = 0; i < nr_clusters; i++) spin_lock_init(&cluster_info[i].lock); - if (swap_cluster_alloc_table(&cluster_info[i])) - goto err_free; - } if (!(si->flags & SWP_SOLIDSTATE)) { si->global_cluster = kmalloc(sizeof(*si->global_cluster), @@ -3272,16 +3369,23 @@ static struct swap_cluster_info *setup_clusters(struct swap_info_struct *si, * See setup_swap_map(): header page, bad pages, * and the EOF part of the last cluster. */ - inc_cluster_info_page(si, cluster_info, 0); + err = inc_cluster_info_page(si, cluster_info, 0); + if (err) + goto err; for (i = 0; i < swap_header->info.nr_badpages; i++) { unsigned int page_nr = swap_header->info.badpages[i]; if (page_nr >= maxpages) continue; - inc_cluster_info_page(si, cluster_info, page_nr); + err = inc_cluster_info_page(si, cluster_info, page_nr); + if (err) + goto err; + } + for (i = maxpages; i < round_up(maxpages, SWAPFILE_CLUSTER); i++) { + err = inc_cluster_info_page(si, cluster_info, i); + if (err) + goto err; } - for (i = maxpages; i < round_up(maxpages, SWAPFILE_CLUSTER); i++) - inc_cluster_info_page(si, cluster_info, i); INIT_LIST_HEAD(&si->free_clusters); INIT_LIST_HEAD(&si->full_clusters); @@ -3989,6 +4093,15 @@ static int __init swapfile_init(void) swapfile_maximum_size = arch_max_swapfile_size(); + /* + * Once a cluster is freed, it's swap table content is read + * only, and all swap cache readers (swap_cache_*) verifies + * the content before use. So it's safe to use RCU slab here. + */ + swap_table_cachep = kmem_cache_create("swap_table", + sizeof(struct swap_table), + 0, SLAB_PANIC | SLAB_TYPESAFE_BY_RCU, NULL); + #ifdef CONFIG_MIGRATION if (swapfile_maximum_size >= (1UL << SWP_MIG_TOTAL_BITS)) swap_migration_ad_supported = true; -- Gitee From 34e66944b9d573f4b3e5f63f5c2a7d6e6b4b26df Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Wed, 17 Sep 2025 00:01:00 +0800 Subject: [PATCH 483/484] mm, swap: use a single page for swap table when the size fits commit f83938e4188c44b535c18903a9761759366aa626 upstream Conflicts: none Backport-reason: Swap Table: phase 1 We have a cluster size of 512 slots. Each slot consumes 8 bytes in swap table so the swap table size of each cluster is exactly one page (4K). If that condition is true, allocate one page direct and disable the slab cache to reduce the memory usage of swap table and avoid fragmentation. Link: https://lkml.kernel.org/r/20250916160100.31545-16-ryncsn@gmail.com Co-developed-by: Chris Li Signed-off-by: Chris Li Signed-off-by: Kairui Song Acked-by: Chris Li Suggested-by: Chris Li Reviewed-by: Barry Song Cc: Baolin Wang Cc: Baoquan He Cc: David Hildenbrand Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Johannes Weiner Cc: Kemeng Shi Cc: kernel test robot Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Nhat Pham Cc: Yosry Ahmed Cc: Zi Yan Cc: SeongJae Park Signed-off-by: Andrew Morton Signed-off-by: Kairui Song --- mm/swap_table.h | 2 ++ mm/swapfile.c | 51 +++++++++++++++++++++++++++++++++++++++---------- 2 files changed, 43 insertions(+), 10 deletions(-) diff --git a/mm/swap_table.h b/mm/swap_table.h index 52254e455304..ea244a57a5b7 100644 --- a/mm/swap_table.h +++ b/mm/swap_table.h @@ -11,6 +11,8 @@ struct swap_table { atomic_long_t entries[SWAPFILE_CLUSTER]; }; +#define SWP_TABLE_USE_PAGE (sizeof(struct swap_table) == PAGE_SIZE) + /* * A swap table entry represents the status of a swap slot on a swap * (physical or virtual) device. The swap table in each cluster is a diff --git a/mm/swapfile.c b/mm/swapfile.c index 9faa6876dc7e..5b2a3004bcbe 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -435,6 +435,38 @@ static inline unsigned int cluster_offset(struct swap_info_struct *si, return cluster_index(si, ci) * SWAPFILE_CLUSTER; } +static struct swap_table *swap_table_alloc(gfp_t gfp) +{ + struct folio *folio; + + if (!SWP_TABLE_USE_PAGE) + return kmem_cache_zalloc(swap_table_cachep, gfp); + + folio = folio_alloc(gfp | __GFP_ZERO, 0); + if (folio) + return folio_address(folio); + return NULL; +} + +static void swap_table_free_folio_rcu_cb(struct rcu_head *head) +{ + struct folio *folio; + + folio = page_folio(container_of(head, struct page, rcu_head)); + folio_put(folio); +} + +static void swap_table_free(struct swap_table *table) +{ + if (!SWP_TABLE_USE_PAGE) { + kmem_cache_free(swap_table_cachep, table); + return; + } + + call_rcu(&(folio_page(virt_to_folio(table), 0)->rcu_head), + swap_table_free_folio_rcu_cb); +} + static void swap_cluster_free_table(struct swap_cluster_info *ci) { unsigned int ci_off; @@ -448,7 +480,7 @@ static void swap_cluster_free_table(struct swap_cluster_info *ci) table = (void *)rcu_dereference_protected(ci->table, true); rcu_assign_pointer(ci->table, NULL); - kmem_cache_free(swap_table_cachep, table); + swap_table_free(table); } /* @@ -471,8 +503,7 @@ swap_cluster_alloc_table(struct swap_info_struct *si, /* The cluster must be free and was just isolated from the free list. */ VM_WARN_ON_ONCE(ci->flags || !cluster_is_empty(ci)); - table = kmem_cache_zalloc(swap_table_cachep, - __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN); + table = swap_table_alloc(__GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN); if (table) { rcu_assign_pointer(ci->table, table); return ci; @@ -488,8 +519,7 @@ swap_cluster_alloc_table(struct swap_info_struct *si, spin_unlock(&si->global_cluster_lock); local_unlock(&percpu_swap_cluster.lock); - table = kmem_cache_zalloc(swap_table_cachep, - __GFP_HIGH | __GFP_NOMEMALLOC | GFP_KERNEL); + table = swap_table_alloc(__GFP_HIGH | __GFP_NOMEMALLOC | GFP_KERNEL); /* * Back to atomic context. We might have migrated to a new CPU with a @@ -507,7 +537,7 @@ swap_cluster_alloc_table(struct swap_info_struct *si, /* Nothing except this helper should touch a dangling empty cluster. */ if (WARN_ON_ONCE(cluster_table_is_alloced(ci))) { if (table) - kmem_cache_free(swap_table_cachep, table); + swap_table_free(table); return ci; } @@ -735,7 +765,7 @@ static int inc_cluster_info_page(struct swap_info_struct *si, ci = cluster_info + idx; if (!ci->table) { - table = kmem_cache_zalloc(swap_table_cachep, GFP_KERNEL); + table = swap_table_alloc(GFP_KERNEL); if (!table) return -ENOMEM; rcu_assign_pointer(ci->table, table); @@ -4098,9 +4128,10 @@ static int __init swapfile_init(void) * only, and all swap cache readers (swap_cache_*) verifies * the content before use. So it's safe to use RCU slab here. */ - swap_table_cachep = kmem_cache_create("swap_table", - sizeof(struct swap_table), - 0, SLAB_PANIC | SLAB_TYPESAFE_BY_RCU, NULL); + if (!SWP_TABLE_USE_PAGE) + swap_table_cachep = kmem_cache_create("swap_table", + sizeof(struct swap_table), + 0, SLAB_PANIC | SLAB_TYPESAFE_BY_RCU, NULL); #ifdef CONFIG_MIGRATION if (swapfile_maximum_size >= (1UL << SWP_MIG_TOTAL_BITS)) -- Gitee From d0eeab3f0878ac96d4bc847626e66508467bd4a1 Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Tue, 16 Sep 2025 19:30:41 +0800 Subject: [PATCH 484/484] mm, swap: fix warn for thp split Upstream: no Extra workaround for downstream. Downstream thp split doesn't update the folio size before replacing the folio in mapping, so add special check for replacing. Signed-off-by: Kairui Song --- mm/swap_state.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/mm/swap_state.c b/mm/swap_state.c index bbd4f373e47b..836a2df2c1d6 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -257,6 +257,7 @@ void __swap_cache_replace_folio(struct swap_cluster_info *ci, unsigned long nr_pages = folio_nr_pages(new); unsigned int ci_off = swp_cluster_offset(entry); unsigned int ci_end = ci_off + nr_pages; + unsigned int ci_start = ci_off; unsigned long old_tb, new_tb; VM_WARN_ON_ONCE(!folio_test_swapcache(old) || !folio_test_swapcache(new)); @@ -279,8 +280,13 @@ void __swap_cache_replace_folio(struct swap_cluster_info *ci, folio_order(old) != folio_order(new)) { ci_off = swp_cluster_offset(old->swap); ci_end = ci_off + folio_nr_pages(old); - while (ci_off++ < ci_end) - WARN_ON_ONCE(swp_tb_to_folio(__swap_table_get(ci, ci_off)) != old); + while (ci_off++ < ci_end) { + /* TODO: Remove this extra check after THP split update */ + if (ci_off == ci_start) + break; + + VM_WARN_ON_ONCE(swp_tb_to_folio(__swap_table_get(ci, ci_off)) != old); + } } } -- Gitee