diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 208d7b94aec6e0ceb101807cfa26556cc576b5bd..72cc4a130821f3c439ed6f14189d9fc6b0f8fdc3 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -539,6 +539,10 @@ cio_ignore= [S390] See Documentation/s390/common_io.rst for details. + + clear_freelist + Enable clear_freelist feature. + clk_ignore_unused [CLK] Prevents the clock framework from automatically gating @@ -4777,6 +4781,15 @@ [KNL, SMP] Set scheduler's default relax_domain_level. See Documentation/admin-guide/cgroup-v1/cpusets.rst. + reliable_debug= [ARM64] + Format: [F][,S][,P] + Only works with CONFIG_MEMORY_RELIABLE and + "kernelcore=reliable" is configured. + F: User memory allocation(special user task, tmpfs) will + not allocate memory from non-mirrored region if failed. + S: The shmem does not use the reliable memory. + P: Page cache does not use the reliable memory. + reserve= [KNL,BUGS] Force kernel to ignore I/O ports or memory Format: ,[,,,...] Reserve I/O ports or memory so the kernel won't use diff --git a/Documentation/admin-guide/sysctl/vm.rst b/Documentation/admin-guide/sysctl/vm.rst index eb227015a89531d04fedfd4179d6d04a9bbed18b..a84bef7aa8640f88ce86f3b02a4aa2a03ff198ed 100644 --- a/Documentation/admin-guide/sysctl/vm.rst +++ b/Documentation/admin-guide/sysctl/vm.rst @@ -25,6 +25,7 @@ files can be found in mm/swap.c. Currently, these files are in /proc/sys/vm: - admin_reserve_kbytes +- clear_freelist_pages - compact_memory - compaction_proactiveness - compact_unevictable_allowed @@ -109,6 +110,18 @@ On x86_64 this is about 128MB. Changing this takes effect whenever an application requests memory. +clear_freelist_pages +==================== + +Available only when CONFIG_CLEAR_FREELIST_PAGE is set. When 1 is written to the +file, all pages in free lists will be written with 0. + +Zone lock is held during clear_freelist_pages, if the execution time is too +long, RCU CPU Stall warnings will be print. For each NUMA node, +clear_freelist_pages is performed on a "random" CPU of the NUMA node. +The time consuming is related to the hardware. + + compact_memory ============== diff --git a/Documentation/filesystems/proc.rst b/Documentation/filesystems/proc.rst index f6783bb99e3f4020d387b53166151ef1c02543e2..2fa2f7cd12876895454d3684b3fa0ca0c8cd617c 100644 --- a/Documentation/filesystems/proc.rst +++ b/Documentation/filesystems/proc.rst @@ -195,6 +195,7 @@ read the file /proc/PID/status:: VmPTE: 20 kb VmSwap: 0 kB HugetlbPages: 0 kB + Reliable: 1608 kB CoreDumping: 0 THP_enabled: 1 Threads: 1 @@ -275,6 +276,7 @@ It's slow but very precise. VmSwap amount of swap used by anonymous private data (shmem swap usage is not included) HugetlbPages size of hugetlb memory portions + Reliable size of reliable memory used CoreDumping process's memory is currently being dumped (killing the process may lead to a corrupted core) THP_enabled process is allowed to use THP (returns 0 when @@ -971,6 +973,8 @@ varies by architecture and compile options. The following is from a ShmemPmdMapped: 0 kB ReliableTotal: 7340032 kB ReliableUsed: 418824 kB + ReliableBuddyMem: 418824 kB + ReliableShmem: 96 kB MemTotal Total usable RAM (i.e. physical RAM minus a few reserved @@ -1104,6 +1108,10 @@ ReliableTotal Total reliable memory size ReliableUsed The used amount of reliable memory +ReliableBuddyMem + Size of unused mirrored memory in buddy system +ReliableShmem + Total reliable memory used by share memory vmallocinfo ~~~~~~~~~~~ diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 391b967fcfbfdb69d2f4d8c3a48d6dff1742c353..15f989844389f3f6e8d5c25938c4440bb5c5299f 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -77,6 +77,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm) SEQ_PUT_DEC(" kB\nVmSwap:\t", swap); seq_puts(m, " kB\n"); hugetlb_report_usage(m, mm); + reliable_report_usage(m, mm); } #undef SEQ_PUT_DEC diff --git a/include/linux/mem_reliable.h b/include/linux/mem_reliable.h index 38891cb2fa839add1c7a371f130eef74540939b8..ddadf28037429a37473bced527391530e7886921 100644 --- a/include/linux/mem_reliable.h +++ b/include/linux/mem_reliable.h @@ -5,31 +5,52 @@ #include #include #include +#include #include #include +#include #ifdef CONFIG_MEMORY_RELIABLE -extern struct static_key_false mem_reliable; +DECLARE_STATIC_KEY_FALSE(mem_reliable); extern bool reliable_enabled; extern bool shmem_reliable; +extern struct percpu_counter reliable_shmem_used_nr_page; +extern long shmem_reliable_nr_page __read_mostly; +extern bool reliable_allow_fallback; +extern bool pagecache_use_reliable_mem; +extern struct percpu_counter pagecache_reliable_pages; +extern struct percpu_counter anon_reliable_pages; +extern unsigned long task_reliable_limit __read_mostly; +extern atomic_long_t reliable_user_used_nr_page; -extern void add_reliable_mem_size(long sz); extern void mem_reliable_init(bool has_unmirrored_mem, - unsigned long *zone_movable_pfn); + unsigned long *zone_movable_pfn, + unsigned long mirrored_sz); extern void shmem_reliable_init(void); extern void reliable_report_meminfo(struct seq_file *m); extern void page_cache_prepare_alloc(gfp_t *gfp); +extern bool mem_reliable_status(void); +extern void reliable_lru_add(enum lru_list lru, struct page *page, + int val); +extern void reliable_lru_add_batch(int zid, enum lru_list lru, + int val); +extern bool mem_reliable_counter_initialized(void); +extern void mem_reliable_out_of_memory(gfp_t gfp_mask, unsigned int order, + int preferred_nid, nodemask_t *nodemask); +extern void reliable_show_mem_info(void); +extern void reliable_report_usage(struct seq_file *m, + struct mm_struct *mm); static inline bool mem_reliable_is_enabled(void) { return static_branch_likely(&mem_reliable); } -static inline bool zone_reliable(struct zone *zone) +static inline bool pagecache_reliable_is_enabled(void) { - return mem_reliable_is_enabled() && zone_idx(zone) < ZONE_MOVABLE; + return pagecache_use_reliable_mem; } static inline bool skip_none_movable_zone(gfp_t gfp, struct zoneref *z) @@ -54,22 +75,109 @@ static inline bool shmem_reliable_is_enabled(void) { return shmem_reliable; } + +static inline bool page_reliable(struct page *page) +{ + if (!mem_reliable_is_enabled()) + return false; + + if (!page) + return false; + + return page_zonenum(page) < ZONE_MOVABLE; +} + +static inline void shmem_reliable_page_counter(struct page *page, int nr_page) +{ + if (shmem_reliable_is_enabled() && page_reliable(page)) + percpu_counter_add(&reliable_shmem_used_nr_page, nr_page); +} + +static inline bool mem_reliable_shmem_limit_check(void) +{ + return percpu_counter_read_positive(&reliable_shmem_used_nr_page) < + shmem_reliable_nr_page; +} + +static inline u64 task_reliable_used_pages(void) +{ + s64 nr_pages; + + nr_pages = percpu_counter_read_positive(&pagecache_reliable_pages); + nr_pages += percpu_counter_read_positive(&anon_reliable_pages); + + return nr_pages; +} + +static inline bool reliable_mem_limit_check(unsigned long nr_page) +{ + return (task_reliable_used_pages() + nr_page) <= + (task_reliable_limit >> PAGE_SHIFT); +} + +static inline bool mem_reliable_should_reclaim(void) +{ + if (percpu_counter_sum_positive(&pagecache_reliable_pages) >= + MAX_ORDER_NR_PAGES) + return true; + + return false; +} + +static inline bool reliable_allow_fb_enabled(void) +{ + return reliable_allow_fallback; +} + +static inline void reliable_page_counter(struct page *page, + struct mm_struct *mm, int val) +{ + if (page_reliable(page)) + atomic_long_add(val, &mm->reliable_nr_page); +} #else #define reliable_enabled 0 +#define pagecache_use_reliable_mem 0 static inline bool mem_reliable_is_enabled(void) { return false; } -static inline void add_reliable_mem_size(long sz) {} +static inline bool pagecache_reliable_is_enabled(void) { return false; } static inline void mem_reliable_init(bool has_unmirrored_mem, - unsigned long *zone_movable_pfn) {} + unsigned long *zone_movable_pfn, + unsigned long mirrored_sz) {} static inline void shmem_reliable_init(void) {} -static inline bool zone_reliable(struct zone *zone) { return false; } static inline bool skip_none_movable_zone(gfp_t gfp, struct zoneref *z) { return false; } static inline void reliable_report_meminfo(struct seq_file *m) {} static inline bool shmem_reliable_is_enabled(void) { return false; } +static inline void shmem_reliable_page_counter(struct page *page, + int nr_page) {} +static inline bool mem_reliable_shmem_limit_check(void) { return true; } static inline void page_cache_prepare_alloc(gfp_t *gfp) {} +static inline bool mem_reliable_status(void) { return false; } +static inline bool page_reliable(struct page *page) { return false; } +static inline void reliable_lru_add(enum lru_list lru, struct page *page, + int val) {} +static inline void reliable_lru_add_batch(int zid, enum lru_list lru, + int val) {} +static inline bool mem_reliable_counter_initialized(void) { return false; } +static inline u64 task_reliable_used_pages(void) { return 0; } +static inline bool reliable_mem_limit_check(unsigned long nr_page) +{ + return false; +} +static inline bool mem_reliable_should_reclaim(void) { return false; } +static inline void mem_reliable_out_of_memory(gfp_t gfp_mask, + unsigned int order, + int preferred_nid, + nodemask_t *nodemask) {} +static inline bool reliable_allow_fb_enabled(void) { return false; } +static inline void reliable_show_mem_info(void) {} +static inline void reliable_page_counter(struct page *page, + struct mm_struct *mm, int val) {} +static inline void reliable_report_usage(struct seq_file *m, + struct mm_struct *mm) {} #endif #endif diff --git a/include/linux/memblock.h b/include/linux/memblock.h index 5b9b0239e34a05a80d66b6e0c19d21f6579f01ff..d6b6a93aa73e99c02d951de4716032804d9c6b5f 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -38,6 +38,7 @@ enum memblock_flags { MEMBLOCK_MIRROR = 0x2, /* mirrored region */ MEMBLOCK_NOMAP = 0x4, /* don't add to kernel direct mapping */ MEMBLOCK_MEMMAP = 0x8, /* memmap reserved region */ + MEMBLOCK_NOMIRROR = 0x10, /* alloc from non-mirrored region */ }; /** @@ -410,6 +411,10 @@ void *memblock_alloc_try_nid_raw(phys_addr_t size, phys_addr_t align, void *memblock_alloc_try_nid(phys_addr_t size, phys_addr_t align, phys_addr_t min_addr, phys_addr_t max_addr, int nid); +void *memblock_alloc_try_nid_raw_flags(phys_addr_t size, phys_addr_t align, + phys_addr_t min_addr, + phys_addr_t max_addr, int nid, + enum memblock_flags flags); static inline void * __init memblock_alloc(phys_addr_t size, phys_addr_t align) { diff --git a/include/linux/mm.h b/include/linux/mm.h index 1ae73cc4b80605bd1055dd7cf1f42fa436c728ff..89e7ea80efcaa9edc4cbd17e470b57afb585947e 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -34,9 +34,6 @@ #include #include -/* added to mm.h to avoid every caller adding new header file */ -#include - struct mempolicy; struct anon_vma; struct anon_vma_chain; @@ -3308,5 +3305,8 @@ static inline int seal_check_future_write(int seals, struct vm_area_struct *vma) return 0; } +/* added to mm.h to avoid every caller adding new header file */ +#include + #endif /* __KERNEL__ */ #endif /* _LINUX_MM_H */ diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index 8fc71e9d7bb079dab4b6057a062d5d34822a6cac..36f2e8f7db9d953db6e157c5061614c8ee607a2a 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -4,6 +4,7 @@ #include #include +#include /** * page_is_file_lru - should the page be on a file LRU or anon LRU? @@ -50,6 +51,7 @@ static __always_inline void add_page_to_lru_list(struct page *page, { update_lru_size(lruvec, lru, page_zonenum(page), thp_nr_pages(page)); list_add(&page->lru, &lruvec->lists[lru]); + reliable_lru_add(lru, page, thp_nr_pages(page)); } static __always_inline void add_page_to_lru_list_tail(struct page *page, @@ -57,6 +59,7 @@ static __always_inline void add_page_to_lru_list_tail(struct page *page, { update_lru_size(lruvec, lru, page_zonenum(page), thp_nr_pages(page)); list_add_tail(&page->lru, &lruvec->lists[lru]); + reliable_lru_add(lru, page, thp_nr_pages(page)); } static __always_inline void del_page_from_lru_list(struct page *page, @@ -64,6 +67,7 @@ static __always_inline void del_page_from_lru_list(struct page *page, { list_del(&page->lru); update_lru_size(lruvec, lru, page_zonenum(page), -thp_nr_pages(page)); + reliable_lru_add(lru, page, -thp_nr_pages(page)); } /** diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 9de02b116185054b47a7709afeb669b8259a3334..cf037e744c5152aafd46cd8bc1ad504ebb1a3d86 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -591,7 +591,8 @@ struct mm_struct { #endif #ifdef CONFIG_MEMORY_RELIABLE - atomic_long_t reserve_0; + /* total used reliable pages */ + KABI_RENAME(atomic_long_t reserve_0, atomic_long_t reliable_nr_page); #endif } __randomize_layout; diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 8719d891848f0e5c8ffc7286369b6c85a39ce65b..7f25539d2fe41466282afd24dec36bfb3398923e 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -261,6 +261,11 @@ static inline bool is_file_lru(enum lru_list lru) return (lru == LRU_INACTIVE_FILE || lru == LRU_ACTIVE_FILE); } +static inline int is_anon_lru(enum lru_list lru) +{ + return (lru == LRU_INACTIVE_ANON || lru == LRU_ACTIVE_ANON); +} + static inline bool is_active_lru(enum lru_list lru) { return (lru == LRU_ACTIVE_ANON || lru == LRU_ACTIVE_FILE); diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index e1bbb3b92921d8eb084d2844cd2ae85ebaf5e14f..ad6664fcc3b2415963c63d66bed42ad8914c0532 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -183,6 +183,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, if (new_page) { get_page(new_page); + reliable_page_counter(new_page, mm, 1); page_add_new_anon_rmap(new_page, vma, addr, false); lru_cache_add_inactive_or_unevictable(new_page, vma); } else @@ -194,6 +195,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, inc_mm_counter(mm, MM_ANONPAGES); } + reliable_page_counter(old_page, mm, -1); flush_cache_page(vma, addr, pte_pfn(*pvmw.pte)); ptep_clear_flush_notify(vma, addr, pvmw.pte); if (new_page) diff --git a/lib/show_mem.c b/lib/show_mem.c index 1c26c14ffbb9bdfe8d442cb381e7c7d1fd242305..11751aebc98f4d061920984048f99377af9631f6 100644 --- a/lib/show_mem.c +++ b/lib/show_mem.c @@ -41,4 +41,5 @@ void show_mem(unsigned int filter, nodemask_t *nodemask) #ifdef CONFIG_MEMORY_FAILURE printk("%lu pages hwpoisoned\n", atomic_long_read(&num_poisoned_pages)); #endif + reliable_show_mem_info(); } diff --git a/mm/Kconfig b/mm/Kconfig index 27c0b9de6357eb2ded44219de674db641a721627..81974d00de4dce57fbe0fecd632bc94e4e19ed5d 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -970,6 +970,19 @@ config MEMORY_RELIABLE To enable this function, mirrored memory is needed and "kernelcore=reliable" need to be added in kernel parameters. +config CLEAR_FREELIST_PAGE + bool "Support for clear free list pages" + depends on MMU && SYSCTL + default n + help + Say y here to enable the clear free list pages feature. When + writing to clear_freelist, trigger to clean up the free memory + of the buddy system. + + To enable this feature, kernel parameter "clear_freelist" also + needs to be added. + + source "mm/damon/Kconfig" endmenu diff --git a/mm/Makefile b/mm/Makefile index 9798d8735cc72f3260dab026adc1df37a120998b..aad7866abe8cc38d0bf076ea42ab4c04c593ad0e 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -7,6 +7,7 @@ KASAN_SANITIZE_slab_common.o := n KASAN_SANITIZE_slab.o := n KASAN_SANITIZE_slub.o := n KCSAN_SANITIZE_kmemleak.o := n +KASAN_SANITIZE_clear_freelist_page.o := n # These produce frequent data race reports: most of them are due to races on # the same word but accesses to different bits of that word. Re-enable KCSAN @@ -129,4 +130,5 @@ obj-$(CONFIG_PIN_MEMORY) += pin_mem.o obj-$(CONFIG_ASCEND_SHARE_POOL) += share_pool.o obj-$(CONFIG_MEMORY_RELIABLE) += mem_reliable.o obj-$(CONFIG_MEMCG_MEMFS_INFO) += memcg_memfs_info.o +obj-$(CONFIG_CLEAR_FREELIST_PAGE) += clear_freelist_page.o obj-$(CONFIG_PAGE_CACHE_LIMIT) += page_cache_limit.o diff --git a/mm/clear_freelist_page.c b/mm/clear_freelist_page.c new file mode 100644 index 0000000000000000000000000000000000000000..50b7ec918bfb676c511c49c689fe265e5ec15740 --- /dev/null +++ b/mm/clear_freelist_page.c @@ -0,0 +1,187 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Support for clear free list pages. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define CFP_DEFAULT_TIMEOUT 2000 +#define for_each_populated_zone_pgdat(pgdat, zone) \ + for (zone = pgdat->node_zones; \ + zone; \ + zone = next_pgdat_zone(zone)) \ + if (!populated_zone(zone)) \ + ; /* do nothing */ \ + else + +struct pgdat_entry { + struct pglist_data *pgdat; + struct work_struct work; +}; + +static DECLARE_WAIT_QUEUE_HEAD(clear_freelist_wait); +static DEFINE_MUTEX(clear_freelist_lock); +static atomic_t clear_freelist_workers; +static atomic_t clear_pages_num; +static ulong cfp_timeout_ms = CFP_DEFAULT_TIMEOUT; + +/* + * next_pgdat_zone - helper magic for for_each_populated_zone_pgdat() + */ +static struct zone *next_pgdat_zone(struct zone *zone) +{ + pg_data_t *pgdat = zone->zone_pgdat; + + if (zone < pgdat->node_zones + MAX_NR_ZONES - 1) + zone++; + else + zone = NULL; + return zone; +} + +static void clear_pgdat_freelist_pages(struct work_struct *work) +{ + struct pgdat_entry *entry = container_of(work, struct pgdat_entry, work); + u64 cfp_timeout_ns = cfp_timeout_ms * NSEC_PER_MSEC; + struct pglist_data *pgdat = entry->pgdat; + unsigned long flags, order, t; + struct page *page; + struct zone *zone; + u64 start, now; + + start = sched_clock(); + + for_each_populated_zone_pgdat(pgdat, zone) { + spin_lock_irqsave(&zone->lock, flags); + for_each_migratetype_order(order, t) { + list_for_each_entry(page, &zone->free_area[order].free_list[t], lru) { + now = sched_clock(); + if (unlikely(now - start > cfp_timeout_ns)) { + spin_unlock_irqrestore(&zone->lock, flags); + goto out; + } + +#ifdef CONFIG_KMAP_LOCAL + int i; + + /* Clear highmem by clear_highpage() */ + for (i = 0; i < (1 << order); i++) + clear_highpage(page + i); +#else + memset(page_address(page), 0, (1 << order) * PAGE_SIZE); +#endif + touch_nmi_watchdog(); + atomic_add(1 << order, &clear_pages_num); + } + } + spin_unlock_irqrestore(&zone->lock, flags); + + cond_resched(); + } + +out: + kfree(entry); + + if (atomic_dec_and_test(&clear_freelist_workers)) + wake_up(&clear_freelist_wait); +} + +static void init_clear_freelist_work(struct pglist_data *pgdat) +{ + struct pgdat_entry *entry; + + entry = kzalloc(sizeof(struct pgdat_entry), GFP_KERNEL); + if (!entry) + return; + + entry->pgdat = pgdat; + INIT_WORK(&entry->work, clear_pgdat_freelist_pages); + queue_work_node(pgdat->node_id, system_unbound_wq, &entry->work); +} + +static void clear_freelist_pages(void) +{ + struct pglist_data *pgdat; + + mutex_lock(&clear_freelist_lock); + drain_all_pages(NULL); + + for_each_online_pgdat(pgdat) { + atomic_inc(&clear_freelist_workers); + init_clear_freelist_work(pgdat); + } + + wait_event(clear_freelist_wait, atomic_read(&clear_freelist_workers) == 0); + + pr_debug("Cleared pages %d\nFree pages %lu\n", atomic_read(&clear_pages_num), + global_zone_page_state(NR_FREE_PAGES)); + atomic_set(&clear_pages_num, 0); + + mutex_unlock(&clear_freelist_lock); +} + +static int sysctl_clear_freelist_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + int ret; + int val; + + table->data = &val; + ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); + + if (!ret && write) + clear_freelist_pages(); + + return ret; +} + +static struct ctl_table clear_freelist_table[] = { + { + .procname = "clear_freelist_pages", + .data = NULL, + .maxlen = sizeof(int), + .mode = 0200, + .proc_handler = &sysctl_clear_freelist_handler, + .extra1 = SYSCTL_ONE, + .extra2 = SYSCTL_ONE, + }, + { } +}; + +static struct ctl_table sys_ctl_table[] = { + { + .procname = "vm", + .mode = 0555, + .child = clear_freelist_table, + }, + { } +}; + +static bool clear_freelist_enabled; +static int __init setup_clear_freelist(char *str) +{ + clear_freelist_enabled = true; + return 1; +} +__setup("clear_freelist", setup_clear_freelist); + +static int __init clear_freelist_init(void) +{ + if (clear_freelist_enabled) + register_sysctl_table(sys_ctl_table); + + return 0; +} +module_init(clear_freelist_init); +module_param(cfp_timeout_ms, ulong, 0644); diff --git a/mm/filemap.c b/mm/filemap.c index 4f9cd18f9197e07397d51612b4a17f2c84c4d431..6480600cf0eac3a4c98650ba89521f12451be88a 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -192,6 +192,7 @@ static void unaccount_page_cache_page(struct address_space *mapping, __mod_lruvec_page_state(page, NR_FILE_PAGES, -nr); if (PageSwapBacked(page)) { __mod_lruvec_page_state(page, NR_SHMEM, -nr); + shmem_reliable_page_counter(page, -nr); if (PageTransHuge(page)) __dec_node_page_state(page, NR_SHMEM_THPS); } else if (PageTransHuge(page)) { @@ -800,10 +801,14 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask) __dec_lruvec_page_state(old, NR_FILE_PAGES); if (!PageHuge(new)) __inc_lruvec_page_state(new, NR_FILE_PAGES); - if (PageSwapBacked(old)) + if (PageSwapBacked(old)) { __dec_lruvec_page_state(old, NR_SHMEM); - if (PageSwapBacked(new)) + shmem_reliable_page_counter(old, -1); + } + if (PageSwapBacked(new)) { __inc_lruvec_page_state(new, NR_SHMEM); + shmem_reliable_page_counter(new, 1); + } xas_unlock_irqrestore(&xas, flags); if (freepage) freepage(old); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 79c855b5adada38b20008bd739f272259b8a25f3..fdd617e8197dd1ed820d47b3dae356dc3b916e5c 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -652,6 +652,7 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf, pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable); set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry); add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR); + reliable_page_counter(page, vma->vm_mm, HPAGE_PMD_NR); mm_inc_nr_ptes(vma->vm_mm); spin_unlock(vmf->ptl); count_vm_event(THP_FAULT_ALLOC); @@ -1115,6 +1116,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, get_page(src_page); page_dup_rmap(src_page, true); add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR); + reliable_page_counter(src_page, dst_mm, HPAGE_PMD_NR); out_zero_page: mm_inc_nr_ptes(dst_mm); pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable); @@ -1696,6 +1698,7 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, if (pmd_present(orig_pmd)) { page = pmd_page(orig_pmd); + reliable_page_counter(page, tlb->mm, -HPAGE_PMD_NR); page_remove_rmap(page, true); VM_BUG_ON_PAGE(page_mapcount(page) < 0, page); VM_BUG_ON_PAGE(!PageHead(page), page); @@ -2077,6 +2080,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, set_page_dirty(page); if (!PageReferenced(page) && pmd_young(old_pmd)) SetPageReferenced(page); + reliable_page_counter(page, mm, -HPAGE_PMD_NR); page_remove_rmap(page, true); put_page(page); } @@ -2212,6 +2216,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, if (freeze) { for (i = 0; i < HPAGE_PMD_NR; i++) { + reliable_page_counter(page + i, mm, -1); page_remove_rmap(page + i, false); put_page(page + i); } @@ -3006,6 +3011,7 @@ void set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw, if (pmd_soft_dirty(pmdval)) pmdswp = pmd_swp_mksoft_dirty(pmdswp); set_pmd_at(mm, address, pvmw->pmd, pmdswp); + reliable_page_counter(page, mm, -HPAGE_PMD_NR); page_remove_rmap(page, true); put_page(page); } @@ -3033,6 +3039,7 @@ void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new) pmde = pmd_wrprotect(pmd_mkuffd_wp(pmde)); flush_cache_range(vma, mmun_start, mmun_start + HPAGE_PMD_SIZE); + reliable_page_counter(new, mm, HPAGE_PMD_NR); if (PageAnon(new)) page_add_anon_rmap(new, vma, mmun_start, true); else @@ -3089,6 +3096,7 @@ vm_fault_t do_anon_huge_page_remap(struct vm_area_struct *vma, unsigned long add pgtable_trans_huge_deposit(vma->vm_mm, pmd, pgtable); set_pmd_at(vma->vm_mm, address, pmd, entry); add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR); + reliable_page_counter(page, vma->vm_mm, HPAGE_PMD_NR); mm_inc_nr_ptes(vma->vm_mm); spin_unlock(ptl); count_vm_event(THP_FAULT_ALLOC); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 817ae73d40bd669c4d891536c5619191c5e6b61e..43eb7d38126617330c850bd20863221082d6d767 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2697,6 +2697,19 @@ struct page *alloc_huge_page(struct vm_area_struct *vma, return ERR_PTR(-ENOSPC); } +void *__init __alloc_bootmem_huge_page_inner(phys_addr_t size, + phys_addr_t align, + phys_addr_t min_addr, + phys_addr_t max_addr, int nid) +{ + if (!mem_reliable_is_enabled()) + return memblock_alloc_try_nid_raw(size, align, max_addr, + max_addr, nid); + + return memblock_alloc_try_nid_raw_flags(size, align, max_addr, max_addr, + nid, MEMBLOCK_NOMIRROR); +} + int alloc_bootmem_huge_page(struct hstate *h, int nid) __attribute__ ((weak, alias("__alloc_bootmem_huge_page"))); int __alloc_bootmem_huge_page(struct hstate *h, int nid) @@ -2712,7 +2725,7 @@ int __alloc_bootmem_huge_page(struct hstate *h, int nid) /* do node specific alloc */ if (nid != NUMA_NO_NODE) { - m = memblock_alloc_try_nid_raw(huge_page_size(h), huge_page_size(h), + m = __alloc_bootmem_huge_page_inner(huge_page_size(h), huge_page_size(h), 0, MEMBLOCK_ALLOC_ACCESSIBLE, nid); if (!m) return 0; @@ -2720,7 +2733,7 @@ int __alloc_bootmem_huge_page(struct hstate *h, int nid) } /* allocate from next node when distributing huge pages */ for_each_node_mask_to_alloc(h, nr_nodes, node, &node_states[N_MEMORY]) { - m = memblock_alloc_try_nid_raw( + m = __alloc_bootmem_huge_page_inner( huge_page_size(h), huge_page_size(h), 0, MEMBLOCK_ALLOC_ACCESSIBLE, node); /* diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 44c048d7b783ed77c3ad5b4d9ae8463eedc03b55..aaef16aa8945cbb2243c11c8d42c1f437660b5ff 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -748,6 +748,7 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page, if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) { clear_user_highpage(page, address); add_mm_counter(vma->vm_mm, MM_ANONPAGES, 1); + reliable_page_counter(page, vma->vm_mm, 1); if (is_zero_pfn(pte_pfn(pteval))) { /* * ptl mostly unnecessary. @@ -776,6 +777,7 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page, * superfluous. */ pte_clear(vma->vm_mm, address, _pte); + reliable_page_counter(src_page, vma->vm_mm, -1); page_remove_rmap(src_page, false); spin_unlock(ptl); free_page_and_swap_cache(src_page); @@ -1057,7 +1059,8 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm, static void collapse_huge_page(struct mm_struct *mm, unsigned long address, struct page **hpage, - int node, int referenced, int unmapped) + int node, int referenced, int unmapped, + bool reliable) { LIST_HEAD(compound_pagelist); pmd_t *pmd, _pmd; @@ -1075,6 +1078,9 @@ static void collapse_huge_page(struct mm_struct *mm, /* Only allocate from the target node */ gfp = alloc_hugepage_khugepaged_gfpmask() | __GFP_THISNODE; + if (reliable) + gfp |= GFP_RELIABLE; + /* * Before allocating the hugepage, release the mmap_lock read lock. * The allocation can take potentially a long time if it involves @@ -1198,6 +1204,7 @@ static void collapse_huge_page(struct mm_struct *mm, spin_lock(pmd_ptl); BUG_ON(!pmd_none(*pmd)); + reliable_page_counter(new_page, vma->vm_mm, HPAGE_PMD_NR); page_add_new_anon_rmap(new_page, vma, address, true); lru_cache_add_inactive_or_unevictable(new_page, vma); pgtable_trans_huge_deposit(mm, pmd, pgtable); @@ -1234,6 +1241,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, spinlock_t *ptl; int node = NUMA_NO_NODE, unmapped = 0; bool writable = false; + bool reliable = false; VM_BUG_ON(address & ~HPAGE_PMD_MASK); @@ -1358,6 +1366,9 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, page_is_young(page) || PageReferenced(page) || mmu_notifier_test_young(vma->vm_mm, address)) referenced++; + + if (page_reliable(page)) + reliable = true; } if (!writable) { result = SCAN_PAGE_RO; @@ -1373,7 +1384,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, node = khugepaged_find_target_node(); /* collapse_huge_page will return with the mmap_lock released */ collapse_huge_page(mm, address, hpage, node, - referenced, unmapped); + referenced, unmapped, reliable); } out: trace_mm_khugepaged_scan_pmd(mm, page, writable, referenced, @@ -1501,6 +1512,7 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr) if (pte_none(*pte)) continue; page = vm_normal_page(vma, addr, *pte); + reliable_page_counter(page, mm, -1); page_remove_rmap(page, false); } @@ -1633,7 +1645,8 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) */ static void collapse_file(struct mm_struct *mm, struct file *file, pgoff_t start, - struct page **hpage, int node) + struct page **hpage, int node, + bool reliable) { struct address_space *mapping = file->f_mapping; gfp_t gfp; @@ -1650,6 +1663,9 @@ static void collapse_file(struct mm_struct *mm, /* Only allocate from the target node */ gfp = alloc_hugepage_khugepaged_gfpmask() | __GFP_THISNODE; + if (reliable) + gfp |= GFP_RELIABLE; + new_page = khugepaged_alloc_page(hpage, gfp, node); if (!new_page) { result = SCAN_ALLOC_HUGE_PAGE_FAIL; @@ -1898,6 +1914,8 @@ static void collapse_file(struct mm_struct *mm, ClearPageActive(page); ClearPageUnevictable(page); unlock_page(page); + if (is_shmem) + shmem_reliable_page_counter(page, -1); put_page(page); index++; } @@ -1908,8 +1926,10 @@ static void collapse_file(struct mm_struct *mm, SetPageUptodate(new_page); page_ref_add(new_page, HPAGE_PMD_NR - 1); - if (is_shmem) + if (is_shmem) { set_page_dirty(new_page); + shmem_reliable_page_counter(new_page, 1 << HPAGE_PMD_ORDER); + } lru_cache_add(new_page); /* @@ -1977,6 +1997,7 @@ static void khugepaged_scan_file(struct mm_struct *mm, int present, swap; int node = NUMA_NO_NODE; int result = SCAN_SUCCEED; + bool reliable = false; present = 0; swap = 0; @@ -2029,6 +2050,9 @@ static void khugepaged_scan_file(struct mm_struct *mm, xas_pause(&xas); cond_resched_rcu(); } + + if (page_reliable(page)) + reliable = true; } rcu_read_unlock(); @@ -2037,7 +2061,7 @@ static void khugepaged_scan_file(struct mm_struct *mm, result = SCAN_EXCEED_NONE_PTE; } else { node = khugepaged_find_target_node(); - collapse_file(mm, file, start, hpage, node); + collapse_file(mm, file, start, hpage, node, reliable); } } diff --git a/mm/ksm.c b/mm/ksm.c index 582c02058baf94e30c511b1faf9d4130aa4de6fe..169c0da1a9db84e072b1db9191762e0b5e172563 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -1155,6 +1155,7 @@ static int replace_page(struct vm_area_struct *vma, struct page *page, */ if (!is_zero_pfn(page_to_pfn(kpage))) { get_page(kpage); + reliable_page_counter(kpage, mm, 1); page_add_anon_rmap(kpage, vma, addr, false); newpte = mk_pte(kpage, vma->vm_page_prot); } else { @@ -1179,6 +1180,7 @@ static int replace_page(struct vm_area_struct *vma, struct page *page, ptep_clear_flush(vma, addr, ptep); set_pte_at_notify(mm, addr, ptep, newpte); + reliable_page_counter(page, mm, -1); page_remove_rmap(page, false); if (!page_mapped(page)) try_to_free_swap(page); diff --git a/mm/mem_reliable.c b/mm/mem_reliable.c index b1bc749532a4ddc6d390cf6fe0ecfdfc045e34b2..4540cfc76489e9778e6b355b03a63c7497675f96 100644 --- a/mm/mem_reliable.c +++ b/mm/mem_reliable.c @@ -7,76 +7,139 @@ #include #include #include +#include + +#define PAGES_TO_B(n_pages) ((n_pages) << PAGE_SHIFT) + +enum mem_reliable_types { + MEM_RELIABLE_ALL, + MEM_RELIABLE_FALLBACK, + MEM_RELIABLE_SHMEM, + MEM_RELIABLE_PAGECACHE, + MEM_RELIABLE_MAX +}; DEFINE_STATIC_KEY_FALSE(mem_reliable); +EXPORT_SYMBOL_GPL(mem_reliable); bool reliable_enabled; - -static atomic_long_t total_reliable_mem; bool shmem_reliable __read_mostly = true; +struct percpu_counter reliable_shmem_used_nr_page; +bool reliable_allow_fallback __read_mostly = true; +bool pagecache_use_reliable_mem __read_mostly = true; +struct percpu_counter pagecache_reliable_pages; +struct percpu_counter anon_reliable_pages; +static unsigned long reliable_pagecache_max_bytes = ULONG_MAX; +/* reliable user limit for user tasks with reliable flag */ +unsigned long task_reliable_limit = ULONG_MAX; +long shmem_reliable_nr_page = ULONG_MAX >> PAGE_SHIFT; +atomic_long_t reliable_user_used_nr_page; -void page_cache_prepare_alloc(gfp_t *gfp) +bool mem_reliable_counter_initialized(void) { - if (mem_reliable_is_enabled()) - *gfp |= GFP_RELIABLE; + return likely(percpu_counter_initialized(&pagecache_reliable_pages)) && + likely((percpu_counter_initialized(&anon_reliable_pages))); } -void add_reliable_mem_size(long sz) +bool mem_reliable_status(void) { - atomic_long_add(sz, &total_reliable_mem); + return mem_reliable_is_enabled(); } +EXPORT_SYMBOL_GPL(mem_reliable_status); -static unsigned long total_reliable_mem_sz(void) +void reliable_lru_add_batch(int zid, enum lru_list lru, + int val) +{ + if (!mem_reliable_is_enabled()) + return; + + if (zid < ZONE_MOVABLE) { + if (is_file_lru(lru)) + percpu_counter_add(&pagecache_reliable_pages, val); + else if (is_anon_lru(lru)) + percpu_counter_add(&anon_reliable_pages, val); + } +} + +void reliable_lru_add(enum lru_list lru, struct page *page, int val) +{ + if (!page_reliable(page)) + return; + + if (is_file_lru(lru)) + percpu_counter_add(&pagecache_reliable_pages, val); + else if (is_anon_lru(lru)) + percpu_counter_add(&anon_reliable_pages, val); + else if (lru == LRU_UNEVICTABLE) { + if (PageAnon(page)) + percpu_counter_add(&anon_reliable_pages, val); + else + percpu_counter_add(&pagecache_reliable_pages, val); + } +} + +void page_cache_prepare_alloc(gfp_t *gfp) { - return atomic_long_read(&total_reliable_mem); + s64 nr_reliable = 0; + + if (!mem_reliable_is_enabled()) + return; + + if (!pagecache_reliable_is_enabled()) + goto no_reliable; + + nr_reliable = percpu_counter_read_positive(&pagecache_reliable_pages); + if (nr_reliable > reliable_pagecache_max_bytes >> PAGE_SHIFT) + goto no_reliable; + + *gfp |= GFP_RELIABLE; + return; + +no_reliable: + *gfp &= ~GFP_RELIABLE; } -static unsigned long used_reliable_mem_sz(void) +static unsigned long total_reliable_pages(void) { - unsigned long nr_page = 0; + unsigned long total_reliable_pages = 0; struct zone *z; for_each_populated_zone(z) if (zone_idx(z) < ZONE_MOVABLE) - nr_page += zone_page_state(z, NR_FREE_PAGES); + total_reliable_pages += zone_managed_pages(z); - return total_reliable_mem_sz() - nr_page * PAGE_SIZE; + return total_reliable_pages; } -static int reliable_mem_notifier(struct notifier_block *nb, - unsigned long action, void *arg) +static unsigned long free_reliable_pages(void) { - struct memory_notify *m_arg = arg; struct zone *zone; + unsigned long cnt = 0; - switch (action) { - case MEM_ONLINE: - zone = page_zone(pfn_to_page(m_arg->start_pfn)); - if (zone_reliable(zone)) - add_reliable_mem_size(m_arg->nr_pages * PAGE_SIZE); - break; - case MEM_OFFLINE: - zone = page_zone(pfn_to_page(m_arg->start_pfn)); - if (zone_reliable(zone)) - add_reliable_mem_size(-m_arg->nr_pages * PAGE_SIZE); - break; - default: - break; - } + for_each_populated_zone(zone) + if (zone_idx(zone) < ZONE_MOVABLE) + cnt += zone_page_state(zone, NR_FREE_PAGES); - return NOTIFY_OK; + return cnt; } -static struct notifier_block reliable_notifier_block = { - .notifier_call = reliable_mem_notifier, -}; +static unsigned long used_reliable_pages(void) +{ + return total_reliable_pages() - free_reliable_pages(); +} -void mem_reliable_init(bool has_unmirrored_mem, unsigned long *zone_movable_pfn) +void mem_reliable_init(bool has_unmirrored_mem, unsigned long *zone_movable_pfn, + unsigned long mirrored_sz) { if (!reliable_enabled) return; - if (atomic_long_read(&total_reliable_mem) == 0) { + if (is_kdump_kernel()) { + pr_info("ignoring memory reliable due to in crashkernel\n"); + return; + } + + if (!mirrored_sz) { memset(zone_movable_pfn, 0, sizeof(unsigned long) * MAX_NUMNODES); pr_err("init failed, mirrored memory size is zero.\n"); @@ -88,35 +151,379 @@ void mem_reliable_init(bool has_unmirrored_mem, unsigned long *zone_movable_pfn) return; } - if (register_hotmemory_notifier(&reliable_notifier_block)) { - pr_err("init failed, register memory notifier failed.\n"); + static_branch_enable(&mem_reliable); + + pr_info("init succeed, mirrored memory size(%lu)\n", mirrored_sz); +} + +void shmem_reliable_init(void) +{ + if (!mem_reliable_is_enabled() || !shmem_reliable_is_enabled()) { + shmem_reliable = false; return; } - static_branch_enable(&mem_reliable); + percpu_counter_init(&reliable_shmem_used_nr_page, 0, GFP_KERNEL); +} - pr_info("init succeed, mirrored memory size(%lu)\n", - total_reliable_mem_sz()); +static void show_val_kb(struct seq_file *m, const char *s, unsigned long num) +{ + seq_put_decimal_ull_width(m, s, num << (PAGE_SHIFT - 10), 8); + seq_write(m, " kB\n", 4); } -void shmem_reliable_init(void) +void reliable_report_meminfo(struct seq_file *m) { - if (!shmem_reliable_is_enabled()) + if (!mem_reliable_is_enabled()) return; - if (!mem_reliable_is_enabled()) { + show_val_kb(m, "ReliableTotal: ", total_reliable_pages()); + show_val_kb(m, "ReliableUsed: ", used_reliable_pages()); + show_val_kb(m, "ReliableTaskUsed: ", task_reliable_used_pages()); + show_val_kb(m, "ReliableBuddyMem: ", free_reliable_pages()); + + if (shmem_reliable_is_enabled()) { + unsigned long shmem_pages = (unsigned long)percpu_counter_sum( + &reliable_shmem_used_nr_page); + show_val_kb(m, "ReliableShmem: ", shmem_pages); + } + + if (pagecache_reliable_is_enabled()) { + s64 nr_pagecache_pages = 0; + unsigned long num = 0; + + num += global_node_page_state(NR_LRU_BASE + LRU_ACTIVE_FILE); + num += global_node_page_state(NR_LRU_BASE + LRU_INACTIVE_FILE); + show_val_kb(m, "FileCache: ", num); + + nr_pagecache_pages = + percpu_counter_sum_positive(&pagecache_reliable_pages); + seq_printf(m, "ReliableFileCache: %8llu kB\n", + nr_pagecache_pages << (PAGE_SHIFT - 10)); + } +} + +#ifdef CONFIG_SYSCTL +int reliable_limit_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *length, loff_t *ppos) +{ + unsigned long old = task_reliable_limit; + int ret; + + ret = proc_doulongvec_minmax(table, write, buffer, length, ppos); + if (ret == 0 && write) { + if (task_reliable_limit > PAGES_TO_B(total_reliable_pages()) || + task_reliable_limit < + (task_reliable_used_pages() << PAGE_SHIFT)) { + task_reliable_limit = old; + return -EINVAL; + } + } + + return ret; +} + +int reliable_pagecache_max_bytes_write(struct ctl_table *table, int write, + void __user *buffer, size_t *length, loff_t *ppos) +{ + unsigned long old_value = reliable_pagecache_max_bytes; + int ret; + + ret = proc_doulongvec_minmax(table, write, buffer, length, ppos); + if (!ret && write) { + if (reliable_pagecache_max_bytes > + PAGES_TO_B(total_reliable_pages())) { + reliable_pagecache_max_bytes = old_value; + return -EINVAL; + } + } + + return ret; +} + +static void mem_reliable_feature_disable(int idx); + +#define CTRL_BITS_SHIFT MEM_RELIABLE_MAX +#define CTRL_BITS_MASK ((1 << CTRL_BITS_SHIFT) - 1) + +static unsigned long mem_reliable_ctrl_bits = CTRL_BITS_MASK; + +static void mem_reliable_ctrl_bit_disable(int idx) +{ + clear_bit(idx, &mem_reliable_ctrl_bits); +} + +static bool mem_reliable_ctrl_bit_is_enabled(int idx) +{ + return !!test_bit(idx, &mem_reliable_ctrl_bits); +} + +static void mem_reliable_parse_ctrl_bits(unsigned long ctrl_bits) +{ + bool status; + int i; + + for (i = MEM_RELIABLE_FALLBACK; i < MEM_RELIABLE_MAX; i++) { + status = !!test_bit(i, &ctrl_bits); + + if (mem_reliable_ctrl_bit_is_enabled(i) && !status) + mem_reliable_feature_disable(i); + } +} + +static void mem_reliable_disable_all(void) +{ + mem_reliable_ctrl_bits = 0; + + reliable_allow_fallback = false; + shmem_reliable = false; + pagecache_use_reliable_mem = false; + static_branch_disable(&mem_reliable); + + pr_info("memory reliable feature disabled.\n"); +} + +int reliable_debug_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *length, loff_t *ppos) +{ + unsigned long old_ctrl_bits, new_ctrl_bits; + static DEFINE_MUTEX(reliable_debug_mutex); + int ret; + + mutex_lock(&reliable_debug_mutex); + old_ctrl_bits = mem_reliable_ctrl_bits; + ret = proc_doulongvec_minmax(table, write, buffer, length, ppos); + if (ret == 0 && write) { + if (!mem_reliable_is_enabled() || + (mem_reliable_ctrl_bits > (1 << CTRL_BITS_SHIFT) - 1)) { + mem_reliable_ctrl_bits = old_ctrl_bits; + mutex_unlock(&reliable_debug_mutex); + + return -EINVAL; + } + + new_ctrl_bits = mem_reliable_ctrl_bits; + mem_reliable_ctrl_bits = old_ctrl_bits; + if (!!test_bit(MEM_RELIABLE_ALL, &new_ctrl_bits)) + mem_reliable_parse_ctrl_bits(new_ctrl_bits); + else + mem_reliable_disable_all(); + } + + mutex_unlock(&reliable_debug_mutex); + + return ret; +} + +#ifdef CONFIG_SHMEM +static unsigned long sysctl_shmem_reliable_bytes_limit = ULONG_MAX; + +int reliable_shmem_bytes_limit_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *length, loff_t *ppos) +{ + unsigned long *data_ptr = (unsigned long *)(table->data); + unsigned long old = *data_ptr; + int ret; + + ret = proc_doulongvec_minmax(table, write, buffer, length, ppos); + if (!ret && write) { + if (*data_ptr > PAGES_TO_B(total_reliable_pages())) { + *data_ptr = old; + return -EINVAL; + } + + shmem_reliable_nr_page = *data_ptr >> PAGE_SHIFT; + } + + return ret; +} +#endif + +static struct ctl_table reliable_ctl_table[] = { + { + .procname = "reliable_pagecache_max_bytes", + .data = &reliable_pagecache_max_bytes, + .maxlen = sizeof(reliable_pagecache_max_bytes), + .mode = 0644, + .proc_handler = reliable_pagecache_max_bytes_write, + }, + { + .procname = "task_reliable_limit", + .data = &task_reliable_limit, + .maxlen = sizeof(task_reliable_limit), + .mode = 0644, + .proc_handler = reliable_limit_handler, + }, +#ifdef CONFIG_SHMEM + { + .procname = "shmem_reliable_bytes_limit", + .data = &sysctl_shmem_reliable_bytes_limit, + .maxlen = sizeof(sysctl_shmem_reliable_bytes_limit), + .mode = 0644, + .proc_handler = reliable_shmem_bytes_limit_handler, + }, +#endif + { + .procname = "reliable_debug", + .data = &mem_reliable_ctrl_bits, + .maxlen = sizeof(mem_reliable_ctrl_bits), + .mode = 0600, + .proc_handler = reliable_debug_handler, + }, + {} +}; + +static struct ctl_table reliable_dir_table[] = { + { + .procname = "vm", + .maxlen = 0, + .mode = 0555, + .child = reliable_ctl_table, + }, + {} +}; + +static int __init reliable_sysctl_init(void) +{ + if (!mem_reliable_is_enabled()) + return 0; + + if (!register_sysctl_table(reliable_dir_table)) { + pr_err("register sysctl failed."); + return -ENOMEM; + } + + percpu_counter_init(&pagecache_reliable_pages, 0, GFP_KERNEL); + percpu_counter_init(&anon_reliable_pages, 0, GFP_KERNEL); + + return 0; +} +arch_initcall(reliable_sysctl_init); +#else +static void mem_reliable_ctrl_bit_disabled(int idx) {} +#endif + +static void mem_reliable_feature_disable(int idx) +{ + char *str = NULL; + + switch (idx) { + case MEM_RELIABLE_FALLBACK: + reliable_allow_fallback = false; + str = "fallback"; + break; + case MEM_RELIABLE_SHMEM: shmem_reliable = false; - pr_info("shmem reliable disabled.\n"); + str = "shmem"; + break; + case MEM_RELIABLE_PAGECACHE: + pagecache_use_reliable_mem = false; + str = "pagecache"; + break; + default: + pr_err("unknown index: %d", idx); + return; } + + mem_reliable_ctrl_bit_disable(idx); + pr_info("%s is disabled\n", str); } -void reliable_report_meminfo(struct seq_file *m) +void reliable_show_mem_info(void) +{ + if (!mem_reliable_is_enabled()) + return; + + pr_info("ReliableTotal: %lu kB\n", total_reliable_pages() + << (PAGE_SHIFT - 10)); + pr_info("ReliableUsed: %lu kB\n", used_reliable_pages() + << (PAGE_SHIFT - 10)); + pr_info("ReliableTaskLimit: %lu kB\n", task_reliable_limit >> 10); + pr_info("ReliableTaskUsed: %lld kB\n", task_reliable_used_pages() + << (PAGE_SHIFT - 10)); + + if (shmem_reliable_is_enabled()) { + pr_info("ReliableShmemPagesLimit: %ld\n", + shmem_reliable_nr_page); + pr_info("ReliableShmem: %llu kB\n", + percpu_counter_sum(&reliable_shmem_used_nr_page) + << (PAGE_SHIFT - 10)); + } + + if (pagecache_reliable_is_enabled()) { + s64 nr_pagecache_pages = 0; + unsigned long num = 0; + + num += global_node_page_state(NR_LRU_BASE + LRU_ACTIVE_FILE); + num += global_node_page_state(NR_LRU_BASE + LRU_INACTIVE_FILE); + pr_info("ReliableFileCacheLimit: %lu kB\n", + reliable_pagecache_max_bytes >> 10); + pr_info("FileCache: %lu kB\n", num << (PAGE_SHIFT - 10)); + + nr_pagecache_pages = + percpu_counter_sum_positive(&pagecache_reliable_pages); + pr_info("ReliableFileCache: %llu kB\n", + nr_pagecache_pages << (PAGE_SHIFT - 10)); + } +} + +void mem_reliable_out_of_memory(gfp_t gfp, unsigned int order, + int preferred_nid, nodemask_t *nodemask) +{ + struct oom_control oc = { + .zonelist = node_zonelist(preferred_nid, gfp), + .nodemask = nodemask, + .memcg = NULL, + .gfp_mask = gfp, + .order = order, + }; + + if (!mutex_trylock(&oom_lock)) + return; + out_of_memory(&oc); + mutex_unlock(&oom_lock); +} + +static int __init setup_reliable_debug(char *str) +{ + if (*str++ != '=' || !*str) + /* + * No options specified. + */ + goto out; + + /* + * Determine which debug features should be switched on + */ + for (; *str && *str != ','; str++) { + switch (*str) { + case 'F': + mem_reliable_feature_disable(MEM_RELIABLE_FALLBACK); + break; + case 'S': + mem_reliable_feature_disable(MEM_RELIABLE_SHMEM); + break; + case 'P': + mem_reliable_feature_disable(MEM_RELIABLE_PAGECACHE); + break; + default: + pr_err("reliable_debug option '%c' unknown. skipped\n", + *str); + } + } + +out: + return 1; +} +__setup("reliable_debug", setup_reliable_debug); + +#define SEQ_PUT_DEC(str, val) \ + seq_put_decimal_ull_width(m, str, (val) << (PAGE_SHIFT-10), 8) +void reliable_report_usage(struct seq_file *m, struct mm_struct *mm) { if (!mem_reliable_is_enabled()) return; - seq_printf(m, "ReliableTotal: %8lu kB\n", - total_reliable_mem_sz() >> 10); - seq_printf(m, "ReliableUsed: %8lu kB\n", - used_reliable_mem_sz() >> 10); + SEQ_PUT_DEC("Reliable:\t", atomic_long_read(&mm->reliable_nr_page)); + seq_puts(m, "kB\n"); } diff --git a/mm/memblock.c b/mm/memblock.c index 53e92fc7ef6f1869cc4933ead7f632428aa0c13b..047947c9ed8de3cde217553ed47aceda08d02863 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -968,6 +968,10 @@ static bool should_skip_region(struct memblock_type *type, if ((flags & MEMBLOCK_MIRROR) && !memblock_is_mirror(m)) return true; + /* skip mirror memory regions with MEMBLOCK_NOMIRROR */ + if ((flags & MEMBLOCK_NOMIRROR) && memblock_is_mirror(m)) + return true; + /* skip nomap memory unless we were asked for it explicitly */ if (!(flags & MEMBLOCK_NOMAP) && memblock_is_nomap(m)) return true; @@ -1386,6 +1390,76 @@ phys_addr_t __init memblock_alloc_range_nid(phys_addr_t size, return found; } +/** + * memblock_alloc_range_nid - allocate boot memory block + * @size: size of memory block to be allocated in bytes + * @align: alignment of the region and block's size + * @start: the lower bound of the memory region to allocate (phys address) + * @end: the upper bound of the memory region to allocate (phys address) + * @nid: nid of the free area to find, %NUMA_NO_NODE for any node + * @exact_nid: control the allocation fall back to other nodes + * @flags: alloc memory with specify flag + * + * The allocation is performed from memory region limited by + * memblock.current_limit if @end == %MEMBLOCK_ALLOC_ACCESSIBLE. + * + * If the specified node can not hold the requested memory and @exact_nid + * is false, the allocation falls back to any node in the system. + * + * In addition, function sets the min_count to 0 using kmemleak_alloc_phys for + * allocated boot memory block, so that it is never reported as leaks. + * + * Return: + * Physical address of allocated memory block on success, %0 on failure. + */ +phys_addr_t __init memblock_alloc_range_nid_flags(phys_addr_t size, + phys_addr_t align, phys_addr_t start, + phys_addr_t end, int nid, + bool exact_nid, + enum memblock_flags flags) +{ + phys_addr_t found; + + if (WARN_ONCE( + nid == MAX_NUMNODES, + "Usage of MAX_NUMNODES is deprecated. Use NUMA_NO_NODE instead\n")) + nid = NUMA_NO_NODE; + + if (!align) { + /* Can't use WARNs this early in boot on powerpc */ + dump_stack(); + align = SMP_CACHE_BYTES; + } + + found = memblock_find_in_range_node(size, align, start, end, nid, + flags); + if (found && !memblock_reserve(found, size)) + goto done; + + if (nid != NUMA_NO_NODE && !exact_nid) { + found = memblock_find_in_range_node(size, align, start, + end, NUMA_NO_NODE, + flags); + if (found && !memblock_reserve(found, size)) + goto done; + } + + return 0; + +done: + /* Skip kmemleak for kasan_init() due to high volume. */ + if (end != MEMBLOCK_ALLOC_KASAN) + /* + * The min_count is set to 0 so that memblock allocated + * blocks are never reported as leaks. This is because many + * of these blocks are only referred via the physical + * address which is not looked up by kmemleak. + */ + kmemleak_alloc_phys(found, size, 0, 0); + + return found; +} + /** * memblock_phys_alloc_range - allocate a memory block inside specified range * @size: size of memory block to be allocated in bytes @@ -1541,6 +1615,39 @@ void * __init memblock_alloc_try_nid_raw( false); } +void * __init memblock_alloc_try_nid_raw_flags( + phys_addr_t size, phys_addr_t align, + phys_addr_t min_addr, phys_addr_t max_addr, + int nid, enum memblock_flags flags) +{ + phys_addr_t alloc; + void *ptr; + + memblock_dbg("%s: %llu bytes align=0x%llx nid=%d from=%pa max_addr=%pa %pS\n", + __func__, (u64)size, (u64)align, nid, &min_addr, + &max_addr, (void *)_RET_IP_); + + if (max_addr > memblock.current_limit) + max_addr = memblock.current_limit; + + alloc = memblock_alloc_range_nid_flags(size, align, min_addr, max_addr, + nid, false, flags); + + /* retry allocation without lower limit */ + if (!alloc && min_addr) + alloc = memblock_alloc_range_nid_flags(size, align, 0, max_addr, + nid, false, flags); + + if (!alloc) + return NULL; + + ptr = phys_to_virt(alloc); + if (ptr && size > 0) + page_init_poison(ptr, size); + + return ptr; +} + /** * memblock_alloc_try_nid - allocate boot memory block * @size: size of memory block to be allocated in bytes diff --git a/mm/memory.c b/mm/memory.c index 3667ec456ace4842245fc32607e0e3c94fcd08a0..e5ad19b8eb60f4bc946376abd1d5ef2813261c0c 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -834,6 +834,7 @@ copy_present_page(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma *prealloc = NULL; copy_user_highpage(new_page, page, addr, src_vma); __SetPageUptodate(new_page); + reliable_page_counter(new_page, dst_vma->vm_mm, 1); page_add_new_anon_rmap(new_page, dst_vma, addr, false); lru_cache_add_inactive_or_unevictable(new_page, dst_vma); rss[mm_counter(new_page)]++; @@ -1273,6 +1274,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, mark_page_accessed(page); } rss[mm_counter(page)]--; + reliable_page_counter(page, mm, -1); page_remove_rmap(page, false); if (unlikely(page_mapcount(page) < 0)) print_bad_pte(vma, addr, ptent, page); @@ -1300,6 +1302,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, } pte_clear_not_present_full(mm, addr, pte, tlb->fullmm); + reliable_page_counter(page, mm, -1); rss[mm_counter(page)]--; page_remove_rmap(page, false); put_page(page); @@ -1664,6 +1667,7 @@ static int insert_page_into_pte_locked(struct mm_struct *mm, pte_t *pte, /* Ok, finally just insert the thing.. */ get_page(page); inc_mm_counter_fast(mm, mm_counter_file(page)); + reliable_page_counter(page, mm, 1); page_add_file_rmap(page, false); set_pte_at(mm, addr, pte, mk_pte(page, prot)); return 0; @@ -2942,9 +2946,12 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) mm_counter_file(old_page)); inc_mm_counter_fast(mm, MM_ANONPAGES); } + reliable_page_counter(old_page, mm, -1); } else { inc_mm_counter_fast(mm, MM_ANONPAGES); } + + reliable_page_counter(new_page, mm, 1); flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte)); entry = mk_pte(new_page, vma->vm_page_prot); entry = pte_sw_mkyoung(entry); @@ -3514,6 +3521,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) */ inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); + reliable_page_counter(page, vma->vm_mm, 1); dec_mm_counter_fast(vma->vm_mm, MM_SWAPENTS); pte = mk_pte(page, vma->vm_page_prot); if ((vmf->flags & FAULT_FLAG_WRITE) && reuse_swap_page(page, NULL)) { @@ -3682,6 +3690,7 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) } inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); + reliable_page_counter(page, vma->vm_mm, 1); page_add_new_anon_rmap(page, vma, vmf->address, false); lru_cache_add_inactive_or_unevictable(page, vma); setpte: @@ -3876,6 +3885,7 @@ static vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page) entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); add_mm_counter(vma->vm_mm, mm_counter_file(page), HPAGE_PMD_NR); + reliable_page_counter(page, vma->vm_mm, HPAGE_PMD_NR); page_add_file_rmap(page, true); /* * deposit and withdraw with pmd lock held @@ -3948,6 +3958,7 @@ vm_fault_t alloc_set_pte(struct vm_fault *vmf, struct page *page) if (write) entry = maybe_mkwrite(pte_mkdirty(entry), vma); /* copy-on-write page */ + reliable_page_counter(page, vma->vm_mm, 1); if (write && !(vma->vm_flags & VM_SHARED)) { inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); page_add_new_anon_rmap(page, vma, vmf->address, false); @@ -5428,6 +5439,7 @@ vm_fault_t do_anon_page_remap(struct vm_area_struct *vma, unsigned long address, if (ret) goto release; inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); + reliable_page_counter(page, vma->vm_mm, 1); page_add_new_anon_rmap(page, vma, address, false); lru_cache_add_inactive_or_unevictable(page, vma); diff --git a/mm/migrate.c b/mm/migrate.c index 6cd51f3817b6b03d5d600fa9a0649b61bfcca274..1f78410a10635a5effa89cc09c86c85e079f1b2b 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -269,6 +269,7 @@ static bool remove_migration_pte(struct page *page, struct vm_area_struct *vma, { set_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte); + reliable_page_counter(new, vma->vm_mm, 1); if (PageAnon(new)) page_add_anon_rmap(new, vma, pvmw.address, false); else @@ -481,6 +482,11 @@ int migrate_page_move_mapping(struct address_space *mapping, xas_unlock(&xas); /* Leave irq disabled to prevent preemption while updating stats */ + if (PageSwapBacked(page) && !PageSwapCache(page)) { + shmem_reliable_page_counter(page, -nr); + shmem_reliable_page_counter(newpage, nr); + } + /* * If moved to a different zone then also account * the page for that zone. Other VM counters will be @@ -2200,6 +2206,7 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, * new page and page_add_new_anon_rmap guarantee the copy is * visible before the pagetable update. */ + reliable_page_counter(new_page, vma->vm_mm, HPAGE_PMD_NR); page_add_anon_rmap(new_page, vma, start, true); /* * At this point the pmd is numa/protnone (i.e. non present) and the TLB @@ -2217,6 +2224,7 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, page_ref_unfreeze(page, 2); mlock_migrate_page(new_page, page); + reliable_page_counter(page, vma->vm_mm, -HPAGE_PMD_NR); page_remove_rmap(page, true); set_page_owner_migrate_reason(new_page, MR_NUMA_MISPLACED); @@ -2461,6 +2469,7 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp, * drop page refcount. Page won't be freed, as we took * a reference just above. */ + reliable_page_counter(page, mm, -1); page_remove_rmap(page, false); put_page(page); @@ -2953,6 +2962,7 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate, goto unlock_abort; inc_mm_counter(mm, MM_ANONPAGES); + reliable_page_counter(page, mm, 1); page_add_new_anon_rmap(page, vma, addr, false); if (!is_zone_device_page(page)) lru_cache_add_inactive_or_unevictable(page, vma); diff --git a/mm/mmap.c b/mm/mmap.c index 515d668e130170ff5656c76ac7d162dc34a091e7..1859f39d2af8e9efda6bbb4da7f83a84773fc727 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1751,6 +1751,7 @@ do_user_swap(struct mm_struct *mm, unsigned long addr_start, unsigned long len, set_pte(pte, swp_entry_to_pte(swp_entry(SWP_USERSWAP_ENTRY, page_to_pfn(page)))); dec_mm_counter(mm, MM_ANONPAGES); + reliable_page_counter(page, mm, -1); page_remove_rmap(page, false); put_page(page); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index ff6fffec8770767e68c93c9716cb6394e28abd02..24116b7828f525b249ed241cbee15a7d0b8df68c 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4664,6 +4664,28 @@ check_retry_cpuset(int cpuset_mems_cookie, struct alloc_context *ac) return false; } +#ifdef CONFIG_MEMORY_RELIABLE +static inline void mem_reliable_fallback_slowpath(gfp_t gfp_mask, + struct alloc_context *ac) +{ + if (!reliable_allow_fb_enabled()) + return; + + if (gfp_mask & __GFP_NOFAIL) + return; + + if ((ac->highest_zoneidx == ZONE_NORMAL) && (gfp_mask & GFP_RELIABLE)) { + ac->highest_zoneidx = gfp_zone(gfp_mask & ~GFP_RELIABLE); + ac->preferred_zoneref = first_zones_zonelist( + ac->zonelist, ac->highest_zoneidx, ac->nodemask); + return; + } +} +#else +static inline void mem_reliable_fallback_slowpath(gfp_t gfp_mask, + struct alloc_context *ac) {} +#endif + static inline struct page * __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, struct alloc_context *ac) @@ -4715,6 +4737,8 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, if (alloc_flags & ALLOC_KSWAPD) wake_all_kswapds(order, gfp_mask, ac); + mem_reliable_fallback_slowpath(gfp_mask, ac); + /* * The adjusted alloc_flags might result in immediate success, so try * that first @@ -5144,11 +5168,112 @@ EXPORT_SYMBOL_GPL(__alloc_pages_bulk); static inline void prepare_before_alloc(gfp_t *gfp_mask) { + bool zone_movable; + if (!mem_reliable_is_enabled()) + goto clear_flag; + + /* + * memory reliable only handle memory allocation from movable zone + * (force alloc from non-movable zone or force alloc from movable + * zone) to get total isolation. + */ + zone_movable = gfp_zone(*gfp_mask & ~GFP_RELIABLE) == ZONE_MOVABLE; + if (!zone_movable) + goto clear_flag; + + if (!in_task()) return; if ((current->flags & PF_RELIABLE) || is_global_init(current)) *gfp_mask |= GFP_RELIABLE; + + return; +clear_flag: + *gfp_mask &= ~GFP_RELIABLE; +} + +static inline long mem_reliable_direct_reclaim(int nr_pages, struct alloc_context *ac) +{ + long nr_reclaimed = 0; + + while (nr_reclaimed < nr_pages) { + /* try to free cache from reliable region */ + long progress = __perform_reclaim(GFP_KERNEL, 0, ac); + + nr_reclaimed += progress; + if (progress < SWAP_CLUSTER_MAX) + break; + } + + return nr_reclaimed; +} + +/* + * return true means memory allocation need retry and flag ___GFP_RELIABILITY + * must be cleared. + */ +static inline bool check_after_alloc(gfp_t *gfp, unsigned int order, + int preferred_nid, + struct alloc_context *ac, + struct page **_page) +{ + int retry_times = MAX_RECLAIM_RETRIES; + int nr_pages; + + if (!mem_reliable_is_enabled()) + return false; + + if (!(*gfp & GFP_RELIABLE)) + return false; + + if (!*_page) + goto out_retry; + + if (*gfp & __GFP_NOFAIL || current->flags & PF_MEMALLOC) + goto out; + + /* percpu counter is not initialized, ignore limit check */ + if (!mem_reliable_counter_initialized()) + goto out; + +limit_check: + /* user task is limited by task_reliable_limit */ + if (!reliable_mem_limit_check(1 << order)) + goto out_free_page; + + goto out; + +out_free_page: + if (mem_reliable_should_reclaim() && retry_times--) { + nr_pages = mem_reliable_direct_reclaim(1 << order, ac); + if (nr_pages) + goto limit_check; + } + + __free_pages(*_page, order); + *_page = NULL; + +out_retry: + if (reliable_allow_fb_enabled() || is_global_init(current)) { + *gfp &= ~GFP_RELIABLE; + return true; + } + + if (*gfp & (__GFP_NORETRY | __GFP_RETRY_MAYFAIL | __GFP_THISNODE)) + goto out; + + /* Coredumps can quickly deplete all memory reserves */ + if (current->flags & PF_DUMPCORE) + goto out; + /* The OOM killer will not help higher order allocs */ + if (order > PAGE_ALLOC_COSTLY_ORDER) + goto out; + + /* oom here */ + mem_reliable_out_of_memory(*gfp, order, preferred_nid, ac->nodemask); +out: + return false; } /* @@ -5175,6 +5300,7 @@ struct page *__alloc_pages(gfp_t gfp, unsigned int order, int preferred_nid, prepare_before_alloc(&gfp); +retry: alloc_gfp = gfp; if (!prepare_alloc_pages(gfp, order, preferred_nid, nodemask, &ac, &alloc_gfp, &alloc_flags)) @@ -5220,6 +5346,9 @@ struct page *__alloc_pages(gfp_t gfp, unsigned int order, int preferred_nid, page = NULL; } + if (check_after_alloc(&gfp, order, preferred_nid, &ac, &page)) + goto retry; + trace_mm_page_alloc(page, order, alloc_gfp, ac.migratetype); return page; @@ -7525,10 +7654,11 @@ static void __init find_zone_movable_pfns_for_nodes(void) if (mirrored_kernelcore) { bool mem_below_4gb_not_mirrored = false; bool has_unmirrored_mem = false; + unsigned long mirrored_sz = 0; for_each_mem_region(r) { if (memblock_is_mirror(r)) { - add_reliable_mem_size(r->size); + mirrored_sz += r->size; continue; } @@ -7550,7 +7680,8 @@ static void __init find_zone_movable_pfns_for_nodes(void) if (mem_below_4gb_not_mirrored) pr_warn("This configuration results in unmirrored kernel memory.\n"); - mem_reliable_init(has_unmirrored_mem, zone_movable_pfn); + mem_reliable_init(has_unmirrored_mem, zone_movable_pfn, + mirrored_sz); goto out2; } diff --git a/mm/rmap.c b/mm/rmap.c index 0dc39cf94345da8f16d51616b88b33ba60268d6b..9719d73bd5fc5f5c630604dfaee376bbb0e7d116 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1591,6 +1591,7 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, vma_mmu_pagesize(vma)); } else { dec_mm_counter(mm, mm_counter(page)); + reliable_page_counter(page, mm, -1); set_pte_at(mm, address, pvmw.pte, pteval); } @@ -1606,6 +1607,7 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, * copied pages. */ dec_mm_counter(mm, mm_counter(page)); + reliable_page_counter(page, mm, -1); /* We have to invalidate as we cleared the pte */ mmu_notifier_invalidate_range(mm, address, address + PAGE_SIZE); @@ -1685,6 +1687,7 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, mmu_notifier_invalidate_range(mm, address, address + PAGE_SIZE); dec_mm_counter(mm, MM_ANONPAGES); + reliable_page_counter(page, mm, -1); goto discard; } @@ -1718,6 +1721,7 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, spin_unlock(&mmlist_lock); } dec_mm_counter(mm, MM_ANONPAGES); + reliable_page_counter(page, mm, -1); inc_mm_counter(mm, MM_SWAPENTS); swp_pte = swp_entry_to_pte(entry); if (pte_soft_dirty(pteval)) @@ -1740,6 +1744,7 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, * See Documentation/vm/mmu_notifier.rst */ dec_mm_counter(mm, mm_counter_file(page)); + reliable_page_counter(page, mm, -1); } discard: /* diff --git a/mm/shmem.c b/mm/shmem.c index ad2d68150ed2f4d8f165c11e05ba0129df73ea01..e85ac8c2150f47a3ece917699424bde6b940a16a 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -752,6 +752,7 @@ static int shmem_add_to_page_cache(struct page *page, mapping->nrpages += nr; __mod_lruvec_page_state(page, NR_FILE_PAGES, nr); __mod_lruvec_page_state(page, NR_SHMEM, nr); + shmem_reliable_page_counter(page, nr); unlock: xas_unlock_irq(&xas); } while (xas_nomem(&xas, gfp)); @@ -784,6 +785,7 @@ static void shmem_delete_from_page_cache(struct page *page, void *radswap) mapping->nrpages--; __dec_lruvec_page_state(page, NR_FILE_PAGES); __dec_lruvec_page_state(page, NR_SHMEM); + shmem_reliable_page_counter(page, -1); xa_unlock_irq(&mapping->i_pages); put_page(page); BUG_ON(error); @@ -1559,12 +1561,20 @@ static struct page *shmem_alloc_page(gfp_t gfp, return page; } -static inline void shmem_prepare_alloc(gfp_t *gfp_mask) +static inline bool shmem_prepare_alloc(gfp_t *gfp_mask) { if (!shmem_reliable_is_enabled()) - return; + return true; + + if (mem_reliable_shmem_limit_check()) { + *gfp_mask |= GFP_RELIABLE; + return true; + } + + if (reliable_allow_fb_enabled()) + return true; - *gfp_mask |= GFP_RELIABLE; + return false; } static struct page *shmem_alloc_and_acct_page(gfp_t gfp, @@ -1583,7 +1593,8 @@ static struct page *shmem_alloc_and_acct_page(gfp_t gfp, if (!shmem_inode_acct_block(inode, nr)) goto failed; - shmem_prepare_alloc(&gfp); + if (!shmem_prepare_alloc(&gfp)) + goto no_mem; if (huge) page = shmem_alloc_hugepage(gfp, info, index, node_id); @@ -1595,6 +1606,7 @@ static struct page *shmem_alloc_and_acct_page(gfp_t gfp, return page; } +no_mem: err = -ENOMEM; shmem_inode_unacct_blocks(inode, nr); failed: @@ -2455,6 +2467,7 @@ static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm, spin_unlock_irq(&info->lock); inc_mm_counter(dst_mm, mm_counter_file(page)); + reliable_page_counter(page, dst_mm, 1); page_add_file_rmap(page, false); set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte); diff --git a/mm/swapfile.c b/mm/swapfile.c index eaf483c7c83e7691297d12b818bfe20ffa1104e8..7faa30f460e40c6c68e4a5c32b441a17f5702056 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1935,6 +1935,8 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, get_page(page); set_pte_at(vma->vm_mm, addr, pte, pte_mkold(mk_pte(page, vma->vm_page_prot))); + + reliable_page_counter(page, vma->vm_mm, 1); if (page == swapcache) { page_add_anon_rmap(page, vma, addr, false); } else { /* ksm created a completely new copy */ diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 3849b28c09527f1174495b2529aa877b204616fc..15c46208a2accb1b097f9ff6412b2b63f7400c4a 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -150,6 +150,7 @@ static int mcopy_atomic_pte(struct mm_struct *dst_mm, #endif inc_mm_counter(dst_mm, MM_ANONPAGES); + reliable_page_counter(page, dst_mm, 1); page_add_new_anon_rmap(page, dst_vma, dst_addr, false); lru_cache_add_inactive_or_unevictable(page, dst_vma); diff --git a/mm/vmscan.c b/mm/vmscan.c index d96f52b2fbe00fb116de94d2fd9e0973c5210b39..e2a73071c720cdec36111bd962f74ab7d9b2e937 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1813,6 +1813,7 @@ static __always_inline void update_lru_sizes(struct lruvec *lruvec, continue; update_lru_size(lruvec, lru, zid, -nr_zone_taken[zid]); + reliable_lru_add_batch(zid, lru, -nr_zone_taken[zid]); } } @@ -2082,6 +2083,7 @@ static unsigned noinline_for_stack move_pages_to_lru(struct lruvec *lruvec, update_lru_size(lruvec, lru, page_zonenum(page), nr_pages); list_add(&page->lru, &lruvec->lists[lru]); + reliable_lru_add(lru, page, nr_pages); nr_moved += nr_pages; if (PageActive(page)) workingset_age_nonresident(lruvec, nr_pages);