diff --git a/arch/x86/kvm/svm/csv.c b/arch/x86/kvm/svm/csv.c index 25e86a8c4c5e87964c78254e17e0853b383599d5..27031f30271b5143e93cafa16b9e270820e0754a 100644 --- a/arch/x86/kvm/svm/csv.c +++ b/arch/x86/kvm/svm/csv.c @@ -79,7 +79,7 @@ int csv_vm_attestation(struct kvm *kvm, unsigned long gpa, unsigned long len) } guest_uaddr = gfn_to_hva(kvm, gpa_to_gfn(gpa)); - pages = hygon_kvm_hooks.sev_pin_memory(kvm, guest_uaddr, len, &n, 1); + pages = hygon_kvm_hooks.sev_pin_memory(kvm, guest_uaddr, len, &n, FOLL_WRITE); if (IS_ERR(pages)) return PTR_ERR(pages); @@ -404,7 +404,7 @@ csv_receive_update_data_to_ringbuf(struct kvm *kvm, /* Pin guest memory */ guest_page = hygon_kvm_hooks.sev_pin_memory(kvm, params.guest_uaddr & PAGE_MASK, - PAGE_SIZE, &n, 1); + PAGE_SIZE, &n, FOLL_WRITE); if (IS_ERR(guest_page)) { ret = PTR_ERR(guest_page); goto e_free; @@ -2649,7 +2649,7 @@ static int csv_launch_secret(struct kvm *kvm, struct kvm_sev_cmd *argp) if (!csv3_guest(kvm) || !(csv->inuse_ext & KVM_CAP_HYGON_COCO_EXT_CSV3_INJ_SECRET)) { pages = hygon_kvm_hooks.sev_pin_memory(kvm, params.guest_uaddr, - params.guest_len, &n, 1); + params.guest_len, &n, FOLL_WRITE); if (IS_ERR(pages)) return PTR_ERR(pages); diff --git a/arch/x86/kvm/svm/csv.h b/arch/x86/kvm/svm/csv.h index 9b0563062a941aad1e2e35b400706604f7a77e97..fca2c43374a8a5e22ce6e5309e68c2517d4df876 100644 --- a/arch/x86/kvm/svm/csv.h +++ b/arch/x86/kvm/svm/csv.h @@ -61,7 +61,7 @@ extern struct hygon_kvm_hooks_table { unsigned long npages); struct page **(*sev_pin_memory)(struct kvm *kvm, unsigned long uaddr, unsigned long ulen, unsigned long *n, - int write); + unsigned int flags); void (*sev_unpin_memory)(struct kvm *kvm, struct page **pages, unsigned long npages); void (*sev_clflush_pages)(struct page *pages[], unsigned long npages); diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 12e88c2d948c9e2d055aadbeff6399b04365be60..f2e1326c09afdb9adaf2b92141dbdf58a42f4e75 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -508,7 +508,7 @@ static int sev_launch_start(struct kvm *kvm, struct kvm_sev_cmd *argp) static struct page **sev_pin_memory(struct kvm *kvm, unsigned long uaddr, unsigned long ulen, unsigned long *n, - int write) + unsigned int flags) { struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; unsigned long npages, size; @@ -516,7 +516,6 @@ static struct page **sev_pin_memory(struct kvm *kvm, unsigned long uaddr, unsigned long locked, lock_limit; struct page **pages; unsigned long first, last; - unsigned int flags = 0; int ret; lockdep_assert_held(&kvm->lock); @@ -549,10 +548,8 @@ static struct page **sev_pin_memory(struct kvm *kvm, unsigned long uaddr, if (!pages) return ERR_PTR(-ENOMEM); - flags = write ? FOLL_WRITE : 0; - /* Pin the user virtual address. */ - npinned = pin_user_pages_fast(uaddr, npages, flags | FOLL_LONGTERM, pages); + npinned = pin_user_pages_fast(uaddr, npages, flags, pages); if (npinned != npages) { pr_err("SEV: Failure locking %lu pages.\n", npages); ret = -ENOMEM; @@ -640,7 +637,7 @@ static int sev_launch_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp) vaddr_end = vaddr + size; /* Lock the user memory. */ - inpages = sev_pin_memory(kvm, vaddr, size, &npages, 1); + inpages = sev_pin_memory(kvm, vaddr, size, &npages, FOLL_WRITE); if (IS_ERR(inpages)) return PTR_ERR(inpages); @@ -1103,7 +1100,7 @@ static int sev_dbg_crypt(struct kvm *kvm, struct kvm_sev_cmd *argp, bool dec) if (IS_ERR(src_p)) return PTR_ERR(src_p); - dst_p = sev_pin_memory(kvm, dst_vaddr & PAGE_MASK, PAGE_SIZE, &n, 1); + dst_p = sev_pin_memory(kvm, dst_vaddr & PAGE_MASK, PAGE_SIZE, &n, FOLL_WRITE); if (IS_ERR(dst_p)) { sev_unpin_memory(kvm, src_p, n); return PTR_ERR(dst_p); @@ -1169,7 +1166,7 @@ static int sev_launch_secret(struct kvm *kvm, struct kvm_sev_cmd *argp) if (copy_from_user(¶ms, (void __user *)(uintptr_t)argp->data, sizeof(params))) return -EFAULT; - pages = sev_pin_memory(kvm, params.guest_uaddr, params.guest_len, &n, 1); + pages = sev_pin_memory(kvm, params.guest_uaddr, params.guest_len, &n, FOLL_WRITE); if (IS_ERR(pages)) return PTR_ERR(pages); @@ -1651,7 +1648,7 @@ static int sev_receive_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp) /* Pin guest memory */ guest_page = sev_pin_memory(kvm, params.guest_uaddr & PAGE_MASK, - PAGE_SIZE, &n, 1); + PAGE_SIZE, &n, FOLL_WRITE); if (IS_ERR(guest_page)) { ret = PTR_ERR(guest_page); goto e_free_trans; @@ -2109,7 +2106,8 @@ int sev_mem_enc_register_region(struct kvm *kvm, return -ENOMEM; mutex_lock(&kvm->lock); - region->pages = sev_pin_memory(kvm, range->addr, range->size, ®ion->npages, 1); + region->pages = sev_pin_memory(kvm, range->addr, range->size, ®ion->npages, + FOLL_WRITE | FOLL_LONGTERM); if (IS_ERR(region->pages)) { ret = PTR_ERR(region->pages); mutex_unlock(&kvm->lock); diff --git a/arch/x86/mm/mem_encrypt_hygon.c b/arch/x86/mm/mem_encrypt_hygon.c index 52ec3fa041feb72217567e02a8b2551b62833dca..da42e32f66e0a984e8650c9cc9302fc6d3136502 100644 --- a/arch/x86/mm/mem_encrypt_hygon.c +++ b/arch/x86/mm/mem_encrypt_hygon.c @@ -283,7 +283,6 @@ static void __init csv_cma_reserve_mem(void) 1 << CSV_CMA_SHIFT, node); break; } - cma_enable_concurrency(csv_cma->cma); if (start > cma_get_base(csv_cma->cma) || !start) start = cma_get_base(csv_cma->cma); diff --git a/include/linux/cma.h b/include/linux/cma.h index 010c89f4b7727323ff690664a69ed885f771b838..18c8d6495f0899a5bdd815aa13fe6ffba1ef8b07 100644 --- a/include/linux/cma.h +++ b/include/linux/cma.h @@ -58,5 +58,4 @@ extern int cma_for_each_area(int (*it)(struct cma *cma, void *data), void *data) extern void cma_reserve_pages_on_error(struct cma *cma); extern int __init cma_alloc_areas(unsigned int max_cma_size); -extern void cma_enable_concurrency(struct cma *cma); #endif diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index fc2023d07f6931154baa44c9c2bb1a8736ef43fb..a3b0ad17c417c736a40522c3b559aa06fea570e5 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -745,6 +745,8 @@ struct huge_bootmem_page { }; int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list); +int replace_free_hugepage_folios(unsigned long start_pfn, unsigned long end_pfn); +void wait_for_freed_hugetlb_folios(void); struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, unsigned long addr, int avoid_reserve); struct folio *alloc_hugetlb_folio_nodemask(struct hstate *h, int preferred_nid, @@ -1049,6 +1051,16 @@ static inline int isolate_or_dissolve_huge_page(struct page *page, return -ENOMEM; } +static inline int replace_free_hugepage_folios(unsigned long start_pfn, + unsigned long end_pfn) +{ + return 0; +} + +static inline void wait_for_freed_hugetlb_folios(void) +{ +} + static inline struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, unsigned long addr, int avoid_reserve) diff --git a/mm/cma.c b/mm/cma.c index 304a4e69180c98cb52787d3ef648417aff28517f..e523ba62a6f967dea1ef3ab88064c8da5dbfc1fc 100644 --- a/mm/cma.c +++ b/mm/cma.c @@ -41,7 +41,6 @@ static unsigned int cma_areas_size = MAX_CMA_AREAS; struct cma *cma_areas = cma_areas_data; unsigned cma_area_count; -static DEFINE_MUTEX(cma_mutex); phys_addr_t cma_get_base(const struct cma *cma) { @@ -125,6 +124,8 @@ static void __init cma_activate_area(struct cma *cma) spin_lock_init(&cma->lock); + mutex_init(&cma->alloc_mutex); + #ifdef CONFIG_CMA_DEBUGFS INIT_HLIST_HEAD(&cma->mem_head); spin_lock_init(&cma->mem_head_lock); @@ -492,12 +493,10 @@ struct page *cma_alloc(struct cma *cma, unsigned long count, spin_unlock_irq(&cma->lock); pfn = cma->base_pfn + (bitmap_no << cma->order_per_bit); - if (!cma->no_mutex) - mutex_lock(&cma_mutex); + mutex_lock(&cma->alloc_mutex); ret = alloc_contig_range(pfn, pfn + count, MIGRATE_CMA, GFP_KERNEL | (no_warn ? __GFP_NOWARN : 0)); - if (!cma->no_mutex) - mutex_unlock(&cma_mutex); + mutex_unlock(&cma->alloc_mutex); if (ret == 0) { page = pfn_to_page(pfn); break; @@ -611,11 +610,3 @@ int cma_for_each_area(int (*it)(struct cma *cma, void *data), void *data) return 0; } - -void cma_enable_concurrency(struct cma *cma) -{ - if (!cma) - return; - - cma->no_mutex = true; -} diff --git a/mm/cma.h b/mm/cma.h index 50275c1d98cc656497018577da57bb9417e47846..f63cd31123a99f11b92ffff5d36226e817caf2a2 100644 --- a/mm/cma.h +++ b/mm/cma.h @@ -16,7 +16,7 @@ struct cma { unsigned long *bitmap; unsigned int order_per_bit; /* Order of pages represented by one bit */ spinlock_t lock; - bool no_mutex; + struct mutex alloc_mutex; #ifdef CONFIG_CMA_DEBUGFS struct hlist_head mem_head; spinlock_t mem_head_lock; diff --git a/mm/compaction.c b/mm/compaction.c index 8b889bee2ace50ed50e45ddbb9985691a2cf6213..eb77b1456012d1ccd2ddbd711d1380b39f3fa707 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -2377,6 +2377,48 @@ bool compaction_zonelist_suitable(struct alloc_context *ac, int order, return false; } +/* + * Should we do compaction for target allocation order. + * Return COMPACT_SUCCESS if allocation for target order can be already + * satisfied + * Return COMPACT_SKIPPED if compaction for target order is likely to fail + * Return COMPACT_CONTINUE if compaction for target order should be ran + */ +static enum compact_result +compaction_suit_allocation_order(struct zone *zone, unsigned int order, + int highest_zoneidx, unsigned int alloc_flags, + bool async) +{ + unsigned long watermark; + + watermark = wmark_pages(zone, alloc_flags & ALLOC_WMARK_MASK); + if (zone_watermark_ok(zone, order, watermark, highest_zoneidx, + alloc_flags)) + return COMPACT_SUCCESS; + + /* + * For unmovable allocations (without ALLOC_CMA), check if there is enough + * free memory in the non-CMA pageblocks. Otherwise compaction could form + * the high-order page in CMA pageblocks, which would not help the + * allocation to succeed. However, limit the check to costly order async + * compaction (such as opportunistic THP attempts) because there is the + * possibility that compaction would migrate pages from non-CMA to CMA + * pageblock. + */ + if (order > PAGE_ALLOC_COSTLY_ORDER && async && + !(alloc_flags & ALLOC_CMA)) { + watermark = low_wmark_pages(zone) + compact_gap(order); + if (!__zone_watermark_ok(zone, 0, watermark, highest_zoneidx, + 0, zone_page_state(zone, NR_FREE_PAGES))) + return COMPACT_SKIPPED; + } + + if (!compaction_suitable(zone, order, highest_zoneidx)) + return COMPACT_SKIPPED; + + return COMPACT_CONTINUE; +} + static enum compact_result compact_zone(struct compact_control *cc, struct capture_control *capc) { @@ -2402,19 +2444,12 @@ compact_zone(struct compact_control *cc, struct capture_control *capc) cc->migratetype = gfp_migratetype(cc->gfp_mask); if (!is_via_compact_memory(cc->order)) { - unsigned long watermark; - - /* Allocation can already succeed, nothing to do */ - watermark = wmark_pages(cc->zone, - cc->alloc_flags & ALLOC_WMARK_MASK); - if (zone_watermark_ok(cc->zone, cc->order, watermark, - cc->highest_zoneidx, cc->alloc_flags)) - return COMPACT_SUCCESS; - - /* Compaction is likely to fail */ - if (!compaction_suitable(cc->zone, cc->order, - cc->highest_zoneidx)) - return COMPACT_SKIPPED; + ret = compaction_suit_allocation_order(cc->zone, cc->order, + cc->highest_zoneidx, + cc->alloc_flags, + cc->mode == MIGRATE_ASYNC); + if (ret != COMPACT_CONTINUE) + return ret; } /* @@ -2908,6 +2943,7 @@ static bool kcompactd_node_suitable(pg_data_t *pgdat) int zoneid; struct zone *zone; enum zone_type highest_zoneidx = pgdat->kcompactd_highest_zoneidx; + enum compact_result ret; for (zoneid = 0; zoneid <= highest_zoneidx; zoneid++) { zone = &pgdat->node_zones[zoneid]; @@ -2915,14 +2951,11 @@ static bool kcompactd_node_suitable(pg_data_t *pgdat) if (!populated_zone(zone)) continue; - /* Allocation can already succeed, check other zones */ - if (zone_watermark_ok(zone, pgdat->kcompactd_max_order, - min_wmark_pages(zone), - highest_zoneidx, 0)) - continue; - - if (compaction_suitable(zone, pgdat->kcompactd_max_order, - highest_zoneidx)) + ret = compaction_suit_allocation_order(zone, + pgdat->kcompactd_max_order, + highest_zoneidx, ALLOC_WMARK_MIN, + false); + if (ret == COMPACT_CONTINUE) return true; } @@ -2945,6 +2978,8 @@ static void kcompactd_do_work(pg_data_t *pgdat) .ignore_skip_hint = false, .gfp_mask = GFP_KERNEL, }; + enum compact_result ret; + trace_mm_compaction_kcompactd_wake(pgdat->node_id, cc.order, cc.highest_zoneidx); count_compact_event(KCOMPACTD_WAKE); @@ -2959,12 +2994,10 @@ static void kcompactd_do_work(pg_data_t *pgdat) if (compaction_deferred(zone, cc.order)) continue; - /* Allocation can already succeed, nothing to do */ - if (zone_watermark_ok(zone, cc.order, - min_wmark_pages(zone), zoneid, 0)) - continue; - - if (!compaction_suitable(zone, cc.order, zoneid)) + ret = compaction_suit_allocation_order(zone, + cc.order, zoneid, ALLOC_WMARK_MIN, + false); + if (ret != COMPACT_CONTINUE) continue; if (kthread_should_stop()) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 7c196b754071bd979f2ef651860745da00f59095..60d3def1342d19fcf5ebfcb62fbe45fb605c9902 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -47,6 +47,7 @@ #include #include "internal.h" #include "hugetlb_vmemmap.h" +#include int hugetlb_max_hstate __read_mostly; unsigned int default_hstate_idx; @@ -1341,6 +1342,9 @@ static struct folio *dequeue_hugetlb_folio_node_exact(struct hstate *h, if (folio_test_hwpoison(folio)) continue; + if (is_migrate_isolate_page(&folio->page)) + continue; + list_move(&folio->lru, &h->hugepage_activelist); folio_ref_unfreeze(folio, 1); folio_clear_hugetlb_freed(folio); @@ -3014,6 +3018,52 @@ static int alloc_and_dissolve_hugetlb_folio(struct hstate *h, return ret; } +/* + * replace_free_hugepage_folios - Replace free hugepage folios in a given pfn + * range with new folios. + * @start_pfn: start pfn of the given pfn range + * @end_pfn: end pfn of the given pfn range + * Returns 0 on success, otherwise negated error. + */ +int replace_free_hugepage_folios(unsigned long start_pfn, unsigned long end_pfn) +{ + struct hstate *h; + struct folio *folio; + int ret = 0; + + LIST_HEAD(isolate_list); + + while (start_pfn < end_pfn) { + folio = pfn_folio(start_pfn); + if (folio_test_hugetlb(folio)) { + h = folio_hstate(folio); + } else { + start_pfn++; + continue; + } + + if (!folio_ref_count(folio)) { + ret = alloc_and_dissolve_hugetlb_folio(h, folio, + &isolate_list); + if (ret) + break; + + putback_movable_pages(&isolate_list); + } + start_pfn++; + } + + return ret; +} + +void wait_for_freed_hugetlb_folios(void) +{ + if (llist_empty(&hpage_freelist)) + return; + + flush_work(&free_hpage_work); +} + int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list) { struct hstate *h; diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c index 4b9734777f698fc789acf1928ce4fdf29e0c49b6..446e9fc723ecc07ea73776e43a65ec41e94ee53c 100644 --- a/mm/hugetlb_vmemmap.c +++ b/mm/hugetlb_vmemmap.c @@ -380,7 +380,7 @@ static int vmemmap_remap_free(unsigned long start, unsigned long end, static int alloc_vmemmap_page_list(unsigned long start, unsigned long end, struct list_head *list) { - gfp_t gfp_mask = GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_THISNODE; + gfp_t gfp_mask = GFP_KERNEL | __GFP_RETRY_MAYFAIL; unsigned long nr_pages = (end - start) >> PAGE_SHIFT; int nid = page_to_nid((struct page *)start); struct page *page, *next; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 786648d205f19a8758422e740b910af902efd467..aa292df1c28275f2fd9b6f1d16a77ff693464856 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6430,7 +6430,17 @@ int alloc_contig_range(unsigned long start, unsigned long end, ret = __alloc_contig_migrate_range(&cc, start, end); if (ret && ret != -EBUSY) goto done; - ret = 0; + + /* + * When in-use hugetlb pages are migrated, they may simply be released + * back into the free hugepage pool instead of being returned to the + * buddy system. After the migration of in-use huge pages is completed, + * we will invoke replace_free_hugepage_folios() to ensure that these + * hugepages are properly released to the buddy system. + */ + ret = replace_free_hugepage_folios(start, end); + if (ret) + goto done; /* * Pages from [start, end) are within a pageblock_nr_pages diff --git a/mm/page_isolation.c b/mm/page_isolation.c index bcf99ba747a05aa9e0015ab431fba20c768d7c29..b8ae5fa32b1dbfa4b15b3eaf0647ce666f1cbc15 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -659,6 +659,16 @@ int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn, struct zone *zone; int ret; + /* + * Due to the deferred freeing of hugetlb folios, the hugepage folios may + * not immediately release to the buddy system. This can cause PageBuddy() + * to fail in __test_page_isolated_in_pageblock(). To ensure that the + * hugetlb folios are properly released back to the buddy system, we + * invoke the wait_for_freed_hugetlb_folios() function to wait for the + * release to complete. + */ + wait_for_freed_hugetlb_folios(); + /* * Note: pageblock_nr_pages != MAX_ORDER. Then, chunks of free pages * are not aligned to pageblock_nr_pages.