From d302db40a6cdd8ef5cd33611340aaceeda81618e Mon Sep 17 00:00:00 2001 From: Hao Lan Date: Thu, 30 Nov 2023 14:17:25 +0800 Subject: [PATCH 01/33] net: sfp: Synchronize some CMIS transceiver modules from ethtool driver inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I9A3QT CVE: NA ---------------------------------------------------------------------- Currently, the SFF-8024 Identifier Values that according to the standard support for the Common Management Interface Specification (CMIS) based on standard identifier values in the ethtool is more than in the kernel. When the driver needs to use a newer Identifier Value, the kernel interface does not support it. Therefore, we synchronize the CMIS mode Identifier Values which supported by the ethtool to the kernel. Signed-off-by: Hao Lan Signed-off-by: Jiantao Xiao Signed-off-by: hongrongxuan --- include/linux/sfp.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/include/linux/sfp.h b/include/linux/sfp.h index 73f573b33465..30438213ec5b 100644 --- a/include/linux/sfp.h +++ b/include/linux/sfp.h @@ -284,6 +284,18 @@ enum { SFF8024_ID_QSFP_8438 = 0x0c, SFF8024_ID_QSFP_8436_8636 = 0x0d, SFF8024_ID_QSFP28_8636 = 0x11, + SFF8024_ID_CXP2 = 0x12, + SFF8024_ID_CDFP = 0x13, + SFF8024_ID_HD4X_FANOUT = 0x14, + SFF8024_ID_HD8X_FANOUT = 0x15, + SFF8024_ID_CDFP_S3 = 0x16, + SFF8024_ID_MICRO_QSFP = 0x17, + SFF8024_ID_QSFP_DD = 0x18, + SFF8024_ID_OSFP = 0x19, + SFF8024_ID_DSFP = 0x1B, + SFF8024_ID_QSFP_PLUS_CMIS = 0x1E, + SFF8024_ID_SFP_DD_CMIS = 0x1F, + SFF8024_ID_SFP_PLUS_CMIS = 0x20, SFF8024_ENCODING_UNSPEC = 0x00, SFF8024_ENCODING_8B10B = 0x01, -- Gitee From f95d7d121ec318ef688387bf1975899c21758c29 Mon Sep 17 00:00:00 2001 From: hongrongxuan Date: Mon, 29 Apr 2024 23:39:27 -0400 Subject: [PATCH 02/33] arm64: Add HWCAP for Data Gathering Hint instruction --- Documentation/arm64/cpu-feature-registers.rst | 2 ++ Documentation/arm64/elf_hwcaps.rst | 3 +++ arch/arm64/include/asm/hwcap.h | 1 + arch/arm64/include/asm/sysreg.h | 1 + arch/arm64/include/uapi/asm/hwcap.h | 1 + arch/arm64/kernel/cpufeature.c | 2 ++ arch/arm64/kernel/cpuinfo.c | 1 + 7 files changed, 11 insertions(+) diff --git a/Documentation/arm64/cpu-feature-registers.rst b/Documentation/arm64/cpu-feature-registers.rst index 40adbe815bb4..30e71739233f 100644 --- a/Documentation/arm64/cpu-feature-registers.rst +++ b/Documentation/arm64/cpu-feature-registers.rst @@ -193,6 +193,8 @@ infrastructure: +------------------------------+---------+---------+ | Name | bits | visible | +------------------------------+---------+---------+ + | DGH | [51-48] | y | + +------------------------------+---------+---------+ | GPI | [31-28] | y | +------------------------------+---------+---------+ | GPA | [27-24] | y | diff --git a/Documentation/arm64/elf_hwcaps.rst b/Documentation/arm64/elf_hwcaps.rst index 6afc1d12c74d..ecebe0b3ae46 100644 --- a/Documentation/arm64/elf_hwcaps.rst +++ b/Documentation/arm64/elf_hwcaps.rst @@ -206,6 +206,9 @@ HWCAP2_ECV Functionality implied by ID_AA64MMFR0_EL1.ECV == 0b0001. +HWCAP2_DGH + Functionality implied by ID_AA64ISAR1_EL1.DGH == 0b0001. + 4. Unused AT_HWCAP bits ----------------------- diff --git a/arch/arm64/include/asm/hwcap.h b/arch/arm64/include/asm/hwcap.h index a700f877f4c8..98b9359ceff1 100644 --- a/arch/arm64/include/asm/hwcap.h +++ b/arch/arm64/include/asm/hwcap.h @@ -87,6 +87,7 @@ #define KERNEL_HWCAP_FLAGM2 __khwcap2_feature(FLAGM2) #define KERNEL_HWCAP_FRINT __khwcap2_feature(FRINT) #define KERNEL_HWCAP_ECV __khwcap2_feature(ECV) +#define KERNEL_HWCAP_DGH __khwcap2_feature(DGH) /* * This yields a mask that user programs can use to figure out what diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h index 49e89d540974..60c6affb9135 100644 --- a/arch/arm64/include/asm/sysreg.h +++ b/arch/arm64/include/asm/sysreg.h @@ -567,6 +567,7 @@ #define ID_AA64ISAR0_TLB_RANGE 0x2 /* id_aa64isar1 */ +#define ID_AA64ISAR1_DGH_SHIFT 48 #define ID_AA64ISAR1_SB_SHIFT 36 #define ID_AA64ISAR1_FRINTTS_SHIFT 32 #define ID_AA64ISAR1_GPI_SHIFT 28 diff --git a/arch/arm64/include/uapi/asm/hwcap.h b/arch/arm64/include/uapi/asm/hwcap.h index 365a7feb43be..f36f9eb8dac2 100644 --- a/arch/arm64/include/uapi/asm/hwcap.h +++ b/arch/arm64/include/uapi/asm/hwcap.h @@ -66,5 +66,6 @@ #define HWCAP2_FLAGM2 (1 << 7) #define HWCAP2_FRINT (1 << 8) #define HWCAP2_ECV (1 << 9) +#define HWCAP2_DGH (1 << 10) #endif /* _UAPI__ASM_HWCAP_H */ diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index fffdc0e74ca5..f394047685df 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -140,6 +140,7 @@ static const struct arm64_ftr_bits ftr_id_aa64isar0[] = { }; static const struct arm64_ftr_bits ftr_id_aa64isar1[] = { + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR1_DGH_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR1_SB_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR1_FRINTTS_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_PTR_AUTH), @@ -1714,6 +1715,7 @@ static const struct arm64_cpu_capabilities arm64_elf_hwcaps[] = { HWCAP_CAP(SYS_ID_AA64ISAR1_EL1, ID_AA64ISAR1_LRCPC_SHIFT, FTR_UNSIGNED, 2, CAP_HWCAP, KERNEL_HWCAP_ILRCPC), HWCAP_CAP(SYS_ID_AA64ISAR1_EL1, ID_AA64ISAR1_FRINTTS_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_FRINT), HWCAP_CAP(SYS_ID_AA64ISAR1_EL1, ID_AA64ISAR1_SB_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_SB), + HWCAP_CAP(SYS_ID_AA64ISAR1_EL1, ID_AA64ISAR1_DGH_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_DGH), HWCAP_CAP(SYS_ID_AA64MMFR2_EL1, ID_AA64MMFR2_AT_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_USCAT), #ifdef CONFIG_ARM64_SVE HWCAP_CAP(SYS_ID_AA64PFR0_EL1, ID_AA64PFR0_SVE_SHIFT, FTR_UNSIGNED, ID_AA64PFR0_SVE, CAP_HWCAP, KERNEL_HWCAP_SVE), diff --git a/arch/arm64/kernel/cpuinfo.c b/arch/arm64/kernel/cpuinfo.c index f1b246b48a47..7f3a91ba07b5 100644 --- a/arch/arm64/kernel/cpuinfo.c +++ b/arch/arm64/kernel/cpuinfo.c @@ -87,6 +87,7 @@ static const char *const hwcap_str[] = { "flagm2", "frint", "ecv", + "dgh", NULL }; -- Gitee From 8a79983767969ea8953927e4419f90d884db1bd4 Mon Sep 17 00:00:00 2001 From: Xiongfeng Wang Date: Tue, 21 Dec 2021 11:55:56 +0800 Subject: [PATCH 03/33] asm-generic: introduce io_stop_wc() and add implementation for ARM64 For memory accesses with write-combining attributes (e.g. those returned by ioremap_wc()), the CPU may wait for prior accesses to be merged with subsequent ones. But in some situation, such wait is bad for the performance. We introduce io_stop_wc() to prevent the merging of write-combining memory accesses before this macro with those after it. We add implementation for ARM64 using DGH instruction and provide NOP implementation for other architectures. Signed-off-by: Xiongfeng Wang Suggested-by: Will Deacon Suggested-by: Catalin Marinas Acked-by: Arnd Bergmann Link: https://lore.kernel.org/r/20211221035556.60346-1-wangxiongfeng2@huawei.com Signed-off-by: Catalin Marinas Signed-off-by: hongrongxuan Conflicts: arch/arm64/include/asm/barrier.h include/asm-generic/barrier.h --- Documentation/memory-barriers.txt | 8 ++++++++ arch/arm64/include/asm/barrier.h | 9 +++++++++ include/asm-generic/barrier.h | 11 +++++++++++ 3 files changed, 28 insertions(+) diff --git a/Documentation/memory-barriers.txt b/Documentation/memory-barriers.txt index 1adbb8a371c7..04c1e99c1624 100644 --- a/Documentation/memory-barriers.txt +++ b/Documentation/memory-barriers.txt @@ -1937,6 +1937,14 @@ There are some more advanced barrier functions: information on consistent memory. + (*) io_stop_wc(); + + For memory accesses with write-combining attributes (e.g. those returned + by ioremap_wc(), the CPU may wait for prior accesses to be merged with + subsequent ones. io_stop_wc() can be used to prevent the merging of + write-combining memory accesses before this macro with those after it when + such wait has performance implications. + =============================== IMPLICIT KERNEL MEMORY BARRIERS =============================== diff --git a/arch/arm64/include/asm/barrier.h b/arch/arm64/include/asm/barrier.h index 0fcd854fc95f..37c17bcd00af 100644 --- a/arch/arm64/include/asm/barrier.h +++ b/arch/arm64/include/asm/barrier.h @@ -28,6 +28,13 @@ #define spec_bar() asm volatile(ALTERNATIVE("dsb nsh\nisb\n", \ SB_BARRIER_INSN"nop\n", \ ARM64_HAS_SB)) +/* + * Data Gathering Hint: + * This instruction prevents merging memory accesses with Normal-NC or + * Device-GRE attributes before the hint instruction with any memory accesses + * appearing after the hint instruction. + */ +#define dgh() asm volatile("hint #6" : : : "memory") #define mb() dsb(sy) #define rmb() dsb(ld) @@ -36,6 +43,8 @@ #define dma_rmb() dmb(oshld) #define dma_wmb() dmb(oshst) +#define io_stop_wc() dgh() + /* * Generate a mask for array_index__nospec() that is ~0UL when 0 <= idx < sz * and 0 otherwise. diff --git a/include/asm-generic/barrier.h b/include/asm-generic/barrier.h index 85b28eb80b11..888f35d02a13 100644 --- a/include/asm-generic/barrier.h +++ b/include/asm-generic/barrier.h @@ -257,5 +257,16 @@ do { \ }) #endif +/* + * ioremap_wc() maps I/O memory as memory with write-combining attributes. For + * this kind of memory accesses, the CPU may wait for prior accesses to be + * merged with subsequent ones. In some situation, such wait is bad for the + * performance. io_stop_wc() can be used to prevent the merging of + * write-combining memory accesses before this macro with those after it. + */ +#ifndef io_stop_wc +#define io_stop_wc do { } while (0) +#endif + #endif /* !__ASSEMBLY__ */ #endif /* __ASM_GENERIC_BARRIER_H */ -- Gitee From f9dd6dcc0ab877d35935125adb6b0db22427a17e Mon Sep 17 00:00:00 2001 From: Saeed Mahameed Date: Wed, 20 Nov 2019 00:15:17 +0000 Subject: [PATCH 04/33] page_pool: Add API to update numa node Add page_pool_update_nid() to be called by page pool consumers when they detect numa node changes. It will update the page pool nid value to start allocating from the new effective numa node. This is to mitigate page pool allocating pages from a wrong numa node, where the pool was originally allocated, and holding on to pages that belong to a different numa node, which causes performance degradation. For pages that are already being consumed and could be returned to the pool by the consumer, in next patch we will add a check per page to avoid recycling them back to the pool and return them to the page allocator. Signed-off-by: Saeed Mahameed Acked-by: Jonathan Lemon Reviewed-by: Ilias Apalodimas Acked-by: Jesper Dangaard Brouer Signed-off-by: David S. Miller Signed-off-by: hongrongxuan --- include/net/page_pool.h | 7 +++++++ include/trace/events/page_pool.h | 22 ++++++++++++++++++++++ net/core/page_pool.c | 8 ++++++++ 3 files changed, 37 insertions(+) diff --git a/include/net/page_pool.h b/include/net/page_pool.h index cf086e13bd25..17e59e9e8064 100644 --- a/include/net/page_pool.h +++ b/include/net/page_pool.h @@ -212,4 +212,11 @@ static inline bool page_pool_put(struct page_pool *pool) return refcount_dec_and_test(&pool->user_cnt); } +/* Caller must provide appropriate safe context, e.g. NAPI. */ +void page_pool_update_nid(struct page_pool *pool, int new_nid); +static inline void page_pool_nid_changed(struct page_pool *pool, int new_nid) +{ + if (unlikely(pool->p.nid != new_nid)) + page_pool_update_nid(pool, new_nid); +} #endif /* _NET_PAGE_POOL_H */ diff --git a/include/trace/events/page_pool.h b/include/trace/events/page_pool.h index 47b5ee880aa9..b58b6a3a3e57 100644 --- a/include/trace/events/page_pool.h +++ b/include/trace/events/page_pool.h @@ -81,6 +81,28 @@ TRACE_EVENT(page_pool_state_hold, __entry->pool, __entry->page, __entry->hold) ); +TRACE_EVENT(page_pool_update_nid, + + TP_PROTO(const struct page_pool *pool, int new_nid), + + TP_ARGS(pool, new_nid), + + TP_STRUCT__entry( + __field(const struct page_pool *, pool) + __field(int, pool_nid) + __field(int, new_nid) + ), + + TP_fast_assign( + __entry->pool = pool; + __entry->pool_nid = pool->p.nid; + __entry->new_nid = new_nid; + ), + + TP_printk("page_pool=%p pool_nid=%d new_nid=%d", + __entry->pool, __entry->pool_nid, __entry->new_nid) +); + #endif /* _TRACE_PAGE_POOL_H */ /* This part must be outside protection */ diff --git a/net/core/page_pool.c b/net/core/page_pool.c index 335f68eaaa05..bc88c50831a0 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -427,3 +427,11 @@ void page_pool_destroy(struct page_pool *pool) schedule_delayed_work(&pool->release_dw, DEFER_TIME); } EXPORT_SYMBOL(page_pool_destroy); + +/* Caller must provide appropriate safe context, e.g. NAPI. */ +void page_pool_update_nid(struct page_pool *pool, int new_nid) +{ + trace_page_pool_update_nid(pool, new_nid); + pool->p.nid = new_nid; +} +EXPORT_SYMBOL(page_pool_update_nid); -- Gitee From e45d7efaf6b1d9e8c1597c21c1d99584ca725148 Mon Sep 17 00:00:00 2001 From: Saeed Mahameed Date: Wed, 20 Nov 2019 00:15:19 +0000 Subject: [PATCH 05/33] page_pool: Don't recycle non-reusable pages A page is NOT reusable when at least one of the following is true: 1) allocated when system was under some pressure. (page_is_pfmemalloc) 2) belongs to a different NUMA node than pool->p.nid. To update pool->p.nid users should call page_pool_update_nid(). Holding on to such pages in the pool will hurt the consumer performance when the pool migrates to a different numa node. Performance testing: XDP drop/tx rate and TCP single/multi stream, on mlx5 driver while migrating rx ring irq from close to far numa: mlx5 internal page cache was locally disabled to get pure page pool results. CPU: Intel(R) Xeon(R) CPU E5-2603 v4 @ 1.70GHz NIC: Mellanox Technologies MT27700 Family [ConnectX-4] (100G) XDP Drop/TX single core: NUMA | XDP | Before | After --------------------------------------- Close | Drop | 11 Mpps | 10.9 Mpps Far | Drop | 4.4 Mpps | 5.8 Mpps Close | TX | 6.5 Mpps | 6.5 Mpps Far | TX | 3.5 Mpps | 4 Mpps Improvement is about 30% drop packet rate, 15% tx packet rate for numa far test. No degradation for numa close tests. TCP single/multi cpu/stream: NUMA | #cpu | Before | After -------------------------------------- Close | 1 | 18 Gbps | 18 Gbps Far | 1 | 15 Gbps | 18 Gbps Close | 12 | 80 Gbps | 80 Gbps Far | 12 | 68 Gbps | 80 Gbps In all test cases we see improvement for the far numa case, and no impact on the close numa case. The impact of adding a check per page is very negligible, and shows no performance degradation whatsoever, also functionality wise it seems more correct and more robust for page pool to verify when pages should be recycled, since page pool can't guarantee where pages are coming from. Signed-off-by: Saeed Mahameed Acked-by: Jonathan Lemon Reviewed-by: Ilias Apalodimas Acked-by: Jesper Dangaard Brouer Signed-off-by: David S. Miller Signed-off-by: hongrongxuan --- net/core/page_pool.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/net/core/page_pool.c b/net/core/page_pool.c index bc88c50831a0..53681aedd134 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -281,6 +281,17 @@ static bool __page_pool_recycle_direct(struct page *page, return true; } +/* page is NOT reusable when: + * 1) allocated when system is under some pressure. (page_is_pfmemalloc) + * 2) belongs to a different NUMA node than pool->p.nid. + * + * To update pool->p.nid users must call page_pool_update_nid. + */ +static bool pool_page_reusable(struct page_pool *pool, struct page *page) +{ + return !page_is_pfmemalloc(page) && page_to_nid(page) == pool->p.nid; +} + void __page_pool_put_page(struct page_pool *pool, struct page *page, bool allow_direct) { @@ -290,7 +301,8 @@ void __page_pool_put_page(struct page_pool *pool, * * refcnt == 1 means page_pool owns page, and can recycle it. */ - if (likely(page_ref_count(page) == 1)) { + if (likely(page_ref_count(page) == 1 && + pool_page_reusable(pool, page))) { /* Read barrier done in page_ref_count / READ_ONCE */ if (allow_direct && in_serving_softirq()) -- Gitee From f8d0cbdd99812887d8156d13a70c6dd001a06109 Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Wed, 20 Nov 2019 16:54:18 +0200 Subject: [PATCH 06/33] net: page_pool: add the possibility to sync DMA memory for device Introduce the following parameters in order to add the possibility to sync DMA memory for device before putting allocated pages in the page_pool caches: - PP_FLAG_DMA_SYNC_DEV: if set in page_pool_params flags, all pages that the driver gets from page_pool will be DMA-synced-for-device according to the length provided by the device driver. Please note DMA-sync-for-CPU is still device driver responsibility - offset: DMA address offset where the DMA engine starts copying rx data - max_len: maximum DMA memory size page_pool is allowed to flush. This is currently used in __page_pool_alloc_pages_slow routine when pages are allocated from page allocator These parameters are supposed to be set by device drivers. This optimization reduces the length of the DMA-sync-for-device. The optimization is valid because pages are initially DMA-synced-for-device as defined via max_len. At RX time, the driver will perform a DMA-sync-for-CPU on the memory for the packet length. What is important is the memory occupied by packet payload, because this is the area CPU is allowed to read and modify. As we don't track cache-lines written into by the CPU, simply use the packet payload length as dma_sync_size at page_pool recycle time. This also take into account any tail-extend. Tested-by: Matteo Croce Signed-off-by: Lorenzo Bianconi Signed-off-by: Jesper Dangaard Brouer Acked-by: Ilias Apalodimas Signed-off-by: David S. Miller Signed-off-by: hongrongxuan --- include/net/page_pool.h | 24 ++++++++++++++++++------ net/core/page_pool.c | 36 ++++++++++++++++++++++++++++++++++-- 2 files changed, 52 insertions(+), 8 deletions(-) diff --git a/include/net/page_pool.h b/include/net/page_pool.h index 17e59e9e8064..02b018469bb9 100644 --- a/include/net/page_pool.h +++ b/include/net/page_pool.h @@ -34,8 +34,18 @@ #include #include -#define PP_FLAG_DMA_MAP 1 /* Should page_pool do the DMA map/unmap */ -#define PP_FLAG_ALL PP_FLAG_DMA_MAP +#define PP_FLAG_DMA_MAP BIT(0) /* Should page_pool do the DMA + * map/unmap + */ +#define PP_FLAG_DMA_SYNC_DEV BIT(1) /* If set all pages that the driver gets + * from page_pool will be + * DMA-synced-for-device according to + * the length provided by the device + * driver. + * Please note DMA-sync-for-CPU is still + * device driver responsibility + */ +#define PP_FLAG_ALL (PP_FLAG_DMA_MAP | PP_FLAG_DMA_SYNC_DEV) /* * Fast allocation side cache array/stack @@ -65,6 +75,8 @@ struct page_pool_params { int nid; /* Numa node id to allocate from pages from */ struct device *dev; /* device, for DMA pre-mapping purposes */ enum dma_data_direction dma_dir; /* DMA mapping direction */ + unsigned int max_len; /* max DMA sync memory size */ + unsigned int offset; /* DMA addr offset */ }; struct page_pool { @@ -149,8 +161,8 @@ static inline void page_pool_use_xdp_mem(struct page_pool *pool, #endif /* Never call this directly, use helpers below */ -void __page_pool_put_page(struct page_pool *pool, - struct page *page, bool allow_direct); +void __page_pool_put_page(struct page_pool *pool, struct page *page, + unsigned int dma_sync_size, bool allow_direct); static inline void page_pool_put_page(struct page_pool *pool, struct page *page, bool allow_direct) @@ -159,14 +171,14 @@ static inline void page_pool_put_page(struct page_pool *pool, * allow registering MEM_TYPE_PAGE_POOL, but shield linker. */ #ifdef CONFIG_PAGE_POOL - __page_pool_put_page(pool, page, allow_direct); + __page_pool_put_page(pool, page, -1, allow_direct); #endif } /* Very limited use-cases allow recycle direct */ static inline void page_pool_recycle_direct(struct page_pool *pool, struct page *page) { - __page_pool_put_page(pool, page, true); + __page_pool_put_page(pool, page, -1, true); } /* Disconnects a page (from a page_pool). API users can have a need diff --git a/net/core/page_pool.c b/net/core/page_pool.c index 53681aedd134..96365f98fa95 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -47,6 +47,21 @@ static int page_pool_init(struct page_pool *pool, (pool->p.dma_dir != DMA_BIDIRECTIONAL)) return -EINVAL; + if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV) { + /* In order to request DMA-sync-for-device the page + * needs to be mapped + */ + if (!(pool->p.flags & PP_FLAG_DMA_MAP)) + return -EINVAL; + + if (!pool->p.max_len) + return -EINVAL; + + /* pool->p.offset has to be set according to the address + * offset used by the DMA engine to start copying rx data + */ + } + if (ptr_ring_init(&pool->ring, ring_qsize, GFP_KERNEL) < 0) return -ENOMEM; @@ -115,6 +130,16 @@ static struct page *__page_pool_get_cached(struct page_pool *pool) return page; } +static void page_pool_dma_sync_for_device(struct page_pool *pool, + struct page *page, + unsigned int dma_sync_size) +{ + dma_sync_size = min(dma_sync_size, pool->p.max_len); + dma_sync_single_range_for_device(pool->p.dev, page->dma_addr, + pool->p.offset, dma_sync_size, + pool->p.dma_dir); +} + /* slow path */ noinline static struct page *__page_pool_alloc_pages_slow(struct page_pool *pool, @@ -159,6 +184,9 @@ static struct page *__page_pool_alloc_pages_slow(struct page_pool *pool, } page_pool_set_dma_addr(page, dma); + if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV) + page_pool_dma_sync_for_device(pool, page, pool->p.max_len); + skip_dma_map: /* Track how many pages are held 'in-flight' */ pool->pages_state_hold_cnt++; @@ -292,8 +320,8 @@ static bool pool_page_reusable(struct page_pool *pool, struct page *page) return !page_is_pfmemalloc(page) && page_to_nid(page) == pool->p.nid; } -void __page_pool_put_page(struct page_pool *pool, - struct page *page, bool allow_direct) +void __page_pool_put_page(struct page_pool *pool, struct page *page, + unsigned int dma_sync_size, bool allow_direct) { /* This allocator is optimized for the XDP mode that uses * one-frame-per-page, but have fallbacks that act like the @@ -305,6 +333,10 @@ void __page_pool_put_page(struct page_pool *pool, pool_page_reusable(pool, page))) { /* Read barrier done in page_ref_count / READ_ONCE */ + if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV) + page_pool_dma_sync_for_device(pool, page, + dma_sync_size); + if (allow_direct && in_serving_softirq()) if (__page_pool_recycle_direct(page, pool)) return; -- Gitee From 7e766fc66570603fbeca0c7ae9eae890e88f6cb2 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Fri, 27 Dec 2019 18:13:18 +0100 Subject: [PATCH 07/33] page_pool: handle page recycle for NUMA_NO_NODE condition The check in pool_page_reusable (page_to_nid(page) == pool->p.nid) is not valid if page_pool was configured with pool->p.nid = NUMA_NO_NODE. The goal of the NUMA changes in commit d5394610b1ba ("page_pool: Don't recycle non-reusable pages"), were to have RX-pages that belongs to the same NUMA node as the CPU processing RX-packet during softirq/NAPI. As illustrated by the performance measurements. This patch moves the NAPI checks out of fast-path, and at the same time solves the NUMA_NO_NODE issue. First realize that alloc_pages_node() with pool->p.nid = NUMA_NO_NODE will lookup current CPU nid (Numa ID) via numa_mem_id(), which is used as the the preferred nid. It is only in rare situations, where e.g. NUMA zone runs dry, that page gets doesn't get allocated from preferred nid. The page_pool API allows drivers to control the nid themselves via controlling pool->p.nid. This patch moves the NAPI check to when alloc cache is refilled, via dequeuing/consuming pages from the ptr_ring. Thus, we can allow placing pages from remote NUMA into the ptr_ring, as the dequeue/consume step will check the NUMA node. All current drivers using page_pool will alloc/refill RX-ring from same CPU running softirq/NAPI process. Drivers that control the nid explicitly, also use page_pool_update_nid when changing nid runtime. To speed up transision to new nid the alloc cache is now flushed on nid changes. This force pages to come from ptr_ring, which does the appropate nid check. For the NUMA_NO_NODE case, when a NIC IRQ is moved to another NUMA node, we accept that transitioning the alloc cache doesn't happen immediately. The preferred nid change runtime via consulting numa_mem_id() based on the CPU processing RX-packets. Notice, to avoid stressing the page buddy allocator and avoid doing too much work under softirq with preempt disabled, the NUMA check at ptr_ring dequeue will break the refill cycle, when detecting a NUMA mismatch. This will cause a slower transition, but its done on purpose. Fixes: d5394610b1ba ("page_pool: Don't recycle non-reusable pages") Reported-by: Li RongQing Reported-by: Yunsheng Lin Signed-off-by: Jesper Dangaard Brouer Signed-off-by: David S. Miller Signed-off-by: hongrongxuan --- net/core/page_pool.c | 80 +++++++++++++++++++++++++++++++++----------- 1 file changed, 61 insertions(+), 19 deletions(-) diff --git a/net/core/page_pool.c b/net/core/page_pool.c index 96365f98fa95..984d4df84203 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -96,10 +96,60 @@ struct page_pool *page_pool_create(const struct page_pool_params *params) } EXPORT_SYMBOL(page_pool_create); +static void __page_pool_return_page(struct page_pool *pool, struct page *page); + +noinline +static struct page *page_pool_refill_alloc_cache(struct page_pool *pool, + bool refill) +{ + struct ptr_ring *r = &pool->ring; + struct page *page; + int pref_nid; /* preferred NUMA node */ + + /* Quicker fallback, avoid locks when ring is empty */ + if (__ptr_ring_empty(r)) + return NULL; + + /* Softirq guarantee CPU and thus NUMA node is stable. This, + * assumes CPU refilling driver RX-ring will also run RX-NAPI. + */ + pref_nid = (pool->p.nid == NUMA_NO_NODE) ? numa_mem_id() : pool->p.nid; + + /* Slower-path: Get pages from locked ring queue */ + spin_lock(&r->consumer_lock); + + /* Refill alloc array, but only if NUMA match */ + do { + page = __ptr_ring_consume(r); + if (unlikely(!page)) + break; + + if (likely(page_to_nid(page) == pref_nid)) { + pool->alloc.cache[pool->alloc.count++] = page; + } else { + /* NUMA mismatch; + * (1) release 1 page to page-allocator and + * (2) break out to fallthrough to alloc_pages_node. + * This limit stress on page buddy alloactor. + */ + __page_pool_return_page(pool, page); + page = NULL; + break; + } + } while (pool->alloc.count < PP_ALLOC_CACHE_REFILL && + refill); + + /* Return last page */ + if (likely(pool->alloc.count > 0)) + page = pool->alloc.cache[--pool->alloc.count]; + + spin_unlock(&r->consumer_lock); + return page; +} + /* fast path */ static struct page *__page_pool_get_cached(struct page_pool *pool) { - struct ptr_ring *r = &pool->ring; bool refill = false; struct page *page; @@ -113,20 +163,7 @@ static struct page *__page_pool_get_cached(struct page_pool *pool) refill = true; } - /* Quicker fallback, avoid locks when ring is empty */ - if (__ptr_ring_empty(r)) - return NULL; - - /* Slow-path: Get page from locked ring queue, - * refill alloc array if requested. - */ - spin_lock(&r->consumer_lock); - page = __ptr_ring_consume(r); - if (refill) - pool->alloc.count = __ptr_ring_consume_batched(r, - pool->alloc.cache, - PP_ALLOC_CACHE_REFILL); - spin_unlock(&r->consumer_lock); + page = page_pool_refill_alloc_cache(pool, refill); return page; } @@ -311,13 +348,10 @@ static bool __page_pool_recycle_direct(struct page *page, /* page is NOT reusable when: * 1) allocated when system is under some pressure. (page_is_pfmemalloc) - * 2) belongs to a different NUMA node than pool->p.nid. - * - * To update pool->p.nid users must call page_pool_update_nid. */ static bool pool_page_reusable(struct page_pool *pool, struct page *page) { - return !page_is_pfmemalloc(page) && page_to_nid(page) == pool->p.nid; + return !page_is_pfmemalloc(page); } void __page_pool_put_page(struct page_pool *pool, struct page *page, @@ -475,7 +509,15 @@ EXPORT_SYMBOL(page_pool_destroy); /* Caller must provide appropriate safe context, e.g. NAPI. */ void page_pool_update_nid(struct page_pool *pool, int new_nid) { + struct page *page; + trace_page_pool_update_nid(pool, new_nid); pool->p.nid = new_nid; + + /* Flush pool alloc cache, as refill will check NUMA node */ + while (pool->alloc.count) { + page = pool->alloc.cache[--pool->alloc.count]; + __page_pool_return_page(pool, page); + } } EXPORT_SYMBOL(page_pool_update_nid); -- Gitee From c0bb388b726a959491072938e33c507d08c175a0 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Fri, 27 Dec 2019 18:13:24 +0100 Subject: [PATCH 08/33] page_pool: help compiler remove code in case CONFIG_NUMA=n When kernel is compiled without NUMA support, then page_pool NUMA config setting (pool->p.nid) doesn't make any practical sense. The compiler cannot see that it can remove the code paths. This patch avoids reading pool->p.nid setting in case of !CONFIG_NUMA, in allocation and numa check code, which helps compiler to see the optimisation potential. It leaves update code intact to keep API the same. $ ./scripts/bloat-o-meter net/core/page_pool.o-numa-enabled \ net/core/page_pool.o-numa-disabled add/remove: 0/0 grow/shrink: 0/3 up/down: 0/-113 (-113) Function old new delta page_pool_create 401 398 -3 __page_pool_alloc_pages_slow 439 426 -13 page_pool_refill_alloc_cache 425 328 -97 Total: Before=3611, After=3498, chg -3.13% Signed-off-by: Jesper Dangaard Brouer Signed-off-by: David S. Miller Signed-off-by: hongrongxuan --- net/core/page_pool.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/net/core/page_pool.c b/net/core/page_pool.c index 984d4df84203..c8daf4f44842 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -113,7 +113,12 @@ static struct page *page_pool_refill_alloc_cache(struct page_pool *pool, /* Softirq guarantee CPU and thus NUMA node is stable. This, * assumes CPU refilling driver RX-ring will also run RX-NAPI. */ +#ifdef CONFIG_NUMA pref_nid = (pool->p.nid == NUMA_NO_NODE) ? numa_mem_id() : pool->p.nid; +#else + /* Ignore pool->p.nid setting if !CONFIG_NUMA, helps compiler */ + pref_nid = numa_mem_id(); /* will be zero like page_to_nid() */ +#endif /* Slower-path: Get pages from locked ring queue */ spin_lock(&r->consumer_lock); @@ -200,7 +205,11 @@ static struct page *__page_pool_alloc_pages_slow(struct page_pool *pool, */ /* Cache was empty, do real allocation */ +#ifdef CONFIG_NUMA page = alloc_pages_node(pool->p.nid, gfp, pool->p.order); +#else + page = alloc_pages(gfp, pool->p.order); +#endif if (!page) return NULL; -- Gitee From bc94828532550b1627e790173f8c334965f6d740 Mon Sep 17 00:00:00 2001 From: Li RongQing Date: Tue, 11 Feb 2020 10:13:44 +0800 Subject: [PATCH 09/33] page_pool: refill page when alloc.count of pool is zero "do {} while" in page_pool_refill_alloc_cache will always refill page once whether refill is true or false, and whether alloc.count of pool is less than PP_ALLOC_CACHE_REFILL or not this is wrong, and will cause overflow of pool->alloc.cache the caller of __page_pool_get_cached should provide guarantee that pool->alloc.cache is safe to access, so in_serving_softirq should be removed as suggested by Jesper Dangaard Brouer in https://patchwork.ozlabs.org/patch/1233713/ so fix this issue by calling page_pool_refill_alloc_cache() only when pool->alloc.count is zero Fixes: 44768decb7c0 ("page_pool: handle page recycle for NUMA_NO_NODE condition") Signed-off-by: Li RongQing Suggested-by: Jesper Dangaard Brouer Acked-by: Jesper Dangaard Brouer Acked-by: Ilias Apalodimas Signed-off-by: David S. Miller Signed-off-by: hongrongxuan --- net/core/page_pool.c | 22 ++++++++-------------- 1 file changed, 8 insertions(+), 14 deletions(-) diff --git a/net/core/page_pool.c b/net/core/page_pool.c index c8daf4f44842..1893335166d4 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -99,8 +99,7 @@ EXPORT_SYMBOL(page_pool_create); static void __page_pool_return_page(struct page_pool *pool, struct page *page); noinline -static struct page *page_pool_refill_alloc_cache(struct page_pool *pool, - bool refill) +static struct page *page_pool_refill_alloc_cache(struct page_pool *pool) { struct ptr_ring *r = &pool->ring; struct page *page; @@ -141,8 +140,7 @@ static struct page *page_pool_refill_alloc_cache(struct page_pool *pool, page = NULL; break; } - } while (pool->alloc.count < PP_ALLOC_CACHE_REFILL && - refill); + } while (pool->alloc.count < PP_ALLOC_CACHE_REFILL); /* Return last page */ if (likely(pool->alloc.count > 0)) @@ -155,20 +153,16 @@ static struct page *page_pool_refill_alloc_cache(struct page_pool *pool, /* fast path */ static struct page *__page_pool_get_cached(struct page_pool *pool) { - bool refill = false; struct page *page; - /* Test for safe-context, caller should provide this guarantee */ - if (likely(in_serving_softirq())) { - if (likely(pool->alloc.count)) { - /* Fast-path */ - page = pool->alloc.cache[--pool->alloc.count]; - return page; - } - refill = true; + /* Caller MUST guarantee safe non-concurrent access, e.g. softirq */ + if (likely(pool->alloc.count)) { + /* Fast-path */ + page = pool->alloc.cache[--pool->alloc.count]; + } else { + page = page_pool_refill_alloc_cache(pool); } - page = page_pool_refill_alloc_cache(pool, refill); return page; } -- Gitee From 22af09c1fa9a931ab718943c521fe9d9efeb9060 Mon Sep 17 00:00:00 2001 From: Ilias Apalodimas Date: Thu, 20 Feb 2020 09:41:55 +0200 Subject: [PATCH 10/33] net: page_pool: API cleanup and comments MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Functions starting with __ usually indicate those which are exported, but should not be called directly. Update some of those declared in the API and make it more readable. page_pool_unmap_page() and page_pool_release_page() were doing exactly the same thing calling __page_pool_clean_page(). Let's rename __page_pool_clean_page() to page_pool_release_page() and export it in order to show up on perf logs and get rid of page_pool_unmap_page(). Finally rename __page_pool_put_page() to page_pool_put_page() since we can now directly call it from drivers and rename the existing page_pool_put_page() to page_pool_put_full_page() since they do the same thing but the latter is trying to sync the full DMA area. This patch also updates netsec, mvneta and stmmac drivers which use those functions. Suggested-by: Jonathan Lemon Acked-by: Toke Høiland-Jørgensen Acked-by: Jesper Dangaard Brouer Signed-off-by: Ilias Apalodimas Signed-off-by: David S. Miller Signed-off-by: hongrongxuan Conflicts: drivers/net/ethernet/marvell/mvneta.c drivers/net/ethernet/socionext/netsec.c net/core/page_pool.c Abort the change in mvneta.c and netsec.c to fix the conflicts. --- drivers/net/ethernet/socionext/netsec.c | 2 +- .../net/ethernet/stmicro/stmmac/stmmac_main.c | 4 +- include/net/page_pool.h | 36 ++++------ net/core/page_pool.c | 70 ++++++++++--------- net/core/xdp.c | 2 +- 5 files changed, 55 insertions(+), 59 deletions(-) diff --git a/drivers/net/ethernet/socionext/netsec.c b/drivers/net/ethernet/socionext/netsec.c index 5a672658da80..5e6ed631e9fc 100644 --- a/drivers/net/ethernet/socionext/netsec.c +++ b/drivers/net/ethernet/socionext/netsec.c @@ -1191,7 +1191,7 @@ static void netsec_uninit_pkt_dring(struct netsec_priv *priv, int id) if (id == NETSEC_RING_RX) { struct page *page = virt_to_page(desc->addr); - page_pool_put_page(dring->page_pool, page, false); + page_pool_put_full_page(dring->page_pool, page, false); } else if (id == NETSEC_RING_TX) { dma_unmap_single(priv->dev, desc->dma_addr, desc->len, DMA_TO_DEVICE); diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index 6a3b0f76d972..f93b9a12dfd5 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -1224,11 +1224,11 @@ static void stmmac_free_rx_buffer(struct stmmac_priv *priv, u32 queue, int i) struct stmmac_rx_buffer *buf = &rx_q->buf_pool[i]; if (buf->page) - page_pool_put_page(rx_q->page_pool, buf->page, false); + page_pool_put_full_page(rx_q->page_pool, buf->page, false); buf->page = NULL; if (buf->sec_page) - page_pool_put_page(rx_q->page_pool, buf->sec_page, false); + page_pool_put_full_page(rx_q->page_pool, buf->sec_page, false); buf->sec_page = NULL; } diff --git a/include/net/page_pool.h b/include/net/page_pool.h index 02b018469bb9..cae58b248445 100644 --- a/include/net/page_pool.h +++ b/include/net/page_pool.h @@ -149,6 +149,7 @@ struct page_pool *page_pool_create(const struct page_pool_params *params); #ifdef CONFIG_PAGE_POOL void page_pool_destroy(struct page_pool *pool); void page_pool_use_xdp_mem(struct page_pool *pool, void (*disconnect)(void *)); +void page_pool_release_page(struct page_pool *pool, struct page *page); #else static inline void page_pool_destroy(struct page_pool *pool) { @@ -158,41 +159,32 @@ static inline void page_pool_use_xdp_mem(struct page_pool *pool, void (*disconnect)(void *)) { } +static inline void page_pool_release_page(struct page_pool *pool, + struct page *page) +{ +} #endif -/* Never call this directly, use helpers below */ -void __page_pool_put_page(struct page_pool *pool, struct page *page, - unsigned int dma_sync_size, bool allow_direct); +void page_pool_put_page(struct page_pool *pool, struct page *page, + unsigned int dma_sync_size, bool allow_direct); -static inline void page_pool_put_page(struct page_pool *pool, - struct page *page, bool allow_direct) +/* Same as above but will try to sync the entire area pool->max_len */ +static inline void page_pool_put_full_page(struct page_pool *pool, + struct page *page, bool allow_direct) { /* When page_pool isn't compiled-in, net/core/xdp.c doesn't * allow registering MEM_TYPE_PAGE_POOL, but shield linker. */ #ifdef CONFIG_PAGE_POOL - __page_pool_put_page(pool, page, -1, allow_direct); + page_pool_put_page(pool, page, -1, allow_direct); #endif } -/* Very limited use-cases allow recycle direct */ + +/* Same as above but the caller must guarantee safe context. e.g NAPI */ static inline void page_pool_recycle_direct(struct page_pool *pool, struct page *page) { - __page_pool_put_page(pool, page, -1, true); -} - -/* Disconnects a page (from a page_pool). API users can have a need - * to disconnect a page (from a page_pool), to allow it to be used as - * a regular page (that will eventually be returned to the normal - * page-allocator via put_page). - */ -void page_pool_unmap_page(struct page_pool *pool, struct page *page); -static inline void page_pool_release_page(struct page_pool *pool, - struct page *page) -{ -#ifdef CONFIG_PAGE_POOL - page_pool_unmap_page(pool, page); -#endif + page_pool_put_full_page(pool, page, true); } static inline dma_addr_t page_pool_get_dma_addr(struct page *page) diff --git a/net/core/page_pool.c b/net/core/page_pool.c index 1893335166d4..6e8cee85429b 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -96,7 +96,7 @@ struct page_pool *page_pool_create(const struct page_pool_params *params) } EXPORT_SYMBOL(page_pool_create); -static void __page_pool_return_page(struct page_pool *pool, struct page *page); +static void page_pool_return_page(struct page_pool *pool, struct page *page); noinline static struct page *page_pool_refill_alloc_cache(struct page_pool *pool) @@ -136,7 +136,7 @@ static struct page *page_pool_refill_alloc_cache(struct page_pool *pool) * (2) break out to fallthrough to alloc_pages_node. * This limit stress on page buddy alloactor. */ - __page_pool_return_page(pool, page); + page_pool_return_page(pool, page); page = NULL; break; } @@ -274,18 +274,25 @@ static s32 page_pool_inflight(struct page_pool *pool) return inflight; } -/* Cleanup page_pool state from page */ -static void __page_pool_clean_page(struct page_pool *pool, - struct page *page) +/* Disconnects a page (from a page_pool). API users can have a need + * to disconnect a page (from a page_pool), to allow it to be used as + * a regular page (that will eventually be returned to the normal + * page-allocator via put_page). + */ +void page_pool_release_page(struct page_pool *pool, struct page *page) { dma_addr_t dma; int count; if (!(pool->p.flags & PP_FLAG_DMA_MAP)) + /* Always account for inflight pages, even if we didn't + * map them + */ goto skip_dma_unmap; dma = page_pool_get_dma_addr(page); - /* DMA unmap */ + + /* When page is unmapped, it cannot be returned our pool */ dma_unmap_page_attrs(pool->p.dev, dma, PAGE_SIZE << pool->p.order, pool->p.dma_dir, DMA_ATTR_SKIP_CPU_SYNC); @@ -297,21 +304,12 @@ static void __page_pool_clean_page(struct page_pool *pool, count = atomic_inc_return(&pool->pages_state_release_cnt); trace_page_pool_state_release(pool, page, count); } - -/* unmap the page and clean our state */ -void page_pool_unmap_page(struct page_pool *pool, struct page *page) -{ - /* When page is unmapped, this implies page will not be - * returned to page_pool. - */ - __page_pool_clean_page(pool, page); -} -EXPORT_SYMBOL(page_pool_unmap_page); +EXPORT_SYMBOL(page_pool_release_page); /* Return a page to the page allocator, cleaning up our state */ -static void __page_pool_return_page(struct page_pool *pool, struct page *page) +static void page_pool_return_page(struct page_pool *pool, struct page *page) { - __page_pool_clean_page(pool, page); + page_pool_release_page(pool, page); put_page(page); /* An optimization would be to call __free_pages(page, pool->p.order) @@ -320,8 +318,7 @@ static void __page_pool_return_page(struct page_pool *pool, struct page *page) */ } -static bool __page_pool_recycle_into_ring(struct page_pool *pool, - struct page *page) +static bool page_pool_recycle_in_ring(struct page_pool *pool, struct page *page) { int ret; /* BH protection not needed if current is serving softirq */ @@ -338,7 +335,7 @@ static bool __page_pool_recycle_into_ring(struct page_pool *pool, * * Caller must provide appropriate safe context. */ -static bool __page_pool_recycle_direct(struct page *page, +static bool page_pool_recycle_in_cache(struct page *page, struct page_pool *pool) { if (unlikely(pool->alloc.count == PP_ALLOC_CACHE_SIZE)) @@ -357,8 +354,14 @@ static bool pool_page_reusable(struct page_pool *pool, struct page *page) return !page_is_pfmemalloc(page); } -void __page_pool_put_page(struct page_pool *pool, struct page *page, - unsigned int dma_sync_size, bool allow_direct) +/* If the page refcnt == 1, this will try to recycle the page. + * if PP_FLAG_DMA_SYNC_DEV is set, we'll try to sync the DMA area for + * the configured size min(dma_sync_size, pool->max_len). + * If the page refcnt != 1, then the page will be returned to memory + * subsystem. + */ +void page_pool_put_page(struct page_pool *pool, struct page *page, + unsigned int dma_sync_size, bool allow_direct) { /* This allocator is optimized for the XDP mode that uses * one-frame-per-page, but have fallbacks that act like the @@ -375,12 +378,12 @@ void __page_pool_put_page(struct page_pool *pool, struct page *page, dma_sync_size); if (allow_direct && in_serving_softirq()) - if (__page_pool_recycle_direct(page, pool)) + if (page_pool_recycle_in_cache(page, pool)) return; - if (!__page_pool_recycle_into_ring(pool, page)) { + if (!page_pool_recycle_in_ring(pool, page)) { /* Cache full, fallback to free pages */ - __page_pool_return_page(pool, page); + page_pool_return_page(pool, page); } return; } @@ -397,12 +400,13 @@ void __page_pool_put_page(struct page_pool *pool, struct page *page, * doing refcnt based recycle tricks, meaning another process * will be invoking put_page. */ - __page_pool_clean_page(pool, page); + /* Do not replace this with page_pool_return_page() */ + page_pool_release_page(pool, page); put_page(page); } -EXPORT_SYMBOL(__page_pool_put_page); +EXPORT_SYMBOL(page_pool_put_page); -static void __page_pool_empty_ring(struct page_pool *pool) +static void page_pool_empty_ring(struct page_pool *pool) { struct page *page; @@ -413,7 +417,7 @@ static void __page_pool_empty_ring(struct page_pool *pool) pr_crit("%s() page_pool refcnt %d violation\n", __func__, page_ref_count(page)); - __page_pool_return_page(pool, page); + page_pool_return_page(pool, page); } } @@ -440,13 +444,13 @@ static void page_pool_scrub(struct page_pool *pool) */ while (pool->alloc.count) { page = pool->alloc.cache[--pool->alloc.count]; - __page_pool_return_page(pool, page); + page_pool_return_page(pool, page); } /* No more consumers should exist, but producers could still * be in-flight. */ - __page_pool_empty_ring(pool); + page_pool_empty_ring(pool); } static int page_pool_release(struct page_pool *pool) @@ -520,7 +524,7 @@ void page_pool_update_nid(struct page_pool *pool, int new_nid) /* Flush pool alloc cache, as refill will check NUMA node */ while (pool->alloc.count) { page = pool->alloc.cache[--pool->alloc.count]; - __page_pool_return_page(pool, page); + page_pool_return_page(pool, page); } } EXPORT_SYMBOL(page_pool_update_nid); diff --git a/net/core/xdp.c b/net/core/xdp.c index b3f463c6543f..0e6490d8f740 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -377,7 +377,7 @@ static void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct, xa = rhashtable_lookup(mem_id_ht, &mem->id, mem_id_rht_params); page = virt_to_head_page(data); napi_direct &= !xdp_return_frame_no_direct(); - page_pool_put_page(xa->page_pool, page, napi_direct); + page_pool_put_full_page(xa->page_pool, page, napi_direct); rcu_read_unlock(); break; case MEM_TYPE_PAGE_SHARED: -- Gitee From 16189bc79408313b852c8701cbe69c70be58ca21 Mon Sep 17 00:00:00 2001 From: Mateusz Nosek Date: Tue, 13 Oct 2020 16:55:51 -0700 Subject: [PATCH 11/33] mm/page_alloc.c: clean code by merging two functions finalise_ac() is just 'epilogue' for 'prepare_alloc_pages'. Therefore there is no need to keep them both so 'finalise_ac' content can be merged into prepare_alloc_pages() code. It would make __alloc_pages_nodemask() cleaner when it comes to readability. Signed-off-by: Mateusz Nosek Signed-off-by: Andrew Morton Reviewed-by: Andrew Morton Cc: Mel Gorman Cc: Mike Rapoport Link: https://lkml.kernel.org/r/20200916110118.6537-1-mateusznosek0@gmail.com Signed-off-by: Linus Torvalds Signed-off-by: hongrongxuan Conflicts: mm/page_alloc.c --- mm/page_alloc.c | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index cf0f620d0e63..05fa976fd7fc 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4899,12 +4899,6 @@ static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order, if (IS_ENABLED(CONFIG_CMA) && ac->migratetype == MIGRATE_MOVABLE) *alloc_flags |= ALLOC_CMA; - return true; -} - -/* Determine whether to spread dirty pages and what the first usable zone */ -static inline void finalise_ac(gfp_t gfp_mask, struct alloc_context *ac) -{ /* Dirty zone balancing only done in the fast path */ ac->spread_dirty_pages = (gfp_mask & __GFP_WRITE); @@ -4915,6 +4909,8 @@ static inline void finalise_ac(gfp_t gfp_mask, struct alloc_context *ac) */ ac->preferred_zoneref = first_zones_zonelist(ac->zonelist, ac->high_zoneidx, ac->nodemask); + + return true; } extern unsigned int vm_memcg_latency_histogram; @@ -4968,8 +4964,6 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, int preferred_nid, rcu_read_unlock(); #endif - finalise_ac(gfp_mask, &ac); - /* * Forbid the first pass from falling back to types that fragment * memory until all local zones are considered. -- Gitee From 509b42d6d0ad55be5aff3d88485c363d68ab35b3 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 29 Apr 2021 23:01:42 -0700 Subject: [PATCH 12/33] mm/page_alloc: rename alloced to allocated Patch series "Introduce a bulk order-0 page allocator with two in-tree users", v6. This series introduces a bulk order-0 page allocator with sunrpc and the network page pool being the first users. The implementation is not efficient as semantics needed to be ironed out first. If no other semantic changes are needed, it can be made more efficient. Despite that, this is a performance-related for users that require multiple pages for an operation without multiple round-trips to the page allocator. Quoting the last patch for the high-speed networking use-case Kernel XDP stats CPU pps Delta Baseline XDP-RX CPU total 3,771,046 n/a List XDP-RX CPU total 3,940,242 +4.49% Array XDP-RX CPU total 4,249,224 +12.68% Via the SUNRPC traces of svc_alloc_arg() Single page: 25.007 us per call over 532,571 calls Bulk list: 6.258 us per call over 517,034 calls Bulk array: 4.590 us per call over 517,442 calls Both potential users in this series are corner cases (NFS and high-speed networks) so it is unlikely that most users will see any benefit in the short term. Other potential other users are batch allocations for page cache readahead, fault around and SLUB allocations when high-order pages are unavailable. It's unknown how much benefit would be seen by converting multiple page allocation calls to a single batch or what difference it may make to headline performance. Light testing of my own running dbench over NFS passed. Chuck and Jesper conducted their own tests and details are included in the changelogs. Patch 1 renames a variable name that is particularly unpopular Patch 2 adds a bulk page allocator Patch 3 adds an array-based version of the bulk allocator Patches 4-5 adds micro-optimisations to the implementation Patches 6-7 SUNRPC user Patches 8-9 Network page_pool user This patch (of 9): Review feedback of the bulk allocator twice found problems with "alloced" being a counter for pages allocated. The naming was based on the API name "alloc" and was based on the idea that verbal communication about malloc tends to use the fake word "malloced" instead of the fake word mallocated. To be consistent, this preparation patch renames alloced to allocated in rmqueue_bulk so the bulk allocator and per-cpu allocator use similar names when the bulk allocator is introduced. Link: https://lkml.kernel.org/r/20210325114228.27719-1-mgorman@techsingularity.net Link: https://lkml.kernel.org/r/20210325114228.27719-2-mgorman@techsingularity.net Signed-off-by: Mel Gorman Reviewed-by: Matthew Wilcox (Oracle) Reviewed-by: Alexander Lobakin Acked-by: Vlastimil Babka Cc: Chuck Lever Cc: Jesper Dangaard Brouer Cc: Christoph Hellwig Cc: Alexander Duyck Cc: Ilias Apalodimas Cc: David Miller Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: hongrongxuan --- mm/page_alloc.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 05fa976fd7fc..6ebd6331ed13 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2759,7 +2759,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, unsigned long count, struct list_head *list, int migratetype, unsigned int alloc_flags) { - int i, alloced = 0; + int i, allocated = 0; spin_lock(&zone->lock); for (i = 0; i < count; ++i) { @@ -2782,7 +2782,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, * pages are ordered properly. */ list_add_tail(&page->lru, list); - alloced++; + allocated++; if (is_migrate_cma(get_pcppage_migratetype(page))) __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, -(1 << order)); @@ -2791,12 +2791,12 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, /* * i pages were removed from the buddy list even if some leak due * to check_pcp_refill failing so adjust NR_FREE_PAGES based - * on i. Do not confuse with 'alloced' which is the number of + * on i. Do not confuse with 'allocated' which is the number of * pages added to the pcp list. */ __mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order)); spin_unlock(&zone->lock); - return alloced; + return allocated; } #ifdef CONFIG_NUMA -- Gitee From fd99065cd13cc86c87f4c11b5549947fc31fcce6 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 29 Apr 2021 23:01:45 -0700 Subject: [PATCH 13/33] mm/page_alloc: add a bulk page allocator This patch adds a new page allocator interface via alloc_pages_bulk, and __alloc_pages_bulk_nodemask. A caller requests a number of pages to be allocated and added to a list. The API is not guaranteed to return the requested number of pages and may fail if the preferred allocation zone has limited free memory, the cpuset changes during the allocation or page debugging decides to fail an allocation. It's up to the caller to request more pages in batch if necessary. Note that this implementation is not very efficient and could be improved but it would require refactoring. The intent is to make it available early to determine what semantics are required by different callers. Once the full semantics are nailed down, it can be refactored. [mgorman@techsingularity.net: fix alloc_pages_bulk() return type, per Matthew] Link: https://lkml.kernel.org/r/20210325123713.GQ3697@techsingularity.net [mgorman@techsingularity.net: fix uninit var warning] Link: https://lkml.kernel.org/r/20210330114847.GX3697@techsingularity.net [mgorman@techsingularity.net: fix comment, per Vlastimil] Link: https://lkml.kernel.org/r/20210412110255.GV3697@techsingularity.net Link: https://lkml.kernel.org/r/20210325114228.27719-3-mgorman@techsingularity.net Signed-off-by: Mel Gorman Acked-by: Vlastimil Babka Reviewed-by: Alexander Lobakin Tested-by: Colin Ian King Cc: Alexander Duyck Cc: Christoph Hellwig Cc: Chuck Lever Cc: David Miller Cc: Ilias Apalodimas Cc: Jesper Dangaard Brouer Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: hongrongxuan --- include/linux/gfp.h | 11 +++++ mm/page_alloc.c | 118 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 129 insertions(+) diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 61f2f6ff9467..aff225a63582 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -496,6 +496,17 @@ __alloc_pages(gfp_t gfp_mask, unsigned int order, int preferred_nid) return __alloc_pages_nodemask(gfp_mask, order, preferred_nid, NULL); } +unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid, + nodemask_t *nodemask, int nr_pages, + struct list_head *list); + +/* Bulk allocate order-0 pages */ +static inline unsigned long +alloc_pages_bulk(gfp_t gfp, unsigned long nr_pages, struct list_head *list) +{ + return __alloc_pages_bulk(gfp, numa_mem_id(), NULL, nr_pages, list); +} + /* * Allocate pages, preferring the node given as nid. The node must be valid and * online. For more general interface, see alloc_pages_node(). diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 6ebd6331ed13..2eeff326521e 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4915,6 +4915,124 @@ static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order, extern unsigned int vm_memcg_latency_histogram; +/* + * __alloc_pages_bulk - Allocate a number of order-0 pages to a list + * @gfp: GFP flags for the allocation + * @preferred_nid: The preferred NUMA node ID to allocate from + * @nodemask: Set of nodes to allocate from, may be NULL + * @nr_pages: The number of pages desired on the list + * @page_list: List to store the allocated pages + * + * This is a batched version of the page allocator that attempts to + * allocate nr_pages quickly and add them to a list. + * + * Returns the number of pages on the list. + */ +unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid, + nodemask_t *nodemask, int nr_pages, + struct list_head *page_list) +{ + struct page *page; + unsigned long flags; + struct zone *zone; + struct zoneref *z; + struct per_cpu_pages *pcp; + struct list_head *pcp_list; + struct alloc_context ac; + gfp_t alloc_gfp; + unsigned int alloc_flags = ALLOC_WMARK_LOW; + int allocated = 0; + + if (WARN_ON_ONCE(nr_pages <= 0)) + return 0; + + /* Use the single page allocator for one page. */ + if (nr_pages == 1) + goto failed; + + /* May set ALLOC_NOFRAGMENT, fragmentation will return 1 page. */ + gfp &= gfp_allowed_mask; + alloc_gfp = gfp; + if (!prepare_alloc_pages(gfp, 0, preferred_nid, nodemask, &ac, &alloc_gfp, &alloc_flags)) + return 0; + gfp = alloc_gfp; + + /* Find an allowed local zone that meets the low watermark. */ + for_each_zone_zonelist_nodemask(zone, z, ac.zonelist, ac.high_zoneidx, ac.nodemask) { + unsigned long mark; + + if (cpusets_enabled() && (alloc_flags & ALLOC_CPUSET) && + !__cpuset_zone_allowed(zone, gfp)) { + continue; + } + + if (nr_online_nodes > 1 && zone != ac.preferred_zoneref->zone && + zone_to_nid(zone) != zone_to_nid(ac.preferred_zoneref->zone)) { + goto failed; + } + + mark = wmark_pages(zone, alloc_flags & ALLOC_WMARK_MASK) + nr_pages; + if (zone_watermark_fast(zone, 0, mark, + zonelist_zone_idx(ac.preferred_zoneref), + alloc_flags, gfp)) { + break; + } + } + + /* + * If there are no allowed local zones that meets the watermarks then + * try to allocate a single page and reclaim if necessary. + */ + if (!zone) + goto failed; + + /* Attempt the batch allocation */ + local_irq_save(flags); + pcp = &this_cpu_ptr(zone->pageset)->pcp; + pcp_list = &pcp->lists[ac.migratetype]; + + while (allocated < nr_pages) { + page = __rmqueue_pcplist(zone, ac.migratetype, alloc_flags, + pcp, pcp_list); + if (!page) { + /* Try and get at least one page */ + if (!allocated) + goto failed_irq; + break; + } + + /* + * Ideally this would be batched but the best way to do + * that cheaply is to first convert zone_statistics to + * be inaccurate per-cpu counter like vm_events to avoid + * a RMW cycle then do the accounting with IRQs enabled. + */ + __count_zid_vm_events(PGALLOC, zone_idx(zone), 1); + zone_statistics(ac.preferred_zoneref->zone, zone); + + prep_new_page(page, 0, gfp, 0); + list_add(&page->lru, page_list); + allocated++; + } + + local_irq_restore(flags); + + return allocated; + +failed_irq: + local_irq_restore(flags); + +failed: + page = __alloc_pages_nodemask(gfp, 0, preferred_nid, nodemask); + if (page) { + list_add(&page->lru, page_list); + allocated = 1; + } + + return allocated; +} +EXPORT_SYMBOL_GPL(__alloc_pages_bulk); + /* * This is the 'heart' of the zoned buddy allocator. */ -- Gitee From e7cbe6be9a6dbd853b7a666f219f961050f03f46 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 29 Apr 2021 23:01:48 -0700 Subject: [PATCH 14/33] mm/page_alloc: add an array-based interface to the bulk page allocator The proposed callers for the bulk allocator store pages from the bulk allocator in an array. This patch adds an array-based interface to the API to avoid multiple list iterations. The page list interface is preserved to avoid requiring all users of the bulk API to allocate and manage enough storage to store the pages. [akpm@linux-foundation.org: remove now unused local `allocated'] Link: https://lkml.kernel.org/r/20210325114228.27719-4-mgorman@techsingularity.net Signed-off-by: Mel Gorman Reviewed-by: Alexander Lobakin Acked-by: Vlastimil Babka Cc: Alexander Duyck Cc: Christoph Hellwig Cc: Chuck Lever Cc: David Miller Cc: Ilias Apalodimas Cc: Jesper Dangaard Brouer Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: hongrongxuan --- include/linux/gfp.h | 13 +++++++--- mm/page_alloc.c | 60 +++++++++++++++++++++++++++++++++------------ 2 files changed, 54 insertions(+), 19 deletions(-) diff --git a/include/linux/gfp.h b/include/linux/gfp.h index aff225a63582..467545cae440 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -498,13 +498,20 @@ __alloc_pages(gfp_t gfp_mask, unsigned int order, int preferred_nid) unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid, nodemask_t *nodemask, int nr_pages, - struct list_head *list); + struct list_head *page_list, + struct page **page_array); /* Bulk allocate order-0 pages */ static inline unsigned long -alloc_pages_bulk(gfp_t gfp, unsigned long nr_pages, struct list_head *list) +alloc_pages_bulk_list(gfp_t gfp, unsigned long nr_pages, struct list_head *list) { - return __alloc_pages_bulk(gfp, numa_mem_id(), NULL, nr_pages, list); + return __alloc_pages_bulk(gfp, numa_mem_id(), NULL, nr_pages, list, NULL); +} + +static inline unsigned long +alloc_pages_bulk_array(gfp_t gfp, unsigned long nr_pages, struct page **page_array) +{ + return __alloc_pages_bulk(gfp, numa_mem_id(), NULL, nr_pages, NULL, page_array); } /* diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 2eeff326521e..066d9eaf145d 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4916,21 +4916,29 @@ static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order, extern unsigned int vm_memcg_latency_histogram; /* - * __alloc_pages_bulk - Allocate a number of order-0 pages to a list + * __alloc_pages_bulk - Allocate a number of order-0 pages to a list or array * @gfp: GFP flags for the allocation * @preferred_nid: The preferred NUMA node ID to allocate from * @nodemask: Set of nodes to allocate from, may be NULL - * @nr_pages: The number of pages desired on the list - * @page_list: List to store the allocated pages + * @nr_pages: The number of pages desired on the list or array + * @page_list: Optional list to store the allocated pages + * @page_array: Optional array to store the pages * * This is a batched version of the page allocator that attempts to - * allocate nr_pages quickly and add them to a list. + * allocate nr_pages quickly. Pages are added to page_list if page_list + * is not NULL, otherwise it is assumed that the page_array is valid. * - * Returns the number of pages on the list. + * For lists, nr_pages is the number of pages that should be allocated. + * + * For arrays, only NULL elements are populated with pages and nr_pages + * is the maximum number of pages that will be stored in the array. + * + * Returns the number of pages on the list or array. */ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid, nodemask_t *nodemask, int nr_pages, - struct list_head *page_list) + struct list_head *page_list, + struct page **page_array) { struct page *page; unsigned long flags; @@ -4941,13 +4949,20 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid, struct alloc_context ac; gfp_t alloc_gfp; unsigned int alloc_flags = ALLOC_WMARK_LOW; - int allocated = 0; + int nr_populated = 0; if (WARN_ON_ONCE(nr_pages <= 0)) return 0; + /* + * Skip populated array elements to determine if any pages need + * to be allocated before disabling IRQs. + */ + while (page_array && page_array[nr_populated] && nr_populated < nr_pages) + nr_populated++; + /* Use the single page allocator for one page. */ - if (nr_pages == 1) + if (nr_pages - nr_populated == 1) goto failed; /* May set ALLOC_NOFRAGMENT, fragmentation will return 1 page. */ @@ -4991,12 +5006,19 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid, pcp = &this_cpu_ptr(zone->pageset)->pcp; pcp_list = &pcp->lists[ac.migratetype]; - while (allocated < nr_pages) { + while (nr_populated < nr_pages) { + + /* Skip existing pages */ + if (page_array && page_array[nr_populated]) { + nr_populated++; + continue; + } + page = __rmqueue_pcplist(zone, ac.migratetype, alloc_flags, pcp, pcp_list); if (!page) { /* Try and get at least one page */ - if (!allocated) + if (!nr_populated) goto failed_irq; break; } @@ -5011,13 +5033,16 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid, zone_statistics(ac.preferred_zoneref->zone, zone); prep_new_page(page, 0, gfp, 0); - list_add(&page->lru, page_list); - allocated++; + if (page_list) + list_add(&page->lru, page_list); + else + page_array[nr_populated] = page; + nr_populated++; } local_irq_restore(flags); - return allocated; + return nr_populated; failed_irq: local_irq_restore(flags); @@ -5025,11 +5050,14 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid, failed: page = __alloc_pages_nodemask(gfp, 0, preferred_nid, nodemask); if (page) { - list_add(&page->lru, page_list); - allocated = 1; + if (page_list) + list_add(&page->lru, page_list); + else + page_array[nr_populated] = page; + nr_populated++; } - return allocated; + return nr_populated; } EXPORT_SYMBOL_GPL(__alloc_pages_bulk); -- Gitee From 64cb102c5d10600f089bf82f51a2ad8ca1479e34 Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Thu, 24 Jun 2021 18:40:04 -0700 Subject: [PATCH 15/33] mm/page_alloc: __alloc_pages_bulk(): do bounds check before accessing array In the event that somebody would call this with an already fully populated page_array, the last loop iteration would do an access beyond the end of page_array. It's of course extremely unlikely that would ever be done, but this triggers my internal static analyzer. Also, if it really is not supposed to be invoked this way (i.e., with no NULL entries in page_array), the nr_populated Acked-by: Mel Gorman Reviewed-by: Andrew Morton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: hongrongxuan --- mm/page_alloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 066d9eaf145d..f5e7051b23e4 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4958,7 +4958,7 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid, * Skip populated array elements to determine if any pages need * to be allocated before disabling IRQs. */ - while (page_array && page_array[nr_populated] && nr_populated < nr_pages) + while (page_array && nr_populated < nr_pages && page_array[nr_populated]) nr_populated++; /* Use the single page allocator for one page. */ -- Gitee From e9898011879a6bf84d7f43f9889c4c96bef44699 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 24 Jun 2021 18:40:07 -0700 Subject: [PATCH 16/33] mm/page_alloc: do bulk array bounds check after checking populated elements Dan Carpenter reported the following The patch 0f87d9d30f21: "mm/page_alloc: add an array-based interface to the bulk page allocator" from Apr 29, 2021, leads to the following static checker warning: mm/page_alloc.c:5338 __alloc_pages_bulk() warn: potentially one past the end of array 'page_array[nr_populated]' The problem can occur if an array is passed in that is fully populated. That potentially ends up allocating a single page and storing it past the end of the array. This patch returns 0 if the array is fully populated. Link: https://lkml.kernel.org/r/20210618125102.GU30378@techsingularity.net Fixes: 0f87d9d30f21 ("mm/page_alloc: add an array-based interface to the bulk page allocator") Signed-off-by: Mel Gorman Reported-by: Dan Carpenter Cc: Jesper Dangaard Brouer Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: hongrongxuan --- mm/page_alloc.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index f5e7051b23e4..bf3663fafb34 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4961,6 +4961,10 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid, while (page_array && nr_populated < nr_pages && page_array[nr_populated]) nr_populated++; + /* Already populated array? */ + if (unlikely(page_array && nr_pages - nr_populated == 0)) + return 0; + /* Use the single page allocator for one page. */ if (nr_pages - nr_populated == 1) goto failed; -- Gitee From 2c18a3d7a8d7180120009d7112b18f94454d4118 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 28 Jun 2021 19:33:29 -0700 Subject: [PATCH 17/33] mm/page_alloc: correct return value of populated elements if bulk array is populated Dave Jones reported the following This made it into 5.13 final, and completely breaks NFSD for me (Serving tcp v3 mounts). Existing mounts on clients hang, as do new mounts from new clients. Rebooting the server back to rc7 everything recovers. The commit b3b64ebd3822 ("mm/page_alloc: do bulk array bounds check after checking populated elements") returns the wrong value if the array is already populated which is interpreted as an allocation failure. Dave reported this fixes his problem and it also passed a test running dbench over NFS. Link: https://lkml.kernel.org/r/20210628150219.GC3840@techsingularity.net Fixes: b3b64ebd3822 ("mm/page_alloc: do bulk array bounds check after checking populated elements") Signed-off-by: Mel Gorman Reported-by: Dave Jones Tested-by: Dave Jones Cc: Dan Carpenter Cc: Jesper Dangaard Brouer Cc: Vlastimil Babka Cc: [5.13+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: hongrongxuan --- mm/page_alloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index bf3663fafb34..a15f36464f31 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4963,7 +4963,7 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid, /* Already populated array? */ if (unlikely(page_array && nr_pages - nr_populated == 0)) - return 0; + return nr_populated; /* Use the single page allocator for one page. */ if (nr_pages - nr_populated == 1) -- Gitee From 85b8158058d6b6c5ce2a8e126defc3e4d322cc84 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Thu, 29 Apr 2021 23:01:51 -0700 Subject: [PATCH 18/33] mm/page_alloc: optimize code layout for __alloc_pages_bulk Looking at perf-report and ASM-code for __alloc_pages_bulk() it is clear that the code activated is suboptimal. The compiler guesses wrong and places unlikely code at the beginning. Due to the use of WARN_ON_ONCE() macro the UD2 asm instruction is added to the code, which confuse the I-cache prefetcher in the CPU. [mgorman@techsingularity.net: minor changes and rebasing] Link: https://lkml.kernel.org/r/20210325114228.27719-5-mgorman@techsingularity.net Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Mel Gorman Reviewed-by: Alexander Lobakin Acked-By: Vlastimil Babka Cc: Alexander Duyck Cc: Christoph Hellwig Cc: Chuck Lever Cc: David Miller Cc: Ilias Apalodimas Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: hongrongxuan --- mm/page_alloc.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index a15f36464f31..4b5fac1fca3e 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4951,7 +4951,7 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid, unsigned int alloc_flags = ALLOC_WMARK_LOW; int nr_populated = 0; - if (WARN_ON_ONCE(nr_pages <= 0)) + if (unlikely(nr_pages <= 0)) return 0; /* @@ -5002,7 +5002,7 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid, * If there are no allowed local zones that meets the watermarks then * try to allocate a single page and reclaim if necessary. */ - if (!zone) + if (unlikely(!zone)) goto failed; /* Attempt the batch allocation */ @@ -5020,7 +5020,7 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid, page = __rmqueue_pcplist(zone, ac.migratetype, alloc_flags, pcp, pcp_list); - if (!page) { + if (unlikely(!page)) { /* Try and get at least one page */ if (!nr_populated) goto failed_irq; -- Gitee From e0ce48e1aabca70f93920c5670e234705e117254 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Thu, 29 Apr 2021 23:01:55 -0700 Subject: [PATCH 19/33] mm/page_alloc: inline __rmqueue_pcplist When __alloc_pages_bulk() got introduced two callers of __rmqueue_pcplist exist and the compiler chooses to not inline this function. ./scripts/bloat-o-meter vmlinux-before vmlinux-inline__rmqueue_pcplist add/remove: 0/1 grow/shrink: 2/0 up/down: 164/-125 (39) Function old new delta rmqueue 2197 2296 +99 __alloc_pages_bulk 1921 1986 +65 __rmqueue_pcplist 125 - -125 Total: Before=19374127, After=19374166, chg +0.00% modprobe page_bench04_bulk loops=$((10**7)) Type:time_bulk_page_alloc_free_array - Per elem: 106 cycles(tsc) 29.595 ns (step:64) - (measurement period time:0.295955434 sec time_interval:295955434) - (invoke count:10000000 tsc_interval:1065447105) Before: - Per elem: 110 cycles(tsc) 30.633 ns (step:64) Link: https://lkml.kernel.org/r/20210325114228.27719-6-mgorman@techsingularity.net Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Mel Gorman Reviewed-by: Alexander Lobakin Acked-by: Vlastimil Babka Cc: Alexander Duyck Cc: Christoph Hellwig Cc: Chuck Lever Cc: David Miller Cc: Ilias Apalodimas Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: hongrongxuan --- mm/page_alloc.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 4b5fac1fca3e..5bb102f17a5a 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3245,7 +3245,8 @@ static inline void zone_statistics(struct zone *preferred_zone, struct zone *z) } /* Remove page from the per-cpu list, caller must protect the list */ -static struct page *__rmqueue_pcplist(struct zone *zone, int migratetype, +static inline +struct page *__rmqueue_pcplist(struct zone *zone, int migratetype, unsigned int alloc_flags, struct per_cpu_pages *pcp, struct list_head *list) -- Gitee From 3d15be78e4a89722c40bddc4f5bed79938cdb586 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Thu, 29 Apr 2021 23:02:04 -0700 Subject: [PATCH 20/33] net: page_pool: refactor dma_map into own function page_pool_dma_map In preparation for next patch, move the dma mapping into its own function, as this will make it easier to follow the changes. [ilias.apalodimas: make page_pool_dma_map return boolean] Link: https://lkml.kernel.org/r/20210325114228.27719-9-mgorman@techsingularity.net Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Mel Gorman Reviewed-by: Ilias Apalodimas Reviewed-by: Alexander Lobakin Cc: Alexander Duyck Cc: Christoph Hellwig Cc: Chuck Lever Cc: David Miller Cc: Matthew Wilcox (Oracle) Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: hongrongxuan --- net/core/page_pool.c | 45 +++++++++++++++++++++++++------------------- 1 file changed, 26 insertions(+), 19 deletions(-) diff --git a/net/core/page_pool.c b/net/core/page_pool.c index 6e8cee85429b..4bf9c4c9ca2b 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -176,14 +176,37 @@ static void page_pool_dma_sync_for_device(struct page_pool *pool, pool->p.dma_dir); } +static bool page_pool_dma_map(struct page_pool *pool, struct page *page) +{ + dma_addr_t dma; + + /* Setup DMA mapping: use 'struct page' area for storing DMA-addr + * since dma_addr_t can be either 32 or 64 bits and does not always fit + * into page private data (i.e 32bit cpu with 64bit DMA caps) + * This mapping is kept for lifetime of page, until leaving pool. + */ + dma = dma_map_page_attrs(pool->p.dev, page, 0, + (PAGE_SIZE << pool->p.order), + pool->p.dma_dir, DMA_ATTR_SKIP_CPU_SYNC); + if (dma_mapping_error(pool->p.dev, dma)) + return false; + + page->dma_addr = dma; + + if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV) + page_pool_dma_sync_for_device(pool, page, pool->p.max_len); + + return true; +} + /* slow path */ noinline static struct page *__page_pool_alloc_pages_slow(struct page_pool *pool, gfp_t _gfp) { + unsigned int pp_flags = pool->p.flags; struct page *page; gfp_t gfp = _gfp; - dma_addr_t dma; /* We could always set __GFP_COMP, and avoid this branch, as * prep_new_page() can handle order-0 with __GFP_COMP. @@ -207,30 +230,14 @@ static struct page *__page_pool_alloc_pages_slow(struct page_pool *pool, if (!page) return NULL; - if (!(pool->p.flags & PP_FLAG_DMA_MAP)) - goto skip_dma_map; - - /* Setup DMA mapping: use 'struct page' area for storing DMA-addr - * since dma_addr_t can be either 32 or 64 bits and does not always fit - * into page private data (i.e 32bit cpu with 64bit DMA caps) - * This mapping is kept for lifetime of page, until leaving pool. - */ - dma = dma_map_page_attrs(pool->p.dev, page, 0, - (PAGE_SIZE << pool->p.order), - pool->p.dma_dir, DMA_ATTR_SKIP_CPU_SYNC); - if (dma_mapping_error(pool->p.dev, dma)) { + if ((pp_flags & PP_FLAG_DMA_MAP) && + unlikely(!page_pool_dma_map(pool, page))) { put_page(page); return NULL; } - page_pool_set_dma_addr(page, dma); - if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV) - page_pool_dma_sync_for_device(pool, page, pool->p.max_len); - -skip_dma_map: /* Track how many pages are held 'in-flight' */ pool->pages_state_hold_cnt++; - trace_page_pool_state_hold(pool, page, pool->pages_state_hold_cnt); /* When page just alloc'ed is should/must have refcnt 1. */ -- Gitee From a1f6aeef9baabf607eba368dd9a7d6c02a735efb Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Thu, 29 Apr 2021 23:02:07 -0700 Subject: [PATCH 21/33] net: page_pool: use alloc_pages_bulk in refill code path There are cases where the page_pool need to refill with pages from the page allocator. Some workloads cause the page_pool to release pages instead of recycling these pages. For these workload it can improve performance to bulk alloc pages from the page-allocator to refill the alloc cache. For XDP-redirect workload with 100G mlx5 driver (that use page_pool) redirecting xdp_frame packets into a veth, that does XDP_PASS to create an SKB from the xdp_frame, which then cannot return the page to the page_pool. Performance results under GitHub xdp-project[1]: [1] https://github.com/xdp-project/xdp-project/blob/master/areas/mem/page_pool06_alloc_pages_bulk.org Mel: The patch "net: page_pool: convert to use alloc_pages_bulk_array variant" was squashed with this patch. From the test page, the array variant was superior with one of the test results as follows. Kernel XDP stats CPU pps Delta Baseline XDP-RX CPU total 3,771,046 n/a List XDP-RX CPU total 3,940,242 +4.49% Array XDP-RX CPU total 4,249,224 +12.68% Link: https://lkml.kernel.org/r/20210325114228.27719-10-mgorman@techsingularity.net Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Mel Gorman Reviewed-by: Alexander Lobakin Cc: Alexander Duyck Cc: Christoph Hellwig Cc: Chuck Lever Cc: David Miller Cc: Ilias Apalodimas Cc: Matthew Wilcox (Oracle) Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: hongrongxuan --- include/net/page_pool.h | 2 +- net/core/page_pool.c | 82 ++++++++++++++++++++++++++++------------- 2 files changed, 57 insertions(+), 27 deletions(-) diff --git a/include/net/page_pool.h b/include/net/page_pool.h index cae58b248445..1a5eb84c4e69 100644 --- a/include/net/page_pool.h +++ b/include/net/page_pool.h @@ -65,7 +65,7 @@ #define PP_ALLOC_CACHE_REFILL 64 struct pp_alloc_cache { u32 count; - void *cache[PP_ALLOC_CACHE_SIZE]; + struct page *cache[PP_ALLOC_CACHE_SIZE]; }; struct page_pool_params { diff --git a/net/core/page_pool.c b/net/core/page_pool.c index 4bf9c4c9ca2b..4c7457eb7894 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -199,38 +199,17 @@ static bool page_pool_dma_map(struct page_pool *pool, struct page *page) return true; } -/* slow path */ -noinline -static struct page *__page_pool_alloc_pages_slow(struct page_pool *pool, - gfp_t _gfp) +static struct page *__page_pool_alloc_page_order(struct page_pool *pool, + gfp_t gfp) { - unsigned int pp_flags = pool->p.flags; struct page *page; - gfp_t gfp = _gfp; - - /* We could always set __GFP_COMP, and avoid this branch, as - * prep_new_page() can handle order-0 with __GFP_COMP. - */ - if (pool->p.order) - gfp |= __GFP_COMP; - - /* FUTURE development: - * - * Current slow-path essentially falls back to single page - * allocations, which doesn't improve performance. This code - * need bulk allocation support from the page allocator code. - */ - /* Cache was empty, do real allocation */ -#ifdef CONFIG_NUMA + gfp |= __GFP_COMP; page = alloc_pages_node(pool->p.nid, gfp, pool->p.order); -#else - page = alloc_pages(gfp, pool->p.order); -#endif - if (!page) + if (unlikely(!page)) return NULL; - if ((pp_flags & PP_FLAG_DMA_MAP) && + if ((pool->p.flags & PP_FLAG_DMA_MAP) && unlikely(!page_pool_dma_map(pool, page))) { put_page(page); return NULL; @@ -239,6 +218,57 @@ static struct page *__page_pool_alloc_pages_slow(struct page_pool *pool, /* Track how many pages are held 'in-flight' */ pool->pages_state_hold_cnt++; trace_page_pool_state_hold(pool, page, pool->pages_state_hold_cnt); + return page; +} + +/* slow path */ +noinline +static struct page *__page_pool_alloc_pages_slow(struct page_pool *pool, + gfp_t gfp) +{ + const int bulk = PP_ALLOC_CACHE_REFILL; + unsigned int pp_flags = pool->p.flags; + unsigned int pp_order = pool->p.order; + struct page *page; + int i, nr_pages; + + /* Don't support bulk alloc for high-order pages */ + if (unlikely(pp_order)) + return __page_pool_alloc_page_order(pool, gfp); + + /* Unnecessary as alloc cache is empty, but guarantees zero count */ + if (unlikely(pool->alloc.count > 0)) + return pool->alloc.cache[--pool->alloc.count]; + + /* Mark empty alloc.cache slots "empty" for alloc_pages_bulk_array */ + memset(&pool->alloc.cache, 0, sizeof(void *) * bulk); + + nr_pages = alloc_pages_bulk_array(gfp, bulk, pool->alloc.cache); + if (unlikely(!nr_pages)) + return NULL; + + /* Pages have been filled into alloc.cache array, but count is zero and + * page element have not been (possibly) DMA mapped. + */ + for (i = 0; i < nr_pages; i++) { + page = pool->alloc.cache[i]; + if ((pp_flags & PP_FLAG_DMA_MAP) && + unlikely(!page_pool_dma_map(pool, page))) { + put_page(page); + continue; + } + pool->alloc.cache[pool->alloc.count++] = page; + /* Track how many pages are held 'in-flight' */ + pool->pages_state_hold_cnt++; + trace_page_pool_state_hold(pool, page, + pool->pages_state_hold_cnt); + } + + /* Return last page */ + if (likely(pool->alloc.count > 0)) + page = pool->alloc.cache[--pool->alloc.count]; + else + page = NULL; /* When page just alloc'ed is should/must have refcnt 1. */ return page; -- Gitee From 4dc88cbfa8b8ef1d29a5aa219d1fe0e8ba434736 Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Thu, 11 Nov 2021 16:29:06 +0800 Subject: [PATCH 22/33] net: page_pool: Add bulk support for ptr_ring Introduce the capability to batch page_pool ptr_ring refill since it is usually run inside the driver NAPI tx completion loop. Suggested-by: Jesper Dangaard Brouer Co-developed-by: Jesper Dangaard Brouer Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Lorenzo Bianconi Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Acked-by: Ilias Apalodimas Link: https://lore.kernel.org/bpf/08dd249c9522c001313f520796faa777c4089e1c.1605267335.git.lorenzo@kernel.org Reviewed-by: Yongxin Li (fix conflicts: delete modifications to net/core/xdp.c) Signed-off-by: hongrongxuan --- include/net/page_pool.h | 26 +++++++++++++++ net/core/page_pool.c | 70 +++++++++++++++++++++++++++++++++++------ 2 files changed, 86 insertions(+), 10 deletions(-) diff --git a/include/net/page_pool.h b/include/net/page_pool.h index 1a5eb84c4e69..c904bede4d7b 100644 --- a/include/net/page_pool.h +++ b/include/net/page_pool.h @@ -150,6 +150,8 @@ struct page_pool *page_pool_create(const struct page_pool_params *params); void page_pool_destroy(struct page_pool *pool); void page_pool_use_xdp_mem(struct page_pool *pool, void (*disconnect)(void *)); void page_pool_release_page(struct page_pool *pool, struct page *page); +void page_pool_put_page_bulk(struct page_pool *pool, void **data, + int count); #else static inline void page_pool_destroy(struct page_pool *pool) { @@ -163,6 +165,11 @@ static inline void page_pool_release_page(struct page_pool *pool, struct page *page) { } + +static inline void page_pool_put_page_bulk(struct page_pool *pool, void **data, + int count) +{ +} #endif void page_pool_put_page(struct page_pool *pool, struct page *page, @@ -223,4 +230,23 @@ static inline void page_pool_nid_changed(struct page_pool *pool, int new_nid) if (unlikely(pool->p.nid != new_nid)) page_pool_update_nid(pool, new_nid); } + +static inline void page_pool_ring_lock(struct page_pool *pool) + __acquires(&pool->ring.producer_lock) +{ + if (in_serving_softirq()) + spin_lock(&pool->ring.producer_lock); + else + spin_lock_bh(&pool->ring.producer_lock); +} + +static inline void page_pool_ring_unlock(struct page_pool *pool) + __releases(&pool->ring.producer_lock) +{ + if (in_serving_softirq()) + spin_unlock(&pool->ring.producer_lock); + else + spin_unlock_bh(&pool->ring.producer_lock); +} + #endif /* _NET_PAGE_POOL_H */ diff --git a/net/core/page_pool.c b/net/core/page_pool.c index 4c7457eb7894..9735608f33b5 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -11,6 +11,8 @@ #include #include +#include + #include #include #include @@ -397,8 +399,9 @@ static bool pool_page_reusable(struct page_pool *pool, struct page *page) * If the page refcnt != 1, then the page will be returned to memory * subsystem. */ -void page_pool_put_page(struct page_pool *pool, struct page *page, - unsigned int dma_sync_size, bool allow_direct) +static __always_inline struct page * +__page_pool_put_page(struct page_pool *pool, struct page *page, + unsigned int dma_sync_size, bool allow_direct) { /* This allocator is optimized for the XDP mode that uses * one-frame-per-page, but have fallbacks that act like the @@ -414,15 +417,12 @@ void page_pool_put_page(struct page_pool *pool, struct page *page, page_pool_dma_sync_for_device(pool, page, dma_sync_size); - if (allow_direct && in_serving_softirq()) - if (page_pool_recycle_in_cache(page, pool)) - return; + if (allow_direct && in_serving_softirq() && + page_pool_recycle_in_cache(page, pool)) + return NULL; - if (!page_pool_recycle_in_ring(pool, page)) { - /* Cache full, fallback to free pages */ - page_pool_return_page(pool, page); - } - return; + /* Page found as candidate for recycling */ + return page; } /* Fallback/non-XDP mode: API user have elevated refcnt. * @@ -440,9 +440,59 @@ void page_pool_put_page(struct page_pool *pool, struct page *page, /* Do not replace this with page_pool_return_page() */ page_pool_release_page(pool, page); put_page(page); + + return NULL; +} + +void page_pool_put_page(struct page_pool *pool, struct page *page, + unsigned int dma_sync_size, bool allow_direct) +{ + page = __page_pool_put_page(pool, page, dma_sync_size, allow_direct); + if (page && !page_pool_recycle_in_ring(pool, page)) { + /* Cache full, fallback to free pages */ + page_pool_return_page(pool, page); + } } EXPORT_SYMBOL(page_pool_put_page); +/* Caller must not use data area after call, as this function overwrites it */ +void page_pool_put_page_bulk(struct page_pool *pool, void **data, + int count) +{ + int i, bulk_len = 0; + + for (i = 0; i < count; i++) { + struct page *page = virt_to_head_page(data[i]); + + page = __page_pool_put_page(pool, page, -1, false); + /* Approved for bulk recycling in ptr_ring cache */ + if (page) + data[bulk_len++] = page; + } + + if (unlikely(!bulk_len)) + return; + + /* Bulk producer into ptr_ring page_pool cache */ + page_pool_ring_lock(pool); + for (i = 0; i < bulk_len; i++) { + if (__ptr_ring_produce(&pool->ring, data[i])) + break; /* ring full */ + } + page_pool_ring_unlock(pool); + + /* Hopefully all pages was return into ptr_ring */ + if (likely(i == bulk_len)) + return; + + /* ptr_ring cache full, free remaining pages outside producer lock + * since put_page() with refcnt == 1 can be an expensive operation + */ + for (; i < bulk_len; i++) + page_pool_return_page(pool, data[i]); +} +EXPORT_SYMBOL(page_pool_put_page_bulk); + static void page_pool_empty_ring(struct page_pool *pool) { struct page *page; -- Gitee From cad7de47fc70038e2259c1d9f9fff5d7cf3ab0d5 Mon Sep 17 00:00:00 2001 From: Jonathan Lemon Date: Wed, 6 Jan 2021 14:18:36 -0800 Subject: [PATCH 23/33] skbuff: Call skb_zcopy_clear() before unref'ing fragments RX zerocopy fragment pages which are not allocated from the system page pool require special handling. Give the callback in skb_zcopy_clear() a chance to process them first. Signed-off-by: Jonathan Lemon Signed-off-by: Jakub Kicinski Signed-off-by: hongrongxuan --- net/core/skbuff.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 80ae7756ffdb..0bdbb4062092 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -619,13 +619,14 @@ static void skb_release_data(struct sk_buff *skb) &shinfo->dataref)) return; + skb_zcopy_clear(skb, true); + for (i = 0; i < shinfo->nr_frags; i++) __skb_frag_unref(&shinfo->frags[i]); if (shinfo->frag_list) kfree_skb_list(shinfo->frag_list); - skb_zcopy_clear(skb, true); skb_free_head(skb); } -- Gitee From ba932be33d334abba0ec4f9ff0072ad4c6f1604e Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Tue, 2 Feb 2021 13:31:46 +0000 Subject: [PATCH 24/33] net: page_pool: simplify page recycling condition tests pool_page_reusable() is a leftover from pre-NUMA-aware times. For now, this function is just a redundant wrapper over page_is_pfmemalloc(), so inline it into its sole call site. Signed-off-by: Alexander Lobakin Acked-by: Jesper Dangaard Brouer Reviewed-by: Ilias Apalodimas Reviewed-by: Jesse Brandeburg Acked-by: David Rientjes Signed-off-by: Jakub Kicinski Signed-off-by: hongrongxuan --- net/core/page_pool.c | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/net/core/page_pool.c b/net/core/page_pool.c index 9735608f33b5..ead428442521 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -385,14 +385,6 @@ static bool page_pool_recycle_in_cache(struct page *page, return true; } -/* page is NOT reusable when: - * 1) allocated when system is under some pressure. (page_is_pfmemalloc) - */ -static bool pool_page_reusable(struct page_pool *pool, struct page *page) -{ - return !page_is_pfmemalloc(page); -} - /* If the page refcnt == 1, this will try to recycle the page. * if PP_FLAG_DMA_SYNC_DEV is set, we'll try to sync the DMA area for * the configured size min(dma_sync_size, pool->max_len). @@ -408,9 +400,11 @@ __page_pool_put_page(struct page_pool *pool, struct page *page, * regular page allocator APIs. * * refcnt == 1 means page_pool owns page, and can recycle it. + * + * page is NOT reusable when allocated when system is under + * some pressure. (page_is_pfmemalloc) */ - if (likely(page_ref_count(page) == 1 && - pool_page_reusable(pool, page))) { + if (likely(page_ref_count(page) == 1 && !page_is_pfmemalloc(page))) { /* Read barrier done in page_ref_count / READ_ONCE */ if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV) -- Gitee From d974f3cb109885535cf73ee6bd549706cd962dc8 Mon Sep 17 00:00:00 2001 From: Matteo Croce Date: Mon, 7 Jun 2021 21:02:36 +0200 Subject: [PATCH 25/33] mm: add a signature in struct page This is needed by the page_pool to avoid recycling a page not allocated via page_pool. The page->signature field is aliased to page->lru.next and page->compound_head, but it can't be set by mistake because the signature value is a bad pointer, and can't trigger a false positive in PageTail() because the last bit is 0. Co-developed-by: Matthew Wilcox (Oracle) Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Matteo Croce Signed-off-by: David S. Miller Signed-off-by: hongrongxuan --- include/linux/mm.h | 11 ++++++----- include/linux/mm_types.h | 7 +++++++ include/linux/poison.h | 3 +++ net/core/page_pool.c | 6 ++++++ 4 files changed, 22 insertions(+), 5 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index d48010d709b8..8ff12971a0d2 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1440,10 +1440,11 @@ struct address_space *page_mapping_file(struct page *page); static inline bool page_is_pfmemalloc(const struct page *page) { /* - * Page index cannot be this large so this must be - * a pfmemalloc page. + * lru.next has bit 1 set if the page is allocated from the + * pfmemalloc reserves. Callers may simply overwrite it if + * they do not need to preserve that information. */ - return page->index == -1UL; + return (uintptr_t)page->lru.next & BIT(1); } /* @@ -1452,12 +1453,12 @@ static inline bool page_is_pfmemalloc(const struct page *page) */ static inline void set_page_pfmemalloc(struct page *page) { - page->index = -1UL; + page->lru.next = (void *)BIT(1); } static inline void clear_page_pfmemalloc(struct page *page) { - page->index = 0; + page->lru.next = NULL; } /* diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index d71a92cfc059..5525c8ae2bed 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -95,6 +95,13 @@ struct page { unsigned long private; }; struct { /* page_pool used by netstack */ + /** + * @pp_magic: magic value to avoid recycling non + * page_pool allocated pages. + */ + unsigned long pp_magic; + struct page_pool *pp; + unsigned long _pp_mapping_pad; /** * @dma_addr: might require a 64-bit value on * 32-bit architectures. diff --git a/include/linux/poison.h b/include/linux/poison.h index df34330b4e34..000bd3a5acf6 100644 --- a/include/linux/poison.h +++ b/include/linux/poison.h @@ -86,4 +86,7 @@ /********** security/ **********/ #define KEY_DESTROY 0xbd +/********** net/core/page_pool.c **********/ +#define PP_SIGNATURE (0x40 + POISON_POINTER_DELTA) + #endif diff --git a/net/core/page_pool.c b/net/core/page_pool.c index ead428442521..0492d6ae0716 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -17,6 +17,7 @@ #include #include #include /* for __put_page() */ +#include #include @@ -217,6 +218,8 @@ static struct page *__page_pool_alloc_page_order(struct page_pool *pool, return NULL; } + page->pp_magic |= PP_SIGNATURE; + /* Track how many pages are held 'in-flight' */ pool->pages_state_hold_cnt++; trace_page_pool_state_hold(pool, page, pool->pages_state_hold_cnt); @@ -259,6 +262,7 @@ static struct page *__page_pool_alloc_pages_slow(struct page_pool *pool, put_page(page); continue; } + page->pp_magic |= PP_SIGNATURE; pool->alloc.cache[pool->alloc.count++] = page; /* Track how many pages are held 'in-flight' */ pool->pages_state_hold_cnt++; @@ -337,6 +341,8 @@ void page_pool_release_page(struct page_pool *pool, struct page *page) DMA_ATTR_SKIP_CPU_SYNC); page_pool_set_dma_addr(page, 0); skip_dma_unmap: + page->pp_magic = 0; + /* This may be the last page returned, releasing the pool, so * it is not safe to reference pool afterwards. */ -- Gitee From 19100feff078a3b40188cbaedd6dd2a77983b9fc Mon Sep 17 00:00:00 2001 From: Matteo Croce Date: Mon, 7 Jun 2021 21:02:37 +0200 Subject: [PATCH 26/33] skbuff: add a parameter to __skb_frag_unref This is a prerequisite patch, the next one is enabling recycling of skbs and fragments. Add an extra argument on __skb_frag_unref() to handle recycling, and update the current users of the function with that. Signed-off-by: Matteo Croce Signed-off-by: David S. Miller Signed-off-by: hongrongxuan --- drivers/net/ethernet/marvell/sky2.c | 2 +- drivers/net/ethernet/mellanox/mlx4/en_rx.c | 2 +- include/linux/skbuff.h | 8 +++++--- net/core/skbuff.c | 4 ++-- net/tls/tls_device.c | 2 +- 5 files changed, 10 insertions(+), 8 deletions(-) diff --git a/drivers/net/ethernet/marvell/sky2.c b/drivers/net/ethernet/marvell/sky2.c index df7c23cd3360..93f38aca37a5 100644 --- a/drivers/net/ethernet/marvell/sky2.c +++ b/drivers/net/ethernet/marvell/sky2.c @@ -2501,7 +2501,7 @@ static void skb_put_frags(struct sk_buff *skb, unsigned int hdr_space, if (length == 0) { /* don't need this page */ - __skb_frag_unref(frag); + __skb_frag_unref(frag, false); --skb_shinfo(skb)->nr_frags; } else { size = min(length, (unsigned) PAGE_SIZE); diff --git a/drivers/net/ethernet/mellanox/mlx4/en_rx.c b/drivers/net/ethernet/mellanox/mlx4/en_rx.c index f9797e503884..6ad3ca63ee2d 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_rx.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_rx.c @@ -526,7 +526,7 @@ static int mlx4_en_complete_rx_desc(struct mlx4_en_priv *priv, fail: while (nr > 0) { nr--; - __skb_frag_unref(skb_shinfo(skb)->frags + nr); + __skb_frag_unref(skb_shinfo(skb)->frags + nr, false); } return 0; } diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 357b1abe8695..faf2975a48a8 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -3018,10 +3018,12 @@ static inline void skb_frag_ref(struct sk_buff *skb, int f) /** * __skb_frag_unref - release a reference on a paged fragment. * @frag: the paged fragment + * @recycle: recycle the page if allocated via page_pool * - * Releases a reference on the paged fragment @frag. + * Releases a reference on the paged fragment @frag + * or recycles the page via the page_pool API. */ -static inline void __skb_frag_unref(skb_frag_t *frag) +static inline void __skb_frag_unref(skb_frag_t *frag, bool recycle) { put_page(skb_frag_page(frag)); } @@ -3035,7 +3037,7 @@ static inline void __skb_frag_unref(skb_frag_t *frag) */ static inline void skb_frag_unref(struct sk_buff *skb, int f) { - __skb_frag_unref(&skb_shinfo(skb)->frags[f]); + __skb_frag_unref(&skb_shinfo(skb)->frags[f], false); } /** diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 0bdbb4062092..04447561ea98 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -622,7 +622,7 @@ static void skb_release_data(struct sk_buff *skb) skb_zcopy_clear(skb, true); for (i = 0; i < shinfo->nr_frags; i++) - __skb_frag_unref(&shinfo->frags[i]); + __skb_frag_unref(&shinfo->frags[i], false); if (shinfo->frag_list) kfree_skb_list(shinfo->frag_list); @@ -3438,7 +3438,7 @@ int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, int shiftlen) fragto = &skb_shinfo(tgt)->frags[merge]; skb_frag_size_add(fragto, skb_frag_size(fragfrom)); - __skb_frag_unref(fragfrom); + __skb_frag_unref(fragfrom, false); } /* Reposition in the original skb */ diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c index 2c3cf47d730b..4b9c3c1460f5 100644 --- a/net/tls/tls_device.c +++ b/net/tls/tls_device.c @@ -128,7 +128,7 @@ static void destroy_record(struct tls_record_info *record) int i; for (i = 0; i < record->num_frags; i++) - __skb_frag_unref(&record->frags[i]); + __skb_frag_unref(&record->frags[i], false); kfree(record); } -- Gitee From 60b401eed377ff34b4a9ab0e4d1e155016120f71 Mon Sep 17 00:00:00 2001 From: Ilias Apalodimas Date: Mon, 7 Jun 2021 21:02:38 +0200 Subject: [PATCH 27/33] page_pool: Allow drivers to hint on SKB recycling Up to now several high speed NICs have custom mechanisms of recycling the allocated memory they use for their payloads. Our page_pool API already has recycling capabilities that are always used when we are running in 'XDP mode'. So let's tweak the API and the kernel network stack slightly and allow the recycling to happen even during the standard operation. The API doesn't take into account 'split page' policies used by those drivers currently, but can be extended once we have users for that. The idea is to be able to intercept the packet on skb_release_data(). If it's a buffer coming from our page_pool API recycle it back to the pool for further usage or just release the packet entirely. To achieve that we introduce a bit in struct sk_buff (pp_recycle:1) and a field in struct page (page->pp) to store the page_pool pointer. Storing the information in page->pp allows us to recycle both SKBs and their fragments. We could have skipped the skb bit entirely, since identical information can bederived from struct page. However, in an effort to affect the free path as less as possible, reading a single bit in the skb which is already in cache, is better that trying to derive identical information for the page stored data. The driver or page_pool has to take care of the sync operations on it's own during the buffer recycling since the buffer is, after opting-in to the recycling, never unmapped. Since the gain on the drivers depends on the architecture, we are not enabling recycling by default if the page_pool API is used on a driver. In order to enable recycling the driver must call skb_mark_for_recycle() to store the information we need for recycling in page->pp and enabling the recycling bit, or page_pool_store_mem_info() for a fragment. Co-developed-by: Jesper Dangaard Brouer Signed-off-by: Jesper Dangaard Brouer Co-developed-by: Matteo Croce Signed-off-by: Matteo Croce Signed-off-by: Ilias Apalodimas Signed-off-by: David S. Miller Signed-off-by: hongrongxuan Conflicts: include/linux/skbuff.h --- include/linux/skbuff.h | 33 ++++++++++++++++++++++++++++++--- include/net/page_pool.h | 9 +++++++++ net/core/page_pool.c | 22 ++++++++++++++++++++++ net/core/skbuff.c | 20 ++++++++++++++++---- 4 files changed, 77 insertions(+), 7 deletions(-) diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index faf2975a48a8..a5920a81fca3 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -37,6 +37,7 @@ #include #include #include +#include #if IS_ENABLED(CONFIG_NF_CONNTRACK) #include #endif @@ -652,6 +653,8 @@ typedef unsigned char *sk_buff_data_t; * @hash: the packet hash * @queue_mapping: Queue mapping for multiqueue devices * @pfmemalloc: skbuff was allocated from PFMEMALLOC reserves + * @pp_recycle: mark the packet for recycling instead of freeing (implies + * page_pool support on driver) * @active_extensions: active extensions (skb_ext_id types) * @ndisc_nodetype: router type (from link layer) * @ooo_okay: allow the mapping of a socket to a queue to be changed @@ -760,10 +763,12 @@ struct sk_buff { fclone:2, peeked:1, head_frag:1, - pfmemalloc:1; + pfmemalloc:1, + pp_recycle:1; /* page_pool recycle indicator */ #ifdef CONFIG_SKB_EXTENSIONS __u8 active_extensions; #endif + /* fields enclosed in headers_start/headers_end are copied * using a single memcpy() in __copy_skb_header() */ @@ -3025,7 +3030,13 @@ static inline void skb_frag_ref(struct sk_buff *skb, int f) */ static inline void __skb_frag_unref(skb_frag_t *frag, bool recycle) { - put_page(skb_frag_page(frag)); + struct page *page = skb_frag_page(frag); + +#ifdef CONFIG_PAGE_POOL + if (recycle && page_pool_return_skb_page(page)) + return; +#endif + put_page(page); } /** @@ -3037,7 +3048,7 @@ static inline void __skb_frag_unref(skb_frag_t *frag, bool recycle) */ static inline void skb_frag_unref(struct sk_buff *skb, int f) { - __skb_frag_unref(&skb_shinfo(skb)->frags[f], false); + __skb_frag_unref(&skb_shinfo(skb)->frags[f], skb->pp_recycle); } /** @@ -4606,5 +4617,21 @@ static inline bool skb_csum_is_sctp(struct sk_buff *skb) return skb->csum_not_inet; } +#ifdef CONFIG_PAGE_POOL +static inline void skb_mark_for_recycle(struct sk_buff *skb, struct page *page, + struct page_pool *pp) +{ + skb->pp_recycle = 1; + page_pool_store_mem_info(page, pp); +} +#endif + +static inline bool skb_pp_recycle(struct sk_buff *skb, void *data) +{ + if (!IS_ENABLED(CONFIG_PAGE_POOL) || !skb->pp_recycle) + return false; + return page_pool_return_skb_page(virt_to_page(data)); +} + #endif /* __KERNEL__ */ #endif /* _LINUX_SKBUFF_H */ diff --git a/include/net/page_pool.h b/include/net/page_pool.h index c904bede4d7b..0106d6059938 100644 --- a/include/net/page_pool.h +++ b/include/net/page_pool.h @@ -144,6 +144,8 @@ inline enum dma_data_direction page_pool_get_dma_dir(struct page_pool *pool) return pool->p.dma_dir; } +bool page_pool_return_skb_page(struct page *page); + struct page_pool *page_pool_create(const struct page_pool_params *params); #ifdef CONFIG_PAGE_POOL @@ -249,4 +251,11 @@ static inline void page_pool_ring_unlock(struct page_pool *pool) spin_unlock_bh(&pool->ring.producer_lock); } +/* Store mem_info on struct page and use it while recycling skb frags */ +static inline +void page_pool_store_mem_info(struct page *page, struct page_pool *pp) +{ + page->pp = pp; +} + #endif /* _NET_PAGE_POOL_H */ diff --git a/net/core/page_pool.c b/net/core/page_pool.c index 0492d6ae0716..33cdf55907b3 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -615,3 +615,25 @@ void page_pool_update_nid(struct page_pool *pool, int new_nid) } } EXPORT_SYMBOL(page_pool_update_nid); + +bool page_pool_return_skb_page(struct page *page) +{ + struct page_pool *pp; + + page = compound_head(page); + if (unlikely(page->pp_magic != PP_SIGNATURE)) + return false; + + pp = page->pp; + + /* Driver set this to memory recycling info. Reset it on recycle. + * This will *not* work for NIC using a split-page memory model. + * The page will be returned to the pool here regardless of the + * 'flipped' fragment being in use or not. + */ + page->pp = NULL; + page_pool_put_full_page(pp, page, false); + + return true; +} +EXPORT_SYMBOL(page_pool_return_skb_page); diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 04447561ea98..c4a3c8d983b9 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -69,6 +69,7 @@ #include #include #include +#include #include #include @@ -603,10 +604,13 @@ static void skb_free_head(struct sk_buff *skb) { unsigned char *head = skb->head; - if (skb->head_frag) + if (skb->head_frag) { + if (skb_pp_recycle(skb, head)) + return; skb_free_frag(head); - else + } else { kfree(head); + } } static void skb_release_data(struct sk_buff *skb) @@ -622,7 +626,7 @@ static void skb_release_data(struct sk_buff *skb) skb_zcopy_clear(skb, true); for (i = 0; i < shinfo->nr_frags; i++) - __skb_frag_unref(&shinfo->frags[i], false); + __skb_frag_unref(&shinfo->frags[i], skb->pp_recycle); if (shinfo->frag_list) kfree_skb_list(shinfo->frag_list); @@ -1021,6 +1025,7 @@ static struct sk_buff *__skb_clone(struct sk_buff *n, struct sk_buff *skb) n->nohdr = 0; n->peeked = 0; C(pfmemalloc); + C(pp_recycle); n->destructor = NULL; C(tail); C(end); @@ -3438,7 +3443,7 @@ int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, int shiftlen) fragto = &skb_shinfo(tgt)->frags[merge]; skb_frag_size_add(fragto, skb_frag_size(fragfrom)); - __skb_frag_unref(fragfrom, false); + __skb_frag_unref(fragfrom, skb->pp_recycle); } /* Reposition in the original skb */ @@ -5093,6 +5098,13 @@ bool skb_try_coalesce(struct sk_buff *to, struct sk_buff *from, if (skb_cloned(to)) return false; + /* The page pool signature of struct page will eventually figure out + * which pages can be recycled or not but for now let's prohibit slab + * allocated and page_pool allocated SKBs from being coalesced. + */ + if (to->pp_recycle != from->pp_recycle) + return false; + if (len <= skb_tailroom(to)) { if (len) BUG_ON(skb_copy_bits(from, 0, skb_put(to, len), len)); -- Gitee From 15e99e06c5ce3e5db81531716329250030e6b1c6 Mon Sep 17 00:00:00 2001 From: Ilias Apalodimas Date: Thu, 11 Nov 2021 16:29:17 +0800 Subject: [PATCH 28/33] skbuff: Fix a potential race while recycling page_pool packets mainline inclusion from mainline-v5.14-rc3 commit 2cc3aeb5eccc category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4CVS3 CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=2cc3aeb5eccc ---------------------------------------------------------------------- As Alexander points out, when we are trying to recycle a cloned/expanded SKB we might trigger a race. The recycling code relies on the pp_recycle bit to trigger, which we carry over to cloned SKBs. If that cloned SKB gets expanded or if we get references to the frags, call skb_release_data() and overwrite skb->head, we are creating separate instances accessing the same page frags. Since the skb_release_data() will first try to recycle the frags, there's a potential race between the original and cloned SKB, since both will have the pp_recycle bit set. Fix this by explicitly those SKBs not recyclable. The atomic_sub_return effectively limits us to a single release case, and when we are calling skb_release_data we are also releasing the option to perform the recycling, or releasing the pages from the page pool. Fixes: 6a5bcd84e886 ("page_pool: Allow drivers to hint on SKB recycling") Reported-by: Alexander Duyck Suggested-by: Alexander Duyck Reviewed-by: Alexander Duyck Acked-by: Jesper Dangaard Brouer Signed-off-by: Ilias Apalodimas Signed-off-by: David S. Miller Reviewed-by: Yongxin Li Signed-off-by: Junxin Chen Signed-off-by: Zheng Zengkai Signed-off-by: hongrongxuan --- net/core/skbuff.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/net/core/skbuff.c b/net/core/skbuff.c index c4a3c8d983b9..2d4899325cba 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -621,7 +621,7 @@ static void skb_release_data(struct sk_buff *skb) if (skb->cloned && atomic_sub_return(skb->nohdr ? (1 << SKB_DATAREF_SHIFT) + 1 : 1, &shinfo->dataref)) - return; + goto exit; skb_zcopy_clear(skb, true); @@ -632,6 +632,17 @@ static void skb_release_data(struct sk_buff *skb) kfree_skb_list(shinfo->frag_list); skb_free_head(skb); +exit: + /* When we clone an SKB we copy the reycling bit. The pp_recycle + * bit is only set on the head though, so in order to avoid races + * while trying to recycle fragments on __skb_frag_unref() we need + * to make one SKB responsible for triggering the recycle path. + * So disable the recycling bit if an SKB is cloned and we have + * additional references to to the fragmented part of the SKB. + * Eventually the last SKB will have the recycling bit set and it's + * dataref set to 0, which will trigger the recycling + */ + skb->pp_recycle = 0; } /* -- Gitee From 155f4e45fbc78322bf8e5b06e96dab4c53d98280 Mon Sep 17 00:00:00 2001 From: Yunsheng Lin Date: Fri, 6 Aug 2021 09:39:07 +0800 Subject: [PATCH 29/33] page_pool: mask the page->signature before the checking MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit As mentioned in commit c07aea3ef4d4 ("mm: add a signature in struct page"): "The page->signature field is aliased to page->lru.next and page->compound_head." And as the comment in page_is_pfmemalloc(): "lru.next has bit 1 set if the page is allocated from the pfmemalloc reserves. Callers may simply overwrite it if they do not need to preserve that information." The page->signature is OR’ed with PP_SIGNATURE when a page is allocated in page pool, see __page_pool_alloc_pages_slow(), and page->signature is checked directly with PP_SIGNATURE in page_pool_return_skb_page(), which might cause resoure leaking problem for a page from page pool if bit 1 of lru.next is set for a pfmemalloc page. What happens here is that the original pp->signature is OR'ed with PP_SIGNATURE after the allocation in order to preserve any existing bits(such as the bit 1, used to indicate a pfmemalloc page), so when those bits are present, those page is not considered to be from page pool and the DMA mapping of those pages will be left stale. As bit 0 is for page->compound_head, So mask both bit 0/1 before the checking in page_pool_return_skb_page(). And we will return those pfmemalloc pages back to the page allocator after cleaning up the DMA mapping. Fixes: 6a5bcd84e886 ("page_pool: Allow drivers to hint on SKB recycling") Reviewed-by: Ilias Apalodimas Signed-off-by: Yunsheng Lin Signed-off-by: David S. Miller Signed-off-by: hongrongxuan --- net/core/page_pool.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/net/core/page_pool.c b/net/core/page_pool.c index 33cdf55907b3..9f1002ee22ba 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -621,7 +621,15 @@ bool page_pool_return_skb_page(struct page *page) struct page_pool *pp; page = compound_head(page); - if (unlikely(page->pp_magic != PP_SIGNATURE)) + + /* page->pp_magic is OR'ed with PP_SIGNATURE after the allocation + * in order to preserve any existing bits, such as bit 0 for the + * head page of compound page and bit 1 for pfmemalloc page, so + * mask those bits for freeing side when doing below checking, + * and page_is_pfmemalloc() is checked in __page_pool_put_page() + * to avoid recycling the pfmemalloc page. + */ + if (unlikely((page->pp_magic & ~0x3UL) != PP_SIGNATURE)) return false; pp = page->pp; -- Gitee From 7f5aee52d53d0ec25f484e4ea1b907877dcbc562 Mon Sep 17 00:00:00 2001 From: Yunsheng Lin Date: Fri, 6 Aug 2021 10:46:19 +0800 Subject: [PATCH 30/33] page_pool: keep pp info as long as page pool owns the page Currently, page->pp is cleared and set everytime the page is recycled, which is unnecessary. So only set the page->pp when the page is added to the page pool and only clear it when the page is released from the page pool. This is also a preparation to support allocating frag page in page pool. Reviewed-by: Ilias Apalodimas Signed-off-by: Yunsheng Lin Signed-off-by: Jakub Kicinski Signed-off-by: hongrongxuan --- include/linux/skbuff.h | 4 +--- include/net/page_pool.h | 7 ------- net/core/page_pool.c | 21 +++++++++++++++++---- 3 files changed, 18 insertions(+), 14 deletions(-) diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index a5920a81fca3..baf5790e3ada 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -4618,11 +4618,9 @@ static inline bool skb_csum_is_sctp(struct sk_buff *skb) } #ifdef CONFIG_PAGE_POOL -static inline void skb_mark_for_recycle(struct sk_buff *skb, struct page *page, - struct page_pool *pp) +static inline void skb_mark_for_recycle(struct sk_buff *skb) { skb->pp_recycle = 1; - page_pool_store_mem_info(page, pp); } #endif diff --git a/include/net/page_pool.h b/include/net/page_pool.h index 0106d6059938..42030381cae7 100644 --- a/include/net/page_pool.h +++ b/include/net/page_pool.h @@ -251,11 +251,4 @@ static inline void page_pool_ring_unlock(struct page_pool *pool) spin_unlock_bh(&pool->ring.producer_lock); } -/* Store mem_info on struct page and use it while recycling skb frags */ -static inline -void page_pool_store_mem_info(struct page *page, struct page_pool *pp) -{ - page->pp = pp; -} - #endif /* _NET_PAGE_POOL_H */ diff --git a/net/core/page_pool.c b/net/core/page_pool.c index 9f1002ee22ba..c36cbdd7153e 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -202,6 +202,19 @@ static bool page_pool_dma_map(struct page_pool *pool, struct page *page) return true; } +static void page_pool_set_pp_info(struct page_pool *pool, + struct page *page) +{ + page->pp = pool; + page->pp_magic |= PP_SIGNATURE; +} + +static void page_pool_clear_pp_info(struct page *page) +{ + page->pp_magic = 0; + page->pp = NULL; +} + static struct page *__page_pool_alloc_page_order(struct page_pool *pool, gfp_t gfp) { @@ -218,7 +231,7 @@ static struct page *__page_pool_alloc_page_order(struct page_pool *pool, return NULL; } - page->pp_magic |= PP_SIGNATURE; + page_pool_set_pp_info(pool, page); /* Track how many pages are held 'in-flight' */ pool->pages_state_hold_cnt++; @@ -262,7 +275,8 @@ static struct page *__page_pool_alloc_pages_slow(struct page_pool *pool, put_page(page); continue; } - page->pp_magic |= PP_SIGNATURE; + + page_pool_set_pp_info(pool, page); pool->alloc.cache[pool->alloc.count++] = page; /* Track how many pages are held 'in-flight' */ pool->pages_state_hold_cnt++; @@ -341,7 +355,7 @@ void page_pool_release_page(struct page_pool *pool, struct page *page) DMA_ATTR_SKIP_CPU_SYNC); page_pool_set_dma_addr(page, 0); skip_dma_unmap: - page->pp_magic = 0; + page_pool_clear_pp_info(page); /* This may be the last page returned, releasing the pool, so * it is not safe to reference pool afterwards. @@ -639,7 +653,6 @@ bool page_pool_return_skb_page(struct page *page) * The page will be returned to the pool here regardless of the * 'flipped' fragment being in use or not. */ - page->pp = NULL; page_pool_put_full_page(pp, page, false); return true; -- Gitee From 7758d43a336f3808e82b4769f8f7e3c3f6ca4006 Mon Sep 17 00:00:00 2001 From: Yunsheng Lin Date: Fri, 6 Aug 2021 10:46:20 +0800 Subject: [PATCH 31/33] page_pool: add interface to manipulate frag count in page pool For 32 bit systems with 64 bit dma, dma_addr[1] is used to store the upper 32 bit dma addr, those system should be rare those days. For normal system, the dma_addr[1] in 'struct page' is not used, so we can reuse dma_addr[1] for storing frag count, which means how many frags this page might be splited to. In order to simplify the page frag support in the page pool, the PAGE_POOL_DMA_USE_PP_FRAG_COUNT macro is added to indicate the 32 bit systems with 64 bit dma, and the page frag support in page pool is disabled for such system. The newly added page_pool_set_frag_count() is called to reserve the maximum frag count before any page frag is passed to the user. The page_pool_atomic_sub_frag_count_return() is called when user is done with the page frag. Signed-off-by: Yunsheng Lin Signed-off-by: Jakub Kicinski Signed-off-by: hongrongxuan --- include/linux/mm_types.h | 18 +++++++++++----- include/net/page_pool.h | 46 ++++++++++++++++++++++++++++++++++------ net/core/page_pool.c | 4 ++++ 3 files changed, 56 insertions(+), 12 deletions(-) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 5525c8ae2bed..c2a97c460197 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -102,11 +102,19 @@ struct page { unsigned long pp_magic; struct page_pool *pp; unsigned long _pp_mapping_pad; - /** - * @dma_addr: might require a 64-bit value on - * 32-bit architectures. - */ - unsigned long dma_addr[2]; + unsigned long dma_addr; + union { + /** + * dma_addr_upper: might require a 64-bit + * value on 32-bit architectures. + */ + unsigned long dma_addr_upper; + /** + * For frag page support, not supported in + * 32-bit architectures with 64-bit DMA. + */ + atomic_long_t pp_frag_count; + }; }; struct { /* slab, slob and slub */ union { diff --git a/include/net/page_pool.h b/include/net/page_pool.h index 42030381cae7..fb2bf3a1bfc1 100644 --- a/include/net/page_pool.h +++ b/include/net/page_pool.h @@ -45,7 +45,10 @@ * Please note DMA-sync-for-CPU is still * device driver responsibility */ -#define PP_FLAG_ALL (PP_FLAG_DMA_MAP | PP_FLAG_DMA_SYNC_DEV) +#define PP_FLAG_PAGE_FRAG BIT(2) /* for page frag feature */ +#define PP_FLAG_ALL (PP_FLAG_DMA_MAP |\ + PP_FLAG_DMA_SYNC_DEV |\ + PP_FLAG_PAGE_FRAG) /* * Fast allocation side cache array/stack @@ -196,19 +199,48 @@ static inline void page_pool_recycle_direct(struct page_pool *pool, page_pool_put_full_page(pool, page, true); } +#define PAGE_POOL_DMA_USE_PP_FRAG_COUNT \ + (sizeof(dma_addr_t) > sizeof(unsigned long)) + static inline dma_addr_t page_pool_get_dma_addr(struct page *page) { - dma_addr_t ret = page->dma_addr[0]; - if (sizeof(dma_addr_t) > sizeof(unsigned long)) - ret |= (dma_addr_t)page->dma_addr[1] << 16 << 16; + dma_addr_t ret = page->dma_addr; + + if (PAGE_POOL_DMA_USE_PP_FRAG_COUNT) + ret |= (dma_addr_t)page->dma_addr_upper << 16 << 16; + return ret; } static inline void page_pool_set_dma_addr(struct page *page, dma_addr_t addr) { - page->dma_addr[0] = addr; - if (sizeof(dma_addr_t) > sizeof(unsigned long)) - page->dma_addr[1] = upper_32_bits(addr); + page->dma_addr = addr; + if (PAGE_POOL_DMA_USE_PP_FRAG_COUNT) + page->dma_addr_upper = upper_32_bits(addr); +} + +static inline void page_pool_set_frag_count(struct page *page, long nr) +{ + atomic_long_set(&page->pp_frag_count, nr); +} + +static inline long page_pool_atomic_sub_frag_count_return(struct page *page, + long nr) +{ + long ret; + + /* As suggested by Alexander, atomic_long_read() may cover up the + * reference count errors, so avoid calling atomic_long_read() in + * the cases of freeing or draining the page_frags, where we would + * not expect it to match or that are slowpath anyway. + */ + if (__builtin_constant_p(nr) && + atomic_long_read(&page->pp_frag_count) == nr) + return 0; + + ret = atomic_long_sub_return(nr, &page->pp_frag_count); + WARN_ON(ret < 0); + return ret; } static inline bool is_page_pool_compiled_in(void) diff --git a/net/core/page_pool.c b/net/core/page_pool.c index c36cbdd7153e..c0a879ee1c95 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -65,6 +65,10 @@ static int page_pool_init(struct page_pool *pool, */ } + if (PAGE_POOL_DMA_USE_PP_FRAG_COUNT && + pool->p.flags & PP_FLAG_PAGE_FRAG) + return -EINVAL; + if (ptr_ring_init(&pool->ring, ring_qsize, GFP_KERNEL) < 0) return -ENOMEM; -- Gitee From 4619d359f695963361bfc7ad4116c2f19036e501 Mon Sep 17 00:00:00 2001 From: Yunsheng Lin Date: Fri, 6 Aug 2021 10:46:21 +0800 Subject: [PATCH 32/33] page_pool: add frag page recycling support in page pool Currently page pool only support page recycling when there is only one user of the page, and the split page reusing implemented in the most driver can not use the page pool as bing-pong way of reusing requires the multi user support in page pool. Those reusing or recycling has below limitations: 1. page from page pool can only be used be one user in order for the page recycling to happen. 2. Bing-pong way of reusing in most driver does not support multi desc using different part of the same page in order to save memory. So add multi-users support and frag page recycling in page pool to overcome the above limitation. Signed-off-by: Yunsheng Lin Signed-off-by: Jakub Kicinski Signed-off-by: hongrongxuan --- include/net/page_pool.h | 15 +++++++ net/core/page_pool.c | 87 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 102 insertions(+) diff --git a/include/net/page_pool.h b/include/net/page_pool.h index fb2bf3a1bfc1..2b1da328370c 100644 --- a/include/net/page_pool.h +++ b/include/net/page_pool.h @@ -91,6 +91,9 @@ struct page_pool { unsigned long defer_warn; u32 pages_state_hold_cnt; + unsigned int frag_offset; + struct page *frag_page; + long frag_users; /* * Data structure for allocation side @@ -138,6 +141,18 @@ static inline struct page *page_pool_dev_alloc_pages(struct page_pool *pool) return page_pool_alloc_pages(pool, gfp); } +struct page *page_pool_alloc_frag(struct page_pool *pool, unsigned int *offset, + unsigned int size, gfp_t gfp); + +static inline struct page *page_pool_dev_alloc_frag(struct page_pool *pool, + unsigned int *offset, + unsigned int size) +{ + gfp_t gfp = (GFP_ATOMIC | __GFP_NOWARN); + + return page_pool_alloc_frag(pool, offset, size, gfp); +} + /* get the stored dma direction. A driver might decide to treat this locally and * avoid the extra cache line from page_pool to determine the direction */ diff --git a/net/core/page_pool.c b/net/core/page_pool.c index c0a879ee1c95..bde7a3076751 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -24,6 +24,8 @@ #define DEFER_TIME (msecs_to_jiffies(1000)) #define DEFER_WARN_INTERVAL (60 * HZ) +#define BIAS_MAX LONG_MAX + static int page_pool_init(struct page_pool *pool, const struct page_pool_params *params) { @@ -419,6 +421,11 @@ static __always_inline struct page * __page_pool_put_page(struct page_pool *pool, struct page *page, unsigned int dma_sync_size, bool allow_direct) { + /* It is not the last user for the page frag case */ + if (pool->p.flags & PP_FLAG_PAGE_FRAG && + page_pool_atomic_sub_frag_count_return(page, 1)) + return NULL; + /* This allocator is optimized for the XDP mode that uses * one-frame-per-page, but have fallbacks that act like the * regular page allocator APIs. @@ -511,6 +518,84 @@ void page_pool_put_page_bulk(struct page_pool *pool, void **data, } EXPORT_SYMBOL(page_pool_put_page_bulk); +static struct page *page_pool_drain_frag(struct page_pool *pool, + struct page *page) +{ + long drain_count = BIAS_MAX - pool->frag_users; + + /* Some user is still using the page frag */ + if (likely(page_pool_atomic_sub_frag_count_return(page, + drain_count))) + return NULL; + + if (page_ref_count(page) == 1 && !page_is_pfmemalloc(page)) { + if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV) + page_pool_dma_sync_for_device(pool, page, -1); + + return page; + } + + page_pool_return_page(pool, page); + return NULL; +} + +static void page_pool_free_frag(struct page_pool *pool) +{ + long drain_count = BIAS_MAX - pool->frag_users; + struct page *page = pool->frag_page; + + pool->frag_page = NULL; + + if (!page || + page_pool_atomic_sub_frag_count_return(page, drain_count)) + return; + + page_pool_return_page(pool, page); +} + +struct page *page_pool_alloc_frag(struct page_pool *pool, + unsigned int *offset, + unsigned int size, gfp_t gfp) +{ + unsigned int max_size = PAGE_SIZE << pool->p.order; + struct page *page = pool->frag_page; + + if (WARN_ON(!(pool->p.flags & PP_FLAG_PAGE_FRAG) || + size > max_size)) + return NULL; + + size = ALIGN(size, dma_get_cache_alignment()); + *offset = pool->frag_offset; + + if (page && *offset + size > max_size) { + page = page_pool_drain_frag(pool, page); + if (page) + goto frag_reset; + } + + if (!page) { + page = page_pool_alloc_pages(pool, gfp); + if (unlikely(!page)) { + pool->frag_page = NULL; + return NULL; + } + + pool->frag_page = page; + +frag_reset: + pool->frag_users = 1; + *offset = 0; + pool->frag_offset = size; + page_pool_set_frag_count(page, BIAS_MAX); + return page; + } + + pool->frag_users++; + pool->frag_offset = *offset + size; + return page; +} +EXPORT_SYMBOL(page_pool_alloc_frag); + static void page_pool_empty_ring(struct page_pool *pool) { struct page *page; @@ -607,6 +692,8 @@ void page_pool_destroy(struct page_pool *pool) if (!page_pool_put(pool)) return; + page_pool_free_frag(pool); + if (!page_pool_release(pool)) return; -- Gitee From c23d33908b5aa8b6bd7e568dfabdfaf8297125c5 Mon Sep 17 00:00:00 2001 From: Yunsheng Lin Date: Tue, 24 Aug 2021 17:06:49 +0800 Subject: [PATCH 33/33] page_pool: use relaxed atomic for release side accounting There is no need to synchronize the account updating, so use the relaxed atomic to avoid some memory barrier in the data path. Acked-by: Jesper Dangaard Brouer Signed-off-by: Yunsheng Lin Acked-by: Ilias Apalodimas Signed-off-by: David S. Miller Signed-off-by: hongrongxuan --- net/core/page_pool.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/core/page_pool.c b/net/core/page_pool.c index bde7a3076751..381705fed4db 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -366,7 +366,7 @@ void page_pool_release_page(struct page_pool *pool, struct page *page) /* This may be the last page returned, releasing the pool, so * it is not safe to reference pool afterwards. */ - count = atomic_inc_return(&pool->pages_state_release_cnt); + count = atomic_inc_return_relaxed(&pool->pages_state_release_cnt); trace_page_pool_state_release(pool, page, count); } EXPORT_SYMBOL(page_pool_release_page); -- Gitee