diff --git a/Documentation/arm64/cpu-feature-registers.rst b/Documentation/arm64/cpu-feature-registers.rst index 40adbe815bb4c1d04c67254c676c1a5874ef844d..30e71739233ff67a9bb17d1a3c9d09466695417f 100644 --- a/Documentation/arm64/cpu-feature-registers.rst +++ b/Documentation/arm64/cpu-feature-registers.rst @@ -193,6 +193,8 @@ infrastructure: +------------------------------+---------+---------+ | Name | bits | visible | +------------------------------+---------+---------+ + | DGH | [51-48] | y | + +------------------------------+---------+---------+ | GPI | [31-28] | y | +------------------------------+---------+---------+ | GPA | [27-24] | y | diff --git a/Documentation/arm64/elf_hwcaps.rst b/Documentation/arm64/elf_hwcaps.rst index 6afc1d12c74d891d61c1452a0a82284ad16b5ec2..ecebe0b3ae4601ab310269873a7be1e50bb67bb3 100644 --- a/Documentation/arm64/elf_hwcaps.rst +++ b/Documentation/arm64/elf_hwcaps.rst @@ -206,6 +206,9 @@ HWCAP2_ECV Functionality implied by ID_AA64MMFR0_EL1.ECV == 0b0001. +HWCAP2_DGH + Functionality implied by ID_AA64ISAR1_EL1.DGH == 0b0001. + 4. Unused AT_HWCAP bits ----------------------- diff --git a/Documentation/memory-barriers.txt b/Documentation/memory-barriers.txt index 1adbb8a371c71b401a77f2f4857d79964a4a8480..04c1e99c16248e0be51d54f8b127f553bb832b36 100644 --- a/Documentation/memory-barriers.txt +++ b/Documentation/memory-barriers.txt @@ -1937,6 +1937,14 @@ There are some more advanced barrier functions: information on consistent memory. + (*) io_stop_wc(); + + For memory accesses with write-combining attributes (e.g. those returned + by ioremap_wc(), the CPU may wait for prior accesses to be merged with + subsequent ones. io_stop_wc() can be used to prevent the merging of + write-combining memory accesses before this macro with those after it when + such wait has performance implications. + =============================== IMPLICIT KERNEL MEMORY BARRIERS =============================== diff --git a/arch/arm64/include/asm/barrier.h b/arch/arm64/include/asm/barrier.h index 0fcd854fc95f3be7759754e0ee0736d3d2062f34..37c17bcd00af1cee321213fda55af4487fc4f574 100644 --- a/arch/arm64/include/asm/barrier.h +++ b/arch/arm64/include/asm/barrier.h @@ -28,6 +28,13 @@ #define spec_bar() asm volatile(ALTERNATIVE("dsb nsh\nisb\n", \ SB_BARRIER_INSN"nop\n", \ ARM64_HAS_SB)) +/* + * Data Gathering Hint: + * This instruction prevents merging memory accesses with Normal-NC or + * Device-GRE attributes before the hint instruction with any memory accesses + * appearing after the hint instruction. + */ +#define dgh() asm volatile("hint #6" : : : "memory") #define mb() dsb(sy) #define rmb() dsb(ld) @@ -36,6 +43,8 @@ #define dma_rmb() dmb(oshld) #define dma_wmb() dmb(oshst) +#define io_stop_wc() dgh() + /* * Generate a mask for array_index__nospec() that is ~0UL when 0 <= idx < sz * and 0 otherwise. diff --git a/arch/arm64/include/asm/hwcap.h b/arch/arm64/include/asm/hwcap.h index a700f877f4c8854930653cc96208cc461852e018..98b9359ceff1d8563e1d7562ba48a46091d39eea 100644 --- a/arch/arm64/include/asm/hwcap.h +++ b/arch/arm64/include/asm/hwcap.h @@ -87,6 +87,7 @@ #define KERNEL_HWCAP_FLAGM2 __khwcap2_feature(FLAGM2) #define KERNEL_HWCAP_FRINT __khwcap2_feature(FRINT) #define KERNEL_HWCAP_ECV __khwcap2_feature(ECV) +#define KERNEL_HWCAP_DGH __khwcap2_feature(DGH) /* * This yields a mask that user programs can use to figure out what diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h index 49e89d5409747c47213ac6a7b112b8a656245ede..60c6affb91355fdab320ac63402527b7893d2806 100644 --- a/arch/arm64/include/asm/sysreg.h +++ b/arch/arm64/include/asm/sysreg.h @@ -567,6 +567,7 @@ #define ID_AA64ISAR0_TLB_RANGE 0x2 /* id_aa64isar1 */ +#define ID_AA64ISAR1_DGH_SHIFT 48 #define ID_AA64ISAR1_SB_SHIFT 36 #define ID_AA64ISAR1_FRINTTS_SHIFT 32 #define ID_AA64ISAR1_GPI_SHIFT 28 diff --git a/arch/arm64/include/uapi/asm/hwcap.h b/arch/arm64/include/uapi/asm/hwcap.h index 365a7feb43be5d6c3e5fbb3b215b0fbc1050730a..f36f9eb8dac29e718df80a0896980692f9358eb5 100644 --- a/arch/arm64/include/uapi/asm/hwcap.h +++ b/arch/arm64/include/uapi/asm/hwcap.h @@ -66,5 +66,6 @@ #define HWCAP2_FLAGM2 (1 << 7) #define HWCAP2_FRINT (1 << 8) #define HWCAP2_ECV (1 << 9) +#define HWCAP2_DGH (1 << 10) #endif /* _UAPI__ASM_HWCAP_H */ diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index fffdc0e74ca54664049dec09c8e1da5ca72c164f..f394047685df5a3dce83cd8ee6ecc15b96d8ee06 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -140,6 +140,7 @@ static const struct arm64_ftr_bits ftr_id_aa64isar0[] = { }; static const struct arm64_ftr_bits ftr_id_aa64isar1[] = { + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR1_DGH_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR1_SB_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR1_FRINTTS_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_PTR_AUTH), @@ -1714,6 +1715,7 @@ static const struct arm64_cpu_capabilities arm64_elf_hwcaps[] = { HWCAP_CAP(SYS_ID_AA64ISAR1_EL1, ID_AA64ISAR1_LRCPC_SHIFT, FTR_UNSIGNED, 2, CAP_HWCAP, KERNEL_HWCAP_ILRCPC), HWCAP_CAP(SYS_ID_AA64ISAR1_EL1, ID_AA64ISAR1_FRINTTS_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_FRINT), HWCAP_CAP(SYS_ID_AA64ISAR1_EL1, ID_AA64ISAR1_SB_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_SB), + HWCAP_CAP(SYS_ID_AA64ISAR1_EL1, ID_AA64ISAR1_DGH_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_DGH), HWCAP_CAP(SYS_ID_AA64MMFR2_EL1, ID_AA64MMFR2_AT_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_USCAT), #ifdef CONFIG_ARM64_SVE HWCAP_CAP(SYS_ID_AA64PFR0_EL1, ID_AA64PFR0_SVE_SHIFT, FTR_UNSIGNED, ID_AA64PFR0_SVE, CAP_HWCAP, KERNEL_HWCAP_SVE), diff --git a/arch/arm64/kernel/cpuinfo.c b/arch/arm64/kernel/cpuinfo.c index f1b246b48a47ef4bf704f3ca79e7c03507a8bcda..7f3a91ba07b502931e3f883823a898155c2a09dc 100644 --- a/arch/arm64/kernel/cpuinfo.c +++ b/arch/arm64/kernel/cpuinfo.c @@ -87,6 +87,7 @@ static const char *const hwcap_str[] = { "flagm2", "frint", "ecv", + "dgh", NULL }; diff --git a/drivers/net/ethernet/marvell/sky2.c b/drivers/net/ethernet/marvell/sky2.c index df7c23cd33600a4e0f0eba0b626185fc383c8343..93f38aca37a51ea143cf697dae8fd148e64ebede 100644 --- a/drivers/net/ethernet/marvell/sky2.c +++ b/drivers/net/ethernet/marvell/sky2.c @@ -2501,7 +2501,7 @@ static void skb_put_frags(struct sk_buff *skb, unsigned int hdr_space, if (length == 0) { /* don't need this page */ - __skb_frag_unref(frag); + __skb_frag_unref(frag, false); --skb_shinfo(skb)->nr_frags; } else { size = min(length, (unsigned) PAGE_SIZE); diff --git a/drivers/net/ethernet/mellanox/mlx4/en_rx.c b/drivers/net/ethernet/mellanox/mlx4/en_rx.c index f9797e5038841ed70b944ed478b499aa1eb3133e..6ad3ca63ee2d7ecd03773a9e36c501ad354fce19 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_rx.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_rx.c @@ -526,7 +526,7 @@ static int mlx4_en_complete_rx_desc(struct mlx4_en_priv *priv, fail: while (nr > 0) { nr--; - __skb_frag_unref(skb_shinfo(skb)->frags + nr); + __skb_frag_unref(skb_shinfo(skb)->frags + nr, false); } return 0; } diff --git a/drivers/net/ethernet/socionext/netsec.c b/drivers/net/ethernet/socionext/netsec.c index 5a672658da80e33edc28a8491d0afe73cf14df99..5e6ed631e9fcddc41c0d340a514ca830361618c2 100644 --- a/drivers/net/ethernet/socionext/netsec.c +++ b/drivers/net/ethernet/socionext/netsec.c @@ -1191,7 +1191,7 @@ static void netsec_uninit_pkt_dring(struct netsec_priv *priv, int id) if (id == NETSEC_RING_RX) { struct page *page = virt_to_page(desc->addr); - page_pool_put_page(dring->page_pool, page, false); + page_pool_put_full_page(dring->page_pool, page, false); } else if (id == NETSEC_RING_TX) { dma_unmap_single(priv->dev, desc->dma_addr, desc->len, DMA_TO_DEVICE); diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index 6a3b0f76d9729a9252fa98ca276749a24f42b648..f93b9a12dfd5ee7a3b875ac06aa05a03dd204a7a 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -1224,11 +1224,11 @@ static void stmmac_free_rx_buffer(struct stmmac_priv *priv, u32 queue, int i) struct stmmac_rx_buffer *buf = &rx_q->buf_pool[i]; if (buf->page) - page_pool_put_page(rx_q->page_pool, buf->page, false); + page_pool_put_full_page(rx_q->page_pool, buf->page, false); buf->page = NULL; if (buf->sec_page) - page_pool_put_page(rx_q->page_pool, buf->sec_page, false); + page_pool_put_full_page(rx_q->page_pool, buf->sec_page, false); buf->sec_page = NULL; } diff --git a/include/asm-generic/barrier.h b/include/asm-generic/barrier.h index 85b28eb80b11fc4c72d2b88c9fd79e71f42cbdbf..888f35d02a13d1e1d68bcb32db1131263db33d82 100644 --- a/include/asm-generic/barrier.h +++ b/include/asm-generic/barrier.h @@ -257,5 +257,16 @@ do { \ }) #endif +/* + * ioremap_wc() maps I/O memory as memory with write-combining attributes. For + * this kind of memory accesses, the CPU may wait for prior accesses to be + * merged with subsequent ones. In some situation, such wait is bad for the + * performance. io_stop_wc() can be used to prevent the merging of + * write-combining memory accesses before this macro with those after it. + */ +#ifndef io_stop_wc +#define io_stop_wc do { } while (0) +#endif + #endif /* !__ASSEMBLY__ */ #endif /* __ASM_GENERIC_BARRIER_H */ diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 61f2f6ff94673b0a74d2d7705d87373bfba74fe5..467545cae440fddc226f99037cb926c4691386c4 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -496,6 +496,24 @@ __alloc_pages(gfp_t gfp_mask, unsigned int order, int preferred_nid) return __alloc_pages_nodemask(gfp_mask, order, preferred_nid, NULL); } +unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid, + nodemask_t *nodemask, int nr_pages, + struct list_head *page_list, + struct page **page_array); + +/* Bulk allocate order-0 pages */ +static inline unsigned long +alloc_pages_bulk_list(gfp_t gfp, unsigned long nr_pages, struct list_head *list) +{ + return __alloc_pages_bulk(gfp, numa_mem_id(), NULL, nr_pages, list, NULL); +} + +static inline unsigned long +alloc_pages_bulk_array(gfp_t gfp, unsigned long nr_pages, struct page **page_array) +{ + return __alloc_pages_bulk(gfp, numa_mem_id(), NULL, nr_pages, NULL, page_array); +} + /* * Allocate pages, preferring the node given as nid. The node must be valid and * online. For more general interface, see alloc_pages_node(). diff --git a/include/linux/mm.h b/include/linux/mm.h index d48010d709b8ced1b2ae3f10193863615e3bd2fc..8ff12971a0d232ed7e97c73427bad8db1af06bc9 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1440,10 +1440,11 @@ struct address_space *page_mapping_file(struct page *page); static inline bool page_is_pfmemalloc(const struct page *page) { /* - * Page index cannot be this large so this must be - * a pfmemalloc page. + * lru.next has bit 1 set if the page is allocated from the + * pfmemalloc reserves. Callers may simply overwrite it if + * they do not need to preserve that information. */ - return page->index == -1UL; + return (uintptr_t)page->lru.next & BIT(1); } /* @@ -1452,12 +1453,12 @@ static inline bool page_is_pfmemalloc(const struct page *page) */ static inline void set_page_pfmemalloc(struct page *page) { - page->index = -1UL; + page->lru.next = (void *)BIT(1); } static inline void clear_page_pfmemalloc(struct page *page) { - page->index = 0; + page->lru.next = NULL; } /* diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index d71a92cfc059b9100211aec8225181bcc575993a..c2a97c4601972507882ac5af2e6b04fd9f62151e 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -96,10 +96,25 @@ struct page { }; struct { /* page_pool used by netstack */ /** - * @dma_addr: might require a 64-bit value on - * 32-bit architectures. + * @pp_magic: magic value to avoid recycling non + * page_pool allocated pages. */ - unsigned long dma_addr[2]; + unsigned long pp_magic; + struct page_pool *pp; + unsigned long _pp_mapping_pad; + unsigned long dma_addr; + union { + /** + * dma_addr_upper: might require a 64-bit + * value on 32-bit architectures. + */ + unsigned long dma_addr_upper; + /** + * For frag page support, not supported in + * 32-bit architectures with 64-bit DMA. + */ + atomic_long_t pp_frag_count; + }; }; struct { /* slab, slob and slub */ union { diff --git a/include/linux/poison.h b/include/linux/poison.h index df34330b4e347243dab6c3f7d1c3f7113b6dd6c7..000bd3a5acf653f3b9645b280e6a95630f744620 100644 --- a/include/linux/poison.h +++ b/include/linux/poison.h @@ -86,4 +86,7 @@ /********** security/ **********/ #define KEY_DESTROY 0xbd +/********** net/core/page_pool.c **********/ +#define PP_SIGNATURE (0x40 + POISON_POINTER_DELTA) + #endif diff --git a/include/linux/sfp.h b/include/linux/sfp.h index 73f573b334652b3a15ee8776aa13531d474ff20f..30438213ec5b23d3ae3358f54045dcd3501d6a1a 100644 --- a/include/linux/sfp.h +++ b/include/linux/sfp.h @@ -284,6 +284,18 @@ enum { SFF8024_ID_QSFP_8438 = 0x0c, SFF8024_ID_QSFP_8436_8636 = 0x0d, SFF8024_ID_QSFP28_8636 = 0x11, + SFF8024_ID_CXP2 = 0x12, + SFF8024_ID_CDFP = 0x13, + SFF8024_ID_HD4X_FANOUT = 0x14, + SFF8024_ID_HD8X_FANOUT = 0x15, + SFF8024_ID_CDFP_S3 = 0x16, + SFF8024_ID_MICRO_QSFP = 0x17, + SFF8024_ID_QSFP_DD = 0x18, + SFF8024_ID_OSFP = 0x19, + SFF8024_ID_DSFP = 0x1B, + SFF8024_ID_QSFP_PLUS_CMIS = 0x1E, + SFF8024_ID_SFP_DD_CMIS = 0x1F, + SFF8024_ID_SFP_PLUS_CMIS = 0x20, SFF8024_ENCODING_UNSPEC = 0x00, SFF8024_ENCODING_8B10B = 0x01, diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 357b1abe8695fe1ae1f3d9492e02d3131a6f8cbe..baf5790e3ada425a5386209e147bd8567a5e7d92 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -37,6 +37,7 @@ #include #include #include +#include #if IS_ENABLED(CONFIG_NF_CONNTRACK) #include #endif @@ -652,6 +653,8 @@ typedef unsigned char *sk_buff_data_t; * @hash: the packet hash * @queue_mapping: Queue mapping for multiqueue devices * @pfmemalloc: skbuff was allocated from PFMEMALLOC reserves + * @pp_recycle: mark the packet for recycling instead of freeing (implies + * page_pool support on driver) * @active_extensions: active extensions (skb_ext_id types) * @ndisc_nodetype: router type (from link layer) * @ooo_okay: allow the mapping of a socket to a queue to be changed @@ -760,10 +763,12 @@ struct sk_buff { fclone:2, peeked:1, head_frag:1, - pfmemalloc:1; + pfmemalloc:1, + pp_recycle:1; /* page_pool recycle indicator */ #ifdef CONFIG_SKB_EXTENSIONS __u8 active_extensions; #endif + /* fields enclosed in headers_start/headers_end are copied * using a single memcpy() in __copy_skb_header() */ @@ -3018,12 +3023,20 @@ static inline void skb_frag_ref(struct sk_buff *skb, int f) /** * __skb_frag_unref - release a reference on a paged fragment. * @frag: the paged fragment + * @recycle: recycle the page if allocated via page_pool * - * Releases a reference on the paged fragment @frag. + * Releases a reference on the paged fragment @frag + * or recycles the page via the page_pool API. */ -static inline void __skb_frag_unref(skb_frag_t *frag) +static inline void __skb_frag_unref(skb_frag_t *frag, bool recycle) { - put_page(skb_frag_page(frag)); + struct page *page = skb_frag_page(frag); + +#ifdef CONFIG_PAGE_POOL + if (recycle && page_pool_return_skb_page(page)) + return; +#endif + put_page(page); } /** @@ -3035,7 +3048,7 @@ static inline void __skb_frag_unref(skb_frag_t *frag) */ static inline void skb_frag_unref(struct sk_buff *skb, int f) { - __skb_frag_unref(&skb_shinfo(skb)->frags[f]); + __skb_frag_unref(&skb_shinfo(skb)->frags[f], skb->pp_recycle); } /** @@ -4604,5 +4617,19 @@ static inline bool skb_csum_is_sctp(struct sk_buff *skb) return skb->csum_not_inet; } +#ifdef CONFIG_PAGE_POOL +static inline void skb_mark_for_recycle(struct sk_buff *skb) +{ + skb->pp_recycle = 1; +} +#endif + +static inline bool skb_pp_recycle(struct sk_buff *skb, void *data) +{ + if (!IS_ENABLED(CONFIG_PAGE_POOL) || !skb->pp_recycle) + return false; + return page_pool_return_skb_page(virt_to_page(data)); +} + #endif /* __KERNEL__ */ #endif /* _LINUX_SKBUFF_H */ diff --git a/include/net/page_pool.h b/include/net/page_pool.h index cf086e13bd253e43c9bed7b5f57c0cbfde745d63..2b1da328370c293f4bb3ec53133f14139facfe9a 100644 --- a/include/net/page_pool.h +++ b/include/net/page_pool.h @@ -34,8 +34,21 @@ #include #include -#define PP_FLAG_DMA_MAP 1 /* Should page_pool do the DMA map/unmap */ -#define PP_FLAG_ALL PP_FLAG_DMA_MAP +#define PP_FLAG_DMA_MAP BIT(0) /* Should page_pool do the DMA + * map/unmap + */ +#define PP_FLAG_DMA_SYNC_DEV BIT(1) /* If set all pages that the driver gets + * from page_pool will be + * DMA-synced-for-device according to + * the length provided by the device + * driver. + * Please note DMA-sync-for-CPU is still + * device driver responsibility + */ +#define PP_FLAG_PAGE_FRAG BIT(2) /* for page frag feature */ +#define PP_FLAG_ALL (PP_FLAG_DMA_MAP |\ + PP_FLAG_DMA_SYNC_DEV |\ + PP_FLAG_PAGE_FRAG) /* * Fast allocation side cache array/stack @@ -55,7 +68,7 @@ #define PP_ALLOC_CACHE_REFILL 64 struct pp_alloc_cache { u32 count; - void *cache[PP_ALLOC_CACHE_SIZE]; + struct page *cache[PP_ALLOC_CACHE_SIZE]; }; struct page_pool_params { @@ -65,6 +78,8 @@ struct page_pool_params { int nid; /* Numa node id to allocate from pages from */ struct device *dev; /* device, for DMA pre-mapping purposes */ enum dma_data_direction dma_dir; /* DMA mapping direction */ + unsigned int max_len; /* max DMA sync memory size */ + unsigned int offset; /* DMA addr offset */ }; struct page_pool { @@ -76,6 +91,9 @@ struct page_pool { unsigned long defer_warn; u32 pages_state_hold_cnt; + unsigned int frag_offset; + struct page *frag_page; + long frag_users; /* * Data structure for allocation side @@ -123,6 +141,18 @@ static inline struct page *page_pool_dev_alloc_pages(struct page_pool *pool) return page_pool_alloc_pages(pool, gfp); } +struct page *page_pool_alloc_frag(struct page_pool *pool, unsigned int *offset, + unsigned int size, gfp_t gfp); + +static inline struct page *page_pool_dev_alloc_frag(struct page_pool *pool, + unsigned int *offset, + unsigned int size) +{ + gfp_t gfp = (GFP_ATOMIC | __GFP_NOWARN); + + return page_pool_alloc_frag(pool, offset, size, gfp); +} + /* get the stored dma direction. A driver might decide to treat this locally and * avoid the extra cache line from page_pool to determine the direction */ @@ -132,11 +162,16 @@ inline enum dma_data_direction page_pool_get_dma_dir(struct page_pool *pool) return pool->p.dma_dir; } +bool page_pool_return_skb_page(struct page *page); + struct page_pool *page_pool_create(const struct page_pool_params *params); #ifdef CONFIG_PAGE_POOL void page_pool_destroy(struct page_pool *pool); void page_pool_use_xdp_mem(struct page_pool *pool, void (*disconnect)(void *)); +void page_pool_release_page(struct page_pool *pool, struct page *page); +void page_pool_put_page_bulk(struct page_pool *pool, void **data, + int count); #else static inline void page_pool_destroy(struct page_pool *pool) { @@ -146,56 +181,81 @@ static inline void page_pool_use_xdp_mem(struct page_pool *pool, void (*disconnect)(void *)) { } +static inline void page_pool_release_page(struct page_pool *pool, + struct page *page) +{ +} + +static inline void page_pool_put_page_bulk(struct page_pool *pool, void **data, + int count) +{ +} #endif -/* Never call this directly, use helpers below */ -void __page_pool_put_page(struct page_pool *pool, - struct page *page, bool allow_direct); +void page_pool_put_page(struct page_pool *pool, struct page *page, + unsigned int dma_sync_size, bool allow_direct); -static inline void page_pool_put_page(struct page_pool *pool, - struct page *page, bool allow_direct) +/* Same as above but will try to sync the entire area pool->max_len */ +static inline void page_pool_put_full_page(struct page_pool *pool, + struct page *page, bool allow_direct) { /* When page_pool isn't compiled-in, net/core/xdp.c doesn't * allow registering MEM_TYPE_PAGE_POOL, but shield linker. */ #ifdef CONFIG_PAGE_POOL - __page_pool_put_page(pool, page, allow_direct); + page_pool_put_page(pool, page, -1, allow_direct); #endif } -/* Very limited use-cases allow recycle direct */ + +/* Same as above but the caller must guarantee safe context. e.g NAPI */ static inline void page_pool_recycle_direct(struct page_pool *pool, struct page *page) { - __page_pool_put_page(pool, page, true); + page_pool_put_full_page(pool, page, true); } -/* Disconnects a page (from a page_pool). API users can have a need - * to disconnect a page (from a page_pool), to allow it to be used as - * a regular page (that will eventually be returned to the normal - * page-allocator via put_page). - */ -void page_pool_unmap_page(struct page_pool *pool, struct page *page); -static inline void page_pool_release_page(struct page_pool *pool, - struct page *page) -{ -#ifdef CONFIG_PAGE_POOL - page_pool_unmap_page(pool, page); -#endif -} +#define PAGE_POOL_DMA_USE_PP_FRAG_COUNT \ + (sizeof(dma_addr_t) > sizeof(unsigned long)) static inline dma_addr_t page_pool_get_dma_addr(struct page *page) { - dma_addr_t ret = page->dma_addr[0]; - if (sizeof(dma_addr_t) > sizeof(unsigned long)) - ret |= (dma_addr_t)page->dma_addr[1] << 16 << 16; + dma_addr_t ret = page->dma_addr; + + if (PAGE_POOL_DMA_USE_PP_FRAG_COUNT) + ret |= (dma_addr_t)page->dma_addr_upper << 16 << 16; + return ret; } static inline void page_pool_set_dma_addr(struct page *page, dma_addr_t addr) { - page->dma_addr[0] = addr; - if (sizeof(dma_addr_t) > sizeof(unsigned long)) - page->dma_addr[1] = upper_32_bits(addr); + page->dma_addr = addr; + if (PAGE_POOL_DMA_USE_PP_FRAG_COUNT) + page->dma_addr_upper = upper_32_bits(addr); +} + +static inline void page_pool_set_frag_count(struct page *page, long nr) +{ + atomic_long_set(&page->pp_frag_count, nr); +} + +static inline long page_pool_atomic_sub_frag_count_return(struct page *page, + long nr) +{ + long ret; + + /* As suggested by Alexander, atomic_long_read() may cover up the + * reference count errors, so avoid calling atomic_long_read() in + * the cases of freeing or draining the page_frags, where we would + * not expect it to match or that are slowpath anyway. + */ + if (__builtin_constant_p(nr) && + atomic_long_read(&page->pp_frag_count) == nr) + return 0; + + ret = atomic_long_sub_return(nr, &page->pp_frag_count); + WARN_ON(ret < 0); + return ret; } static inline bool is_page_pool_compiled_in(void) @@ -212,4 +272,30 @@ static inline bool page_pool_put(struct page_pool *pool) return refcount_dec_and_test(&pool->user_cnt); } +/* Caller must provide appropriate safe context, e.g. NAPI. */ +void page_pool_update_nid(struct page_pool *pool, int new_nid); +static inline void page_pool_nid_changed(struct page_pool *pool, int new_nid) +{ + if (unlikely(pool->p.nid != new_nid)) + page_pool_update_nid(pool, new_nid); +} + +static inline void page_pool_ring_lock(struct page_pool *pool) + __acquires(&pool->ring.producer_lock) +{ + if (in_serving_softirq()) + spin_lock(&pool->ring.producer_lock); + else + spin_lock_bh(&pool->ring.producer_lock); +} + +static inline void page_pool_ring_unlock(struct page_pool *pool) + __releases(&pool->ring.producer_lock) +{ + if (in_serving_softirq()) + spin_unlock(&pool->ring.producer_lock); + else + spin_unlock_bh(&pool->ring.producer_lock); +} + #endif /* _NET_PAGE_POOL_H */ diff --git a/include/trace/events/page_pool.h b/include/trace/events/page_pool.h index 47b5ee880aa9fe2c1da42bb9a664570bed455596..b58b6a3a3e57996310ccc55814f3af53c09ec96e 100644 --- a/include/trace/events/page_pool.h +++ b/include/trace/events/page_pool.h @@ -81,6 +81,28 @@ TRACE_EVENT(page_pool_state_hold, __entry->pool, __entry->page, __entry->hold) ); +TRACE_EVENT(page_pool_update_nid, + + TP_PROTO(const struct page_pool *pool, int new_nid), + + TP_ARGS(pool, new_nid), + + TP_STRUCT__entry( + __field(const struct page_pool *, pool) + __field(int, pool_nid) + __field(int, new_nid) + ), + + TP_fast_assign( + __entry->pool = pool; + __entry->pool_nid = pool->p.nid; + __entry->new_nid = new_nid; + ), + + TP_printk("page_pool=%p pool_nid=%d new_nid=%d", + __entry->pool, __entry->pool_nid, __entry->new_nid) +); + #endif /* _TRACE_PAGE_POOL_H */ /* This part must be outside protection */ diff --git a/mm/page_alloc.c b/mm/page_alloc.c index cf0f620d0e637cff1686e485533b3ca0829ad5d3..5bb102f17a5a36b6cf864ac6976a006ecca1de56 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2759,7 +2759,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, unsigned long count, struct list_head *list, int migratetype, unsigned int alloc_flags) { - int i, alloced = 0; + int i, allocated = 0; spin_lock(&zone->lock); for (i = 0; i < count; ++i) { @@ -2782,7 +2782,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, * pages are ordered properly. */ list_add_tail(&page->lru, list); - alloced++; + allocated++; if (is_migrate_cma(get_pcppage_migratetype(page))) __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, -(1 << order)); @@ -2791,12 +2791,12 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, /* * i pages were removed from the buddy list even if some leak due * to check_pcp_refill failing so adjust NR_FREE_PAGES based - * on i. Do not confuse with 'alloced' which is the number of + * on i. Do not confuse with 'allocated' which is the number of * pages added to the pcp list. */ __mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order)); spin_unlock(&zone->lock); - return alloced; + return allocated; } #ifdef CONFIG_NUMA @@ -3245,7 +3245,8 @@ static inline void zone_statistics(struct zone *preferred_zone, struct zone *z) } /* Remove page from the per-cpu list, caller must protect the list */ -static struct page *__rmqueue_pcplist(struct zone *zone, int migratetype, +static inline +struct page *__rmqueue_pcplist(struct zone *zone, int migratetype, unsigned int alloc_flags, struct per_cpu_pages *pcp, struct list_head *list) @@ -4899,12 +4900,6 @@ static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order, if (IS_ENABLED(CONFIG_CMA) && ac->migratetype == MIGRATE_MOVABLE) *alloc_flags |= ALLOC_CMA; - return true; -} - -/* Determine whether to spread dirty pages and what the first usable zone */ -static inline void finalise_ac(gfp_t gfp_mask, struct alloc_context *ac) -{ /* Dirty zone balancing only done in the fast path */ ac->spread_dirty_pages = (gfp_mask & __GFP_WRITE); @@ -4915,10 +4910,162 @@ static inline void finalise_ac(gfp_t gfp_mask, struct alloc_context *ac) */ ac->preferred_zoneref = first_zones_zonelist(ac->zonelist, ac->high_zoneidx, ac->nodemask); + + return true; } extern unsigned int vm_memcg_latency_histogram; +/* + * __alloc_pages_bulk - Allocate a number of order-0 pages to a list or array + * @gfp: GFP flags for the allocation + * @preferred_nid: The preferred NUMA node ID to allocate from + * @nodemask: Set of nodes to allocate from, may be NULL + * @nr_pages: The number of pages desired on the list or array + * @page_list: Optional list to store the allocated pages + * @page_array: Optional array to store the pages + * + * This is a batched version of the page allocator that attempts to + * allocate nr_pages quickly. Pages are added to page_list if page_list + * is not NULL, otherwise it is assumed that the page_array is valid. + * + * For lists, nr_pages is the number of pages that should be allocated. + * + * For arrays, only NULL elements are populated with pages and nr_pages + * is the maximum number of pages that will be stored in the array. + * + * Returns the number of pages on the list or array. + */ +unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid, + nodemask_t *nodemask, int nr_pages, + struct list_head *page_list, + struct page **page_array) +{ + struct page *page; + unsigned long flags; + struct zone *zone; + struct zoneref *z; + struct per_cpu_pages *pcp; + struct list_head *pcp_list; + struct alloc_context ac; + gfp_t alloc_gfp; + unsigned int alloc_flags = ALLOC_WMARK_LOW; + int nr_populated = 0; + + if (unlikely(nr_pages <= 0)) + return 0; + + /* + * Skip populated array elements to determine if any pages need + * to be allocated before disabling IRQs. + */ + while (page_array && nr_populated < nr_pages && page_array[nr_populated]) + nr_populated++; + + /* Already populated array? */ + if (unlikely(page_array && nr_pages - nr_populated == 0)) + return nr_populated; + + /* Use the single page allocator for one page. */ + if (nr_pages - nr_populated == 1) + goto failed; + + /* May set ALLOC_NOFRAGMENT, fragmentation will return 1 page. */ + gfp &= gfp_allowed_mask; + alloc_gfp = gfp; + if (!prepare_alloc_pages(gfp, 0, preferred_nid, nodemask, &ac, &alloc_gfp, &alloc_flags)) + return 0; + gfp = alloc_gfp; + + /* Find an allowed local zone that meets the low watermark. */ + for_each_zone_zonelist_nodemask(zone, z, ac.zonelist, ac.high_zoneidx, ac.nodemask) { + unsigned long mark; + + if (cpusets_enabled() && (alloc_flags & ALLOC_CPUSET) && + !__cpuset_zone_allowed(zone, gfp)) { + continue; + } + + if (nr_online_nodes > 1 && zone != ac.preferred_zoneref->zone && + zone_to_nid(zone) != zone_to_nid(ac.preferred_zoneref->zone)) { + goto failed; + } + + mark = wmark_pages(zone, alloc_flags & ALLOC_WMARK_MASK) + nr_pages; + if (zone_watermark_fast(zone, 0, mark, + zonelist_zone_idx(ac.preferred_zoneref), + alloc_flags, gfp)) { + break; + } + } + + /* + * If there are no allowed local zones that meets the watermarks then + * try to allocate a single page and reclaim if necessary. + */ + if (unlikely(!zone)) + goto failed; + + /* Attempt the batch allocation */ + local_irq_save(flags); + pcp = &this_cpu_ptr(zone->pageset)->pcp; + pcp_list = &pcp->lists[ac.migratetype]; + + while (nr_populated < nr_pages) { + + /* Skip existing pages */ + if (page_array && page_array[nr_populated]) { + nr_populated++; + continue; + } + + page = __rmqueue_pcplist(zone, ac.migratetype, alloc_flags, + pcp, pcp_list); + if (unlikely(!page)) { + /* Try and get at least one page */ + if (!nr_populated) + goto failed_irq; + break; + } + + /* + * Ideally this would be batched but the best way to do + * that cheaply is to first convert zone_statistics to + * be inaccurate per-cpu counter like vm_events to avoid + * a RMW cycle then do the accounting with IRQs enabled. + */ + __count_zid_vm_events(PGALLOC, zone_idx(zone), 1); + zone_statistics(ac.preferred_zoneref->zone, zone); + + prep_new_page(page, 0, gfp, 0); + if (page_list) + list_add(&page->lru, page_list); + else + page_array[nr_populated] = page; + nr_populated++; + } + + local_irq_restore(flags); + + return nr_populated; + +failed_irq: + local_irq_restore(flags); + +failed: + page = __alloc_pages_nodemask(gfp, 0, preferred_nid, nodemask); + if (page) { + if (page_list) + list_add(&page->lru, page_list); + else + page_array[nr_populated] = page; + nr_populated++; + } + + return nr_populated; +} +EXPORT_SYMBOL_GPL(__alloc_pages_bulk); + /* * This is the 'heart' of the zoned buddy allocator. */ @@ -4968,8 +5115,6 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, int preferred_nid, rcu_read_unlock(); #endif - finalise_ac(gfp_mask, &ac); - /* * Forbid the first pass from falling back to types that fragment * memory until all local zones are considered. diff --git a/net/core/page_pool.c b/net/core/page_pool.c index 335f68eaaa05c9f14d8377cc940f38f7b48b14ff..381705fed4db9bd63d78980afaabb795d2f06ba9 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -11,16 +11,21 @@ #include #include +#include + #include #include #include #include /* for __put_page() */ +#include #include #define DEFER_TIME (msecs_to_jiffies(1000)) #define DEFER_WARN_INTERVAL (60 * HZ) +#define BIAS_MAX LONG_MAX + static int page_pool_init(struct page_pool *pool, const struct page_pool_params *params) { @@ -47,6 +52,25 @@ static int page_pool_init(struct page_pool *pool, (pool->p.dma_dir != DMA_BIDIRECTIONAL)) return -EINVAL; + if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV) { + /* In order to request DMA-sync-for-device the page + * needs to be mapped + */ + if (!(pool->p.flags & PP_FLAG_DMA_MAP)) + return -EINVAL; + + if (!pool->p.max_len) + return -EINVAL; + + /* pool->p.offset has to be set according to the address + * offset used by the DMA engine to start copying rx data + */ + } + + if (PAGE_POOL_DMA_USE_PP_FRAG_COUNT && + pool->p.flags & PP_FLAG_PAGE_FRAG) + return -EINVAL; + if (ptr_ring_init(&pool->ring, ring_qsize, GFP_KERNEL) < 0) return -ENOMEM; @@ -81,69 +105,89 @@ struct page_pool *page_pool_create(const struct page_pool_params *params) } EXPORT_SYMBOL(page_pool_create); -/* fast path */ -static struct page *__page_pool_get_cached(struct page_pool *pool) +static void page_pool_return_page(struct page_pool *pool, struct page *page); + +noinline +static struct page *page_pool_refill_alloc_cache(struct page_pool *pool) { struct ptr_ring *r = &pool->ring; - bool refill = false; struct page *page; - - /* Test for safe-context, caller should provide this guarantee */ - if (likely(in_serving_softirq())) { - if (likely(pool->alloc.count)) { - /* Fast-path */ - page = pool->alloc.cache[--pool->alloc.count]; - return page; - } - refill = true; - } + int pref_nid; /* preferred NUMA node */ /* Quicker fallback, avoid locks when ring is empty */ if (__ptr_ring_empty(r)) return NULL; - /* Slow-path: Get page from locked ring queue, - * refill alloc array if requested. + /* Softirq guarantee CPU and thus NUMA node is stable. This, + * assumes CPU refilling driver RX-ring will also run RX-NAPI. */ +#ifdef CONFIG_NUMA + pref_nid = (pool->p.nid == NUMA_NO_NODE) ? numa_mem_id() : pool->p.nid; +#else + /* Ignore pool->p.nid setting if !CONFIG_NUMA, helps compiler */ + pref_nid = numa_mem_id(); /* will be zero like page_to_nid() */ +#endif + + /* Slower-path: Get pages from locked ring queue */ spin_lock(&r->consumer_lock); - page = __ptr_ring_consume(r); - if (refill) - pool->alloc.count = __ptr_ring_consume_batched(r, - pool->alloc.cache, - PP_ALLOC_CACHE_REFILL); + + /* Refill alloc array, but only if NUMA match */ + do { + page = __ptr_ring_consume(r); + if (unlikely(!page)) + break; + + if (likely(page_to_nid(page) == pref_nid)) { + pool->alloc.cache[pool->alloc.count++] = page; + } else { + /* NUMA mismatch; + * (1) release 1 page to page-allocator and + * (2) break out to fallthrough to alloc_pages_node. + * This limit stress on page buddy alloactor. + */ + page_pool_return_page(pool, page); + page = NULL; + break; + } + } while (pool->alloc.count < PP_ALLOC_CACHE_REFILL); + + /* Return last page */ + if (likely(pool->alloc.count > 0)) + page = pool->alloc.cache[--pool->alloc.count]; + spin_unlock(&r->consumer_lock); return page; } -/* slow path */ -noinline -static struct page *__page_pool_alloc_pages_slow(struct page_pool *pool, - gfp_t _gfp) +/* fast path */ +static struct page *__page_pool_get_cached(struct page_pool *pool) { struct page *page; - gfp_t gfp = _gfp; - dma_addr_t dma; - /* We could always set __GFP_COMP, and avoid this branch, as - * prep_new_page() can handle order-0 with __GFP_COMP. - */ - if (pool->p.order) - gfp |= __GFP_COMP; + /* Caller MUST guarantee safe non-concurrent access, e.g. softirq */ + if (likely(pool->alloc.count)) { + /* Fast-path */ + page = pool->alloc.cache[--pool->alloc.count]; + } else { + page = page_pool_refill_alloc_cache(pool); + } - /* FUTURE development: - * - * Current slow-path essentially falls back to single page - * allocations, which doesn't improve performance. This code - * need bulk allocation support from the page allocator code. - */ + return page; +} - /* Cache was empty, do real allocation */ - page = alloc_pages_node(pool->p.nid, gfp, pool->p.order); - if (!page) - return NULL; +static void page_pool_dma_sync_for_device(struct page_pool *pool, + struct page *page, + unsigned int dma_sync_size) +{ + dma_sync_size = min(dma_sync_size, pool->p.max_len); + dma_sync_single_range_for_device(pool->p.dev, page->dma_addr, + pool->p.offset, dma_sync_size, + pool->p.dma_dir); +} - if (!(pool->p.flags & PP_FLAG_DMA_MAP)) - goto skip_dma_map; +static bool page_pool_dma_map(struct page_pool *pool, struct page *page) +{ + dma_addr_t dma; /* Setup DMA mapping: use 'struct page' area for storing DMA-addr * since dma_addr_t can be either 32 or 64 bits and does not always fit @@ -153,17 +197,104 @@ static struct page *__page_pool_alloc_pages_slow(struct page_pool *pool, dma = dma_map_page_attrs(pool->p.dev, page, 0, (PAGE_SIZE << pool->p.order), pool->p.dma_dir, DMA_ATTR_SKIP_CPU_SYNC); - if (dma_mapping_error(pool->p.dev, dma)) { + if (dma_mapping_error(pool->p.dev, dma)) + return false; + + page->dma_addr = dma; + + if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV) + page_pool_dma_sync_for_device(pool, page, pool->p.max_len); + + return true; +} + +static void page_pool_set_pp_info(struct page_pool *pool, + struct page *page) +{ + page->pp = pool; + page->pp_magic |= PP_SIGNATURE; +} + +static void page_pool_clear_pp_info(struct page *page) +{ + page->pp_magic = 0; + page->pp = NULL; +} + +static struct page *__page_pool_alloc_page_order(struct page_pool *pool, + gfp_t gfp) +{ + struct page *page; + + gfp |= __GFP_COMP; + page = alloc_pages_node(pool->p.nid, gfp, pool->p.order); + if (unlikely(!page)) + return NULL; + + if ((pool->p.flags & PP_FLAG_DMA_MAP) && + unlikely(!page_pool_dma_map(pool, page))) { put_page(page); return NULL; } - page_pool_set_dma_addr(page, dma); -skip_dma_map: + page_pool_set_pp_info(pool, page); + /* Track how many pages are held 'in-flight' */ pool->pages_state_hold_cnt++; - trace_page_pool_state_hold(pool, page, pool->pages_state_hold_cnt); + return page; +} + +/* slow path */ +noinline +static struct page *__page_pool_alloc_pages_slow(struct page_pool *pool, + gfp_t gfp) +{ + const int bulk = PP_ALLOC_CACHE_REFILL; + unsigned int pp_flags = pool->p.flags; + unsigned int pp_order = pool->p.order; + struct page *page; + int i, nr_pages; + + /* Don't support bulk alloc for high-order pages */ + if (unlikely(pp_order)) + return __page_pool_alloc_page_order(pool, gfp); + + /* Unnecessary as alloc cache is empty, but guarantees zero count */ + if (unlikely(pool->alloc.count > 0)) + return pool->alloc.cache[--pool->alloc.count]; + + /* Mark empty alloc.cache slots "empty" for alloc_pages_bulk_array */ + memset(&pool->alloc.cache, 0, sizeof(void *) * bulk); + + nr_pages = alloc_pages_bulk_array(gfp, bulk, pool->alloc.cache); + if (unlikely(!nr_pages)) + return NULL; + + /* Pages have been filled into alloc.cache array, but count is zero and + * page element have not been (possibly) DMA mapped. + */ + for (i = 0; i < nr_pages; i++) { + page = pool->alloc.cache[i]; + if ((pp_flags & PP_FLAG_DMA_MAP) && + unlikely(!page_pool_dma_map(pool, page))) { + put_page(page); + continue; + } + + page_pool_set_pp_info(pool, page); + pool->alloc.cache[pool->alloc.count++] = page; + /* Track how many pages are held 'in-flight' */ + pool->pages_state_hold_cnt++; + trace_page_pool_state_hold(pool, page, + pool->pages_state_hold_cnt); + } + + /* Return last page */ + if (likely(pool->alloc.count > 0)) + page = pool->alloc.cache[--pool->alloc.count]; + else + page = NULL; /* When page just alloc'ed is should/must have refcnt 1. */ return page; @@ -206,44 +337,44 @@ static s32 page_pool_inflight(struct page_pool *pool) return inflight; } -/* Cleanup page_pool state from page */ -static void __page_pool_clean_page(struct page_pool *pool, - struct page *page) +/* Disconnects a page (from a page_pool). API users can have a need + * to disconnect a page (from a page_pool), to allow it to be used as + * a regular page (that will eventually be returned to the normal + * page-allocator via put_page). + */ +void page_pool_release_page(struct page_pool *pool, struct page *page) { dma_addr_t dma; int count; if (!(pool->p.flags & PP_FLAG_DMA_MAP)) + /* Always account for inflight pages, even if we didn't + * map them + */ goto skip_dma_unmap; dma = page_pool_get_dma_addr(page); - /* DMA unmap */ + + /* When page is unmapped, it cannot be returned our pool */ dma_unmap_page_attrs(pool->p.dev, dma, PAGE_SIZE << pool->p.order, pool->p.dma_dir, DMA_ATTR_SKIP_CPU_SYNC); page_pool_set_dma_addr(page, 0); skip_dma_unmap: + page_pool_clear_pp_info(page); + /* This may be the last page returned, releasing the pool, so * it is not safe to reference pool afterwards. */ - count = atomic_inc_return(&pool->pages_state_release_cnt); + count = atomic_inc_return_relaxed(&pool->pages_state_release_cnt); trace_page_pool_state_release(pool, page, count); } - -/* unmap the page and clean our state */ -void page_pool_unmap_page(struct page_pool *pool, struct page *page) -{ - /* When page is unmapped, this implies page will not be - * returned to page_pool. - */ - __page_pool_clean_page(pool, page); -} -EXPORT_SYMBOL(page_pool_unmap_page); +EXPORT_SYMBOL(page_pool_release_page); /* Return a page to the page allocator, cleaning up our state */ -static void __page_pool_return_page(struct page_pool *pool, struct page *page) +static void page_pool_return_page(struct page_pool *pool, struct page *page) { - __page_pool_clean_page(pool, page); + page_pool_release_page(pool, page); put_page(page); /* An optimization would be to call __free_pages(page, pool->p.order) @@ -252,8 +383,7 @@ static void __page_pool_return_page(struct page_pool *pool, struct page *page) */ } -static bool __page_pool_recycle_into_ring(struct page_pool *pool, - struct page *page) +static bool page_pool_recycle_in_ring(struct page_pool *pool, struct page *page) { int ret; /* BH protection not needed if current is serving softirq */ @@ -270,7 +400,7 @@ static bool __page_pool_recycle_into_ring(struct page_pool *pool, * * Caller must provide appropriate safe context. */ -static bool __page_pool_recycle_direct(struct page *page, +static bool page_pool_recycle_in_cache(struct page *page, struct page_pool *pool) { if (unlikely(pool->alloc.count == PP_ALLOC_CACHE_SIZE)) @@ -281,27 +411,43 @@ static bool __page_pool_recycle_direct(struct page *page, return true; } -void __page_pool_put_page(struct page_pool *pool, - struct page *page, bool allow_direct) +/* If the page refcnt == 1, this will try to recycle the page. + * if PP_FLAG_DMA_SYNC_DEV is set, we'll try to sync the DMA area for + * the configured size min(dma_sync_size, pool->max_len). + * If the page refcnt != 1, then the page will be returned to memory + * subsystem. + */ +static __always_inline struct page * +__page_pool_put_page(struct page_pool *pool, struct page *page, + unsigned int dma_sync_size, bool allow_direct) { + /* It is not the last user for the page frag case */ + if (pool->p.flags & PP_FLAG_PAGE_FRAG && + page_pool_atomic_sub_frag_count_return(page, 1)) + return NULL; + /* This allocator is optimized for the XDP mode that uses * one-frame-per-page, but have fallbacks that act like the * regular page allocator APIs. * * refcnt == 1 means page_pool owns page, and can recycle it. + * + * page is NOT reusable when allocated when system is under + * some pressure. (page_is_pfmemalloc) */ - if (likely(page_ref_count(page) == 1)) { + if (likely(page_ref_count(page) == 1 && !page_is_pfmemalloc(page))) { /* Read barrier done in page_ref_count / READ_ONCE */ - if (allow_direct && in_serving_softirq()) - if (__page_pool_recycle_direct(page, pool)) - return; + if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV) + page_pool_dma_sync_for_device(pool, page, + dma_sync_size); - if (!__page_pool_recycle_into_ring(pool, page)) { - /* Cache full, fallback to free pages */ - __page_pool_return_page(pool, page); - } - return; + if (allow_direct && in_serving_softirq() && + page_pool_recycle_in_cache(page, pool)) + return NULL; + + /* Page found as candidate for recycling */ + return page; } /* Fallback/non-XDP mode: API user have elevated refcnt. * @@ -316,12 +462,141 @@ void __page_pool_put_page(struct page_pool *pool, * doing refcnt based recycle tricks, meaning another process * will be invoking put_page. */ - __page_pool_clean_page(pool, page); + /* Do not replace this with page_pool_return_page() */ + page_pool_release_page(pool, page); put_page(page); + + return NULL; +} + +void page_pool_put_page(struct page_pool *pool, struct page *page, + unsigned int dma_sync_size, bool allow_direct) +{ + page = __page_pool_put_page(pool, page, dma_sync_size, allow_direct); + if (page && !page_pool_recycle_in_ring(pool, page)) { + /* Cache full, fallback to free pages */ + page_pool_return_page(pool, page); + } } -EXPORT_SYMBOL(__page_pool_put_page); +EXPORT_SYMBOL(page_pool_put_page); -static void __page_pool_empty_ring(struct page_pool *pool) +/* Caller must not use data area after call, as this function overwrites it */ +void page_pool_put_page_bulk(struct page_pool *pool, void **data, + int count) +{ + int i, bulk_len = 0; + + for (i = 0; i < count; i++) { + struct page *page = virt_to_head_page(data[i]); + + page = __page_pool_put_page(pool, page, -1, false); + /* Approved for bulk recycling in ptr_ring cache */ + if (page) + data[bulk_len++] = page; + } + + if (unlikely(!bulk_len)) + return; + + /* Bulk producer into ptr_ring page_pool cache */ + page_pool_ring_lock(pool); + for (i = 0; i < bulk_len; i++) { + if (__ptr_ring_produce(&pool->ring, data[i])) + break; /* ring full */ + } + page_pool_ring_unlock(pool); + + /* Hopefully all pages was return into ptr_ring */ + if (likely(i == bulk_len)) + return; + + /* ptr_ring cache full, free remaining pages outside producer lock + * since put_page() with refcnt == 1 can be an expensive operation + */ + for (; i < bulk_len; i++) + page_pool_return_page(pool, data[i]); +} +EXPORT_SYMBOL(page_pool_put_page_bulk); + +static struct page *page_pool_drain_frag(struct page_pool *pool, + struct page *page) +{ + long drain_count = BIAS_MAX - pool->frag_users; + + /* Some user is still using the page frag */ + if (likely(page_pool_atomic_sub_frag_count_return(page, + drain_count))) + return NULL; + + if (page_ref_count(page) == 1 && !page_is_pfmemalloc(page)) { + if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV) + page_pool_dma_sync_for_device(pool, page, -1); + + return page; + } + + page_pool_return_page(pool, page); + return NULL; +} + +static void page_pool_free_frag(struct page_pool *pool) +{ + long drain_count = BIAS_MAX - pool->frag_users; + struct page *page = pool->frag_page; + + pool->frag_page = NULL; + + if (!page || + page_pool_atomic_sub_frag_count_return(page, drain_count)) + return; + + page_pool_return_page(pool, page); +} + +struct page *page_pool_alloc_frag(struct page_pool *pool, + unsigned int *offset, + unsigned int size, gfp_t gfp) +{ + unsigned int max_size = PAGE_SIZE << pool->p.order; + struct page *page = pool->frag_page; + + if (WARN_ON(!(pool->p.flags & PP_FLAG_PAGE_FRAG) || + size > max_size)) + return NULL; + + size = ALIGN(size, dma_get_cache_alignment()); + *offset = pool->frag_offset; + + if (page && *offset + size > max_size) { + page = page_pool_drain_frag(pool, page); + if (page) + goto frag_reset; + } + + if (!page) { + page = page_pool_alloc_pages(pool, gfp); + if (unlikely(!page)) { + pool->frag_page = NULL; + return NULL; + } + + pool->frag_page = page; + +frag_reset: + pool->frag_users = 1; + *offset = 0; + pool->frag_offset = size; + page_pool_set_frag_count(page, BIAS_MAX); + return page; + } + + pool->frag_users++; + pool->frag_offset = *offset + size; + return page; +} +EXPORT_SYMBOL(page_pool_alloc_frag); + +static void page_pool_empty_ring(struct page_pool *pool) { struct page *page; @@ -332,7 +607,7 @@ static void __page_pool_empty_ring(struct page_pool *pool) pr_crit("%s() page_pool refcnt %d violation\n", __func__, page_ref_count(page)); - __page_pool_return_page(pool, page); + page_pool_return_page(pool, page); } } @@ -359,13 +634,13 @@ static void page_pool_scrub(struct page_pool *pool) */ while (pool->alloc.count) { page = pool->alloc.cache[--pool->alloc.count]; - __page_pool_return_page(pool, page); + page_pool_return_page(pool, page); } /* No more consumers should exist, but producers could still * be in-flight. */ - __page_pool_empty_ring(pool); + page_pool_empty_ring(pool); } static int page_pool_release(struct page_pool *pool) @@ -417,6 +692,8 @@ void page_pool_destroy(struct page_pool *pool) if (!page_pool_put(pool)) return; + page_pool_free_frag(pool); + if (!page_pool_release(pool)) return; @@ -427,3 +704,48 @@ void page_pool_destroy(struct page_pool *pool) schedule_delayed_work(&pool->release_dw, DEFER_TIME); } EXPORT_SYMBOL(page_pool_destroy); + +/* Caller must provide appropriate safe context, e.g. NAPI. */ +void page_pool_update_nid(struct page_pool *pool, int new_nid) +{ + struct page *page; + + trace_page_pool_update_nid(pool, new_nid); + pool->p.nid = new_nid; + + /* Flush pool alloc cache, as refill will check NUMA node */ + while (pool->alloc.count) { + page = pool->alloc.cache[--pool->alloc.count]; + page_pool_return_page(pool, page); + } +} +EXPORT_SYMBOL(page_pool_update_nid); + +bool page_pool_return_skb_page(struct page *page) +{ + struct page_pool *pp; + + page = compound_head(page); + + /* page->pp_magic is OR'ed with PP_SIGNATURE after the allocation + * in order to preserve any existing bits, such as bit 0 for the + * head page of compound page and bit 1 for pfmemalloc page, so + * mask those bits for freeing side when doing below checking, + * and page_is_pfmemalloc() is checked in __page_pool_put_page() + * to avoid recycling the pfmemalloc page. + */ + if (unlikely((page->pp_magic & ~0x3UL) != PP_SIGNATURE)) + return false; + + pp = page->pp; + + /* Driver set this to memory recycling info. Reset it on recycle. + * This will *not* work for NIC using a split-page memory model. + * The page will be returned to the pool here regardless of the + * 'flipped' fragment being in use or not. + */ + page_pool_put_full_page(pp, page, false); + + return true; +} +EXPORT_SYMBOL(page_pool_return_skb_page); diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 80ae7756ffdb89c612cf0e7bb44f34070ba89afe..2d4899325cba1ea87c989282c2e229c1ce1c60dc 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -69,6 +69,7 @@ #include #include #include +#include #include #include @@ -603,10 +604,13 @@ static void skb_free_head(struct sk_buff *skb) { unsigned char *head = skb->head; - if (skb->head_frag) + if (skb->head_frag) { + if (skb_pp_recycle(skb, head)) + return; skb_free_frag(head); - else + } else { kfree(head); + } } static void skb_release_data(struct sk_buff *skb) @@ -617,16 +621,28 @@ static void skb_release_data(struct sk_buff *skb) if (skb->cloned && atomic_sub_return(skb->nohdr ? (1 << SKB_DATAREF_SHIFT) + 1 : 1, &shinfo->dataref)) - return; + goto exit; + + skb_zcopy_clear(skb, true); for (i = 0; i < shinfo->nr_frags; i++) - __skb_frag_unref(&shinfo->frags[i]); + __skb_frag_unref(&shinfo->frags[i], skb->pp_recycle); if (shinfo->frag_list) kfree_skb_list(shinfo->frag_list); - skb_zcopy_clear(skb, true); skb_free_head(skb); +exit: + /* When we clone an SKB we copy the reycling bit. The pp_recycle + * bit is only set on the head though, so in order to avoid races + * while trying to recycle fragments on __skb_frag_unref() we need + * to make one SKB responsible for triggering the recycle path. + * So disable the recycling bit if an SKB is cloned and we have + * additional references to to the fragmented part of the SKB. + * Eventually the last SKB will have the recycling bit set and it's + * dataref set to 0, which will trigger the recycling + */ + skb->pp_recycle = 0; } /* @@ -1020,6 +1036,7 @@ static struct sk_buff *__skb_clone(struct sk_buff *n, struct sk_buff *skb) n->nohdr = 0; n->peeked = 0; C(pfmemalloc); + C(pp_recycle); n->destructor = NULL; C(tail); C(end); @@ -3437,7 +3454,7 @@ int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, int shiftlen) fragto = &skb_shinfo(tgt)->frags[merge]; skb_frag_size_add(fragto, skb_frag_size(fragfrom)); - __skb_frag_unref(fragfrom); + __skb_frag_unref(fragfrom, skb->pp_recycle); } /* Reposition in the original skb */ @@ -5092,6 +5109,13 @@ bool skb_try_coalesce(struct sk_buff *to, struct sk_buff *from, if (skb_cloned(to)) return false; + /* The page pool signature of struct page will eventually figure out + * which pages can be recycled or not but for now let's prohibit slab + * allocated and page_pool allocated SKBs from being coalesced. + */ + if (to->pp_recycle != from->pp_recycle) + return false; + if (len <= skb_tailroom(to)) { if (len) BUG_ON(skb_copy_bits(from, 0, skb_put(to, len), len)); diff --git a/net/core/xdp.c b/net/core/xdp.c index b3f463c6543fe1ab92a43f8c8ac71d31fc9d3a41..0e6490d8f7406ca8bbc67f9bb6d9bb2894526f06 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -377,7 +377,7 @@ static void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct, xa = rhashtable_lookup(mem_id_ht, &mem->id, mem_id_rht_params); page = virt_to_head_page(data); napi_direct &= !xdp_return_frame_no_direct(); - page_pool_put_page(xa->page_pool, page, napi_direct); + page_pool_put_full_page(xa->page_pool, page, napi_direct); rcu_read_unlock(); break; case MEM_TYPE_PAGE_SHARED: diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c index 2c3cf47d730bb4b44b2fe77e9455710159ba5e9a..4b9c3c1460f522c0743a32081f6c8599c0e677ee 100644 --- a/net/tls/tls_device.c +++ b/net/tls/tls_device.c @@ -128,7 +128,7 @@ static void destroy_record(struct tls_record_info *record) int i; for (i = 0; i < record->num_frags; i++) - __skb_frag_unref(&record->frags[i]); + __skb_frag_unref(&record->frags[i], false); kfree(record); }