diff --git a/Documentation/networking/page_pool.rst b/Documentation/networking/page_pool.rst new file mode 100644 index 0000000000000000000000000000000000000000..5db8c263b0c67c18ed60e30b3958912747f44132 --- /dev/null +++ b/Documentation/networking/page_pool.rst @@ -0,0 +1,223 @@ +.. SPDX-License-Identifier: GPL-2.0 + +============= +Page Pool API +============= + +The page_pool allocator is optimized for the XDP mode that uses one frame +per-page, but it can fallback on the regular page allocator APIs. + +Basic use involves replacing alloc_pages() calls with the +page_pool_alloc_pages() call. Drivers should use page_pool_dev_alloc_pages() +replacing dev_alloc_pages(). + +API keeps track of inflight pages, in order to let API user know +when it is safe to free a page_pool object. Thus, API users +must run page_pool_release_page() when a page is leaving the page_pool or +call page_pool_put_page() where appropriate in order to maintain correct +accounting. + +API user must call page_pool_put_page() once on a page, as it +will either recycle the page, or in case of refcnt > 1, it will +release the DMA mapping and inflight state accounting. + +Architecture overview +===================== + +.. code-block:: none + + +------------------+ + | Driver | + +------------------+ + ^ + | + | + | + v + +--------------------------------------------+ + | request memory | + +--------------------------------------------+ + ^ ^ + | | + | Pool empty | Pool has entries + | | + v v + +-----------------------+ +------------------------+ + | alloc (and map) pages | | get page from cache | + +-----------------------+ +------------------------+ + ^ ^ + | | + | cache available | No entries, refill + | | from ptr-ring + | | + v v + +-----------------+ +------------------+ + | Fast cache | | ptr-ring cache | + +-----------------+ +------------------+ + +API interface +============= +The number of pools created **must** match the number of hardware queues +unless hardware restrictions make that impossible. This would otherwise beat the +purpose of page pool, which is allocate pages fast from cache without locking. +This lockless guarantee naturally comes from running under a NAPI softirq. +The protection doesn't strictly have to be NAPI, any guarantee that allocating +a page will cause no race conditions is enough. + +* page_pool_create(): Create a pool. + * flags: PP_FLAG_DMA_MAP, PP_FLAG_DMA_SYNC_DEV + * order: 2^order pages on allocation + * pool_size: size of the ptr_ring + * nid: preferred NUMA node for allocation + * dev: struct device. Used on DMA operations + * dma_dir: DMA direction + * max_len: max DMA sync memory size + * offset: DMA address offset + +* page_pool_put_page(): The outcome of this depends on the page refcnt. If the + driver bumps the refcnt > 1 this will unmap the page. If the page refcnt is 1 + the allocator owns the page and will try to recycle it in one of the pool + caches. If PP_FLAG_DMA_SYNC_DEV is set, the page will be synced for_device + using dma_sync_single_range_for_device(). + +* page_pool_put_full_page(): Similar to page_pool_put_page(), but will DMA sync + for the entire memory area configured in area pool->max_len. + +* page_pool_recycle_direct(): Similar to page_pool_put_full_page() but caller + must guarantee safe context (e.g NAPI), since it will recycle the page + directly into the pool fast cache. + +* page_pool_release_page(): Unmap the page (if mapped) and account for it on + inflight counters. + +* page_pool_dev_alloc_pages(): Get a page from the page allocator or page_pool + caches. + +* page_pool_get_dma_addr(): Retrieve the stored DMA address. + +* page_pool_get_dma_dir(): Retrieve the stored DMA direction. + +* page_pool_put_page_bulk(): Tries to refill a number of pages into the + ptr_ring cache holding ptr_ring producer lock. If the ptr_ring is full, + page_pool_put_page_bulk() will release leftover pages to the page allocator. + page_pool_put_page_bulk() is suitable to be run inside the driver NAPI tx + completion loop for the XDP_REDIRECT use case. + Please note the caller must not use data area after running + page_pool_put_page_bulk(), as this function overwrites it. + +* page_pool_get_stats(): Retrieve statistics about the page_pool. This API + is only available if the kernel has been configured with + ``CONFIG_PAGE_POOL_STATS=y``. A pointer to a caller allocated ``struct + page_pool_stats`` structure is passed to this API which is filled in. The + caller can then report those stats to the user (perhaps via ethtool, + debugfs, etc.). See below for an example usage of this API. + +Stats API and structures +------------------------ +If the kernel is configured with ``CONFIG_PAGE_POOL_STATS=y``, the API +``page_pool_get_stats()`` and structures described below are available. It +takes a pointer to a ``struct page_pool`` and a pointer to a ``struct +page_pool_stats`` allocated by the caller. + +The API will fill in the provided ``struct page_pool_stats`` with +statistics about the page_pool. + +The stats structure has the following fields:: + + struct page_pool_stats { + struct page_pool_alloc_stats alloc_stats; + struct page_pool_recycle_stats recycle_stats; + }; + + +The ``struct page_pool_alloc_stats`` has the following fields: + * ``fast``: successful fast path allocations + * ``slow``: slow path order-0 allocations + * ``slow_high_order``: slow path high order allocations + * ``empty``: ptr ring is empty, so a slow path allocation was forced. + * ``refill``: an allocation which triggered a refill of the cache + * ``waive``: pages obtained from the ptr ring that cannot be added to + the cache due to a NUMA mismatch. + +The ``struct page_pool_recycle_stats`` has the following fields: + * ``cached``: recycling placed page in the page pool cache + * ``cache_full``: page pool cache was full + * ``ring``: page placed into the ptr ring + * ``ring_full``: page released from page pool because the ptr ring was full + * ``released_refcnt``: page released (and not recycled) because refcnt > 1 + +Coding examples +=============== + +Registration +------------ + +.. code-block:: c + + /* Page pool registration */ + struct page_pool_params pp_params = { 0 }; + struct xdp_rxq_info xdp_rxq; + int err; + + pp_params.order = 0; + /* internal DMA mapping in page_pool */ + pp_params.flags = PP_FLAG_DMA_MAP; + pp_params.pool_size = DESC_NUM; + pp_params.nid = NUMA_NO_NODE; + pp_params.dev = priv->dev; + pp_params.dma_dir = xdp_prog ? DMA_BIDIRECTIONAL : DMA_FROM_DEVICE; + page_pool = page_pool_create(&pp_params); + + err = xdp_rxq_info_reg(&xdp_rxq, ndev, 0); + if (err) + goto err_out; + + err = xdp_rxq_info_reg_mem_model(&xdp_rxq, MEM_TYPE_PAGE_POOL, page_pool); + if (err) + goto err_out; + +NAPI poller +----------- + + +.. code-block:: c + + /* NAPI Rx poller */ + enum dma_data_direction dma_dir; + + dma_dir = page_pool_get_dma_dir(dring->page_pool); + while (done < budget) { + if (some error) + page_pool_recycle_direct(page_pool, page); + if (packet_is_xdp) { + if XDP_DROP: + page_pool_recycle_direct(page_pool, page); + } else (packet_is_skb) { + page_pool_release_page(page_pool, page); + new_page = page_pool_dev_alloc_pages(page_pool); + } + } + +Stats +----- + +.. code-block:: c + + #ifdef CONFIG_PAGE_POOL_STATS + /* retrieve stats */ + struct page_pool_stats stats = { 0 }; + if (page_pool_get_stats(page_pool, &stats)) { + /* perhaps the driver reports statistics with ethool */ + ethtool_print_allocation_stats(&stats.alloc_stats); + ethtool_print_recycle_stats(&stats.recycle_stats); + } + #endif + +Driver unload +------------- + +.. code-block:: c + + /* Driver unload */ + page_pool_put_full_page(page_pool, page, false); + xdp_rxq_info_unreg(&xdp_rxq); diff --git a/arch/arm64/configs/tencent.config b/arch/arm64/configs/tencent.config index 5d120d0c8b213b1ebcc0e76ee8b0a44d01806a58..4c00e34e77c3a4f8b57df0f328744cac0d807ebd 100644 --- a/arch/arm64/configs/tencent.config +++ b/arch/arm64/configs/tencent.config @@ -1455,3 +1455,4 @@ CONFIG_ASYNC_RAID6_TEST=m CONFIG_TEST_KSTRTOX=y CONFIG_TEST_BPF=m CONFIG_BUG_ON_DATA_CORRUPTION=y +CONFIG_PAGE_POOL_STATS=y diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 467545cae440fddc226f99037cb926c4691386c4..4564ebb291855548fe523473fd3a61c4f6ab168a 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -514,6 +514,15 @@ alloc_pages_bulk_array(gfp_t gfp, unsigned long nr_pages, struct page **page_arr return __alloc_pages_bulk(gfp, numa_mem_id(), NULL, nr_pages, NULL, page_array); } +static inline unsigned long +alloc_pages_bulk_array_node(gfp_t gfp, int nid, unsigned long nr_pages, struct page **page_array) +{ + if (nid == NUMA_NO_NODE) + nid = numa_mem_id(); + + return __alloc_pages_bulk(gfp, nid, NULL, nr_pages, NULL, page_array); +} + /* * Allocate pages, preferring the node given as nid. The node must be valid and * online. For more general interface, see alloc_pages_node(). diff --git a/include/net/page_pool.h b/include/net/page_pool.h index 2b1da328370c293f4bb3ec53133f14139facfe9a..c93d6271207d635ad4e7d1ee0add18994e24c01c 100644 --- a/include/net/page_pool.h +++ b/include/net/page_pool.h @@ -82,6 +82,69 @@ struct page_pool_params { unsigned int offset; /* DMA addr offset */ }; +#ifdef CONFIG_PAGE_POOL_STATS +struct page_pool_alloc_stats { + u64 fast; /* fast path allocations */ + u64 slow; /* slow-path order 0 allocations */ + u64 slow_high_order; /* slow-path high order allocations */ + u64 empty; /* failed refills due to empty ptr ring, forcing + * slow path allocation + */ + u64 refill; /* allocations via successful refill */ + u64 waive; /* failed refills due to numa zone mismatch */ +}; + +struct page_pool_recycle_stats { + u64 cached; /* recycling placed page in the cache. */ + u64 cache_full; /* cache was full */ + u64 ring; /* recycling placed page back into ptr ring */ + u64 ring_full; /* page was released from page-pool because + * PTR ring was full. + */ + u64 released_refcnt; /* page released because of elevated + * refcnt + */ +}; + +/* This struct wraps the above stats structs so users of the + * page_pool_get_stats API can pass a single argument when requesting the + * stats for the page pool. + */ +struct page_pool_stats { + struct page_pool_alloc_stats alloc_stats; + struct page_pool_recycle_stats recycle_stats; +}; + +int page_pool_ethtool_stats_get_count(void); +u8 *page_pool_ethtool_stats_get_strings(u8 *data); +u64 *page_pool_ethtool_stats_get(u64 *data, void *stats); + +/* + * Drivers that wish to harvest page pool stats and report them to users + * (perhaps via ethtool, debugfs, or another mechanism) can allocate a + * struct page_pool_stats call page_pool_get_stats to get stats for the specified pool. + */ +bool page_pool_get_stats(struct page_pool *pool, + struct page_pool_stats *stats); +#else + +static inline int page_pool_ethtool_stats_get_count(void) +{ + return 0; +} + +static inline u8 *page_pool_ethtool_stats_get_strings(u8 *data) +{ + return data; +} + +static inline u64 *page_pool_ethtool_stats_get(u64 *data, void *stats) +{ + return data; +} + +#endif + struct page_pool { struct page_pool_params p; @@ -95,6 +158,11 @@ struct page_pool { struct page *frag_page; long frag_users; +#ifdef CONFIG_PAGE_POOL_STATS + /* these stats are incremented while in softirq context */ + struct page_pool_alloc_stats alloc_stats; +#endif + /* * Data structure for allocation side * @@ -123,6 +191,10 @@ struct page_pool { */ struct ptr_ring ring; +#ifdef CONFIG_PAGE_POOL_STATS + /* recycle stats are per-cpu to avoid locking */ + struct page_pool_recycle_stats __percpu *recycle_stats; +#endif atomic_t pages_state_release_cnt; /* A page_pool is strictly tied to a single RX-queue being diff --git a/net/Kconfig b/net/Kconfig index a34f4585218892533057c32880b9179987581ba8..1d4211802ebdd29393e465f2ccb9a9bae54016ea 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -440,6 +440,19 @@ config NET_DEVLINK config PAGE_POOL bool +config PAGE_POOL_STATS + default n + bool "Page pool stats" + depends on PAGE_POOL + help + Enable page pool statistics to track page allocation and recycling + in page pools. This option incurs additional CPU cost in allocation + and recycle paths and additional memory cost to store the statistics. + These statistics are only available if this option is enabled and if + the driver using the page pool supports exporting this data. + + If unsure, say N. + config FAILOVER tristate "Generic failover module" help diff --git a/net/core/page_pool.c b/net/core/page_pool.c index 381705fed4db9bd63d78980afaabb795d2f06ba9..1983035f5ca1d66437ed102c2a65ece56469c5b3 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -18,6 +18,7 @@ #include #include /* for __put_page() */ #include +#include #include @@ -26,6 +27,112 @@ #define BIAS_MAX LONG_MAX +#ifdef CONFIG_PAGE_POOL_STATS +/* alloc_stat_inc is intended to be used in softirq context */ +#define alloc_stat_inc(pool, __stat) (pool->alloc_stats.__stat++) +/* recycle_stat_inc is safe to use when preemption is possible. */ +#define recycle_stat_inc(pool, __stat) \ + do { \ + struct page_pool_recycle_stats __percpu *s = pool->recycle_stats; \ + this_cpu_inc(s->__stat); \ + } while (0) + +#define recycle_stat_add(pool, __stat, val) \ + do { \ + struct page_pool_recycle_stats __percpu *s = pool->recycle_stats; \ + this_cpu_add(s->__stat, val); \ + } while (0) + +static const char pp_stats[][ETH_GSTRING_LEN] = { + "rx_pp_alloc_fast", + "rx_pp_alloc_slow", + "rx_pp_alloc_slow_ho", + "rx_pp_alloc_empty", + "rx_pp_alloc_refill", + "rx_pp_alloc_waive", + "rx_pp_recycle_cached", + "rx_pp_recycle_cache_full", + "rx_pp_recycle_ring", + "rx_pp_recycle_ring_full", + "rx_pp_recycle_released_ref", +}; + +bool page_pool_get_stats(struct page_pool *pool, + struct page_pool_stats *stats) +{ + int cpu = 0; + + if (!stats) + return false; + + /* The caller is responsible to initialize stats. */ + stats->alloc_stats.fast += pool->alloc_stats.fast; + stats->alloc_stats.slow += pool->alloc_stats.slow; + stats->alloc_stats.slow_high_order += pool->alloc_stats.slow_high_order; + stats->alloc_stats.empty += pool->alloc_stats.empty; + stats->alloc_stats.refill += pool->alloc_stats.refill; + stats->alloc_stats.waive += pool->alloc_stats.waive; + + for_each_possible_cpu(cpu) { + const struct page_pool_recycle_stats *pcpu = + per_cpu_ptr(pool->recycle_stats, cpu); + + stats->recycle_stats.cached += pcpu->cached; + stats->recycle_stats.cache_full += pcpu->cache_full; + stats->recycle_stats.ring += pcpu->ring; + stats->recycle_stats.ring_full += pcpu->ring_full; + stats->recycle_stats.released_refcnt += pcpu->released_refcnt; + } + + return true; +} +EXPORT_SYMBOL(page_pool_get_stats); + +u8 *page_pool_ethtool_stats_get_strings(u8 *data) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(pp_stats); i++) { + memcpy(data, pp_stats[i], ETH_GSTRING_LEN); + data += ETH_GSTRING_LEN; + } + + return data; +} +EXPORT_SYMBOL(page_pool_ethtool_stats_get_strings); + +int page_pool_ethtool_stats_get_count(void) +{ + return ARRAY_SIZE(pp_stats); +} +EXPORT_SYMBOL(page_pool_ethtool_stats_get_count); + +u64 *page_pool_ethtool_stats_get(u64 *data, void *stats) +{ + struct page_pool_stats *pool_stats = stats; + + *data++ = pool_stats->alloc_stats.fast; + *data++ = pool_stats->alloc_stats.slow; + *data++ = pool_stats->alloc_stats.slow_high_order; + *data++ = pool_stats->alloc_stats.empty; + *data++ = pool_stats->alloc_stats.refill; + *data++ = pool_stats->alloc_stats.waive; + *data++ = pool_stats->recycle_stats.cached; + *data++ = pool_stats->recycle_stats.cache_full; + *data++ = pool_stats->recycle_stats.ring; + *data++ = pool_stats->recycle_stats.ring_full; + *data++ = pool_stats->recycle_stats.released_refcnt; + + return data; +} +EXPORT_SYMBOL(page_pool_ethtool_stats_get); + +#else +#define alloc_stat_inc(pool, __stat) +#define recycle_stat_inc(pool, __stat) +#define recycle_stat_add(pool, __stat, val) +#endif + static int page_pool_init(struct page_pool *pool, const struct page_pool_params *params) { @@ -71,6 +178,12 @@ static int page_pool_init(struct page_pool *pool, pool->p.flags & PP_FLAG_PAGE_FRAG) return -EINVAL; +#ifdef CONFIG_PAGE_POOL_STATS + pool->recycle_stats = alloc_percpu(struct page_pool_recycle_stats); + if (!pool->recycle_stats) + return -ENOMEM; +#endif + if (ptr_ring_init(&pool->ring, ring_qsize, GFP_KERNEL) < 0) return -ENOMEM; @@ -115,8 +228,10 @@ static struct page *page_pool_refill_alloc_cache(struct page_pool *pool) int pref_nid; /* preferred NUMA node */ /* Quicker fallback, avoid locks when ring is empty */ - if (__ptr_ring_empty(r)) + if (__ptr_ring_empty(r)) { + alloc_stat_inc(pool, empty); return NULL; + } /* Softirq guarantee CPU and thus NUMA node is stable. This, * assumes CPU refilling driver RX-ring will also run RX-NAPI. @@ -146,14 +261,17 @@ static struct page *page_pool_refill_alloc_cache(struct page_pool *pool) * This limit stress on page buddy alloactor. */ page_pool_return_page(pool, page); + alloc_stat_inc(pool, waive); page = NULL; break; } } while (pool->alloc.count < PP_ALLOC_CACHE_REFILL); /* Return last page */ - if (likely(pool->alloc.count > 0)) + if (likely(pool->alloc.count > 0)) { page = pool->alloc.cache[--pool->alloc.count]; + alloc_stat_inc(pool, refill); + } spin_unlock(&r->consumer_lock); return page; @@ -168,6 +286,7 @@ static struct page *__page_pool_get_cached(struct page_pool *pool) if (likely(pool->alloc.count)) { /* Fast-path */ page = pool->alloc.cache[--pool->alloc.count]; + alloc_stat_inc(pool, fast); } else { page = page_pool_refill_alloc_cache(pool); } @@ -237,6 +356,7 @@ static struct page *__page_pool_alloc_page_order(struct page_pool *pool, return NULL; } + alloc_stat_inc(pool, slow_high_order); page_pool_set_pp_info(pool, page); /* Track how many pages are held 'in-flight' */ @@ -267,7 +387,8 @@ static struct page *__page_pool_alloc_pages_slow(struct page_pool *pool, /* Mark empty alloc.cache slots "empty" for alloc_pages_bulk_array */ memset(&pool->alloc.cache, 0, sizeof(void *) * bulk); - nr_pages = alloc_pages_bulk_array(gfp, bulk, pool->alloc.cache); + nr_pages = alloc_pages_bulk_array_node(gfp, pool->p.nid, bulk, + pool->alloc.cache); if (unlikely(!nr_pages)) return NULL; @@ -291,10 +412,12 @@ static struct page *__page_pool_alloc_pages_slow(struct page_pool *pool, } /* Return last page */ - if (likely(pool->alloc.count > 0)) + if (likely(pool->alloc.count > 0)) { page = pool->alloc.cache[--pool->alloc.count]; - else + alloc_stat_inc(pool, slow); + } else { page = NULL; + } /* When page just alloc'ed is should/must have refcnt 1. */ return page; @@ -392,7 +515,12 @@ static bool page_pool_recycle_in_ring(struct page_pool *pool, struct page *page) else ret = ptr_ring_produce_bh(&pool->ring, page); - return (ret == 0) ? true : false; + if (!ret) { + recycle_stat_inc(pool, ring); + return true; + } + + return false; } /* Only allow direct recycling in special circumstances, into the @@ -403,11 +531,14 @@ static bool page_pool_recycle_in_ring(struct page_pool *pool, struct page *page) static bool page_pool_recycle_in_cache(struct page *page, struct page_pool *pool) { - if (unlikely(pool->alloc.count == PP_ALLOC_CACHE_SIZE)) + if (unlikely(pool->alloc.count == PP_ALLOC_CACHE_SIZE)) { + recycle_stat_inc(pool, cache_full); return false; + } /* Caller MUST have verified/know (page_ref_count(page) == 1) */ pool->alloc.cache[pool->alloc.count++] = page; + recycle_stat_inc(pool, cached); return true; } @@ -462,6 +593,7 @@ __page_pool_put_page(struct page_pool *pool, struct page *page, * doing refcnt based recycle tricks, meaning another process * will be invoking put_page. */ + recycle_stat_inc(pool, released_refcnt); /* Do not replace this with page_pool_return_page() */ page_pool_release_page(pool, page); put_page(page); @@ -475,6 +607,7 @@ void page_pool_put_page(struct page_pool *pool, struct page *page, page = __page_pool_put_page(pool, page, dma_sync_size, allow_direct); if (page && !page_pool_recycle_in_ring(pool, page)) { /* Cache full, fallback to free pages */ + recycle_stat_inc(pool, ring_full); page_pool_return_page(pool, page); } } @@ -501,9 +634,13 @@ void page_pool_put_page_bulk(struct page_pool *pool, void **data, /* Bulk producer into ptr_ring page_pool cache */ page_pool_ring_lock(pool); for (i = 0; i < bulk_len; i++) { - if (__ptr_ring_produce(&pool->ring, data[i])) - break; /* ring full */ + if (__ptr_ring_produce(&pool->ring, data[i])) { + /* ring full */ + recycle_stat_inc(pool, ring_full); + break; + } } + recycle_stat_add(pool, ring, i); page_pool_ring_unlock(pool); /* Hopefully all pages was return into ptr_ring */ @@ -569,8 +706,10 @@ struct page *page_pool_alloc_frag(struct page_pool *pool, if (page && *offset + size > max_size) { page = page_pool_drain_frag(pool, page); - if (page) + if (page) { + alloc_stat_inc(pool, fast); goto frag_reset; + } } if (!page) { @@ -592,6 +731,7 @@ struct page *page_pool_alloc_frag(struct page_pool *pool, pool->frag_users++; pool->frag_offset = *offset + size; + alloc_stat_inc(pool, fast); return page; } EXPORT_SYMBOL(page_pool_alloc_frag); @@ -621,6 +761,9 @@ static void page_pool_free(struct page_pool *pool) if (pool->p.flags & PP_FLAG_DMA_MAP) put_device(pool->p.dev); +#ifdef CONFIG_PAGE_POOL_STATS + free_percpu(pool->recycle_stats); +#endif kfree(pool); }