[RFC PATCH net-next v5 07/14] page_pool: devmem support

From: Mina Almasry
Date: Sun Dec 17 2023 - 21:44:12 EST


Convert netmem to be a union of struct page and struct netmem. Overload
the LSB of struct netmem* to indicate that it's a net_iov, otherwise
it's a page.

Currently these entries in struct page are rented by the page_pool and
used exclusively by the net stack:

struct {
unsigned long pp_magic;
struct page_pool *pp;
unsigned long _pp_mapping_pad;
unsigned long dma_addr;
atomic_long_t pp_ref_count;
};

Mirror these (and only these) entries into struct net_iov and implement
netmem helpers that can access these common fields regardless of
whether the underlying type is page or net_iov.

Implement checks for net_iov in netmem helpers which delegate to mm
APIs, to ensure net_iov are never passed to the mm stack.

Signed-off-by: Mina Almasry <almasrymina@xxxxxxxxxx>

---

RFCv5:
- Use netmem instead of page* with LSB set.
- Use pp_ref_count for refcounting net_iov.
- Removed many of the custom checks for netmem.

v1:
- Disable fragmentation support for iov properly.
- fix napi_pp_put_page() path (Yunsheng).
- Use pp_frag_count for devmem refcounting.

---
include/net/netmem.h | 145 ++++++++++++++++++++++++++++++--
include/net/page_pool/helpers.h | 25 +++---
net/core/page_pool.c | 26 +++---
net/core/skbuff.c | 9 +-
4 files changed, 164 insertions(+), 41 deletions(-)

diff --git a/include/net/netmem.h b/include/net/netmem.h
index 31f338f19da0..7557aecc0f78 100644
--- a/include/net/netmem.h
+++ b/include/net/netmem.h
@@ -12,11 +12,47 @@

/* net_iov */

+DECLARE_STATIC_KEY_FALSE(page_pool_mem_providers);
+
+/* We overload the LSB of the struct page pointer to indicate whether it's
+ * a page or net_iov.
+ */
+#define NET_IOV 0x01UL
+
struct net_iov {
+ unsigned long __unused_padding;
+ unsigned long pp_magic;
+ struct page_pool *pp;
struct dmabuf_genpool_chunk_owner *owner;
unsigned long dma_addr;
+ atomic_long_t pp_ref_count;
};

+/* These fields in struct page are used by the page_pool and net stack:
+ *
+ * struct {
+ * unsigned long pp_magic;
+ * struct page_pool *pp;
+ * unsigned long _pp_mapping_pad;
+ * unsigned long dma_addr;
+ * atomic_long_t pp_ref_count;
+ * };
+ *
+ * We mirror the page_pool fields here so the page_pool can access these fields
+ * without worrying whether the underlying fields belong to a page or net_iov.
+ *
+ * The non-net stack fields of struct page are private to the mm stack and must
+ * never be mirrored to net_iov.
+ */
+#define NET_IOV_ASSERT_OFFSET(pg, iov) \
+ static_assert(offsetof(struct page, pg) == \
+ offsetof(struct net_iov, iov))
+NET_IOV_ASSERT_OFFSET(pp_magic, pp_magic);
+NET_IOV_ASSERT_OFFSET(pp, pp);
+NET_IOV_ASSERT_OFFSET(dma_addr, dma_addr);
+NET_IOV_ASSERT_OFFSET(pp_ref_count, pp_ref_count);
+#undef NET_IOV_ASSERT_OFFSET
+
static inline struct dmabuf_genpool_chunk_owner *
net_iov_owner(const struct net_iov *niov)
{
@@ -47,19 +83,25 @@ net_iov_binding(const struct net_iov *niov)
struct netmem {
union {
struct page page;
-
- /* Stub to prevent compiler implicitly converting from page*
- * to netmem_t* and vice versa.
- *
- * Other memory type(s) net stack would like to support
- * can be added to this union.
- */
- void *addr;
+ struct net_iov niov;
};
};

+static inline bool netmem_is_net_iov(const struct netmem *netmem)
+{
+#ifdef CONFIG_PAGE_POOL
+ return static_branch_unlikely(&page_pool_mem_providers) &&
+ (unsigned long)netmem & NET_IOV;
+#else
+ return false;
+#endif
+}
+
static inline struct page *netmem_to_page(struct netmem *netmem)
{
+ if (WARN_ON_ONCE(netmem_is_net_iov(netmem)))
+ return NULL;
+
return &netmem->page;
}

@@ -70,17 +112,104 @@ static inline struct netmem *page_to_netmem(struct page *page)

static inline int netmem_ref_count(struct netmem *netmem)
{
+ /* The non-pp refcount of netmem is always 1. On netmem, we only support
+ * pp refcounting.
+ */
+ if (netmem_is_net_iov(netmem))
+ return 1;
+
return page_ref_count(netmem_to_page(netmem));
}

static inline unsigned long netmem_to_pfn(struct netmem *netmem)
{
+ if (netmem_is_net_iov(netmem))
+ return 0;
+
return page_to_pfn(netmem_to_page(netmem));
}

+static inline struct net_iov *__netmem_clear_lsb(struct netmem *netmem)
+{
+ return (struct net_iov *)((unsigned long)netmem & ~NET_IOV);
+}
+
+static inline unsigned long netmem_get_pp_magic(struct netmem *netmem)
+{
+ return __netmem_clear_lsb(netmem)->pp_magic;
+}
+
+static inline void netmem_or_pp_magic(struct netmem *netmem,
+ unsigned long pp_magic)
+{
+ __netmem_clear_lsb(netmem)->pp_magic |= pp_magic;
+}
+
+static inline void netmem_clear_pp_magic(struct netmem *netmem)
+{
+ __netmem_clear_lsb(netmem)->pp_magic = 0;
+}
+
+static inline struct page_pool *netmem_get_pp(struct netmem *netmem)
+{
+ return __netmem_clear_lsb(netmem)->pp;
+}
+
+static inline void netmem_set_pp(struct netmem *netmem, struct page_pool *pool)
+{
+ __netmem_clear_lsb(netmem)->pp = pool;
+}
+
+static inline unsigned long netmem_get_dma_addr(struct netmem *netmem)
+{
+ return __netmem_clear_lsb(netmem)->dma_addr;
+}
+
+static inline void netmem_set_dma_addr(struct netmem *netmem,
+ unsigned long dma_addr)
+{
+ __netmem_clear_lsb(netmem)->dma_addr = dma_addr;
+}
+
+static inline atomic_long_t *netmem_get_pp_ref_count_ref(struct netmem *netmem)
+{
+ return &__netmem_clear_lsb(netmem)->pp_ref_count;
+}
+
+static inline bool netmem_is_pref_nid(struct netmem *netmem, int pref_nid)
+{
+ /* Assume net_iov are on the preferred node without actually
+ * checking...
+ *
+ * This check is only used to check for recycling memory in the page
+ * pool's fast paths. Currently the only implementation of net_iov
+ * is dmabuf device memory. It's a deliberate decision by the user to
+ * bind a certain dmabuf to a certain netdev, and the netdev rx queue
+ * would not be able to reallocate memory from another dmabuf that
+ * exists on the preferred node, so, this check doesn't make much sense
+ * in this case. Assume all net_iovs can be recycled for now.
+ */
+ if (netmem_is_net_iov(netmem))
+ return true;
+
+ return page_to_nid(netmem_to_page(netmem)) == pref_nid;
+}
+
static inline struct netmem *netmem_compound_head(struct netmem *netmem)
{
+ /* niov are never compounded */
+ if (netmem_is_net_iov(netmem))
+ return netmem;
+
return page_to_netmem(compound_head(netmem_to_page(netmem)));
}

+static inline void *netmem_address(struct netmem *netmem)
+{
+ if (netmem_is_net_iov(netmem))
+ return NULL;
+
+ return page_address(netmem_to_page(netmem));
+}
+
#endif /* _NET_NETMEM_H */
diff --git a/include/net/page_pool/helpers.h b/include/net/page_pool/helpers.h
index c71969279284..8827518379a8 100644
--- a/include/net/page_pool/helpers.h
+++ b/include/net/page_pool/helpers.h
@@ -215,7 +215,7 @@ inline enum dma_data_direction page_pool_get_dma_dir(struct page_pool *pool)

static inline void page_pool_fragment_netmem(struct netmem *netmem, long nr)
{
- atomic_long_set(&netmem_to_page(netmem)->pp_ref_count, nr);
+ atomic_long_set(netmem_get_pp_ref_count_ref(netmem), nr);
}

/**
@@ -243,7 +243,7 @@ static inline void page_pool_fragment_page(struct page *page, long nr)

static inline long page_pool_unref_netmem(struct netmem *netmem, long nr)
{
- struct page *page = netmem_to_page(netmem);
+ atomic_long_t *pp_ref_count = netmem_get_pp_ref_count_ref(netmem);
long ret;

/* If nr == pp_ref_count then we have cleared all remaining
@@ -260,19 +260,19 @@ static inline long page_pool_unref_netmem(struct netmem *netmem, long nr)
* initially, and only overwrite it when the page is partitioned into
* more than one piece.
*/
- if (atomic_long_read(&page->pp_ref_count) == nr) {
+ if (atomic_long_read(pp_ref_count) == nr) {
/* As we have ensured nr is always one for constant case using
* the BUILD_BUG_ON(), only need to handle the non-constant case
* here for pp_ref_count draining, which is a rare case.
*/
BUILD_BUG_ON(__builtin_constant_p(nr) && nr != 1);
if (!__builtin_constant_p(nr))
- atomic_long_set(&page->pp_ref_count, 1);
+ atomic_long_set(pp_ref_count, 1);

return 0;
}

- ret = atomic_long_sub_return(nr, &page->pp_ref_count);
+ ret = atomic_long_sub_return(nr, pp_ref_count);
WARN_ON(ret < 0);

/* We are the last user here too, reset pp_ref_count back to 1 to
@@ -281,7 +281,7 @@ static inline long page_pool_unref_netmem(struct netmem *netmem, long nr)
* page_pool_unref_page() currently.
*/
if (unlikely(!ret))
- atomic_long_set(&page->pp_ref_count, 1);
+ atomic_long_set(pp_ref_count, 1);

return ret;
}
@@ -391,9 +391,7 @@ static inline void page_pool_free_va(struct page_pool *pool, void *va,

static inline dma_addr_t page_pool_get_dma_addr_netmem(struct netmem *netmem)
{
- struct page *page = netmem_to_page(netmem);
-
- dma_addr_t ret = page->dma_addr;
+ dma_addr_t ret = netmem_get_dma_addr(netmem);

if (PAGE_POOL_32BIT_ARCH_WITH_64BIT_DMA)
ret <<= PAGE_SHIFT;
@@ -416,18 +414,17 @@ static inline dma_addr_t page_pool_get_dma_addr(struct page *page)
static inline bool page_pool_set_dma_addr_netmem(struct netmem *netmem,
dma_addr_t addr)
{
- struct page *page = netmem_to_page(netmem);
-
if (PAGE_POOL_32BIT_ARCH_WITH_64BIT_DMA) {
- page->dma_addr = addr >> PAGE_SHIFT;
+ netmem_set_dma_addr(netmem, addr >> PAGE_SHIFT);

/* We assume page alignment to shave off bottom bits,
* if this "compression" doesn't work we need to drop.
*/
- return addr != (dma_addr_t)page->dma_addr << PAGE_SHIFT;
+ return addr != (dma_addr_t)netmem_get_dma_addr(netmem)
+ << PAGE_SHIFT;
}

- page->dma_addr = addr;
+ netmem_set_dma_addr(netmem, addr);
return false;
}

diff --git a/net/core/page_pool.c b/net/core/page_pool.c
index 965a7bc0a407..173158a3dd61 100644
--- a/net/core/page_pool.c
+++ b/net/core/page_pool.c
@@ -25,7 +25,7 @@

#include "page_pool_priv.h"

-static DEFINE_STATIC_KEY_FALSE(page_pool_mem_providers);
+DEFINE_STATIC_KEY_FALSE(page_pool_mem_providers);

#define DEFER_TIME (msecs_to_jiffies(1000))
#define DEFER_WARN_INTERVAL (60 * HZ)
@@ -334,7 +334,7 @@ page_pool_refill_alloc_cache(struct page_pool *pool)
if (unlikely(!netmem))
break;

- if (likely(page_to_nid(netmem_to_page(netmem)) == pref_nid)) {
+ if (likely(netmem_is_pref_nid(netmem, pref_nid))) {
pool->alloc.cache[pool->alloc.count++] = netmem;
} else {
/* NUMA mismatch;
@@ -421,10 +421,8 @@ static bool page_pool_dma_map(struct page_pool *pool, struct netmem *netmem)

static void page_pool_set_pp_info(struct page_pool *pool, struct netmem *netmem)
{
- struct page *page = netmem_to_page(netmem);
-
- page->pp = pool;
- page->pp_magic |= PP_SIGNATURE;
+ netmem_set_pp(netmem, pool);
+ netmem_or_pp_magic(netmem, PP_SIGNATURE);

/* Ensuring all pages have been split into one fragment initially:
* page_pool_set_pp_info() is only called once for every page when it
@@ -439,10 +437,8 @@ static void page_pool_set_pp_info(struct page_pool *pool, struct netmem *netmem)

static void page_pool_clear_pp_info(struct netmem *netmem)
{
- struct page *page = netmem_to_page(netmem);
-
- page->pp_magic = 0;
- page->pp = NULL;
+ netmem_clear_pp_magic(netmem);
+ netmem_set_pp(netmem, NULL);
}

static struct page *__page_pool_alloc_page_order(struct page_pool *pool,
@@ -670,8 +666,9 @@ static bool page_pool_recycle_in_cache(struct netmem *netmem,

static bool __page_pool_page_can_be_recycled(struct netmem *netmem)
{
- return page_ref_count(netmem_to_page(netmem)) == 1 &&
- !page_is_pfmemalloc(netmem_to_page(netmem));
+ return netmem_is_net_iov(netmem) ||
+ (page_ref_count(netmem_to_page(netmem)) == 1 &&
+ !page_is_pfmemalloc(netmem_to_page(netmem)));
}

/* If the page refcnt == 1, this will try to recycle the page.
@@ -693,7 +690,7 @@ __page_pool_put_page(struct page_pool *pool, struct netmem *netmem,
* refcnt == 1 means page_pool owns page, and can recycle it.
*
* page is NOT reusable when allocated when system is under
- * some pressure. (page_is_pfmemalloc)
+ * some pressure. (page_pool_page_is_pfmemalloc)
*/
if (likely(__page_pool_page_can_be_recycled(netmem))) {
/* Read barrier done in page_ref_count / READ_ONCE */
@@ -709,6 +706,7 @@ __page_pool_put_page(struct page_pool *pool, struct netmem *netmem,
/* Page found as candidate for recycling */
return netmem;
}
+
/* Fallback/non-XDP mode: API user have elevated refcnt.
*
* Many drivers split up the page into fragments, and some
@@ -906,7 +904,7 @@ static void page_pool_empty_ring(struct page_pool *pool)
/* Empty recycle ring */
while ((netmem = ptr_ring_consume_bh(&pool->ring))) {
/* Verify the refcnt invariant of cached pages */
- if (!(page_ref_count(netmem_to_page(netmem)) == 1))
+ if (!(netmem_ref_count(netmem) == 1))
pr_crit("%s() page_pool refcnt %d violation\n",
__func__, netmem_ref_count(netmem));

diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index ab86799b7fe4..96f85543f1dc 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -901,11 +901,10 @@ static void skb_clone_fraglist(struct sk_buff *skb)
#if IS_ENABLED(CONFIG_PAGE_POOL)
bool napi_pp_put_page(struct netmem *netmem, bool napi_safe)
{
- struct page *page = netmem_to_page(netmem);
bool allow_direct = false;
struct page_pool *pp;

- page = compound_head(page);
+ netmem = netmem_compound_head(netmem);

/* page->pp_magic is OR'ed with PP_SIGNATURE after the allocation
* in order to preserve any existing bits, such as bit 0 for the
@@ -914,10 +913,10 @@ bool napi_pp_put_page(struct netmem *netmem, bool napi_safe)
* and page_is_pfmemalloc() is checked in __page_pool_put_page()
* to avoid recycling the pfmemalloc page.
*/
- if (unlikely((page->pp_magic & ~0x3UL) != PP_SIGNATURE))
+ if (unlikely((netmem_get_pp_magic(netmem) & ~0x3UL) != PP_SIGNATURE))
return false;

- pp = page->pp;
+ pp = netmem_get_pp(netmem);

/* Allow direct recycle if we have reasons to believe that we are
* in the same context as the consumer would run, so there's
@@ -937,7 +936,7 @@ bool napi_pp_put_page(struct netmem *netmem, bool napi_safe)
* The page will be returned to the pool here regardless of the
* 'flipped' fragment being in use or not.
*/
- page_pool_put_full_netmem(pp, page_to_netmem(page), allow_direct);
+ page_pool_put_full_netmem(pp, netmem, allow_direct);

return true;
}
--
2.43.0.472.g3155946c3a-goog