Re: [PATCH net-next v7 3/8] page_pool: fix IOMMU crash when driver has already unbound

From: Jesper Dangaard Brouer
Date: Wed Jan 15 2025 - 11:30:14 EST




On 10/01/2025 14.06, Yunsheng Lin wrote:
[...]
In order not to call DMA APIs to do DMA unmmapping after driver
has already unbound and stall the unloading of the networking
driver, use some pre-allocated item blocks to record inflight
pages including the ones which are handed over to network stack,
so the page_pool can do the DMA unmmapping for those pages when
page_pool_destroy() is called. As the pre-allocated item blocks
need to be large enough to avoid performance degradation, add a
'item_fast_empty' stat to indicate the unavailability of the
pre-allocated item blocks.


[...]
diff --git a/net/core/page_pool.c b/net/core/page_pool.c
index 1aa7b93bdcc8..fa7629c3ec94 100644
--- a/net/core/page_pool.c
+++ b/net/core/page_pool.c
[...]
@@ -268,6 +271,7 @@ static int page_pool_init(struct page_pool *pool,
return -ENOMEM;
}
+ spin_lock_init(&pool->item_lock);
atomic_set(&pool->pages_state_release_cnt, 0);
/* Driver calling page_pool_create() also call page_pool_destroy() */
@@ -325,6 +329,200 @@ static void page_pool_uninit(struct page_pool *pool)
#endif
}
+#define PAGE_POOL_ITEM_USED 0
+#define PAGE_POOL_ITEM_MAPPED 1
+
+#define ITEMS_PER_PAGE ((PAGE_SIZE - \
+ offsetof(struct page_pool_item_block, items)) / \
+ sizeof(struct page_pool_item))
+
+#define page_pool_item_init_state(item) \
+({ \
+ (item)->state = 0; \
+})
+
+#if defined(CONFIG_DEBUG_NET)
+#define page_pool_item_set_used(item) \
+ __set_bit(PAGE_POOL_ITEM_USED, &(item)->state)
+
+#define page_pool_item_clear_used(item) \
+ __clear_bit(PAGE_POOL_ITEM_USED, &(item)->state)
+
+#define page_pool_item_is_used(item) \
+ test_bit(PAGE_POOL_ITEM_USED, &(item)->state)
+#else
+#define page_pool_item_set_used(item)
+#define page_pool_item_clear_used(item)
+#define page_pool_item_is_used(item) false
+#endif
+
+#define page_pool_item_set_mapped(item) \
+ __set_bit(PAGE_POOL_ITEM_MAPPED, &(item)->state)
+
+/* Only clear_mapped and is_mapped need to be atomic as they can be
+ * called concurrently.
+ */
+#define page_pool_item_clear_mapped(item) \
+ clear_bit(PAGE_POOL_ITEM_MAPPED, &(item)->state)
+
+#define page_pool_item_is_mapped(item) \
+ test_bit(PAGE_POOL_ITEM_MAPPED, &(item)->state)
+
+static __always_inline void __page_pool_release_page_dma(struct page_pool *pool,
+ netmem_ref netmem,
+ bool destroyed)
+{
+ struct page_pool_item *item;
+ dma_addr_t dma;
+
+ if (!pool->dma_map)
+ /* Always account for inflight pages, even if we didn't
+ * map them
+ */
+ return;
+
+ dma = page_pool_get_dma_addr_netmem(netmem);
+ item = netmem_get_pp_item(netmem);
+
+ /* dma unmapping is always needed when page_pool_destory() is not called
+ * yet.
+ */
+ DEBUG_NET_WARN_ON_ONCE(!destroyed && !page_pool_item_is_mapped(item));
+ if (unlikely(destroyed && !page_pool_item_is_mapped(item)))
+ return;
+
+ /* When page is unmapped, it cannot be returned to our pool */
+ dma_unmap_page_attrs(pool->p.dev, dma,
+ PAGE_SIZE << pool->p.order, pool->p.dma_dir,
+ DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING);
+ page_pool_set_dma_addr_netmem(netmem, 0);
+ page_pool_item_clear_mapped(item);
+}
+

I have a hard time reading/reviewing/maintaining below code, without
some design description. This code needs more comments on what is the
*intend* and design it's trying to achieve.

From patch description the only hint I have is:
"use some pre-allocated item blocks to record inflight pages"

E.g. Why is it needed/smart to hijack the page->pp pointer?

+static void __page_pool_item_init(struct page_pool *pool, struct page *page)
+{

Function name is confusing. First I though this was init'ing a single
item, but looking at the code it is iterating over ITEMS_PER_PAGE.

Maybe it should be called page_pool_item_block_init ?

+ struct page_pool_item_block *block = page_address(page);
+ struct page_pool_item *items = block->items;
+ unsigned int i;
+
+ list_add(&block->list, &pool->item_blocks);
+ block->pp = pool;
+
+ for (i = 0; i < ITEMS_PER_PAGE; i++) {
+ page_pool_item_init_state(&items[i]);
+ __llist_add(&items[i].lentry, &pool->hold_items);
+ }
+}
+
+static int page_pool_item_init(struct page_pool *pool)
+{
+#define PAGE_POOL_MIN_INFLIGHT_ITEMS 512
+ struct page_pool_item_block *block;
+ int item_cnt;
+
+ INIT_LIST_HEAD(&pool->item_blocks);
+ init_llist_head(&pool->hold_items);
+ init_llist_head(&pool->release_items);
+
+ item_cnt = pool->p.pool_size * 2 + PP_ALLOC_CACHE_SIZE +
+ PAGE_POOL_MIN_INFLIGHT_ITEMS;
+ while (item_cnt > 0) {
+ struct page *page;
+
+ page = alloc_pages_node(pool->p.nid, GFP_KERNEL, 0);
+ if (!page)
+ goto err;
+
+ __page_pool_item_init(pool, page);
+ item_cnt -= ITEMS_PER_PAGE;
+ }
+
+ return 0;
+err:
+ list_for_each_entry(block, &pool->item_blocks, list)
+ put_page(virt_to_page(block));
+
+ return -ENOMEM;
+}
+
+static void page_pool_item_unmap(struct page_pool *pool,
+ struct page_pool_item *item)
+{
+ spin_lock_bh(&pool->item_lock);
+ __page_pool_release_page_dma(pool, item->pp_netmem, true);
+ spin_unlock_bh(&pool->item_lock);
+}
+
+static void page_pool_items_unmap(struct page_pool *pool)
+{
+ struct page_pool_item_block *block;
+
+ if (!pool->dma_map || pool->mp_priv)
+ return;
+
+ list_for_each_entry(block, &pool->item_blocks, list) {
+ struct page_pool_item *items = block->items;
+ int i;
+
+ for (i = 0; i < ITEMS_PER_PAGE; i++) {
+ struct page_pool_item *item = &items[i];
+
+ if (!page_pool_item_is_mapped(item))
+ continue;
+
+ page_pool_item_unmap(pool, item);
+ }
+ }
+}
+
+static void page_pool_item_uninit(struct page_pool *pool)
+{
+ while (!list_empty(&pool->item_blocks)) {
+ struct page_pool_item_block *block;
+
+ block = list_first_entry(&pool->item_blocks,
+ struct page_pool_item_block,
+ list);
+ list_del(&block->list);
+ put_page(virt_to_page(block));
+ }
+}
+
+static bool page_pool_item_add(struct page_pool *pool, netmem_ref netmem)
+{
+ struct page_pool_item *item;
+ struct llist_node *node;
+
+ if (unlikely(llist_empty(&pool->hold_items))) {
+ pool->hold_items.first = llist_del_all(&pool->release_items);
+
+ if (unlikely(llist_empty(&pool->hold_items))) {
+ alloc_stat_inc(pool, item_fast_empty);
+ return false;
+ }
+ }
+
+ node = pool->hold_items.first;
+ pool->hold_items.first = node->next;
+ item = llist_entry(node, struct page_pool_item, lentry);
+ item->pp_netmem = netmem;
+ page_pool_item_set_used(item);
+ netmem_set_pp_item(netmem, item);
+ return true;
+}
+
+static void page_pool_item_del(struct page_pool *pool, netmem_ref netmem)
+{
+ struct page_pool_item *item = netmem_get_pp_item(netmem);
+
+ DEBUG_NET_WARN_ON_ONCE(item->pp_netmem != netmem);
+ DEBUG_NET_WARN_ON_ONCE(page_pool_item_is_mapped(item));
+ DEBUG_NET_WARN_ON_ONCE(!page_pool_item_is_used(item));
+ page_pool_item_clear_used(item);
+ netmem_set_pp_item(netmem, NULL);
+ llist_add(&item->lentry, &pool->release_items);
+}
+
/**
* page_pool_create_percpu() - create a page pool for a given cpu.
* @params: parameters, see struct page_pool_params
@@ -344,12 +542,18 @@ page_pool_create_percpu(const struct page_pool_params *params, int cpuid)
if (err < 0)
goto err_free;
- err = page_pool_list(pool);
+ err = page_pool_item_init(pool);
if (err)
goto err_uninit;
+ err = page_pool_list(pool);
+ if (err)
+ goto err_item_uninit;
+
return pool;
+err_item_uninit:
+ page_pool_item_uninit(pool);
err_uninit:
page_pool_uninit(pool);
err_free:
@@ -369,7 +573,8 @@ struct page_pool *page_pool_create(const struct page_pool_params *params)
}
EXPORT_SYMBOL(page_pool_create);
-static void page_pool_return_page(struct page_pool *pool, netmem_ref netmem);
+static void __page_pool_return_page(struct page_pool *pool, netmem_ref netmem,
+ bool destroyed);
static noinline netmem_ref page_pool_refill_alloc_cache(struct page_pool *pool)
{
@@ -407,7 +612,7 @@ static noinline netmem_ref page_pool_refill_alloc_cache(struct page_pool *pool)
* (2) break out to fallthrough to alloc_pages_node.
* This limit stress on page buddy alloactor.
*/
- page_pool_return_page(pool, netmem);
+ __page_pool_return_page(pool, netmem, false);
alloc_stat_inc(pool, waive);
netmem = 0;
break;
@@ -464,6 +669,7 @@ page_pool_dma_sync_for_device(const struct page_pool *pool,
static bool page_pool_dma_map(struct page_pool *pool, netmem_ref netmem)
{
+ struct page_pool_item *item;
dma_addr_t dma;
/* Setup DMA mapping: use 'struct page' area for storing DMA-addr
@@ -481,6 +687,9 @@ static bool page_pool_dma_map(struct page_pool *pool, netmem_ref netmem)
if (page_pool_set_dma_addr_netmem(netmem, dma))
goto unmap_failed;
+ item = netmem_get_pp_item(netmem);
+ DEBUG_NET_WARN_ON_ONCE(page_pool_item_is_mapped(item));
+ page_pool_item_set_mapped(item);
page_pool_dma_sync_for_device(pool, netmem, pool->p.max_len);
return true;
@@ -503,19 +712,24 @@ static struct page *__page_pool_alloc_page_order(struct page_pool *pool,
if (unlikely(!page))
return NULL;
- if (pool->dma_map && unlikely(!page_pool_dma_map(pool, page_to_netmem(page)))) {
- put_page(page);
- return NULL;
- }
+ if (unlikely(!page_pool_set_pp_info(pool, page_to_netmem(page))))
+ goto err_alloc;
+
+ if (pool->dma_map && unlikely(!page_pool_dma_map(pool, page_to_netmem(page))))
+ goto err_set_info;
alloc_stat_inc(pool, slow_high_order);
- page_pool_set_pp_info(pool, page_to_netmem(page));
/* Track how many pages are held 'in-flight' */
pool->pages_state_hold_cnt++;
trace_page_pool_state_hold(pool, page_to_netmem(page),
pool->pages_state_hold_cnt);
return page;
+err_set_info:
+ page_pool_clear_pp_info(pool, page_to_netmem(page));
+err_alloc:
+ put_page(page);
+ return NULL;
}
/* slow path */
@@ -550,12 +764,18 @@ static noinline netmem_ref __page_pool_alloc_pages_slow(struct page_pool *pool,
*/
for (i = 0; i < nr_pages; i++) {
netmem = pool->alloc.cache[i];
+
+ if (unlikely(!page_pool_set_pp_info(pool, netmem))) {
+ put_page(netmem_to_page(netmem));
+ continue;
+ }
+
if (dma_map && unlikely(!page_pool_dma_map(pool, netmem))) {
+ page_pool_clear_pp_info(pool, netmem);
put_page(netmem_to_page(netmem));
continue;
}
- page_pool_set_pp_info(pool, netmem);
pool->alloc.cache[pool->alloc.count++] = netmem;
/* Track how many pages are held 'in-flight' */
pool->pages_state_hold_cnt++;
@@ -627,9 +847,11 @@ s32 page_pool_inflight(const struct page_pool *pool, bool strict)
return inflight;
}
-void page_pool_set_pp_info(struct page_pool *pool, netmem_ref netmem)
+bool page_pool_set_pp_info(struct page_pool *pool, netmem_ref netmem)
{
- netmem_set_pp(netmem, pool);
+ if (unlikely(!page_pool_item_add(pool, netmem)))
+ return false;
+
netmem_or_pp_magic(netmem, PP_SIGNATURE);
/* Ensuring all pages have been split into one fragment initially:
@@ -641,32 +863,14 @@ void page_pool_set_pp_info(struct page_pool *pool, netmem_ref netmem)
page_pool_fragment_netmem(netmem, 1);
if (pool->has_init_callback)
pool->slow.init_callback(netmem, pool->slow.init_arg);
-}
-void page_pool_clear_pp_info(netmem_ref netmem)
-{
- netmem_clear_pp_magic(netmem);
- netmem_set_pp(netmem, NULL);
+ return true;
}
-static __always_inline void __page_pool_release_page_dma(struct page_pool *pool,
- netmem_ref netmem)
+void page_pool_clear_pp_info(struct page_pool *pool, netmem_ref netmem)
{
- dma_addr_t dma;
-
- if (!pool->dma_map)
- /* Always account for inflight pages, even if we didn't
- * map them
- */
- return;
-
- dma = page_pool_get_dma_addr_netmem(netmem);
-
- /* When page is unmapped, it cannot be returned to our pool */
- dma_unmap_page_attrs(pool->p.dev, dma,
- PAGE_SIZE << pool->p.order, pool->p.dma_dir,
- DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING);
- page_pool_set_dma_addr_netmem(netmem, 0);
+ netmem_clear_pp_magic(netmem);
+ page_pool_item_del(pool, netmem);
}
/* Disconnects a page (from a page_pool). API users can have a need
@@ -674,7 +878,8 @@ static __always_inline void __page_pool_release_page_dma(struct page_pool *pool,
* a regular page (that will eventually be returned to the normal
* page-allocator via put_page).
*/
-void page_pool_return_page(struct page_pool *pool, netmem_ref netmem)
+void __page_pool_return_page(struct page_pool *pool, netmem_ref netmem,
+ bool destroyed)
{
int count;
bool put;
@@ -683,7 +888,7 @@ void page_pool_return_page(struct page_pool *pool, netmem_ref netmem)
if (static_branch_unlikely(&page_pool_mem_providers) && pool->mp_priv)
put = mp_dmabuf_devmem_release_page(pool, netmem);
else
- __page_pool_release_page_dma(pool, netmem);
+ __page_pool_release_page_dma(pool, netmem, destroyed);
/* This may be the last page returned, releasing the pool, so
* it is not safe to reference pool afterwards.
@@ -692,7 +897,7 @@ void page_pool_return_page(struct page_pool *pool, netmem_ref netmem)
trace_page_pool_state_release(pool, netmem, count);
if (put) {
- page_pool_clear_pp_info(netmem);
+ page_pool_clear_pp_info(pool, netmem);
put_page(netmem_to_page(netmem));
}
/* An optimization would be to call __free_pages(page, pool->p.order)
@@ -701,6 +906,27 @@ void page_pool_return_page(struct page_pool *pool, netmem_ref netmem)
*/
}
+/* Called from page_pool_put_*() path, need to synchronizated with
+ * page_pool_destory() path.
+ */
+static void page_pool_return_page(struct page_pool *pool, netmem_ref netmem)
+{
+ unsigned int destroy_cnt;
+
+ rcu_read_lock();
+
+ destroy_cnt = READ_ONCE(pool->destroy_cnt);
+ if (unlikely(destroy_cnt)) {
+ spin_lock_bh(&pool->item_lock);
+ __page_pool_return_page(pool, netmem, true);
+ spin_unlock_bh(&pool->item_lock);
+ } else {
+ __page_pool_return_page(pool, netmem, false);
+ }
+
+ rcu_read_unlock();
+}
+
static bool page_pool_recycle_in_ring(struct page_pool *pool, netmem_ref netmem)
{
int ret;
@@ -963,7 +1189,7 @@ static netmem_ref page_pool_drain_frag(struct page_pool *pool,
return netmem;
}
- page_pool_return_page(pool, netmem);
+ __page_pool_return_page(pool, netmem, false);
return 0;
}
@@ -977,7 +1203,7 @@ static void page_pool_free_frag(struct page_pool *pool)
if (!netmem || page_pool_unref_netmem(netmem, drain_count))
return;
- page_pool_return_page(pool, netmem);
+ __page_pool_return_page(pool, netmem, false);
}
netmem_ref page_pool_alloc_frag_netmem(struct page_pool *pool,
@@ -1053,6 +1279,7 @@ static void __page_pool_destroy(struct page_pool *pool)
if (pool->disconnect)
pool->disconnect(pool);
+ page_pool_item_uninit(pool);
page_pool_unlist(pool);
page_pool_uninit(pool);
@@ -1084,7 +1311,7 @@ static void page_pool_empty_alloc_cache_once(struct page_pool *pool)
static void page_pool_scrub(struct page_pool *pool)
{
page_pool_empty_alloc_cache_once(pool);
- pool->destroy_cnt++;
+ WRITE_ONCE(pool->destroy_cnt, pool->destroy_cnt + 1);
/* No more consumers should exist, but producers could still
* be in-flight.
@@ -1178,6 +1405,8 @@ void page_pool_destroy(struct page_pool *pool)
*/
synchronize_rcu();
+ page_pool_items_unmap(pool);
+
page_pool_detached(pool);
pool->defer_start = jiffies;
pool->defer_warn = jiffies + DEFER_WARN_INTERVAL;
@@ -1198,7 +1427,7 @@ void page_pool_update_nid(struct page_pool *pool, int new_nid)
/* Flush pool alloc cache, as refill will check NUMA node */
while (pool->alloc.count) {
netmem = pool->alloc.cache[--pool->alloc.count];
- page_pool_return_page(pool, netmem);
+ __page_pool_return_page(pool, netmem, false);
}
}
EXPORT_SYMBOL(page_pool_update_nid);
diff --git a/net/core/page_pool_priv.h b/net/core/page_pool_priv.h
index 57439787b9c2..5d85f862a30a 100644
--- a/net/core/page_pool_priv.h
+++ b/net/core/page_pool_priv.h
@@ -36,16 +36,18 @@ static inline bool page_pool_set_dma_addr(struct page *page, dma_addr_t addr)
}
#if defined(CONFIG_PAGE_POOL)
-void page_pool_set_pp_info(struct page_pool *pool, netmem_ref netmem);
-void page_pool_clear_pp_info(netmem_ref netmem);
+bool page_pool_set_pp_info(struct page_pool *pool, netmem_ref netmem);
+void page_pool_clear_pp_info(struct page_pool *pool, netmem_ref netmem);
int page_pool_check_memory_provider(struct net_device *dev,
struct netdev_rx_queue *rxq);
#else
-static inline void page_pool_set_pp_info(struct page_pool *pool,
+static inline bool page_pool_set_pp_info(struct page_pool *pool,
netmem_ref netmem)
{
+ return true;
}
-static inline void page_pool_clear_pp_info(netmem_ref netmem)
+static inline void page_pool_clear_pp_info(struct page_pool *pool,
+ netmem_ref netmem)
{
}
static inline int page_pool_check_memory_provider(struct net_device *dev,