Re: [PATCH net-next v4 3/3] skbuff: keep track of pp page when pp_frag_count is used
From: Ilias Apalodimas
Date: Mon Oct 04 2021 - 01:52:19 EST
On Thu, Sep 30, 2021 at 04:07:47PM +0800, Yunsheng Lin wrote:
> As the skb->pp_recycle and page->pp_magic may not be enough
> to track if a frag page is from page pool after the calling
> of __skb_frag_ref(), mostly because of a data race, see:
> commit 2cc3aeb5eccc ("skbuff: Fix a potential race while
> recycling page_pool packets").
>
> There may be clone and expand head case that might lose the
> track if a frag page is from page pool or not.
>
> And not being able to keep track of pp page may cause problem
> for the skb_split() case in tso_fragment() too:
> Supposing a skb has 3 frag pages, all coming from a page pool,
> and is split to skb1 and skb2:
> skb1: first frag page + first half of second frag page
> skb2: second half of second frag page + third frag page
>
> How do we set the skb->pp_recycle of skb1 and skb2?
> 1. If we set both of them to 1, then we may have a similar
> race as the above commit for second frag page.
> 2. If we set only one of them to 1, then we may have resource
> leaking problem as both first frag page and third frag page
> are indeed from page pool.
>
> Increment the pp_frag_count of pp page frag in __skb_frag_ref(),
> and only use page->pp_magic to indicate a pp page frag in
> __skb_frag_unref() to keep track of pp page frag.
>
> Similar handling is done for the head page of a skb too.
>
> As we need the head page of a compound page to decide if it is
> from page pool at first, so __page_frag_cache_drain() and
> page_ref_inc() is used to avoid unnecessary compound_head()
> calling.
>
> Signed-off-by: Yunsheng Lin <linyunsheng@xxxxxxxxxx>
> ---
> include/linux/skbuff.h | 30 ++++++++++++++++++++----------
> include/net/page_pool.h | 24 +++++++++++++++++++++++-
> net/core/page_pool.c | 17 ++---------------
> net/core/skbuff.c | 10 ++++++++--
> 4 files changed, 53 insertions(+), 28 deletions(-)
>
> diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
> index 841e2f0f5240..aeee150d4a04 100644
> --- a/include/linux/skbuff.h
> +++ b/include/linux/skbuff.h
> @@ -3073,7 +3073,19 @@ static inline struct page *skb_frag_page(const skb_frag_t *frag)
> */
> static inline void __skb_frag_ref(skb_frag_t *frag)
> {
> - get_page(skb_frag_page(frag));
> + struct page *page = skb_frag_page(frag);
> +
> + page = compound_head(page);
> +
> +#ifdef CONFIG_PAGE_POOL
> + if (page_pool_is_pp_page(page) &&
> + page_pool_is_pp_page_frag(page)) {
> + page_pool_atomic_inc_frag_count(page);
> + return;
> + }
> +#endif
> +
> + page_ref_inc(page);
There's a BUG_ON we are now ignoring on get_page.
> }
>
> /**
> @@ -3100,11 +3112,16 @@ static inline void __skb_frag_unref(skb_frag_t *frag, bool recycle)
> {
> struct page *page = skb_frag_page(frag);
>
> + page = compound_head(page);
> +
> #ifdef CONFIG_PAGE_POOL
> - if (recycle && page_pool_return_skb_page(page))
> + if (page_pool_is_pp_page(page) &&
> + (recycle || page_pool_is_pp_page_frag(page))) {
> + page_pool_return_skb_page(page);
> return;
> + }
> #endif
> - put_page(page);
> + __page_frag_cache_drain(page, 1);
Same here, freeing the page is not the only thing put_page does.
> }
>
> /**
> @@ -4718,12 +4735,5 @@ static inline void skb_mark_for_recycle(struct sk_buff *skb)
> }
> #endif
>
> -static inline bool skb_pp_recycle(struct sk_buff *skb, void *data)
> -{
> - if (!IS_ENABLED(CONFIG_PAGE_POOL) || !skb->pp_recycle)
> - return false;
> - return page_pool_return_skb_page(virt_to_page(data));
> -}
> -
> #endif /* __KERNEL__ */
> #endif /* _LINUX_SKBUFF_H */
> diff --git a/include/net/page_pool.h b/include/net/page_pool.h
> index 3855f069627f..740a8ca7f4a6 100644
> --- a/include/net/page_pool.h
> +++ b/include/net/page_pool.h
> @@ -164,7 +164,7 @@ inline enum dma_data_direction page_pool_get_dma_dir(struct page_pool *pool)
> return pool->p.dma_dir;
> }
>
> -bool page_pool_return_skb_page(struct page *page);
> +void page_pool_return_skb_page(struct page *page);
>
> struct page_pool *page_pool_create(const struct page_pool_params *params);
>
> @@ -231,6 +231,28 @@ static inline void page_pool_set_frag_count(struct page *page, long nr)
> atomic_long_set(&page->pp_frag_count, nr);
> }
>
> +static inline void page_pool_atomic_inc_frag_count(struct page *page)
> +{
> + atomic_long_inc(&page->pp_frag_count);
> +}
> +
> +static inline bool page_pool_is_pp_page(struct page *page)
> +{
> + /* page->pp_magic is OR'ed with PP_SIGNATURE after the allocation
> + * in order to preserve any existing bits, such as bit 0 for the
> + * head page of compound page and bit 1 for pfmemalloc page, so
> + * mask those bits for freeing side when doing below checking,
> + * and page_is_pfmemalloc() is checked in __page_pool_put_page()
> + * to avoid recycling the pfmemalloc page.
> + */
> + return (page->pp_magic & ~0x3UL) == PP_SIGNATURE;
> +}
> +
> +static inline bool page_pool_is_pp_page_frag(struct page *page)
> +{
> + return !!atomic_long_read(&page->pp_frag_count);
> +}
> +
> static inline long page_pool_atomic_sub_frag_count_return(struct page *page,
> long nr)
> {
> diff --git a/net/core/page_pool.c b/net/core/page_pool.c
> index 2c643b72ce16..d141e00459c9 100644
> --- a/net/core/page_pool.c
> +++ b/net/core/page_pool.c
> @@ -219,6 +219,7 @@ static void page_pool_set_pp_info(struct page_pool *pool,
> {
> page->pp = pool;
> page->pp_magic |= PP_SIGNATURE;
> + page_pool_set_frag_count(page, 0);
> }
>
> static void page_pool_clear_pp_info(struct page *page)
> @@ -736,22 +737,10 @@ void page_pool_update_nid(struct page_pool *pool, int new_nid)
> }
> EXPORT_SYMBOL(page_pool_update_nid);
>
> -bool page_pool_return_skb_page(struct page *page)
> +void page_pool_return_skb_page(struct page *page)
> {
> struct page_pool *pp;
>
> - page = compound_head(page);
> -
> - /* page->pp_magic is OR'ed with PP_SIGNATURE after the allocation
> - * in order to preserve any existing bits, such as bit 0 for the
> - * head page of compound page and bit 1 for pfmemalloc page, so
> - * mask those bits for freeing side when doing below checking,
> - * and page_is_pfmemalloc() is checked in __page_pool_put_page()
> - * to avoid recycling the pfmemalloc page.
> - */
> - if (unlikely((page->pp_magic & ~0x3UL) != PP_SIGNATURE))
> - return false;
> -
> pp = page->pp;
>
> /* Driver set this to memory recycling info. Reset it on recycle.
> @@ -760,7 +749,5 @@ bool page_pool_return_skb_page(struct page *page)
> * 'flipped' fragment being in use or not.
> */
> page_pool_put_full_page(pp, page, false);
> -
> - return true;
> }
> EXPORT_SYMBOL(page_pool_return_skb_page);
> diff --git a/net/core/skbuff.c b/net/core/skbuff.c
> index 74601bbc56ac..e3691b025d30 100644
> --- a/net/core/skbuff.c
> +++ b/net/core/skbuff.c
> @@ -646,9 +646,15 @@ static void skb_free_head(struct sk_buff *skb)
> unsigned char *head = skb->head;
>
> if (skb->head_frag) {
> - if (skb_pp_recycle(skb, head))
> + struct page *page = virt_to_head_page(head);
> +
> + if (page_pool_is_pp_page(page) &&
> + (skb->pp_recycle || page_pool_is_pp_page_frag(page))) {
> + page_pool_return_skb_page(page);
> return;
> - skb_free_frag(head);
> + }
> +
> + __page_frag_cache_drain(page, 1);
> } else {
> kfree(head);
> }
> --
> 2.33.0
>
Regardless of the comments above, providing some numbers on how the
patches affect performance (at least on hns3), would be good to have.
I'll try giving this another look. I still think having three indicators
to look at before recycling the page is not ideal.
Regards
/Ilias