Re: [net-next RFC v3 7/7] net: ravb: Allocate RX buffers via page pool

From: Paul Barker
Date: Fri Apr 19 2024 - 04:03:15 EST


On 15/04/2024 13:16, Niklas Söderlund wrote:
> Hi Paul,
>
> I think using page pool is a good idea!
>
> On 2024-04-15 10:48:04 +0100, Paul Barker wrote:
>> This patch makes multiple changes that can't be separated:
>>
>> 1) Allocate plain RX buffers via a page pool instead of allocating
>> SKBs, then use build_skb() when a packet is received.
>> 2) For GbEth IP, reduce the RX buffer size to 2kB.
>> 3) For GbEth IP, merge packets which span more than one RX descriptor
>> as SKB fragments instead of copying data.
>>
>> Implementing (1) without (2) would require the use of an order-1 page
>> pool (instead of an order-0 page pool split into page fragments) for
>> GbEth.
>>
>> Implementing (2) without (3) would leave us no space to re-assemble
>> packets which span more than one RX descriptor.
>>
>> Implementing (3) without (1) would not be possible as the network stack
>> expects to use put_page() or page_pool_put_page() to free SKB fragments
>> after an SKB is consumed.
>>
>> This patch gives the following improvements during testing with iperf3.
>>
>> * RZ/G2L:
>> * TCP RX: same bandwidth at -43% CPU load (70% -> 40%)
>> * UDP RX: same bandwidth at -17% CPU load (88% -> 74%)
>>
>> * RZ/G2UL:
>> * TCP RX: +30% bandwidth (726Mbps -> 941Mbps)
>> * UDP RX: +417% bandwidth (108Mbps -> 558Mbps)
>>
>> * RZ/G3S:
>> * TCP RX: +64% bandwidth (562Mbps -> 920Mbps)
>> * UDP RX: +420% bandwidth (90Mbps -> 468Mbps)
>>
>> * RZ/Five:
>> * TCP RX: +217% bandwidth (145Mbps -> 459Mbps)
>> * UDP RX: +470% bandwidth (20Mbps -> 114Mbps)
>>
>> There is no significant impact on bandwidth or CPU load in testing on
>> RZ/G2H or R-Car M3N.
>>
>> Signed-off-by: Paul Barker <paul.barker.ct@xxxxxxxxxxxxxx>
>> ---
>> drivers/net/ethernet/renesas/ravb.h | 10 +-
>> drivers/net/ethernet/renesas/ravb_main.c | 209 +++++++++++++----------
>> 2 files changed, 128 insertions(+), 91 deletions(-)
>>
>> diff --git a/drivers/net/ethernet/renesas/ravb.h b/drivers/net/ethernet/renesas/ravb.h
>> index 9c6392ade2f1..4348366c3dc7 100644
>> --- a/drivers/net/ethernet/renesas/ravb.h
>> +++ b/drivers/net/ethernet/renesas/ravb.h
>> @@ -1050,8 +1050,8 @@ struct ravb_hw_info {
>> netdev_features_t net_features;
>> int stats_len;
>> u32 tccr_mask;
>> + u32 rx_buffer_size;
>> u32 rx_max_frame_size;
>> - u32 rx_max_desc_use;
>> u32 rx_desc_size;
>> unsigned aligned_tx: 1;
>> unsigned needs_irq_coalesce:1; /* Needs software IRQ coalescing */
>> @@ -1071,6 +1071,11 @@ struct ravb_hw_info {
>> unsigned half_duplex:1; /* E-MAC supports half duplex mode */
>> };
>>
>> +struct ravb_rx_buffer {
>> + struct page *page;
>> + unsigned int offset;
>> +};
>> +
>> struct ravb_private {
>> struct net_device *ndev;
>> struct platform_device *pdev;
>> @@ -1094,7 +1099,8 @@ struct ravb_private {
>> struct ravb_tx_desc *tx_ring[NUM_TX_QUEUE];
>> void *tx_align[NUM_TX_QUEUE];
>> struct sk_buff *rx_1st_skb;
>> - struct sk_buff **rx_skb[NUM_RX_QUEUE];
>> + struct page_pool *rx_pool;
>
> Don't we need a page pool per queue? Else multiple calls to
> ravb_ring_init() and ravb_ring_free() for different queues will
> otherwise risk allocating over a previous queue and multiple free the
> same one.

Ack.

>
>> + struct ravb_rx_buffer *rx_buffers[NUM_RX_QUEUE];
>> struct sk_buff **tx_skb[NUM_TX_QUEUE];
>> u32 rx_over_errors;
>> u32 rx_fifo_errors;
>> diff --git a/drivers/net/ethernet/renesas/ravb_main.c b/drivers/net/ethernet/renesas/ravb_main.c
>> index 7434faf0820c..892a3eadef1e 100644
>> --- a/drivers/net/ethernet/renesas/ravb_main.c
>> +++ b/drivers/net/ethernet/renesas/ravb_main.c
>> @@ -30,6 +30,7 @@
>> #include <linux/reset.h>
>> #include <linux/math64.h>
>> #include <net/ip.h>
>> +#include <net/page_pool/helpers.h>
>>
>> #include "ravb.h"
>>
>> @@ -113,25 +114,6 @@ static void ravb_set_rate_rcar(struct net_device *ndev)
>> }
>> }
>>
>> -static struct sk_buff *
>> -ravb_alloc_skb(struct net_device *ndev, const struct ravb_hw_info *info,
>> - gfp_t gfp_mask)
>> -{
>> - struct sk_buff *skb;
>> - u32 reserve;
>> -
>> - skb = __netdev_alloc_skb(ndev, info->rx_max_frame_size + RAVB_ALIGN - 1,
>> - gfp_mask);
>> - if (!skb)
>> - return NULL;
>> -
>> - reserve = (unsigned long)skb->data & (RAVB_ALIGN - 1);
>> - if (reserve)
>> - skb_reserve(skb, RAVB_ALIGN - reserve);
>> -
>> - return skb;
>> -}
>> -
>> /* Get MAC address from the MAC address registers
>> *
>> * Ethernet AVB device doesn't have ROM for MAC address.
>> @@ -257,21 +239,10 @@ static void ravb_rx_ring_free(struct net_device *ndev, int q)
>> {
>> struct ravb_private *priv = netdev_priv(ndev);
>> unsigned int ring_size;
>> - unsigned int i;
>>
>> if (!priv->rx_ring[q].raw)
>> return;
>>
>> - for (i = 0; i < priv->num_rx_ring[q]; i++) {
>> - struct ravb_rx_desc *desc = ravb_rx_get_desc(priv, q, i);
>> -
>> - if (!dma_mapping_error(ndev->dev.parent,
>> - le32_to_cpu(desc->dptr)))
>> - dma_unmap_single(ndev->dev.parent,
>> - le32_to_cpu(desc->dptr),
>> - priv->info->rx_max_frame_size,
>> - DMA_FROM_DEVICE);
>> - }
>> ring_size = priv->info->rx_desc_size * (priv->num_rx_ring[q] + 1);
>> dma_free_coherent(ndev->dev.parent, ring_size, priv->rx_ring[q].raw,
>> priv->rx_desc_dma[q]);
>> @@ -298,13 +269,14 @@ static void ravb_ring_free(struct net_device *ndev, int q)
>> priv->tx_ring[q] = NULL;
>> }
>>
>> - /* Free RX skb ringbuffer */
>> - if (priv->rx_skb[q]) {
>> - for (i = 0; i < priv->num_rx_ring[q]; i++)
>> - dev_kfree_skb(priv->rx_skb[q][i]);
>> + /* Free RX buffers */
>> + for (i = 0; i < priv->num_rx_ring[q]; i++) {
>> + if (priv->rx_buffers[q][i].page)
>> + page_pool_put_page(priv->rx_pool, priv->rx_buffers[q][i].page, 0, true);
>> }
>> - kfree(priv->rx_skb[q]);
>> - priv->rx_skb[q] = NULL;
>> + kfree(priv->rx_buffers[q]);
>> + priv->rx_buffers[q] = NULL;
>> + page_pool_destroy(priv->rx_pool);
>>
>> /* Free aligned TX buffers */
>> kfree(priv->tx_align[q]);
>> @@ -317,35 +289,54 @@ static void ravb_ring_free(struct net_device *ndev, int q)
>> priv->tx_skb[q] = NULL;
>> }
>>
>> +static int
>> +ravb_alloc_rx_buffer(struct net_device *ndev, int q, u32 entry, gfp_t gfp_mask,
>> + __le32 *dptr)
>
> Why not pass the struct ravb_rx_desc instead of a dptr? Then the
> function can deal with the error case and fill in rx_desc->dptr and
> rx_desc->ds_cc directly making the caller simpler.

Ack.

>
>> +{
>> + struct ravb_private *priv = netdev_priv(ndev);
>> + const struct ravb_hw_info *info = priv->info;
>> + struct ravb_rx_buffer *rx_buff = &priv->rx_buffers[q][entry];
>> + dma_addr_t dma_addr;
>> + unsigned int size;
>> +
>> + size = info->rx_buffer_size;
>> + rx_buff->page = page_pool_alloc(priv->rx_pool, &rx_buff->offset, &size,
>> + gfp_mask);
>> + if (unlikely(!rx_buff->page))
>> + return -ENOMEM;
>> +
>> + dma_addr = page_pool_get_dma_addr(rx_buff->page) + rx_buff->offset;
>> + dma_sync_single_for_device(ndev->dev.parent, dma_addr,
>> + info->rx_buffer_size, DMA_FROM_DEVICE);
>> + *dptr = cpu_to_le32(dma_addr);
>> + return 0;
>> +}
>> +
>> static u32
>> ravb_rx_ring_refill(struct net_device *ndev, int q, u32 count, gfp_t gfp_mask)
>> {
>> struct ravb_private *priv = netdev_priv(ndev);
>> const struct ravb_hw_info *info = priv->info;
>> struct ravb_rx_desc *rx_desc;
>> - dma_addr_t dma_addr;
>> u32 i, entry;
>>
>> for (i = 0; i < count; i++) {
>> entry = (priv->dirty_rx[q] + i) % priv->num_rx_ring[q];
>> rx_desc = ravb_rx_get_desc(priv, q, entry);
>> - rx_desc->ds_cc = cpu_to_le16(info->rx_max_desc_use);
>>
>> - if (!priv->rx_skb[q][entry]) {
>> - priv->rx_skb[q][entry] = ravb_alloc_skb(ndev, info, gfp_mask);
>> - if (!priv->rx_skb[q][entry])
>> - break;
>> - dma_addr = dma_map_single(ndev->dev.parent,
>> - priv->rx_skb[q][entry]->data,
>> - priv->info->rx_max_frame_size,
>> - DMA_FROM_DEVICE);
>> - skb_checksum_none_assert(priv->rx_skb[q][entry]);
>> - /* We just set the data size to 0 for a failed mapping
>> - * which should prevent DMA from happening...
>> - */
>> - if (dma_mapping_error(ndev->dev.parent, dma_addr))
>> + if (!priv->rx_buffers[q][entry].page) {
>> + if (unlikely(ravb_alloc_rx_buffer(ndev, q, entry, gfp_mask,
>> + &rx_desc->dptr))) {
>> + /* We just set the data size to 0 for a failed mapping
>> + * which should prevent DMA from happening...
>> + */
>> rx_desc->ds_cc = cpu_to_le16(0);
>> - rx_desc->dptr = cpu_to_le32(dma_addr);
>> + break;
>> + }
>> +
>> + rx_desc->ds_cc = cpu_to_le16(info->rx_buffer_size
>> + - SKB_DATA_ALIGN(sizeof(struct skb_shared_info))
>> + - ETH_FCS_LEN + sizeof(__sum16));
>
> Can a comment be added to why we subtract and add things to the size?

Ack.

I'll address these in v4.

--
Paul Barker

Attachment: OpenPGP_0x27F4B3459F002257.asc
Description: OpenPGP public key

Attachment: OpenPGP_signature.asc
Description: OpenPGP digital signature