[PATCH 2/3] net-next/hinic:performance improvement for HiNIC

From: Xue Chaojing
Date: Thu Nov 15 2018 - 23:00:18 EST


In order to improve performance, this patch adds rx checksum offload
for the HiNIC driver and optimizes the code. Performance test(Iperf)
shows more than 95% improvement in TCP streams.

Signed-off-by: Xue Chaojing <xuechaojing@xxxxxxxxxx>
---
.../net/ethernet/huawei/hinic/hinic_hw_dev.h | 2 +
.../net/ethernet/huawei/hinic/hinic_hw_wq.c | 43 +++++++++++++------
.../net/ethernet/huawei/hinic/hinic_hw_wq.h | 3 +-
.../net/ethernet/huawei/hinic/hinic_hw_wqe.h | 4 ++
.../net/ethernet/huawei/hinic/hinic_main.c | 8 +++-
.../net/ethernet/huawei/hinic/hinic_port.c | 28 ++++++++++++
.../net/ethernet/huawei/hinic/hinic_port.h | 10 +++++
drivers/net/ethernet/huawei/hinic/hinic_rx.c | 24 +++++++++++
drivers/net/ethernet/huawei/hinic/hinic_rx.h | 5 +++
9 files changed, 112 insertions(+), 15 deletions(-)

diff --git a/drivers/net/ethernet/huawei/hinic/hinic_hw_dev.h b/drivers/net/ethernet/huawei/hinic/hinic_hw_dev.h
index 097b5502603f..d1a7d2522d82 100644
--- a/drivers/net/ethernet/huawei/hinic/hinic_hw_dev.h
+++ b/drivers/net/ethernet/huawei/hinic/hinic_hw_dev.h
@@ -50,6 +50,8 @@ enum hinic_port_cmd {

HINIC_PORT_CMD_GET_LINK_STATE = 24,

+ HINIC_PORT_CMD_SET_RX_CSUM = 26,
+
HINIC_PORT_CMD_SET_PORT_STATE = 41,

HINIC_PORT_CMD_FWCTXT_INIT = 69,
diff --git a/drivers/net/ethernet/huawei/hinic/hinic_hw_wq.c b/drivers/net/ethernet/huawei/hinic/hinic_hw_wq.c
index f92f1bf3901a..34859502c932 100644
--- a/drivers/net/ethernet/huawei/hinic/hinic_hw_wq.c
+++ b/drivers/net/ethernet/huawei/hinic/hinic_hw_wq.c
@@ -74,12 +74,6 @@
((void *)((cmdq_pages)->shadow_page_vaddr) \
+ (wq)->block_idx * CMDQ_BLOCK_SIZE)

-#define WQE_PAGE_OFF(wq, idx) (((idx) & ((wq)->num_wqebbs_per_page - 1)) * \
- (wq)->wqebb_size)
-
-#define WQE_PAGE_NUM(wq, idx) (((idx) / ((wq)->num_wqebbs_per_page)) \
- & ((wq)->num_q_pages - 1))
-
#define WQ_PAGE_ADDR(wq, idx) \
((wq)->shadow_block_vaddr[WQE_PAGE_NUM(wq, idx)])

@@ -93,6 +87,17 @@
(((unsigned long)(wqe) - (unsigned long)(wq)->shadow_wqe) \
/ (wq)->max_wqe_size)

+static inline int WQE_PAGE_OFF(struct hinic_wq *wq, u16 idx)
+{
+ return (((idx) & ((wq)->num_wqebbs_per_page - 1))
+ << (wq)->wqebb_size_shift);
+}
+
+static inline int WQE_PAGE_NUM(struct hinic_wq *wq, u16 idx)
+{
+ return (((idx) >> ((wq)->wqebbs_per_page_shift))
+ & ((wq)->num_q_pages - 1));
+}
/**
* queue_alloc_page - allocate page for Queue
* @hwif: HW interface for allocating DMA
@@ -513,6 +518,7 @@ int hinic_wq_allocate(struct hinic_wqs *wqs, struct hinic_wq *wq,
struct hinic_hwif *hwif = wqs->hwif;
struct pci_dev *pdev = hwif->pdev;
u16 num_wqebbs_per_page;
+ u16 wqebb_size_shift;
int err;

if (wqebb_size == 0) {
@@ -530,7 +536,9 @@ int hinic_wq_allocate(struct hinic_wqs *wqs, struct hinic_wq *wq,
return -EINVAL;
}

- num_wqebbs_per_page = ALIGN(wq_page_size, wqebb_size) / wqebb_size;
+ wqebb_size_shift = ilog2(wqebb_size);
+ num_wqebbs_per_page = ALIGN(wq_page_size, wqebb_size)
+ >> wqebb_size_shift;

if (num_wqebbs_per_page & (num_wqebbs_per_page - 1)) {
dev_err(&pdev->dev, "num wqebbs per page must be power of 2\n");
@@ -550,7 +558,8 @@ int hinic_wq_allocate(struct hinic_wqs *wqs, struct hinic_wq *wq,
wq->q_depth = q_depth;
wq->max_wqe_size = max_wqe_size;
wq->num_wqebbs_per_page = num_wqebbs_per_page;
-
+ wq->wqebbs_per_page_shift = ilog2(num_wqebbs_per_page);
+ wq->wqebb_size_shift = wqebb_size_shift;
wq->block_vaddr = WQ_BASE_VADDR(wqs, wq);
wq->shadow_block_vaddr = WQ_BASE_ADDR(wqs, wq);
wq->block_paddr = WQ_BASE_PADDR(wqs, wq);
@@ -604,7 +613,9 @@ int hinic_wqs_cmdq_alloc(struct hinic_cmdq_pages *cmdq_pages,
u16 q_depth, u16 max_wqe_size)
{
struct pci_dev *pdev = hwif->pdev;
+ u16 num_wqebbs_per_page_shift;
u16 num_wqebbs_per_page;
+ u16 wqebb_size_shift;
int i, j, err = -ENOMEM;

if (wqebb_size == 0) {
@@ -622,7 +633,9 @@ int hinic_wqs_cmdq_alloc(struct hinic_cmdq_pages *cmdq_pages,
return -EINVAL;
}

- num_wqebbs_per_page = ALIGN(wq_page_size, wqebb_size) / wqebb_size;
+ wqebb_size_shift = ilog2(wqebb_size);
+ num_wqebbs_per_page = ALIGN(wq_page_size, wqebb_size)
+ >> wqebb_size_shift;

if (num_wqebbs_per_page & (num_wqebbs_per_page - 1)) {
dev_err(&pdev->dev, "num wqebbs per page must be power of 2\n");
@@ -636,6 +649,7 @@ int hinic_wqs_cmdq_alloc(struct hinic_cmdq_pages *cmdq_pages,
dev_err(&pdev->dev, "Failed to allocate CMDQ page\n");
return err;
}
+ num_wqebbs_per_page_shift = ilog2(num_wqebbs_per_page);

for (i = 0; i < cmdq_blocks; i++) {
wq[i].hwif = hwif;
@@ -647,7 +661,8 @@ int hinic_wqs_cmdq_alloc(struct hinic_cmdq_pages *cmdq_pages,
wq[i].q_depth = q_depth;
wq[i].max_wqe_size = max_wqe_size;
wq[i].num_wqebbs_per_page = num_wqebbs_per_page;
-
+ wq[i].wqebbs_per_page_shift = num_wqebbs_per_page_shift;
+ wq[i].wqebb_size_shift = wqebb_size_shift;
wq[i].block_vaddr = CMDQ_BASE_VADDR(cmdq_pages, &wq[i]);
wq[i].shadow_block_vaddr = CMDQ_BASE_ADDR(cmdq_pages, &wq[i]);
wq[i].block_paddr = CMDQ_BASE_PADDR(cmdq_pages, &wq[i]);
@@ -741,7 +756,7 @@ struct hinic_hw_wqe *hinic_get_wqe(struct hinic_wq *wq, unsigned int wqe_size,

*prod_idx = MASKED_WQE_IDX(wq, atomic_read(&wq->prod_idx));

- num_wqebbs = ALIGN(wqe_size, wq->wqebb_size) / wq->wqebb_size;
+ num_wqebbs = ALIGN(wqe_size, wq->wqebb_size) >> wq->wqebb_size_shift;

if (atomic_sub_return(num_wqebbs, &wq->delta) <= 0) {
atomic_add(num_wqebbs, &wq->delta);
@@ -795,7 +810,8 @@ void hinic_return_wqe(struct hinic_wq *wq, unsigned int wqe_size)
**/
void hinic_put_wqe(struct hinic_wq *wq, unsigned int wqe_size)
{
- int num_wqebbs = ALIGN(wqe_size, wq->wqebb_size) / wq->wqebb_size;
+ int num_wqebbs = ALIGN(wqe_size, wq->wqebb_size)
+ >> wq->wqebb_size_shift;

atomic_add(num_wqebbs, &wq->cons_idx);

@@ -813,7 +829,8 @@ void hinic_put_wqe(struct hinic_wq *wq, unsigned int wqe_size)
struct hinic_hw_wqe *hinic_read_wqe(struct hinic_wq *wq, unsigned int wqe_size,
u16 *cons_idx)
{
- int num_wqebbs = ALIGN(wqe_size, wq->wqebb_size) / wq->wqebb_size;
+ int num_wqebbs = ALIGN(wqe_size, wq->wqebb_size)
+ >> wq->wqebb_size_shift;
u16 curr_cons_idx, end_cons_idx;
int curr_pg, end_pg;

diff --git a/drivers/net/ethernet/huawei/hinic/hinic_hw_wq.h b/drivers/net/ethernet/huawei/hinic/hinic_hw_wq.h
index 9b66545ba563..0a936cd6709b 100644
--- a/drivers/net/ethernet/huawei/hinic/hinic_hw_wq.h
+++ b/drivers/net/ethernet/huawei/hinic/hinic_hw_wq.h
@@ -39,7 +39,8 @@ struct hinic_wq {
u16 q_depth;
u16 max_wqe_size;
u16 num_wqebbs_per_page;
-
+ u16 wqebbs_per_page_shift;
+ u16 wqebb_size_shift;
/* The addresses are 64 bit in the HW */
u64 block_paddr;
void **shadow_block_vaddr;
diff --git a/drivers/net/ethernet/huawei/hinic/hinic_hw_wqe.h b/drivers/net/ethernet/huawei/hinic/hinic_hw_wqe.h
index 9754d6ed5f4a..138941527872 100644
--- a/drivers/net/ethernet/huawei/hinic/hinic_hw_wqe.h
+++ b/drivers/net/ethernet/huawei/hinic/hinic_hw_wqe.h
@@ -170,6 +170,10 @@

#define HINIC_RQ_CQE_STATUS_RXDONE_MASK 0x1

+#define HINIC_RQ_CQE_STATUS_CSUM_ERR_SHIFT 0
+
+#define HINIC_RQ_CQE_STATUS_CSUM_ERR_MASK 0xFFFFU
+
#define HINIC_RQ_CQE_STATUS_GET(val, member) \
(((val) >> HINIC_RQ_CQE_STATUS_##member##_SHIFT) & \
HINIC_RQ_CQE_STATUS_##member##_MASK)
diff --git a/drivers/net/ethernet/huawei/hinic/hinic_main.c b/drivers/net/ethernet/huawei/hinic/hinic_main.c
index cefd441bb117..6d48dc62a44b 100644
--- a/drivers/net/ethernet/huawei/hinic/hinic_main.c
+++ b/drivers/net/ethernet/huawei/hinic/hinic_main.c
@@ -723,6 +723,7 @@ static void set_rx_mode(struct work_struct *work)
{
struct hinic_rx_mode_work *rx_mode_work = work_to_rx_mode_work(work);
struct hinic_dev *nic_dev = rx_mode_work_to_nic_dev(rx_mode_work);
+ struct netdev_hw_addr *ha;

netif_info(nic_dev, drv, nic_dev->netdev, "set rx mode work\n");

@@ -806,7 +807,8 @@ static const struct net_device_ops hinic_netdev_ops = {
static void netdev_features_init(struct net_device *netdev)
{
netdev->hw_features = NETIF_F_SG | NETIF_F_HIGHDMA | NETIF_F_IP_CSUM |
- NETIF_F_IPV6_CSUM | NETIF_F_TSO | NETIF_F_TSO6;
+ NETIF_F_IPV6_CSUM | NETIF_F_TSO | NETIF_F_TSO6 |
+ NETIF_F_RXCSUM;

netdev->vlan_features = netdev->hw_features;

@@ -869,12 +871,16 @@ static int set_features(struct hinic_dev *nic_dev,
netdev_features_t features, bool force_change)
{
netdev_features_t changed = force_change ? ~0 : pre_features ^ features;
+ u32 csum_en = HINIC_RX_CSUM_OFFLOAD_EN;
int err = 0;

if (changed & NETIF_F_TSO)
err = hinic_port_set_tso(nic_dev, (features & NETIF_F_TSO) ?
HINIC_TSO_ENABLE : HINIC_TSO_DISABLE);

+ if (changed & NETIF_F_RXCSUM)
+ err = hinic_set_rx_csum_offload(nic_dev, csum_en);
+
return err;
}

diff --git a/drivers/net/ethernet/huawei/hinic/hinic_port.c b/drivers/net/ethernet/huawei/hinic/hinic_port.c
index 7575a7d3bd9f..e9f76e904610 100644
--- a/drivers/net/ethernet/huawei/hinic/hinic_port.c
+++ b/drivers/net/ethernet/huawei/hinic/hinic_port.c
@@ -409,3 +409,31 @@ int hinic_port_set_tso(struct hinic_dev *nic_dev, enum hinic_tso_state state)

return 0;
}
+
+int hinic_set_rx_csum_offload(struct hinic_dev *nic_dev, u32 en)
+{
+ struct hinic_checksum_offload rx_csum_cfg = {0};
+ struct hinic_hwdev *hwdev = nic_dev->hwdev;
+ struct hinic_hwif *hwif = hwdev->hwif;
+ struct pci_dev *pdev = hwif->pdev;
+ u16 out_size;
+ int err;
+
+ if (!hwdev)
+ return -EINVAL;
+
+ rx_csum_cfg.func_id = HINIC_HWIF_FUNC_IDX(hwif);
+ rx_csum_cfg.rx_csum_offload = en;
+
+ err = hinic_port_msg_cmd(hwdev, HINIC_PORT_CMD_SET_RX_CSUM,
+ &rx_csum_cfg, sizeof(rx_csum_cfg),
+ &rx_csum_cfg, &out_size);
+ if (err || !out_size || rx_csum_cfg.status) {
+ dev_err(&pdev->dev,
+ "Failed to set rx csum offload, ret = %d\n",
+ rx_csum_cfg.status);
+ return -EINVAL;
+ }
+
+ return 0;
+}
diff --git a/drivers/net/ethernet/huawei/hinic/hinic_port.h b/drivers/net/ethernet/huawei/hinic/hinic_port.h
index f6e3220fe28f..02d896eed455 100644
--- a/drivers/net/ethernet/huawei/hinic/hinic_port.h
+++ b/drivers/net/ethernet/huawei/hinic/hinic_port.h
@@ -183,6 +183,15 @@ struct hinic_tso_config {
u8 resv2[3];
};

+struct hinic_checksum_offload {
+ u8 status;
+ u8 version;
+ u8 rsvd0[6];
+
+ u16 func_id;
+ u16 rsvd1;
+ u32 rx_csum_offload;
+};
int hinic_port_add_mac(struct hinic_dev *nic_dev, const u8 *addr,
u16 vlan_id);

@@ -213,4 +222,5 @@ int hinic_port_get_cap(struct hinic_dev *nic_dev,

int hinic_port_set_tso(struct hinic_dev *nic_dev, enum hinic_tso_state state);

+int hinic_set_rx_csum_offload(struct hinic_dev *nic_dev, u32 en);
#endif
diff --git a/drivers/net/ethernet/huawei/hinic/hinic_rx.c b/drivers/net/ethernet/huawei/hinic/hinic_rx.c
index 4c0f7eda1166..93e8f207f6da 100644
--- a/drivers/net/ethernet/huawei/hinic/hinic_rx.c
+++ b/drivers/net/ethernet/huawei/hinic/hinic_rx.c
@@ -89,6 +89,28 @@ static void rxq_stats_init(struct hinic_rxq *rxq)
hinic_rxq_clean_stats(rxq);
}

+static void rx_csum(struct hinic_rxq *rxq, u16 cons_idx,
+ struct sk_buff *skb)
+{
+ struct net_device *netdev = rxq->netdev;
+ struct hinic_rq_cqe *cqe;
+ struct hinic_rq *rq;
+ u32 csum_err;
+ u32 status;
+
+ rq = rxq->rq;
+ cqe = rq->cqe[cons_idx];
+ status = be32_to_cpu(cqe->status);
+ csum_err = HINIC_RQ_CQE_STATUS_GET(status, CSUM_ERR);
+
+ if (!(netdev->features & NETIF_F_RXCSUM))
+ return;
+
+ if (!csum_err)
+ skb->ip_summed = CHECKSUM_UNNECESSARY;
+ else
+ skb->ip_summed = CHECKSUM_NONE;
+}
/**
* rx_alloc_skb - allocate skb and map it to dma address
* @rxq: rx queue
@@ -328,6 +350,8 @@ static int rxq_recv(struct hinic_rxq *rxq, int budget)

rx_unmap_skb(rxq, hinic_sge_to_dma(&sge));

+ rx_csum(rxq, ci, skb);
+
prefetch(skb->data);

pkt_len = sge.len;
diff --git a/drivers/net/ethernet/huawei/hinic/hinic_rx.h b/drivers/net/ethernet/huawei/hinic/hinic_rx.h
index 27c9af4b1c12..7ab57d6ff83f 100644
--- a/drivers/net/ethernet/huawei/hinic/hinic_rx.h
+++ b/drivers/net/ethernet/huawei/hinic/hinic_rx.h
@@ -23,6 +23,11 @@

#include "hinic_hw_qp.h"

+#define BIT(nr) (1UL << (nr))
+#define HINIC_RX_CSUM_OFFLOAD_EN 0xFFF
+#define HINIC_RX_CSUM_HW_CHECK_NONE BIT(7)
+#define HINIC_RX_CSUM_IPSU_OTHER_ERR BIT(8)
+
struct hinic_rxq_stats {
u64 pkts;
u64 bytes;
--
2.17.1