Re: [PATCH V2 net-next 2/2] net: hns3: add vf fault detect support

From: Simon Horman
Date: Mon Oct 09 2023 - 09:42:36 EST


+ Leon

On Sat, Oct 07, 2023 at 11:12:15AM +0800, Jijie Shao wrote:
> From: Jie Wang <wangjie125@xxxxxxxxxx>
>
> Currently hns3 driver supports vf fault detect feature. Several ras caused
> by VF resources don't need to do PF function reset for recovery. The driver
> only needs to reset the specified VF.
>
> So this patch adds process in ras module. New process will get detailed
> information about ras and do the most correct measures based on these
> accurate information.
>
> Signed-off-by: Jie Wang <wangjie125@xxxxxxxxxx>
> Signed-off-by: Jijie Shao <shaojijie@xxxxxxxxxx>
> ---
> changeLog:
> v1 -> v2:
> - fix the wrong use of vf recovery notify interface
> - add BUILD_BUG_ON to gurantee macros
> - optimise hclge_handle_vf_queue_err_ras for unsupported firmware
> v1: https://lore.kernel.org/netdev/20230113020829.48451-1-lanhao@xxxxxxxxxx/

Hi Leon,

I believe you reviewed v1 of this back in January and February.
Could you find some time to look at v2?

> ---
> drivers/net/ethernet/hisilicon/hns3/hnae3.h | 1 +
> .../hns3/hns3_common/hclge_comm_cmd.h | 1 +
> .../hisilicon/hns3/hns3pf/hclge_err.c | 116 +++++++++++++++++-
> .../hisilicon/hns3/hns3pf/hclge_err.h | 2 +
> .../hisilicon/hns3/hns3pf/hclge_main.c | 3 +-
> .../hisilicon/hns3/hns3pf/hclge_main.h | 2 +
> .../hisilicon/hns3/hns3pf/hclge_mbx.c | 2 +-
> 7 files changed, 120 insertions(+), 7 deletions(-)
>
> diff --git a/drivers/net/ethernet/hisilicon/hns3/hnae3.h b/drivers/net/ethernet/hisilicon/hns3/hnae3.h
> index 46062106fc6a..d7e175a9cb49 100644
> --- a/drivers/net/ethernet/hisilicon/hns3/hnae3.h
> +++ b/drivers/net/ethernet/hisilicon/hns3/hnae3.h
> @@ -275,6 +275,7 @@ enum hnae3_reset_type {
> HNAE3_GLOBAL_RESET,
> HNAE3_IMP_RESET,
> HNAE3_NONE_RESET,
> + HNAE3_VF_EXP_RESET,
> HNAE3_MAX_RESET,
> };
>
> diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_common/hclge_comm_cmd.h b/drivers/net/ethernet/hisilicon/hns3/hns3_common/hclge_comm_cmd.h
> index 92e73d44f0e5..533c19d25e4f 100644
> --- a/drivers/net/ethernet/hisilicon/hns3/hns3_common/hclge_comm_cmd.h
> +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_common/hclge_comm_cmd.h
> @@ -93,6 +93,7 @@ enum hclge_opcode_type {
> HCLGE_OPC_DFX_SSU_REG_2 = 0x004F,
>
> HCLGE_OPC_QUERY_DEV_SPECS = 0x0050,
> + HCLGE_OPC_GET_QUEUE_ERR_VF = 0x0067,
>
> /* MAC command */
> HCLGE_OPC_CONFIG_MAC_MODE = 0x0301,
> diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_err.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_err.c
> index 3f35227ef1fa..d63e114f93d0 100644
> --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_err.c
> +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_err.c
> @@ -1301,10 +1301,12 @@ static const struct hclge_hw_type_id hclge_hw_type_id_st[] = {
> .msg = "tqp_int_ecc_error"
> }, {
> .type_id = PF_ABNORMAL_INT_ERROR,
> - .msg = "pf_abnormal_int_error"
> + .msg = "pf_abnormal_int_error",
> + .cause_by_vf = true
> }, {
> .type_id = MPF_ABNORMAL_INT_ERROR,
> - .msg = "mpf_abnormal_int_error"
> + .msg = "mpf_abnormal_int_error",
> + .cause_by_vf = true
> }, {
> .type_id = COMMON_ERROR,
> .msg = "common_error"
> @@ -2759,7 +2761,7 @@ void hclge_handle_occurred_error(struct hclge_dev *hdev)
> hclge_handle_error_info_log(ae_dev);
> }
>
> -static void
> +static bool
> hclge_handle_error_type_reg_log(struct device *dev,
> struct hclge_mod_err_info *mod_info,
> struct hclge_type_reg_err_info *type_reg_info)
> @@ -2770,6 +2772,7 @@ hclge_handle_error_type_reg_log(struct device *dev,
> u8 mod_id, total_module, type_id, total_type, i, is_ras;
> u8 index_module = MODULE_NONE;
> u8 index_type = NONE_ERROR;
> + bool cause_by_vf = false;
>
> mod_id = mod_info->mod_id;
> type_id = type_reg_info->type_id & HCLGE_ERR_TYPE_MASK;
> @@ -2788,6 +2791,7 @@ hclge_handle_error_type_reg_log(struct device *dev,
> for (i = 0; i < total_type; i++) {
> if (type_id == hclge_hw_type_id_st[i].type_id) {
> index_type = i;
> + cause_by_vf = hclge_hw_type_id_st[i].cause_by_vf;
> break;
> }
> }
> @@ -2805,6 +2809,8 @@ hclge_handle_error_type_reg_log(struct device *dev,
> dev_err(dev, "reg_value:\n");
> for (i = 0; i < type_reg_info->reg_num; i++)
> dev_err(dev, "0x%08x\n", type_reg_info->hclge_reg[i]);
> +
> + return cause_by_vf;
> }
>
> static void hclge_handle_error_module_log(struct hnae3_ae_dev *ae_dev,
> @@ -2815,6 +2821,7 @@ static void hclge_handle_error_module_log(struct hnae3_ae_dev *ae_dev,
> struct device *dev = &hdev->pdev->dev;
> struct hclge_mod_err_info *mod_info;
> struct hclge_sum_err_info *sum_info;
> + bool cause_by_vf = false;
> u8 mod_num, err_num, i;
> u32 offset = 0;
>
> @@ -2843,12 +2850,16 @@ static void hclge_handle_error_module_log(struct hnae3_ae_dev *ae_dev,
>
> type_reg_info = (struct hclge_type_reg_err_info *)
> &buf[offset++];
> - hclge_handle_error_type_reg_log(dev, mod_info,
> - type_reg_info);
> + if (hclge_handle_error_type_reg_log(dev, mod_info,
> + type_reg_info))
> + cause_by_vf = true;
>
> offset += type_reg_info->reg_num;
> }
> }
> +
> + if (hnae3_ae_dev_vf_fault_supported(hdev->ae_dev) && cause_by_vf)
> + set_bit(HNAE3_VF_EXP_RESET, &ae_dev->hw_err_reset_req);
> }
>
> static int hclge_query_all_err_bd_num(struct hclge_dev *hdev, u32 *bd_num)
> @@ -2940,3 +2951,98 @@ int hclge_handle_error_info_log(struct hnae3_ae_dev *ae_dev)
> out:
> return ret;
> }
> +
> +static bool hclge_reset_vf_in_bitmap(struct hclge_dev *hdev,
> + unsigned long *bitmap)
> +{
> + struct hclge_vport *vport;
> + bool exist_set = false;
> + int func_id;
> + int ret;
> +
> + func_id = find_first_bit(bitmap, HCLGE_VPORT_NUM);
> + if (func_id == PF_VPORT_ID)
> + return false;
> +
> + while (func_id != HCLGE_VPORT_NUM) {
> + vport = hclge_get_vf_vport(hdev,
> + func_id - HCLGE_VF_VPORT_START_NUM);
> + if (!vport) {
> + dev_err(&hdev->pdev->dev, "invalid func id(%d)\n",
> + func_id);
> + return false;
> + }
> +
> + dev_info(&hdev->pdev->dev, "do function %d recovery.", func_id);
> +
> + ret = hclge_reset_tqp(&vport->nic);
> + if (ret) {
> + dev_err(&hdev->pdev->dev,
> + "failed to reset tqp, ret = %d.", ret);
> + return false;
> + }
> +
> + ret = hclge_inform_vf_reset(vport, HNAE3_VF_FUNC_RESET);
> + if (ret) {
> + dev_err(&hdev->pdev->dev,
> + "failed to reset func %d, ret = %d.",
> + func_id, ret);
> + return false;
> + }
> +
> + exist_set = true;
> + clear_bit(func_id, bitmap);
> + func_id = find_first_bit(bitmap, HCLGE_VPORT_NUM);
> + }
> +
> + return exist_set;
> +}
> +
> +static void hclge_get_vf_fault_bitmap(struct hclge_desc *desc,
> + unsigned long *bitmap)
> +{
> +#define HCLGE_FIR_FAULT_BYTES 24
> +#define HCLGE_SEC_FAULT_BYTES 8
> +
> + u8 *buff;
> +
> + BUILD_BUG_ON(HCLGE_FIR_FAULT_BYTES + HCLGE_SEC_FAULT_BYTES !=
> + BITS_TO_BYTES(HCLGE_VPORT_NUM));
> +
> + memcpy(bitmap, desc[0].data, HCLGE_FIR_FAULT_BYTES);
> + buff = (u8 *)bitmap + HCLGE_FIR_FAULT_BYTES;
> + memcpy(buff, desc[1].data, HCLGE_SEC_FAULT_BYTES);
> +}
> +
> +int hclge_handle_vf_queue_err_ras(struct hclge_dev *hdev)
> +{
> + unsigned long vf_fault_bitmap[BITS_TO_LONGS(HCLGE_VPORT_NUM)];
> + struct hclge_desc desc[2];
> + bool cause_by_vf = false;
> + int ret;
> +
> + if (!test_and_clear_bit(HNAE3_VF_EXP_RESET,
> + &hdev->ae_dev->hw_err_reset_req) ||
> + !hnae3_ae_dev_vf_fault_supported(hdev->ae_dev))
> + return 0;
> +
> + hclge_comm_cmd_setup_basic_desc(&desc[0], HCLGE_OPC_GET_QUEUE_ERR_VF,
> + true);
> + desc[0].flag |= cpu_to_le16(HCLGE_COMM_CMD_FLAG_NEXT);
> + hclge_comm_cmd_setup_basic_desc(&desc[1], HCLGE_OPC_GET_QUEUE_ERR_VF,
> + true);
> +
> + ret = hclge_comm_cmd_send(&hdev->hw.hw, desc, 2);
> + if (ret) {
> + dev_err(&hdev->pdev->dev,
> + "failed to get vf bitmap, ret = %d.\n", ret);
> + return ret;
> + }
> + hclge_get_vf_fault_bitmap(desc, vf_fault_bitmap);
> +
> + cause_by_vf = hclge_reset_vf_in_bitmap(hdev, vf_fault_bitmap);
> + if (cause_by_vf)
> + hdev->ae_dev->hw_err_reset_req = 0;
> +
> + return 0;
> +}
> diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_err.h b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_err.h
> index 86be6fb32990..68b738affa66 100644
> --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_err.h
> +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_err.h
> @@ -196,6 +196,7 @@ struct hclge_hw_module_id {
> struct hclge_hw_type_id {
> enum hclge_err_type_list type_id;
> const char *msg;
> + bool cause_by_vf; /* indicate the error may from vf exception */
> };
>
> struct hclge_sum_err_info {
> @@ -228,4 +229,5 @@ int hclge_handle_hw_msix_error(struct hclge_dev *hdev,
> unsigned long *reset_requests);
> int hclge_handle_error_info_log(struct hnae3_ae_dev *ae_dev);
> int hclge_handle_mac_tnl(struct hclge_dev *hdev);
> +int hclge_handle_vf_queue_err_ras(struct hclge_dev *hdev);
> #endif
> diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
> index c42574e29747..99c0576e6383 100644
> --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
> +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
> @@ -3424,7 +3424,7 @@ static int hclge_get_status(struct hnae3_handle *handle)
> return hdev->hw.mac.link;
> }
>
> -static struct hclge_vport *hclge_get_vf_vport(struct hclge_dev *hdev, int vf)
> +struct hclge_vport *hclge_get_vf_vport(struct hclge_dev *hdev, int vf)
> {
> if (!pci_num_vf(hdev->pdev)) {
> dev_err(&hdev->pdev->dev,
> @@ -4468,6 +4468,7 @@ static void hclge_handle_err_recovery(struct hclge_dev *hdev)
> if (hclge_find_error_source(hdev)) {
> hclge_handle_error_info_log(ae_dev);
> hclge_handle_mac_tnl(hdev);
> + hclge_handle_vf_queue_err_ras(hdev);
> }
>
> hclge_handle_err_reset_request(hdev);
> diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h
> index 7bc2049b723d..02c7aab3546e 100644
> --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h
> +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h
> @@ -1146,4 +1146,6 @@ int hclge_dbg_dump_rst_info(struct hclge_dev *hdev, char *buf, int len);
> int hclge_push_vf_link_status(struct hclge_vport *vport);
> int hclge_enable_vport_vlan_filter(struct hclge_vport *vport, bool request_en);
> int hclge_mac_update_stats(struct hclge_dev *hdev);
> +struct hclge_vport *hclge_get_vf_vport(struct hclge_dev *hdev, int vf);
> +int hclge_inform_vf_reset(struct hclge_vport *vport, u16 reset_type);
> #endif
> diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mbx.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mbx.c
> index 04ff9bf12185..4b0d07ca2505 100644
> --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mbx.c
> +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mbx.c
> @@ -124,7 +124,7 @@ static int hclge_send_mbx_msg(struct hclge_vport *vport, u8 *msg, u16 msg_len,
> return status;
> }
>
> -static int hclge_inform_vf_reset(struct hclge_vport *vport, u16 reset_type)
> +int hclge_inform_vf_reset(struct hclge_vport *vport, u16 reset_type)
> {
> __le16 msg_data;
> u8 dest_vfid;
> --
> 2.30.0
>
>