Re: [PATCH 16/17] fpga: dfl: fme: add global error reporting support

From: Alan Tull
Date: Tue Apr 09 2019 - 17:36:07 EST


On Sun, Mar 24, 2019 at 10:24 PM Wu Hao <hao.wu@xxxxxxxxx> wrote:

Hi Hao,

>
> This patch adds support for global error reporting for FPGA
> Management Engine (FME), it introduces sysfs interfaces to
> report different error detected by the hardware, and allow
> user to clear errors or inject error for testing purpose.
>
> Signed-off-by: Luwei Kang <luwei.kang@xxxxxxxxx>
> Signed-off-by: Ananda Ravuri <ananda.ravuri@xxxxxxxxx>
> Signed-off-by: Xu Yilun <yilun.xu@xxxxxxxxx>
> Signed-off-by: Wu Hao <hao.wu@xxxxxxxxx>
> ---
> Documentation/ABI/testing/sysfs-platform-dfl-fme | 58 ++++
> drivers/fpga/Makefile | 2 +-
> drivers/fpga/dfl-fme-error.c | 390 +++++++++++++++++++++++
> drivers/fpga/dfl-fme-main.c | 4 +
> drivers/fpga/dfl-fme.h | 2 +
> drivers/fpga/dfl.h | 2 +
> 6 files changed, 457 insertions(+), 1 deletion(-)
> create mode 100644 drivers/fpga/dfl-fme-error.c
>
> diff --git a/Documentation/ABI/testing/sysfs-platform-dfl-fme b/Documentation/ABI/testing/sysfs-platform-dfl-fme
> index 4b6448f..38f9cdd 100644
> --- a/Documentation/ABI/testing/sysfs-platform-dfl-fme
> +++ b/Documentation/ABI/testing/sysfs-platform-dfl-fme
> @@ -156,3 +156,61 @@ KernelVersion: 5.2
> Contact: Wu Hao <hao.wu@xxxxxxxxx>
> Description: Read-only. Read this file to get power limit for fpga, it
> is only valid for integrated solution.
> +
> +What: /sys/bus/platform/devices/dfl-fme.0/errors/fme-errors/errors
> +Date: March 2019
> +KernelVersion: 5.2
> +Contact: Wu Hao <hao.wu@xxxxxxxxx>
> +Description: Read-only. Read this file to get errors detected by hardware.
> +
> +What: /sys/bus/platform/devices/dfl-fme.0/errors/fme-errors/first_error
> +Date: March 2019
> +KernelVersion: 5.2
> +Contact: Wu Hao <hao.wu@xxxxxxxxx>
> +Description: Read-only. Read this file to get the first error detected by
> + hardware.
> +
> +What: /sys/bus/platform/devices/dfl-fme.0/errors/fme-errors/next_error
> +Date: March 2019
> +KernelVersion: 5.2
> +Contact: Wu Hao <hao.wu@xxxxxxxxx>
> +Description: Read-only. Read this file to get the second error detected by
> + hardware.
> +
> +What: /sys/bus/platform/devices/dfl-fme.0/errors/fme-errors/clear
> +Date: March 2019
> +KernelVersion: 5.2
> +Contact: Wu Hao <hao.wu@xxxxxxxxx>
> +Description: Write-only. Write error code to this file to clear errors. If
> + the input error code doesn't match, it returns -EBUSY.

As with the afu errors patch, seems like -EINVAL would be better.

> +
> +What: /sys/bus/platform/devices/dfl-fme.0/errors/pcie0_errors
> +Date: March 2019
> +KernelVersion: 5.2
> +Contact: Wu Hao <hao.wu@xxxxxxxxx>
> +Description: Read-only. It returns errors detected on pcie0 link.
> +
> +What: /sys/bus/platform/devices/dfl-fme.0/errors/pcie1_errors
> +Date: March 2019
> +KernelVersion: 5.2
> +Contact: Wu Hao <hao.wu@xxxxxxxxx>
> +Description: Read-only. It returns errors detected on pcie1 link.
> +
> +What: /sys/bus/platform/devices/dfl-fme.0/errors/nonfatal_errors
> +Date: March 2019
> +KernelVersion: 5.2
> +Contact: Wu Hao <hao.wu@xxxxxxxxx>
> +Description: Read-only. It returns non-fatal errors detected.
> +
> +What: /sys/bus/platform/devices/dfl-fme.0/errors/catfatal_errors
> +Date: March 2019
> +KernelVersion: 5.2
> +Contact: Wu Hao <hao.wu@xxxxxxxxx>
> +Description: Read-only. It returns catastrophic and fatal errors detected.
> +
> +What: /sys/bus/platform/devices/dfl-fme.0/errors/inject_error
> +Date: March 2019
> +KernelVersion: 5.2
> +Contact: Wu Hao <hao.wu@xxxxxxxxx>
> +Description: Read-Write. Write this file to inject errors for testing
> + purpose. Read this file to check errors injected.
> diff --git a/drivers/fpga/Makefile b/drivers/fpga/Makefile
> index f1f0af7..1a9fa3d 100644
> --- a/drivers/fpga/Makefile
> +++ b/drivers/fpga/Makefile
> @@ -38,7 +38,7 @@ obj-$(CONFIG_FPGA_DFL_FME_BRIDGE) += dfl-fme-br.o
> obj-$(CONFIG_FPGA_DFL_FME_REGION) += dfl-fme-region.o
> obj-$(CONFIG_FPGA_DFL_AFU) += dfl-afu.o
>
> -dfl-fme-objs := dfl-fme-main.o dfl-fme-pr.o
> +dfl-fme-objs := dfl-fme-main.o dfl-fme-pr.o dfl-fme-error.o
> dfl-afu-objs := dfl-afu-main.o dfl-afu-region.o dfl-afu-dma-region.o
> dfl-afu-objs += dfl-afu-error.o
>
> diff --git a/drivers/fpga/dfl-fme-error.c b/drivers/fpga/dfl-fme-error.c
> new file mode 100644
> index 0000000..f2bd5f8
> --- /dev/null
> +++ b/drivers/fpga/dfl-fme-error.c
> @@ -0,0 +1,390 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * Driver for FPGA Management Engine Error Management
> + *
> + * Copyright 2019 Intel Corporation, Inc.
> + *
> + * Authors:
> + * Kang Luwei <luwei.kang@xxxxxxxxx>
> + * Xiao Guangrong <guangrong.xiao@xxxxxxxxxxxxxxx>
> + * Wu Hao <hao.wu@xxxxxxxxx>
> + * Joseph Grecco <joe.grecco@xxxxxxxxx>
> + * Enno Luebbers <enno.luebbers@xxxxxxxxx>
> + * Tim Whisonant <tim.whisonant@xxxxxxxxx>
> + * Ananda Ravuri <ananda.ravuri@xxxxxxxxx>
> + * Mitchel, Henry <henry.mitchel@xxxxxxxxx>
> + */
> +
> +#include <linux/uaccess.h>
> +
> +#include "dfl.h"
> +#include "dfl-fme.h"
> +
> +#define FME_ERROR_MASK 0x8
> +#define FME_ERROR 0x10
> +#define MBP_ERROR BIT_ULL(6)
> +#define PCIE0_ERROR_MASK 0x18
> +#define PCIE0_ERROR 0x20
> +#define PCIE1_ERROR_MASK 0x28
> +#define PCIE1_ERROR 0x30
> +#define FME_FIRST_ERROR 0x38
> +#define FME_NEXT_ERROR 0x40
> +#define RAS_NONFAT_ERROR_MASK 0x48
> +#define RAS_NONFAT_ERROR 0x50
> +#define RAS_CATFAT_ERROR_MASK 0x58
> +#define RAS_CATFAT_ERROR 0x60
> +#define RAS_ERROR_INJECT 0x68
> +#define INJECT_ERROR_MASK GENMASK_ULL(2, 0)
> +
> +static ssize_t errors_show(struct device *dev,
> + struct device_attribute *attr, char *buf)
> +{
> + struct device *err_dev = dev->parent;
> + void __iomem *base;
> +
> + base = dfl_get_feature_ioaddr_by_id(err_dev, FME_FEATURE_ID_GLOBAL_ERR);
> +
> + return scnprintf(buf, PAGE_SIZE, "0x%llx\n",
> + (unsigned long long)readq(base + FME_ERROR));
> +}
> +static DEVICE_ATTR_RO(errors);
> +
> +static ssize_t first_error_show(struct device *dev,
> + struct device_attribute *attr, char *buf)
> +{
> + struct device *err_dev = dev->parent;
> + void __iomem *base;
> +
> + base = dfl_get_feature_ioaddr_by_id(err_dev, FME_FEATURE_ID_GLOBAL_ERR);
> +
> + return scnprintf(buf, PAGE_SIZE, "0x%llx\n",
> + (unsigned long long)readq(base + FME_FIRST_ERROR));
> +}
> +static DEVICE_ATTR_RO(first_error);
> +
> +static ssize_t next_error_show(struct device *dev,
> + struct device_attribute *attr, char *buf)
> +{
> + struct device *err_dev = dev->parent;
> + void __iomem *base;
> +
> + base = dfl_get_feature_ioaddr_by_id(err_dev, FME_FEATURE_ID_GLOBAL_ERR);
> +
> + return scnprintf(buf, PAGE_SIZE, "0x%llx\n",
> + (unsigned long long)readq(base + FME_NEXT_ERROR));
> +}
> +static DEVICE_ATTR_RO(next_error);
> +
> +static ssize_t clear_store(struct device *dev, struct device_attribute *attr,
> + const char *buf, size_t count)
> +{
> + struct dfl_feature_platform_data *pdata = dev_get_platdata(dev->parent);
> + struct device *err_dev = dev->parent;
> + struct dfl_feature *feature;
> + void __iomem *base;
> + u64 v, val;
> + int ret;
> +
> + ret = kstrtou64(buf, 0, &val);
> + if (ret)
> + return ret;

from kstrtox.c:
* Returns 0 on success, -ERANGE on overflow and -EINVAL on parsing error.

Your sysfs doc could be updated to explain these return codes.

> +
> + feature = dfl_get_feature_by_id(err_dev, FME_FEATURE_ID_GLOBAL_ERR);
> + base = feature->ioaddr;
> +
> + mutex_lock(&pdata->lock);
> + writeq(GENMASK_ULL(63, 0), base + FME_ERROR_MASK);
> +
> + v = readq(base + FME_ERROR);
> + if (val != v) {
> + ret = -EINVAL;

Oh wait, that's what I thought it should be ;) so the doc just needs to change.

> + goto done;

It would be easy to avoid using 'goto' here.

> + }
> +
> + writeq(v, base + FME_ERROR);
> + v = readq(base + FME_FIRST_ERROR);
> + writeq(v, base + FME_FIRST_ERROR);
> + v = readq(base + FME_NEXT_ERROR);
> + writeq(v, base + FME_NEXT_ERROR);
> +
> +done:
> + /* Workaround: disable MBP_ERROR if feature revision is 0 */
> + writeq(dfl_feature_revision(feature->ioaddr) ? 0ULL : MBP_ERROR,
> + base + FME_ERROR_MASK);
> + mutex_unlock(&pdata->lock);
> + return ret ? ret : count;
> +}
> +static DEVICE_ATTR_WO(clear);
> +
> +static ssize_t revision_show(struct device *dev, struct device_attribute *attr,
> + char *buf)
> +{
> + struct device *err_dev = dev->parent;
> + void __iomem *base;
> +
> + base = dfl_get_feature_ioaddr_by_id(err_dev, FME_FEATURE_ID_GLOBAL_ERR);
> +
> + return scnprintf(buf, PAGE_SIZE, "%u\n", dfl_feature_revision(base));
> +}
> +static DEVICE_ATTR_RO(revision);

The revision attr to be documented in the sysfs doc.

> +
> +static ssize_t pcie0_errors_show(struct device *dev,
> + struct device_attribute *attr, char *buf)
> +{
> + struct device *err_dev = dev->parent;
> + void __iomem *base;
> +
> + base = dfl_get_feature_ioaddr_by_id(err_dev, FME_FEATURE_ID_GLOBAL_ERR);
> +
> + return scnprintf(buf, PAGE_SIZE, "0x%llx\n",
> + (unsigned long long)readq(base + PCIE0_ERROR));
> +}
> +
> +static ssize_t pcie0_errors_store(struct device *dev,

The sysfs doc shows this as read-only.

> + struct device_attribute *attr,
> + const char *buf, size_t count)
> +{
> + struct dfl_feature_platform_data *pdata = dev_get_platdata(dev->parent);
> + struct device *err_dev = dev->parent;
> + void __iomem *base;
> + u64 v, val;
> + int ret;
> +
> + ret = kstrtou64(buf, 0, &val);
> + if (ret)
> + return ret;
> +
> + base = dfl_get_feature_ioaddr_by_id(err_dev, FME_FEATURE_ID_GLOBAL_ERR);
> +
> + mutex_lock(&pdata->lock);
> + writeq(GENMASK_ULL(63, 0), base + PCIE0_ERROR_MASK);
> +
> + v = readq(base + PCIE0_ERROR);
> + if (val != v)
> + ret = -EBUSY;
> + else
> + writeq(v, base + PCIE0_ERROR);
> +
> + writeq(0ULL, base + PCIE0_ERROR_MASK);
> + mutex_unlock(&pdata->lock);
> + return ret ? ret : count;
> +}
> +static DEVICE_ATTR_RW(pcie0_errors);
> +
> +static ssize_t pcie1_errors_show(struct device *dev,
> + struct device_attribute *attr, char *buf)
> +{
> + struct device *err_dev = dev->parent;
> + void __iomem *base;
> +
> + base = dfl_get_feature_ioaddr_by_id(err_dev, FME_FEATURE_ID_GLOBAL_ERR);
> +
> + return scnprintf(buf, PAGE_SIZE, "0x%llx\n",
> + (unsigned long long)readq(base + PCIE1_ERROR));
> +}
> +
> +static ssize_t pcie1_errors_store(struct device *dev,
> + struct device_attribute *attr,
> + const char *buf, size_t count)

Same here, documentation needs to show as R/W and explain what happens
when written.

> +{
> + struct dfl_feature_platform_data *pdata = dev_get_platdata(dev->parent);
> + struct device *err_dev = dev->parent;
> + void __iomem *base;
> + u64 v, val;
> + int ret;
> +
> + ret = kstrtou64(buf, 0, &val);
> + if (ret)
> + return ret;
> +
> + base = dfl_get_feature_ioaddr_by_id(err_dev, FME_FEATURE_ID_GLOBAL_ERR);
> +
> + mutex_lock(&pdata->lock);
> + writeq(GENMASK_ULL(63, 0), base + PCIE1_ERROR_MASK);
> +
> + v = readq(base + PCIE1_ERROR);
> + if (val != v)
> + ret = -EBUSY;
> + else
> + writeq(v, base + PCIE1_ERROR);
> +
> + writeq(0ULL, base + PCIE1_ERROR_MASK);
> + mutex_unlock(&pdata->lock);
> + return ret ? ret : count;
> +}
> +static DEVICE_ATTR_RW(pcie1_errors);
> +
> +static ssize_t nonfatal_errors_show(struct device *dev,
> + struct device_attribute *attr, char *buf)
> +{
> + struct device *err_dev = dev->parent;
> + void __iomem *base;
> +
> + base = dfl_get_feature_ioaddr_by_id(err_dev, FME_FEATURE_ID_GLOBAL_ERR);
> +
> + return scnprintf(buf, PAGE_SIZE, "0x%llx\n",
> + (unsigned long long)readq(base + RAS_NONFAT_ERROR));
> +}
> +static DEVICE_ATTR_RO(nonfatal_errors);
> +
> +static ssize_t catfatal_errors_show(struct device *dev,
> + struct device_attribute *attr, char *buf)
> +{
> + struct device *err_dev = dev->parent;
> + void __iomem *base;
> +
> + base = dfl_get_feature_ioaddr_by_id(err_dev, FME_FEATURE_ID_GLOBAL_ERR);
> +
> + return scnprintf(buf, PAGE_SIZE, "0x%llx\n",
> + (unsigned long long)readq(base + RAS_CATFAT_ERROR));
> +}
> +static DEVICE_ATTR_RO(catfatal_errors);
> +
> +static ssize_t inject_error_show(struct device *dev,
> + struct device_attribute *attr, char *buf)
> +{
> + struct device *err_dev = dev->parent;
> + void __iomem *base;
> + u64 v;
> +
> + base = dfl_get_feature_ioaddr_by_id(err_dev, FME_FEATURE_ID_GLOBAL_ERR);
> +
> + v = readq(base + RAS_ERROR_INJECT);
> +
> + return scnprintf(buf, PAGE_SIZE, "0x%llx\n",
> + (unsigned long long)FIELD_GET(INJECT_ERROR_MASK, v));
> +}
> +
> +static ssize_t inject_error_store(struct device *dev,
> + struct device_attribute *attr,
> + const char *buf, size_t count)
> +{
> + struct dfl_feature_platform_data *pdata = dev_get_platdata(dev->parent);
> + struct device *err_dev = dev->parent;
> + void __iomem *base;
> + u8 inject_error;
> + int ret;
> + u64 v;
> +
> + ret = kstrtou8(buf, 0, &inject_error);
> + if (ret)
> + return ret;
> +
> + if (inject_error & ~INJECT_ERROR_MASK)
> + return -EINVAL;
> +
> + base = dfl_get_feature_ioaddr_by_id(err_dev, FME_FEATURE_ID_GLOBAL_ERR);
> +
> + mutex_lock(&pdata->lock);
> + v = readq(base + RAS_ERROR_INJECT);
> + v &= ~INJECT_ERROR_MASK;
> + v |= FIELD_PREP(INJECT_ERROR_MASK, inject_error);
> + writeq(v, base + RAS_ERROR_INJECT);
> + mutex_unlock(&pdata->lock);
> +
> + return count;
> +}
> +static DEVICE_ATTR_RW(inject_error);
> +
> +static struct attribute *fme_errors_attrs[] = {
> + &dev_attr_errors.attr,
> + &dev_attr_first_error.attr,
> + &dev_attr_next_error.attr,
> + &dev_attr_clear.attr,
> + NULL,
> +};
> +
> +static struct attribute_group fme_errors_attr_group = {
> + .attrs = fme_errors_attrs,
> + .name = "fme-errors",
> +};
> +
> +static struct attribute *errors_attrs[] = {
> + &dev_attr_revision.attr,
> + &dev_attr_pcie0_errors.attr,
> + &dev_attr_pcie1_errors.attr,
> + &dev_attr_nonfatal_errors.attr,
> + &dev_attr_catfatal_errors.attr,
> + &dev_attr_inject_error.attr,
> + NULL,
> +};
> +
> +static struct attribute_group errors_attr_group = {
> + .attrs = errors_attrs,
> +};
> +
> +static const struct attribute_group *error_groups[] = {
> + &fme_errors_attr_group,
> + &errors_attr_group,
> + NULL
> +};
> +
> +static void fme_error_enable(struct dfl_feature *feature)
> +{
> + void __iomem *base = feature->ioaddr;
> +
> + /* Workaround: disable MBP_ERROR if revision is 0 */
> + writeq(dfl_feature_revision(feature->ioaddr) ? 0ULL : MBP_ERROR,
> + base + FME_ERROR_MASK);
> + writeq(0ULL, base + PCIE0_ERROR_MASK);
> + writeq(0ULL, base + PCIE1_ERROR_MASK);
> + writeq(0ULL, base + RAS_NONFAT_ERROR_MASK);
> + writeq(0ULL, base + RAS_CATFAT_ERROR_MASK);
> +}
> +
> +static void err_dev_release(struct device *dev)
> +{
> + kfree(dev);
> +}
> +
> +static int fme_global_err_init(struct platform_device *pdev,
> + struct dfl_feature *feature)
> +{
> + struct device *dev;
> + int ret = 0;
> +
> + dev = kzalloc(sizeof(*dev), GFP_KERNEL);
> + if (!dev)
> + return -ENOMEM;
> +
> + dev->parent = &pdev->dev;
> + dev->release = err_dev_release;
> + dev_set_name(dev, "errors");
> +
> + fme_error_enable(feature);
> +
> + ret = device_register(dev);
> + if (ret) {
> + put_device(dev);
> + return ret;
> + }
> +
> + ret = sysfs_create_groups(&dev->kobj, error_groups);
> + if (ret) {
> + device_unregister(dev);
> + return ret;
> + }
> +
> + feature->priv = dev;
> +
> + return ret;
> +}
> +
> +static void fme_global_err_uinit(struct platform_device *pdev,
> + struct dfl_feature *feature)
> +{
> + struct device *dev = feature->priv;
> +
> + sysfs_remove_groups(&dev->kobj, error_groups);
> + device_unregister(dev);
> +}
> +
> +const struct dfl_feature_id fme_global_err_id_table[] = {
> + {.id = FME_FEATURE_ID_GLOBAL_ERR,},
> + {0,}
> +};
> +
> +const struct dfl_feature_ops fme_global_err_ops = {
> + .init = fme_global_err_init,
> + .uinit = fme_global_err_uinit,
> +};
> diff --git a/drivers/fpga/dfl-fme-main.c b/drivers/fpga/dfl-fme-main.c
> index dafa6580..76cb112 100644
> --- a/drivers/fpga/dfl-fme-main.c
> +++ b/drivers/fpga/dfl-fme-main.c
> @@ -686,6 +686,10 @@ static struct dfl_feature_driver fme_feature_drvs[] = {
> .ops = &fme_power_mgmt_ops,
> },
> {
> + .id_table = fme_global_err_id_table,
> + .ops = &fme_global_err_ops,
> + },
> + {
> .ops = NULL,
> },
> };
> diff --git a/drivers/fpga/dfl-fme.h b/drivers/fpga/dfl-fme.h
> index 7a021c4..5fbe3f5 100644
> --- a/drivers/fpga/dfl-fme.h
> +++ b/drivers/fpga/dfl-fme.h
> @@ -37,5 +37,7 @@ struct dfl_fme {
>
> extern const struct dfl_feature_ops fme_pr_mgmt_ops;
> extern const struct dfl_feature_id fme_pr_mgmt_id_table[];
> +extern const struct dfl_feature_ops fme_global_err_ops;
> +extern const struct dfl_feature_id fme_global_err_id_table[];
>
> #endif /* __DFL_FME_H */
> diff --git a/drivers/fpga/dfl.h b/drivers/fpga/dfl.h
> index fbc57f0..6c32080 100644
> --- a/drivers/fpga/dfl.h
> +++ b/drivers/fpga/dfl.h
> @@ -197,12 +197,14 @@ struct dfl_feature_driver {
> * feature dev (platform device)'s reources.
> * @ioaddr: mapped mmio resource address.
> * @ops: ops of this sub feature.
> + * @priv: priv data of this feature.
> */
> struct dfl_feature {
> u64 id;
> int resource_index;
> void __iomem *ioaddr;
> const struct dfl_feature_ops *ops;
> + void *priv;
> };
>
> #define DEV_STATUS_IN_USE 0
> --
> 2.7.4
>

Thanks,
Alan