Re: [PATCH 16/17] fpga: dfl: fme: add global error reporting support

From: Wu Hao
Date: Tue Apr 09 2019 - 21:50:25 EST


On Tue, Apr 09, 2019 at 04:35:25PM -0500, Alan Tull wrote:
> On Sun, Mar 24, 2019 at 10:24 PM Wu Hao <hao.wu@xxxxxxxxx> wrote:
>
> Hi Hao,
>
> >
> > This patch adds support for global error reporting for FPGA
> > Management Engine (FME), it introduces sysfs interfaces to
> > report different error detected by the hardware, and allow
> > user to clear errors or inject error for testing purpose.
> >
> > Signed-off-by: Luwei Kang <luwei.kang@xxxxxxxxx>
> > Signed-off-by: Ananda Ravuri <ananda.ravuri@xxxxxxxxx>
> > Signed-off-by: Xu Yilun <yilun.xu@xxxxxxxxx>
> > Signed-off-by: Wu Hao <hao.wu@xxxxxxxxx>
> > ---
> > Documentation/ABI/testing/sysfs-platform-dfl-fme | 58 ++++
> > drivers/fpga/Makefile | 2 +-
> > drivers/fpga/dfl-fme-error.c | 390 +++++++++++++++++++++++
> > drivers/fpga/dfl-fme-main.c | 4 +
> > drivers/fpga/dfl-fme.h | 2 +
> > drivers/fpga/dfl.h | 2 +
> > 6 files changed, 457 insertions(+), 1 deletion(-)
> > create mode 100644 drivers/fpga/dfl-fme-error.c
> >
> > diff --git a/Documentation/ABI/testing/sysfs-platform-dfl-fme b/Documentation/ABI/testing/sysfs-platform-dfl-fme
> > index 4b6448f..38f9cdd 100644
> > --- a/Documentation/ABI/testing/sysfs-platform-dfl-fme
> > +++ b/Documentation/ABI/testing/sysfs-platform-dfl-fme
> > @@ -156,3 +156,61 @@ KernelVersion: 5.2
> > Contact: Wu Hao <hao.wu@xxxxxxxxx>
> > Description: Read-only. Read this file to get power limit for fpga, it
> > is only valid for integrated solution.
> > +
> > +What: /sys/bus/platform/devices/dfl-fme.0/errors/fme-errors/errors
> > +Date: March 2019
> > +KernelVersion: 5.2
> > +Contact: Wu Hao <hao.wu@xxxxxxxxx>
> > +Description: Read-only. Read this file to get errors detected by hardware.
> > +
> > +What: /sys/bus/platform/devices/dfl-fme.0/errors/fme-errors/first_error
> > +Date: March 2019
> > +KernelVersion: 5.2
> > +Contact: Wu Hao <hao.wu@xxxxxxxxx>
> > +Description: Read-only. Read this file to get the first error detected by
> > + hardware.
> > +
> > +What: /sys/bus/platform/devices/dfl-fme.0/errors/fme-errors/next_error
> > +Date: March 2019
> > +KernelVersion: 5.2
> > +Contact: Wu Hao <hao.wu@xxxxxxxxx>
> > +Description: Read-only. Read this file to get the second error detected by
> > + hardware.
> > +
> > +What: /sys/bus/platform/devices/dfl-fme.0/errors/fme-errors/clear
> > +Date: March 2019
> > +KernelVersion: 5.2
> > +Contact: Wu Hao <hao.wu@xxxxxxxxx>
> > +Description: Write-only. Write error code to this file to clear errors. If
> > + the input error code doesn't match, it returns -EBUSY.
>
> As with the afu errors patch, seems like -EINVAL would be better.
>
> > +
> > +What: /sys/bus/platform/devices/dfl-fme.0/errors/pcie0_errors
> > +Date: March 2019
> > +KernelVersion: 5.2
> > +Contact: Wu Hao <hao.wu@xxxxxxxxx>
> > +Description: Read-only. It returns errors detected on pcie0 link.
> > +
> > +What: /sys/bus/platform/devices/dfl-fme.0/errors/pcie1_errors
> > +Date: March 2019
> > +KernelVersion: 5.2
> > +Contact: Wu Hao <hao.wu@xxxxxxxxx>
> > +Description: Read-only. It returns errors detected on pcie1 link.
> > +
> > +What: /sys/bus/platform/devices/dfl-fme.0/errors/nonfatal_errors
> > +Date: March 2019
> > +KernelVersion: 5.2
> > +Contact: Wu Hao <hao.wu@xxxxxxxxx>
> > +Description: Read-only. It returns non-fatal errors detected.
> > +
> > +What: /sys/bus/platform/devices/dfl-fme.0/errors/catfatal_errors
> > +Date: March 2019
> > +KernelVersion: 5.2
> > +Contact: Wu Hao <hao.wu@xxxxxxxxx>
> > +Description: Read-only. It returns catastrophic and fatal errors detected.
> > +
> > +What: /sys/bus/platform/devices/dfl-fme.0/errors/inject_error
> > +Date: March 2019
> > +KernelVersion: 5.2
> > +Contact: Wu Hao <hao.wu@xxxxxxxxx>
> > +Description: Read-Write. Write this file to inject errors for testing
> > + purpose. Read this file to check errors injected.
> > diff --git a/drivers/fpga/Makefile b/drivers/fpga/Makefile
> > index f1f0af7..1a9fa3d 100644
> > --- a/drivers/fpga/Makefile
> > +++ b/drivers/fpga/Makefile
> > @@ -38,7 +38,7 @@ obj-$(CONFIG_FPGA_DFL_FME_BRIDGE) += dfl-fme-br.o
> > obj-$(CONFIG_FPGA_DFL_FME_REGION) += dfl-fme-region.o
> > obj-$(CONFIG_FPGA_DFL_AFU) += dfl-afu.o
> >
> > -dfl-fme-objs := dfl-fme-main.o dfl-fme-pr.o
> > +dfl-fme-objs := dfl-fme-main.o dfl-fme-pr.o dfl-fme-error.o
> > dfl-afu-objs := dfl-afu-main.o dfl-afu-region.o dfl-afu-dma-region.o
> > dfl-afu-objs += dfl-afu-error.o
> >
> > diff --git a/drivers/fpga/dfl-fme-error.c b/drivers/fpga/dfl-fme-error.c
> > new file mode 100644
> > index 0000000..f2bd5f8
> > --- /dev/null
> > +++ b/drivers/fpga/dfl-fme-error.c
> > @@ -0,0 +1,390 @@
> > +// SPDX-License-Identifier: GPL-2.0
> > +/*
> > + * Driver for FPGA Management Engine Error Management
> > + *
> > + * Copyright 2019 Intel Corporation, Inc.
> > + *
> > + * Authors:
> > + * Kang Luwei <luwei.kang@xxxxxxxxx>
> > + * Xiao Guangrong <guangrong.xiao@xxxxxxxxxxxxxxx>
> > + * Wu Hao <hao.wu@xxxxxxxxx>
> > + * Joseph Grecco <joe.grecco@xxxxxxxxx>
> > + * Enno Luebbers <enno.luebbers@xxxxxxxxx>
> > + * Tim Whisonant <tim.whisonant@xxxxxxxxx>
> > + * Ananda Ravuri <ananda.ravuri@xxxxxxxxx>
> > + * Mitchel, Henry <henry.mitchel@xxxxxxxxx>
> > + */
> > +
> > +#include <linux/uaccess.h>
> > +
> > +#include "dfl.h"
> > +#include "dfl-fme.h"
> > +
> > +#define FME_ERROR_MASK 0x8
> > +#define FME_ERROR 0x10
> > +#define MBP_ERROR BIT_ULL(6)
> > +#define PCIE0_ERROR_MASK 0x18
> > +#define PCIE0_ERROR 0x20
> > +#define PCIE1_ERROR_MASK 0x28
> > +#define PCIE1_ERROR 0x30
> > +#define FME_FIRST_ERROR 0x38
> > +#define FME_NEXT_ERROR 0x40
> > +#define RAS_NONFAT_ERROR_MASK 0x48
> > +#define RAS_NONFAT_ERROR 0x50
> > +#define RAS_CATFAT_ERROR_MASK 0x58
> > +#define RAS_CATFAT_ERROR 0x60
> > +#define RAS_ERROR_INJECT 0x68
> > +#define INJECT_ERROR_MASK GENMASK_ULL(2, 0)
> > +
> > +static ssize_t errors_show(struct device *dev,
> > + struct device_attribute *attr, char *buf)
> > +{
> > + struct device *err_dev = dev->parent;
> > + void __iomem *base;
> > +
> > + base = dfl_get_feature_ioaddr_by_id(err_dev, FME_FEATURE_ID_GLOBAL_ERR);
> > +
> > + return scnprintf(buf, PAGE_SIZE, "0x%llx\n",
> > + (unsigned long long)readq(base + FME_ERROR));
> > +}
> > +static DEVICE_ATTR_RO(errors);
> > +
> > +static ssize_t first_error_show(struct device *dev,
> > + struct device_attribute *attr, char *buf)
> > +{
> > + struct device *err_dev = dev->parent;
> > + void __iomem *base;
> > +
> > + base = dfl_get_feature_ioaddr_by_id(err_dev, FME_FEATURE_ID_GLOBAL_ERR);
> > +
> > + return scnprintf(buf, PAGE_SIZE, "0x%llx\n",
> > + (unsigned long long)readq(base + FME_FIRST_ERROR));
> > +}
> > +static DEVICE_ATTR_RO(first_error);
> > +
> > +static ssize_t next_error_show(struct device *dev,
> > + struct device_attribute *attr, char *buf)
> > +{
> > + struct device *err_dev = dev->parent;
> > + void __iomem *base;
> > +
> > + base = dfl_get_feature_ioaddr_by_id(err_dev, FME_FEATURE_ID_GLOBAL_ERR);
> > +
> > + return scnprintf(buf, PAGE_SIZE, "0x%llx\n",
> > + (unsigned long long)readq(base + FME_NEXT_ERROR));
> > +}
> > +static DEVICE_ATTR_RO(next_error);
> > +
> > +static ssize_t clear_store(struct device *dev, struct device_attribute *attr,
> > + const char *buf, size_t count)
> > +{
> > + struct dfl_feature_platform_data *pdata = dev_get_platdata(dev->parent);
> > + struct device *err_dev = dev->parent;
> > + struct dfl_feature *feature;
> > + void __iomem *base;
> > + u64 v, val;
> > + int ret;
> > +
> > + ret = kstrtou64(buf, 0, &val);
> > + if (ret)
> > + return ret;
>
> from kstrtox.c:
> * Returns 0 on success, -ERANGE on overflow and -EINVAL on parsing error.
>
> Your sysfs doc could be updated to explain these return codes.

Sure, will fix this.

>
> > +
> > + feature = dfl_get_feature_by_id(err_dev, FME_FEATURE_ID_GLOBAL_ERR);
> > + base = feature->ioaddr;
> > +
> > + mutex_lock(&pdata->lock);
> > + writeq(GENMASK_ULL(63, 0), base + FME_ERROR_MASK);
> > +
> > + v = readq(base + FME_ERROR);
> > + if (val != v) {
> > + ret = -EINVAL;
>
> Oh wait, that's what I thought it should be ;) so the doc just needs to change.

Sure, will capture the detailed return values in doc.

>
> > + goto done;
>
> It would be easy to avoid using 'goto' here.

Yes, agree.

>
> > + }
> > +
> > + writeq(v, base + FME_ERROR);
> > + v = readq(base + FME_FIRST_ERROR);
> > + writeq(v, base + FME_FIRST_ERROR);
> > + v = readq(base + FME_NEXT_ERROR);
> > + writeq(v, base + FME_NEXT_ERROR);
> > +
> > +done:
> > + /* Workaround: disable MBP_ERROR if feature revision is 0 */
> > + writeq(dfl_feature_revision(feature->ioaddr) ? 0ULL : MBP_ERROR,
> > + base + FME_ERROR_MASK);
> > + mutex_unlock(&pdata->lock);
> > + return ret ? ret : count;
> > +}
> > +static DEVICE_ATTR_WO(clear);
> > +
> > +static ssize_t revision_show(struct device *dev, struct device_attribute *attr,
> > + char *buf)
> > +{
> > + struct device *err_dev = dev->parent;
> > + void __iomem *base;
> > +
> > + base = dfl_get_feature_ioaddr_by_id(err_dev, FME_FEATURE_ID_GLOBAL_ERR);
> > +
> > + return scnprintf(buf, PAGE_SIZE, "%u\n", dfl_feature_revision(base));
> > +}
> > +static DEVICE_ATTR_RO(revision);
>
> The revision attr to be documented in the sysfs doc.

Will fix this.

>
> > +
> > +static ssize_t pcie0_errors_show(struct device *dev,
> > + struct device_attribute *attr, char *buf)
> > +{
> > + struct device *err_dev = dev->parent;
> > + void __iomem *base;
> > +
> > + base = dfl_get_feature_ioaddr_by_id(err_dev, FME_FEATURE_ID_GLOBAL_ERR);
> > +
> > + return scnprintf(buf, PAGE_SIZE, "0x%llx\n",
> > + (unsigned long long)readq(base + PCIE0_ERROR));
> > +}
> > +
> > +static ssize_t pcie0_errors_store(struct device *dev,
>
> The sysfs doc shows this as read-only.
>
> > + struct device_attribute *attr,
> > + const char *buf, size_t count)
> > +{
> > + struct dfl_feature_platform_data *pdata = dev_get_platdata(dev->parent);
> > + struct device *err_dev = dev->parent;
> > + void __iomem *base;
> > + u64 v, val;
> > + int ret;
> > +
> > + ret = kstrtou64(buf, 0, &val);
> > + if (ret)
> > + return ret;
> > +
> > + base = dfl_get_feature_ioaddr_by_id(err_dev, FME_FEATURE_ID_GLOBAL_ERR);
> > +
> > + mutex_lock(&pdata->lock);
> > + writeq(GENMASK_ULL(63, 0), base + PCIE0_ERROR_MASK);
> > +
> > + v = readq(base + PCIE0_ERROR);
> > + if (val != v)
> > + ret = -EBUSY;
> > + else
> > + writeq(v, base + PCIE0_ERROR);
> > +
> > + writeq(0ULL, base + PCIE0_ERROR_MASK);
> > + mutex_unlock(&pdata->lock);
> > + return ret ? ret : count;
> > +}
> > +static DEVICE_ATTR_RW(pcie0_errors);
> > +
> > +static ssize_t pcie1_errors_show(struct device *dev,
> > + struct device_attribute *attr, char *buf)
> > +{
> > + struct device *err_dev = dev->parent;
> > + void __iomem *base;
> > +
> > + base = dfl_get_feature_ioaddr_by_id(err_dev, FME_FEATURE_ID_GLOBAL_ERR);
> > +
> > + return scnprintf(buf, PAGE_SIZE, "0x%llx\n",
> > + (unsigned long long)readq(base + PCIE1_ERROR));
> > +}
> > +
> > +static ssize_t pcie1_errors_store(struct device *dev,
> > + struct device_attribute *attr,
> > + const char *buf, size_t count)
>
> Same here, documentation needs to show as R/W and explain what happens
> when written.

Sorry, will fix them in doc.

>
> > +{
> > + struct dfl_feature_platform_data *pdata = dev_get_platdata(dev->parent);
> > + struct device *err_dev = dev->parent;
> > + void __iomem *base;
> > + u64 v, val;
> > + int ret;
> > +
> > + ret = kstrtou64(buf, 0, &val);
> > + if (ret)
> > + return ret;
> > +
> > + base = dfl_get_feature_ioaddr_by_id(err_dev, FME_FEATURE_ID_GLOBAL_ERR);
> > +
> > + mutex_lock(&pdata->lock);
> > + writeq(GENMASK_ULL(63, 0), base + PCIE1_ERROR_MASK);
> > +
> > + v = readq(base + PCIE1_ERROR);
> > + if (val != v)
> > + ret = -EBUSY;
> > + else
> > + writeq(v, base + PCIE1_ERROR);
> > +
> > + writeq(0ULL, base + PCIE1_ERROR_MASK);
> > + mutex_unlock(&pdata->lock);
> > + return ret ? ret : count;
> > +}
> > +static DEVICE_ATTR_RW(pcie1_errors);
> > +
> > +static ssize_t nonfatal_errors_show(struct device *dev,
> > + struct device_attribute *attr, char *buf)
> > +{
> > + struct device *err_dev = dev->parent;
> > + void __iomem *base;
> > +
> > + base = dfl_get_feature_ioaddr_by_id(err_dev, FME_FEATURE_ID_GLOBAL_ERR);
> > +
> > + return scnprintf(buf, PAGE_SIZE, "0x%llx\n",
> > + (unsigned long long)readq(base + RAS_NONFAT_ERROR));
> > +}
> > +static DEVICE_ATTR_RO(nonfatal_errors);
> > +
> > +static ssize_t catfatal_errors_show(struct device *dev,
> > + struct device_attribute *attr, char *buf)
> > +{
> > + struct device *err_dev = dev->parent;
> > + void __iomem *base;
> > +
> > + base = dfl_get_feature_ioaddr_by_id(err_dev, FME_FEATURE_ID_GLOBAL_ERR);
> > +
> > + return scnprintf(buf, PAGE_SIZE, "0x%llx\n",
> > + (unsigned long long)readq(base + RAS_CATFAT_ERROR));
> > +}
> > +static DEVICE_ATTR_RO(catfatal_errors);
> > +
> > +static ssize_t inject_error_show(struct device *dev,
> > + struct device_attribute *attr, char *buf)
> > +{
> > + struct device *err_dev = dev->parent;
> > + void __iomem *base;
> > + u64 v;
> > +
> > + base = dfl_get_feature_ioaddr_by_id(err_dev, FME_FEATURE_ID_GLOBAL_ERR);
> > +
> > + v = readq(base + RAS_ERROR_INJECT);
> > +
> > + return scnprintf(buf, PAGE_SIZE, "0x%llx\n",
> > + (unsigned long long)FIELD_GET(INJECT_ERROR_MASK, v));
> > +}
> > +
> > +static ssize_t inject_error_store(struct device *dev,
> > + struct device_attribute *attr,
> > + const char *buf, size_t count)
> > +{
> > + struct dfl_feature_platform_data *pdata = dev_get_platdata(dev->parent);
> > + struct device *err_dev = dev->parent;
> > + void __iomem *base;
> > + u8 inject_error;
> > + int ret;
> > + u64 v;
> > +
> > + ret = kstrtou8(buf, 0, &inject_error);
> > + if (ret)
> > + return ret;
> > +
> > + if (inject_error & ~INJECT_ERROR_MASK)
> > + return -EINVAL;
> > +
> > + base = dfl_get_feature_ioaddr_by_id(err_dev, FME_FEATURE_ID_GLOBAL_ERR);
> > +
> > + mutex_lock(&pdata->lock);
> > + v = readq(base + RAS_ERROR_INJECT);
> > + v &= ~INJECT_ERROR_MASK;
> > + v |= FIELD_PREP(INJECT_ERROR_MASK, inject_error);
> > + writeq(v, base + RAS_ERROR_INJECT);
> > + mutex_unlock(&pdata->lock);
> > +
> > + return count;
> > +}
> > +static DEVICE_ATTR_RW(inject_error);
> > +
> > +static struct attribute *fme_errors_attrs[] = {
> > + &dev_attr_errors.attr,
> > + &dev_attr_first_error.attr,
> > + &dev_attr_next_error.attr,
> > + &dev_attr_clear.attr,
> > + NULL,
> > +};
> > +
> > +static struct attribute_group fme_errors_attr_group = {
> > + .attrs = fme_errors_attrs,
> > + .name = "fme-errors",
> > +};
> > +
> > +static struct attribute *errors_attrs[] = {
> > + &dev_attr_revision.attr,
> > + &dev_attr_pcie0_errors.attr,
> > + &dev_attr_pcie1_errors.attr,
> > + &dev_attr_nonfatal_errors.attr,
> > + &dev_attr_catfatal_errors.attr,
> > + &dev_attr_inject_error.attr,
> > + NULL,
> > +};
> > +
> > +static struct attribute_group errors_attr_group = {
> > + .attrs = errors_attrs,
> > +};
> > +
> > +static const struct attribute_group *error_groups[] = {
> > + &fme_errors_attr_group,
> > + &errors_attr_group,
> > + NULL
> > +};
> > +
> > +static void fme_error_enable(struct dfl_feature *feature)
> > +{
> > + void __iomem *base = feature->ioaddr;
> > +
> > + /* Workaround: disable MBP_ERROR if revision is 0 */
> > + writeq(dfl_feature_revision(feature->ioaddr) ? 0ULL : MBP_ERROR,
> > + base + FME_ERROR_MASK);
> > + writeq(0ULL, base + PCIE0_ERROR_MASK);
> > + writeq(0ULL, base + PCIE1_ERROR_MASK);
> > + writeq(0ULL, base + RAS_NONFAT_ERROR_MASK);
> > + writeq(0ULL, base + RAS_CATFAT_ERROR_MASK);
> > +}
> > +
> > +static void err_dev_release(struct device *dev)
> > +{
> > + kfree(dev);
> > +}
> > +
> > +static int fme_global_err_init(struct platform_device *pdev,
> > + struct dfl_feature *feature)
> > +{
> > + struct device *dev;
> > + int ret = 0;
> > +
> > + dev = kzalloc(sizeof(*dev), GFP_KERNEL);
> > + if (!dev)
> > + return -ENOMEM;
> > +
> > + dev->parent = &pdev->dev;
> > + dev->release = err_dev_release;
> > + dev_set_name(dev, "errors");
> > +
> > + fme_error_enable(feature);
> > +
> > + ret = device_register(dev);
> > + if (ret) {
> > + put_device(dev);
> > + return ret;
> > + }
> > +
> > + ret = sysfs_create_groups(&dev->kobj, error_groups);
> > + if (ret) {
> > + device_unregister(dev);
> > + return ret;
> > + }
> > +
> > + feature->priv = dev;
> > +
> > + return ret;
> > +}
> > +
> > +static void fme_global_err_uinit(struct platform_device *pdev,
> > + struct dfl_feature *feature)
> > +{
> > + struct device *dev = feature->priv;
> > +
> > + sysfs_remove_groups(&dev->kobj, error_groups);
> > + device_unregister(dev);
> > +}
> > +
> > +const struct dfl_feature_id fme_global_err_id_table[] = {
> > + {.id = FME_FEATURE_ID_GLOBAL_ERR,},
> > + {0,}
> > +};
> > +
> > +const struct dfl_feature_ops fme_global_err_ops = {
> > + .init = fme_global_err_init,
> > + .uinit = fme_global_err_uinit,
> > +};
> > diff --git a/drivers/fpga/dfl-fme-main.c b/drivers/fpga/dfl-fme-main.c
> > index dafa6580..76cb112 100644
> > --- a/drivers/fpga/dfl-fme-main.c
> > +++ b/drivers/fpga/dfl-fme-main.c
> > @@ -686,6 +686,10 @@ static struct dfl_feature_driver fme_feature_drvs[] = {
> > .ops = &fme_power_mgmt_ops,
> > },
> > {
> > + .id_table = fme_global_err_id_table,
> > + .ops = &fme_global_err_ops,
> > + },
> > + {
> > .ops = NULL,
> > },
> > };
> > diff --git a/drivers/fpga/dfl-fme.h b/drivers/fpga/dfl-fme.h
> > index 7a021c4..5fbe3f5 100644
> > --- a/drivers/fpga/dfl-fme.h
> > +++ b/drivers/fpga/dfl-fme.h
> > @@ -37,5 +37,7 @@ struct dfl_fme {
> >
> > extern const struct dfl_feature_ops fme_pr_mgmt_ops;
> > extern const struct dfl_feature_id fme_pr_mgmt_id_table[];
> > +extern const struct dfl_feature_ops fme_global_err_ops;
> > +extern const struct dfl_feature_id fme_global_err_id_table[];
> >
> > #endif /* __DFL_FME_H */
> > diff --git a/drivers/fpga/dfl.h b/drivers/fpga/dfl.h
> > index fbc57f0..6c32080 100644
> > --- a/drivers/fpga/dfl.h
> > +++ b/drivers/fpga/dfl.h
> > @@ -197,12 +197,14 @@ struct dfl_feature_driver {
> > * feature dev (platform device)'s reources.
> > * @ioaddr: mapped mmio resource address.
> > * @ops: ops of this sub feature.
> > + * @priv: priv data of this feature.
> > */
> > struct dfl_feature {
> > u64 id;
> > int resource_index;
> > void __iomem *ioaddr;
> > const struct dfl_feature_ops *ops;
> > + void *priv;
> > };
> >
> > #define DEV_STATUS_IN_USE 0
> > --
> > 2.7.4
> >

Thanks for the review, will fix these things in the next version.

Hao

>
> Thanks,
> Alan