Re: [PATCH v5 3/3] crypto: hisilicon/qm - defining the device isolation strategy

From: yekai(A)
Date: Thu Jul 21 2022 - 04:15:00 EST




On 2022/7/8 15:35, Greg KH wrote:
On Fri, Jul 08, 2022 at 03:08:20PM +0800, Kai Ye wrote:
Define the device isolation strategy by the device driver. The
user configures a frequency value by uacce interface. If the
slot reset frequency exceeds the value of setting for a certain
period of time, the device will not be available in user space.
The time window is one hour. The VF device use the PF device
isolation strategy. All the hardware errors are processed by PF
driver. This solution can be used for other drivers.

Signed-off-by: Kai Ye <yekai13@xxxxxxxxxx>
---
drivers/crypto/hisilicon/qm.c | 163 +++++++++++++++++++++++++++++++---
include/linux/hisi_acc_qm.h | 9 ++
2 files changed, 160 insertions(+), 12 deletions(-)

diff --git a/drivers/crypto/hisilicon/qm.c b/drivers/crypto/hisilicon/qm.c
index ad83c194d664..8eb3b790a655 100644
--- a/drivers/crypto/hisilicon/qm.c
+++ b/drivers/crypto/hisilicon/qm.c
@@ -417,6 +417,16 @@ struct hisi_qm_resource {
struct list_head list;
};

+/**
+ * struct qm_hw_err - Structure describing the device errors
+ * @list: hardware error list
+ * @timestamp: timestamp when the error occurred
+ */
+struct qm_hw_err {
+ struct list_head list;
+ unsigned long long timestamp;
+};
+
struct hisi_qm_hw_ops {
int (*get_vft)(struct hisi_qm *qm, u32 *base, u32 *number);
void (*qm_db)(struct hisi_qm *qm, u16 qn,
@@ -3410,6 +3420,111 @@ static long hisi_qm_uacce_ioctl(struct uacce_queue *q, unsigned int cmd,
return 0;
}

+/**
+ * qm_hw_err_isolate() - Try to isolate the uacce device with its VFs
+ * according to user's configuration of isolation strategy. Warning: this
+ * API should be called while there the users on this device are suspended
+ * by slot resetting preparation of PCI AER.
+ * @qm: the uacce device
+ */
+static int qm_hw_err_isolate(struct hisi_qm *qm)
+{
+ struct qm_hw_err *err, *tmp, *hw_err;
+ struct qm_err_isolate *isolate;
+ u32 count = 0;
+
+ isolate = &qm->isolate_data;
+
+#define SECONDS_PER_HOUR 3600
+
+ /* All the hw errs are processed by PF driver */
+ if (qm->uacce->is_vf || isolate->is_isolate ||
+ !isolate->hw_err_isolate_hz)
+ return 0;
+
+ hw_err = kzalloc(sizeof(*hw_err), GFP_ATOMIC);

Why atomic? What lock is held here?

Atomic is not required. So use GFP_KERNEL.

+ if (!hw_err)
+ return -ENOMEM;
+
+ mutex_lock(&isolate->isolate_lock);
+ hw_err->timestamp = jiffies;
+ list_for_each_entry_safe(err, tmp, &isolate->uacce_hw_errs, list) {
+ if ((hw_err->timestamp - err->timestamp) / HZ >
+ SECONDS_PER_HOUR) {

No possiblity of wrapping the timestamp?
I do not understand this suggestion, Can you show more detail in this suggestion?


+ list_del(&err->list);
+ kfree(err);
+ } else {
+ count++;
+ }
+ }
+ list_add(&hw_err->list, &isolate->uacce_hw_errs);
+ mutex_unlock(&isolate->isolate_lock);
+
+ if (count >= isolate->hw_err_isolate_hz)
+ isolate->is_isolate = true;
+
+ return 0;
+}
+
+static void qm_hw_err_destroy(struct hisi_qm *qm)
+{
+ struct qm_hw_err *err, *tmp;
+
+ mutex_lock(&qm->isolate_data.isolate_lock);
+ list_for_each_entry_safe(err, tmp, &qm->isolate_data.uacce_hw_errs, list) {
+ list_del(&err->list);
+ kfree(err);
+ }
+ mutex_unlock(&qm->isolate_data.isolate_lock);
+}
+
+static enum uacce_dev_state hisi_qm_get_isolate_state(struct uacce_device *uacce)
+{
+ struct hisi_qm *qm = uacce->priv;
+ struct hisi_qm *pf_qm;
+
+ if (uacce->is_vf)
+ pf_qm = pci_get_drvdata(pci_physfn(qm->pdev));
+ else
+ pf_qm = qm;
+
+ return pf_qm->isolate_data.is_isolate ?
+ UACCE_DEV_ISOLATE : UACCE_DEV_NORMAL;
+}
+
+static int hisi_qm_isolate_strategy_write(struct uacce_device *uacce,
+ u32 freq)
+{
+ struct hisi_qm *qm = uacce->priv;
+
+ /* Must be set by PF */
+ if (uacce->is_vf)
+ return -EINVAL;

But the value passed to you is not invalid, something else went wrong.
Are you sure this is the correct error?
use EPERM instead of EINVAL.

+
+ if (qm->isolate_data.is_isolate)
+ return -EINVAL;

Same here, why is this correct?
use EPERM instead of EINVAL.

+
+ qm->isolate_data.hw_err_isolate_hz = freq;

No validation of the value passed to you? It can be anything?

+
+ /* After the policy is updated, need to reset the hardware err list */
+ qm_hw_err_destroy(qm);

No error checking?
Due to the process is clean list. So no error checking is required.

thanks,

greg k-h
.


Thanks

Kai