[PATCH 07/10] PCI/LINK: Re-add BW notification portdrv as PCIe BW controller
From: Ilpo Järvinen
Date: Thu Aug 17 2023 - 08:18:39 EST
This mostly reverts b4c7d2076b4e ("PCI/LINK: Remove bandwidth
notification"), however, there are small tweaks:
1) Call it PCIe bwctrl (bandwidth controller) instead of just
bandwidth notifications.
2) Don't print the notifications into kernel log, just keep the current
link speed updated.
3) Use concurrency safe LNKCTL RMW operations.
4) Read link speed after enabling the notification to ensure the
current link speed is correct from the start.
5) Add local variable in probe for srv->port.
6) Handle link speed read and LBMS write race in
pcie_bw_notification_irq().
The reason for 1) is to indicate the increased scope of the driver. A
subsequent commit extends the driver to allow controlling PCIe
bandwidths from user space upon crossing thermal thresholds.
While 2) is somewhat unfortunate, the log spam was the source of
complaints that eventually lead to the removal of the bandwidth
notifications driver (see the links below for further information).
After re-adding this driver back the userspace can, if it wishes to,
observe the link speed changes using the current bus speed files under
sysfs.
Link: https://lore.kernel.org/all/20190429185611.121751-1-helgaas@xxxxxxxxxx/
Link: https://lore.kernel.org/linux-pci/20190501142942.26972-1-keith.busch@xxxxxxxxx/
Link: https://lore.kernel.org/linux-pci/20200115221008.GA191037@xxxxxxxxxx/
Suggested-by: Lukas Wunner <lukas@xxxxxxxxx>
Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@xxxxxxxxxxxxxxx>
---
drivers/pci/pcie/Kconfig | 8 +++
drivers/pci/pcie/Makefile | 1 +
drivers/pci/pcie/bwctrl.c | 131 +++++++++++++++++++++++++++++++++++++
drivers/pci/pcie/portdrv.c | 9 +--
drivers/pci/pcie/portdrv.h | 10 ++-
5 files changed, 153 insertions(+), 6 deletions(-)
create mode 100644 drivers/pci/pcie/bwctrl.c
diff --git a/drivers/pci/pcie/Kconfig b/drivers/pci/pcie/Kconfig
index 228652a59f27..426dd81b48e4 100644
--- a/drivers/pci/pcie/Kconfig
+++ b/drivers/pci/pcie/Kconfig
@@ -137,6 +137,14 @@ config PCIE_PTM
This is only useful if you have devices that support PTM, but it
is safe to enable even if you don't.
+config PCIE_BW
+ bool "PCI Express Bandwidth Change Notification"
+ depends on PCIEPORTBUS
+ help
+ This enables PCI Express Bandwidth Change Notification. If
+ you know link width or rate changes occur only to correct
+ unreliable links, you may answer Y.
+
config PCIE_EDR
bool "PCI Express Error Disconnect Recover support"
depends on PCIE_DPC && ACPI
diff --git a/drivers/pci/pcie/Makefile b/drivers/pci/pcie/Makefile
index 8de4ed5f98f1..175065a495cf 100644
--- a/drivers/pci/pcie/Makefile
+++ b/drivers/pci/pcie/Makefile
@@ -12,4 +12,5 @@ obj-$(CONFIG_PCIEAER_INJECT) += aer_inject.o
obj-$(CONFIG_PCIE_PME) += pme.o
obj-$(CONFIG_PCIE_DPC) += dpc.o
obj-$(CONFIG_PCIE_PTM) += ptm.o
+obj-$(CONFIG_PCIE_BW) += bwctrl.o
obj-$(CONFIG_PCIE_EDR) += edr.o
diff --git a/drivers/pci/pcie/bwctrl.c b/drivers/pci/pcie/bwctrl.c
new file mode 100644
index 000000000000..4fc6718fc0e5
--- /dev/null
+++ b/drivers/pci/pcie/bwctrl.c
@@ -0,0 +1,131 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * PCI Express Link Bandwidth Notification services driver
+ * Author: Alexandru Gagniuc <mr.nuke.me@xxxxxxxxx>
+ *
+ * Copyright (C) 2019, Dell Inc
+ *
+ * The PCIe Link Bandwidth Notification provides a way to notify the
+ * operating system when the link width or data rate changes. This
+ * capability is required for all root ports and downstream ports
+ * supporting links wider than x1 and/or multiple link speeds.
+ *
+ * This service port driver hooks into the bandwidth notification interrupt
+ * watching for link speed changes or links becoming degraded in operation
+ * and updates the cached link speed exposed to user space.
+ */
+
+#define dev_fmt(fmt) "bwctrl: " fmt
+
+#include "../pci.h"
+#include "portdrv.h"
+
+static bool pcie_link_bandwidth_notification_supported(struct pci_dev *dev)
+{
+ int ret;
+ u32 lnk_cap;
+
+ ret = pcie_capability_read_dword(dev, PCI_EXP_LNKCAP, &lnk_cap);
+ return (ret == PCIBIOS_SUCCESSFUL) && (lnk_cap & PCI_EXP_LNKCAP_LBNC);
+}
+
+static void pcie_enable_link_bandwidth_notification(struct pci_dev *dev)
+{
+ u16 link_status;
+ int ret;
+
+ pcie_capability_write_word(dev, PCI_EXP_LNKSTA, PCI_EXP_LNKSTA_LBMS);
+ pcie_capability_set_word(dev, PCI_EXP_LNKCTL, PCI_EXP_LNKCTL_LBMIE);
+
+ /* Read after enabling notifications to ensure link speed is up to date */
+ ret = pcie_capability_read_word(dev, PCI_EXP_LNKSTA, &link_status);
+ if (ret == PCIBIOS_SUCCESSFUL)
+ pcie_update_link_speed(dev->subordinate, link_status);
+}
+
+static void pcie_disable_link_bandwidth_notification(struct pci_dev *dev)
+{
+ pcie_capability_clear_word(dev, PCI_EXP_LNKCTL, PCI_EXP_LNKCTL_LBMIE);
+}
+
+static irqreturn_t pcie_bw_notification_irq(int irq, void *context)
+{
+ struct pcie_device *srv = context;
+ struct pci_dev *port = srv->port;
+ u16 link_status, events;
+ int ret;
+
+ ret = pcie_capability_read_word(port, PCI_EXP_LNKSTA, &link_status);
+ events = link_status & PCI_EXP_LNKSTA_LBMS;
+
+ if (ret != PCIBIOS_SUCCESSFUL || !events)
+ return IRQ_NONE;
+
+ pcie_capability_write_word(port, PCI_EXP_LNKSTA, events);
+
+ /*
+ * The write to clear LBMS prevents getting interrupt from the
+ * latest link speed when the link speed changes between the above
+ * LNKSTA read and write. Therefore, re-read the speed before
+ * updating it.
+ */
+ ret = pcie_capability_read_word(port, PCI_EXP_LNKSTA, &link_status);
+ if (ret != PCIBIOS_SUCCESSFUL)
+ return IRQ_HANDLED;
+ pcie_update_link_speed(port->subordinate, link_status);
+
+ return IRQ_HANDLED;
+}
+
+static int pcie_bandwidth_notification_probe(struct pcie_device *srv)
+{
+ struct pci_dev *port = srv->port;
+ int ret;
+
+ /* Single-width or single-speed ports do not have to support this. */
+ if (!pcie_link_bandwidth_notification_supported(port))
+ return -ENODEV;
+
+ ret = request_irq(srv->irq, pcie_bw_notification_irq,
+ IRQF_SHARED, "PCIe BW ctrl", srv);
+ if (ret)
+ return ret;
+
+ pcie_enable_link_bandwidth_notification(port);
+ pci_info(port, "enabled with IRQ %d\n", srv->irq);
+
+ return 0;
+}
+
+static void pcie_bandwidth_notification_remove(struct pcie_device *srv)
+{
+ pcie_disable_link_bandwidth_notification(srv->port);
+ free_irq(srv->irq, srv);
+}
+
+static int pcie_bandwidth_notification_suspend(struct pcie_device *srv)
+{
+ pcie_disable_link_bandwidth_notification(srv->port);
+ return 0;
+}
+
+static int pcie_bandwidth_notification_resume(struct pcie_device *srv)
+{
+ pcie_enable_link_bandwidth_notification(srv->port);
+ return 0;
+}
+
+static struct pcie_port_service_driver pcie_bandwidth_notification_driver = {
+ .name = "pcie_bwctrl",
+ .port_type = PCIE_ANY_PORT,
+ .service = PCIE_PORT_SERVICE_BWCTRL,
+ .probe = pcie_bandwidth_notification_probe,
+ .suspend = pcie_bandwidth_notification_suspend,
+ .resume = pcie_bandwidth_notification_resume,
+ .remove = pcie_bandwidth_notification_remove,
+};
+
+int __init pcie_bwctrl_init(void)
+{
+ return pcie_port_service_register(&pcie_bandwidth_notification_driver);
+}
diff --git a/drivers/pci/pcie/portdrv.c b/drivers/pci/pcie/portdrv.c
index 46fad0d813b2..ed33049bffd6 100644
--- a/drivers/pci/pcie/portdrv.c
+++ b/drivers/pci/pcie/portdrv.c
@@ -67,7 +67,7 @@ static int pcie_message_numbers(struct pci_dev *dev, int mask,
*/
if (mask & (PCIE_PORT_SERVICE_PME | PCIE_PORT_SERVICE_HP |
- PCIE_PORT_SERVICE_BWNOTIF)) {
+ PCIE_PORT_SERVICE_BWCTRL)) {
pcie_capability_read_word(dev, PCI_EXP_FLAGS, ®16);
*pme = (reg16 & PCI_EXP_FLAGS_IRQ) >> 9;
nvec = *pme + 1;
@@ -149,11 +149,11 @@ static int pcie_port_enable_irq_vec(struct pci_dev *dev, int *irqs, int mask)
/* PME, hotplug and bandwidth notification share an MSI/MSI-X vector */
if (mask & (PCIE_PORT_SERVICE_PME | PCIE_PORT_SERVICE_HP |
- PCIE_PORT_SERVICE_BWNOTIF)) {
+ PCIE_PORT_SERVICE_BWCTRL)) {
pcie_irq = pci_irq_vector(dev, pme);
irqs[PCIE_PORT_SERVICE_PME_SHIFT] = pcie_irq;
irqs[PCIE_PORT_SERVICE_HP_SHIFT] = pcie_irq;
- irqs[PCIE_PORT_SERVICE_BWNOTIF_SHIFT] = pcie_irq;
+ irqs[PCIE_PORT_SERVICE_BWCTRL_SHIFT] = pcie_irq;
}
if (mask & PCIE_PORT_SERVICE_AER)
@@ -270,7 +270,7 @@ static int get_port_device_capability(struct pci_dev *dev)
pcie_capability_read_dword(dev, PCI_EXP_LNKCAP, &linkcap);
if (linkcap & PCI_EXP_LNKCAP_LBNC)
- services |= PCIE_PORT_SERVICE_BWNOTIF;
+ services |= PCIE_PORT_SERVICE_BWCTRL;
}
return services;
@@ -828,6 +828,7 @@ static void __init pcie_init_services(void)
pcie_pme_init();
pcie_dpc_init();
pcie_hp_init();
+ pcie_bwctrl_init();
}
static int __init pcie_portdrv_init(void)
diff --git a/drivers/pci/pcie/portdrv.h b/drivers/pci/pcie/portdrv.h
index 58a2b1a1cae4..f622c8a02a5b 100644
--- a/drivers/pci/pcie/portdrv.h
+++ b/drivers/pci/pcie/portdrv.h
@@ -20,8 +20,8 @@
#define PCIE_PORT_SERVICE_HP (1 << PCIE_PORT_SERVICE_HP_SHIFT)
#define PCIE_PORT_SERVICE_DPC_SHIFT 3 /* Downstream Port Containment */
#define PCIE_PORT_SERVICE_DPC (1 << PCIE_PORT_SERVICE_DPC_SHIFT)
-#define PCIE_PORT_SERVICE_BWNOTIF_SHIFT 4 /* Bandwidth notification */
-#define PCIE_PORT_SERVICE_BWNOTIF (1 << PCIE_PORT_SERVICE_BWNOTIF_SHIFT)
+#define PCIE_PORT_SERVICE_BWCTRL_SHIFT 4 /* Bandwidth Controller (notifications) */
+#define PCIE_PORT_SERVICE_BWCTRL (1 << PCIE_PORT_SERVICE_BWCTRL_SHIFT)
#define PCIE_PORT_DEVICE_MAXSERVICES 5
@@ -53,6 +53,12 @@ int pcie_dpc_init(void);
static inline int pcie_dpc_init(void) { return 0; }
#endif
+#ifdef CONFIG_PCIE_BW
+int pcie_bwctrl_init(void);
+#else
+static inline int pcie_bwctrl_init(void) { return 0; }
+#endif
+
/* Port Type */
#define PCIE_ANY_PORT (~0)
--
2.30.2