[PATCH] acpi_pm: Reduce PMTMR counter read contention

From: Zhenzhong Duan
Date: Tue Jan 22 2019 - 20:55:53 EST


On a large system with many CPUs, using PMTMR as the clock source can
have a significant impact on the overall system performance because
of the following reasons:
1) There is a single PMTMR counter shared by all the CPUs.
2) PMTMR counter reading is a very slow operation.

Using PMTMR as the default clock source may happen when, for example,
the TSC clock calibration exceeds the allowable tolerance and HPET
disabled by nohpet on kernel command line. Sometimes the performance
slowdown can be so severe that the system may crash because of a NMI
watchdog soft lockup, logs:

[ 20.181521] clocksource: acpi_pm: mask: 0xffffff max_cycles: 0xffffff,
max_idle_ns: 2085701024 ns
[ 44.273786] BUG: soft lockup - CPU#48 stuck for 23s! [swapper/48:0]
[ 44.279992] BUG: soft lockup - CPU#49 stuck for 23s! [migration/49:307]
[ 44.285169] BUG: soft lockup - CPU#50 stuck for 23s! [migration/50:313]

Commit f99fd22e4d4b ("x86/hpet: Reduce HPET counter read contention")
fixed a similar issue for HPET, this patch adapts that design to PMTMR.

Signed-off-by: Zhenzhong Duan <zhenzhong.duan@xxxxxxxxxx>
Tested-by: Kin Cho <kin.cho@xxxxxxxxxx>
Cc: Daniel Lezcano <daniel.lezcano@xxxxxxxxxx>
Cc: Thomas Gleixner <tglx@xxxxxxxxxxxxx>
Cc: Waiman Long <longman@xxxxxxxxxx>
Cc: Srinivas Eeda <srinivas.eeda@xxxxxxxxxx>
---
drivers/clocksource/acpi_pm.c | 101 +++++++++++++++++++++++++++++++++++++++++-
1 file changed, 100 insertions(+), 1 deletion(-)

diff --git a/drivers/clocksource/acpi_pm.c b/drivers/clocksource/acpi_pm.c
index 1961e35..8b522eb 100644
--- a/drivers/clocksource/acpi_pm.c
+++ b/drivers/clocksource/acpi_pm.c
@@ -32,12 +32,111 @@
*/
u32 pmtmr_ioport __read_mostly;

-static inline u32 read_pmtmr(void)
+static inline u32 pmtmr_readl(void)
{
/* mask the output to 24 bits */
return inl(pmtmr_ioport) & ACPI_PM_MASK;
}

+#if defined(CONFIG_SMP) && defined(CONFIG_64BIT)
+/*
+ * Reading the PMTMR counter is a very slow operation. If a large number of
+ * CPUs are trying to access the PMTMR counter simultaneously, it can cause
+ * massive delay and slow down system performance dramatically. This may
+ * happen when PMTMR is the default clock source instead of TSC. For a
+ * really large system with hundreds of CPUs, the slowdown may be so
+ * severe that it may actually crash the system because of a NMI watchdog
+ * soft lockup, for example.
+ *
+ * If multiple CPUs are trying to access the PMTMR counter at the same time,
+ * we don't actually need to read the counter multiple times. Instead, the
+ * other CPUs can use the counter value read by the first CPU in the group.
+ *
+ * This special feature is only enabled on x86-64 systems. It is unlikely
+ * that 32-bit x86 systems will have enough CPUs to require this feature
+ * with its associated locking overhead. And we also need 64-bit atomic
+ * read.
+ *
+ * The lock and the pmtmr value are stored together and can be read in a
+ * single atomic 64-bit read. It is explicitly assumed that arch_spinlock_t
+ * is 32 bits in size.
+ */
+union pmtmr_lock {
+ struct {
+ arch_spinlock_t lock;
+ u32 value;
+ };
+ u64 lockval;
+};
+
+static union pmtmr_lock pmtmr __cacheline_aligned = {
+ { .lock = __ARCH_SPIN_LOCK_UNLOCKED, },
+};
+
+static u32 read_pmtmr(void)
+{
+ unsigned long flags;
+ union pmtmr_lock old, new;
+
+ BUILD_BUG_ON(sizeof(union pmtmr_lock) != 8);
+
+ /*
+ * Read PMTMR directly if in NMI.
+ */
+ if (in_nmi())
+ return (u64)pmtmr_readl();
+
+ /*
+ * Read the current state of the lock and PMTMR value atomically.
+ */
+ old.lockval = READ_ONCE(pmtmr.lockval);
+
+ if (arch_spin_is_locked(&old.lock))
+ goto contended;
+
+ local_irq_save(flags);
+ if (arch_spin_trylock(&pmtmr.lock)) {
+ new.value = pmtmr_readl();
+ /*
+ * Use WRITE_ONCE() to prevent store tearing.
+ */
+ WRITE_ONCE(pmtmr.value, new.value);
+ arch_spin_unlock(&pmtmr.lock);
+ local_irq_restore(flags);
+ return (u64)new.value;
+ }
+ local_irq_restore(flags);
+
+contended:
+ /*
+ * Contended case
+ * --------------
+ * Wait until the PMTMR value change or the lock is free to indicate
+ * its value is up-to-date.
+ *
+ * It is possible that old.value has already contained the latest
+ * PMTMR value while the lock holder was in the process of releasing
+ * the lock. Checking for lock state change will enable us to return
+ * the value immediately instead of waiting for the next PMTMR reader
+ * to come along.
+ */
+ do {
+ cpu_relax();
+ new.lockval = READ_ONCE(pmtmr.lockval);
+ } while ((new.value == old.value) && arch_spin_is_locked(&new.lock));
+
+ return (u64)new.value;
+}
+#else
+/*
+ * For UP or 32-bit.
+ */
+static inline u32 read_pmtmr(void)
+{
+ return pmtmr_readl();
+}
+#endif
+
u32 acpi_pm_read_verified(void)
{
u32 v1 = 0, v2 = 0, v3 = 0;
--
1.8.3.1