Re: [PATCH v8 01/11] cpuidle/poll_state: poll via smp_cond_load_relaxed()

From: Christoph Lameter (Ampere)
Date: Tue Oct 15 2024 - 13:26:36 EST


On Tue, 15 Oct 2024, Catalin Marinas wrote:

> > Setting of need_resched() from another processor involves sending an IPI
> > after that was set. I dont think we need to smp_cond_load_relaxed since
> > the IPI will cause an event. For ARM a WFE would be sufficient.
>
> I'm not worried about the need_resched() case, even without an IPI it
> would still work.
>
> The loop_count++ side of the condition is supposed to timeout in the
> absence of a need_resched() event. You can't do an smp_cond_load_*() on
> a variable that's only updated by the waiting CPU. Nothing guarantees to
> wake it up to update the variable (the event stream on arm64, yes, but
> that's generic code).

Hmm... I have WFET implementation here without smp_cond modelled after
the delay() implementation ARM64 (but its not generic and there is
an additional patch required to make this work. Intermediate patch
attached)


From: Christoph Lameter (Ampere) <cl@xxxxxxxxxx>
Subject: [Haltpoll: Implement waiting using WFET

Use WFET if the hardware supports it to implement
a wait until something happens to wake up the cpu.

If WFET is not available then use the stream event
source to periodically wake up until an event happens
or the timeout expires.

The smp_cond_wait() is not necessary because the scheduler
will create an event on the targeted cpu by sending an IPI.

Without cond_wait we can simply take the basic approach
from the delay() function and customize it a bit.

Signed-off-by: Christoph Lameter <cl@xxxxxxxxx>

---
drivers/cpuidle/poll_state.c | 43 +++++++++++++++++-------------------
1 file changed, 20 insertions(+), 23 deletions(-)

Index: linux/drivers/cpuidle/poll_state.c
===================================================================
--- linux.orig/drivers/cpuidle/poll_state.c
+++ linux/drivers/cpuidle/poll_state.c
@@ -5,48 +5,41 @@

#include <linux/cpuidle.h>
#include <linux/sched.h>
-#include <linux/sched/clock.h>
#include <linux/sched/idle.h>
-
-#ifdef CONFIG_ARM64
-/*
- * POLL_IDLE_RELAX_COUNT determines how often we check for timeout
- * while polling for TIF_NEED_RESCHED in thread_info->flags.
- *
- * Set this to a low value since arm64, instead of polling, uses a
- * event based mechanism.
- */
-#define POLL_IDLE_RELAX_COUNT 1
-#else
-#define POLL_IDLE_RELAX_COUNT 200
-#endif
+#include <clocksource/arm_arch_timer.h>

static int __cpuidle poll_idle(struct cpuidle_device *dev,
struct cpuidle_driver *drv, int index)
{
- u64 time_start;
-
- time_start = local_clock_noinstr();
+ const cycles_t start = get_cycles();

dev->poll_time_limit = false;

raw_local_irq_enable();
if (!current_set_polling_and_test()) {
- u64 limit;

- limit = cpuidle_poll_time(drv, dev);
+ const cycles_t end = start + ARCH_TIMER_NSECS_TO_CYCLES(cpuidle_poll_time(drv, dev));

while (!need_resched()) {
- unsigned int loop_count = 0;
- if (local_clock_noinstr() - time_start > limit) {
- dev->poll_time_limit = true;
- break;
- }

- smp_cond_load_relaxed(&current_thread_info()->flags,
- VAL & _TIF_NEED_RESCHED ||
- loop_count++ >= POLL_IDLE_RELAX_COUNT);
+ if (alternative_has_cap_unlikely(ARM64_HAS_WFXT)) {
+
+ /* We can power down for a configurable interval while waiting */
+ while (!need_resched() && get_cycles() < end)
+ wfet(end);
+
+ } else if (arch_timer_evtstrm_available()) {
+ const cycles_t timer_period = ARCH_TIMER_USECS_TO_CYCLES(ARCH_TIMER_EVT_STREAM_PERIOD_US);
+
+ /* Wake up periodically during evstream events */
+ while (!need_resched() && get_cycles() + timer_period <= end)
+ wfe();
+ }
}
+
+ /* In case time is not up yet due to coarse time intervals above */
+ while (!need_resched() && get_cycles() < end)
+ cpu_relax();
}
raw_local_irq_disable();
From: Christoph Lameter (Ampere) <cl@xxxxxxxxx>

Move the conversion from time to cycles of arch_timer
into arch_timer.h. Add nsec conversion since we will need that soon.

Signed-off-by: Christoph Lameter <cl@xxxxxxxxx>

Index: linux/arch/arm64/lib/delay.c
===================================================================
--- linux.orig/arch/arm64/lib/delay.c
+++ linux/arch/arm64/lib/delay.c
@@ -15,14 +15,6 @@

#include <clocksource/arm_arch_timer.h>

-#define USECS_TO_CYCLES(time_usecs) \
- xloops_to_cycles((time_usecs) * 0x10C7UL)
-
-static inline unsigned long xloops_to_cycles(unsigned long xloops)
-{
- return (xloops * loops_per_jiffy * HZ) >> 32;
-}
-
void __delay(unsigned long cycles)
{
cycles_t start = get_cycles();
@@ -39,7 +31,7 @@ void __delay(unsigned long cycles)
wfet(end);
} else if (arch_timer_evtstrm_available()) {
const cycles_t timer_evt_period =
- USECS_TO_CYCLES(ARCH_TIMER_EVT_STREAM_PERIOD_US);
+ ARCH_TIMER_USECS_TO_CYCLES(ARCH_TIMER_EVT_STREAM_PERIOD_US);

while ((get_cycles() - start + timer_evt_period) < cycles)
wfe();
@@ -52,7 +44,7 @@ EXPORT_SYMBOL(__delay);

inline void __const_udelay(unsigned long xloops)
{
- __delay(xloops_to_cycles(xloops));
+ __delay(arch_timer_xloops_to_cycles(xloops));
}
EXPORT_SYMBOL(__const_udelay);

Index: linux/include/clocksource/arm_arch_timer.h
===================================================================
--- linux.orig/include/clocksource/arm_arch_timer.h
+++ linux/include/clocksource/arm_arch_timer.h
@@ -90,6 +90,19 @@ extern u64 (*arch_timer_read_counter)(vo
extern struct arch_timer_kvm_info *arch_timer_get_kvm_info(void);
extern bool arch_timer_evtstrm_available(void);

+#include <linux/delay.h>
+
+static inline unsigned long arch_timer_xloops_to_cycles(unsigned long xloops)
+{
+ return (xloops * loops_per_jiffy * HZ) >> 32;
+}
+
+#define ARCH_TIMER_USECS_TO_CYCLES(time_usecs) \
+ arch_timer_xloops_to_cycles((time_usecs) * 0x10C7UL)
+
+#define ARCH_TIMER_NSECS_TO_CYCLES(time_nsecs) \
+ arch_timer_xloops_to_cycles((time_nsecs) * 0x5UL)
+
#else

static inline u32 arch_timer_get_rate(void)