[RFC PATCH 2/4] x86, mwaitt: introduce mwaitx idle with a configurable timer

From: Huang Rui
Date: Tue May 19 2015 - 04:35:51 EST


MWAITX/MWAIT does not let the cpu core go into C1 state on AMD processors.
The cpu core still consumes less power while waiting, and has faster exit
from waiting than "Halt". This patch implements an interface using the
kernel parameter "idle=" to configure mwaitx type and timer value.

If "idle=mwaitx", the timeout will be set as the maximum value
((2^64 - 1) * TSC cycle).
If "idle=mwaitx,100", the timeout will be set as 100ns.
If the processor doesn't support MWAITX, then halt is used.

Signed-off-by: Huang Rui <ray.huang@xxxxxxx>
---
arch/x86/include/asm/mwait.h | 2 +
arch/x86/include/asm/processor.h | 2 +-
arch/x86/kernel/process.c | 79 ++++++++++++++++++++++++++++++++++++++++
3 files changed, 82 insertions(+), 1 deletion(-)

diff --git a/arch/x86/include/asm/mwait.h b/arch/x86/include/asm/mwait.h
index b91136f..c4e51e7 100644
--- a/arch/x86/include/asm/mwait.h
+++ b/arch/x86/include/asm/mwait.h
@@ -14,6 +14,8 @@
#define CPUID5_ECX_INTERRUPT_BREAK 0x2

#define MWAIT_ECX_INTERRUPT_BREAK 0x1
+#define MWAITX_ECX_TIMER_ENABLE 0x2
+#define MWAITX_EBX_WAIT_TIMEOUT 0xffffffff

static inline void __monitor(const void *eax, unsigned long ecx,
unsigned long edx)
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 23ba676..0f60e94 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -733,7 +733,7 @@ extern unsigned long boot_option_idle_override;
extern bool amd_e400_c1e_detected;

enum idle_boot_override {IDLE_NO_OVERRIDE=0, IDLE_HALT, IDLE_NOMWAIT,
- IDLE_POLL};
+ IDLE_POLL, IDLE_MWAITX};

extern void enable_sep_cpu(void);
extern int sysenter_setup(void);
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 6e338e3..9d68193 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -30,6 +30,7 @@
#include <asm/debugreg.h>
#include <asm/nmi.h>
#include <asm/tlbflush.h>
+#include <asm/x86_init.h>

/*
* per-CPU TSS segments. Threads are completely 'soft' on Linux,
@@ -276,6 +277,7 @@ unsigned long boot_option_idle_override = IDLE_NO_OVERRIDE;
EXPORT_SYMBOL(boot_option_idle_override);

static void (*x86_idle)(void);
+static unsigned long idle_param;

#ifndef CONFIG_SMP
static inline void play_dead(void)
@@ -444,6 +446,17 @@ static int prefer_mwait_c1_over_halt(const struct cpuinfo_x86 *c)
return 1;
}

+static int not_support_mwaitx(const struct cpuinfo_x86 *c)
+{
+ if (c->x86_vendor != X86_VENDOR_AMD)
+ return 1;
+
+ if (!cpu_has(c, X86_FEATURE_MWAITT))
+ return 1;
+
+ return 0;
+}
+
/*
* MONITOR/MWAIT with no hints, used for default default C1 state.
* This invokes MWAIT with interrutps enabled and no flags,
@@ -470,12 +483,45 @@ static void mwait_idle(void)
__current_clr_polling();
}

+/*
+ * AMD Excavator processors support the new MONITORX/MWAITX instructions.
+ * The function is similar to mwait but with a timer. On AMD platforms
+ * mwaitx does not let the core go into C1 state. This provides for a
+ * faster waiting exit speed. The user can configure the idle method and
+ * timer value via the idle kernel parameter.
+ */
+static void mwaitx_idle(void)
+{
+ unsigned long ebx, ecx;
+
+ ebx = idle_param;
+ ecx = MWAITX_ECX_TIMER_ENABLE;
+
+ if (!current_set_polling_and_test()) {
+ __monitorx((void *)&current_thread_info()->flags, 0, 0);
+ if (!need_resched())
+ __sti_mwaitx(0, ebx, ecx);
+ else
+ local_irq_enable();
+ } else {
+ local_irq_enable();
+ }
+ __current_clr_polling();
+}
+
void select_idle_routine(const struct cpuinfo_x86 *c)
{
#ifdef CONFIG_SMP
if (boot_option_idle_override == IDLE_POLL && smp_num_siblings > 1)
pr_warn_once("WARNING: polling idle and HT enabled, performance may degrade\n");
#endif
+
+ if (boot_option_idle_override == IDLE_MWAITX &&
+ not_support_mwaitx(c)) {
+ pr_warn_once("WARNING: mwaitx not supported, using default idle support\n");
+ x86_idle = default_idle;
+ }
+
if (x86_idle || boot_option_idle_override == IDLE_POLL)
return;

@@ -499,6 +545,8 @@ void __init init_amd_e400_c1e_mask(void)

static int __init idle_setup(char *str)
{
+ unsigned long timeout, tsc_freq;
+
if (!str)
return -EINVAL;

@@ -524,6 +572,37 @@ static int __init idle_setup(char *str)
* of boot_option_idle_override.
*/
boot_option_idle_override = IDLE_NOMWAIT;
+ } else if (!strncmp(str, "mwaitx", 6)) {
+ /*
+ * If the boot option of "idle=mwaitx" is added, it means
+ * that mwaitx will be enabled if current processor
+ * supports it. If not supported, use default_idle.
+ */
+ x86_idle = mwaitx_idle;
+ boot_option_idle_override = IDLE_MWAITX;
+ str += 6;
+ if (str && (str[0] == ',')) {
+ if (kstrtoul(str + 1, 0, &timeout)) {
+ pr_warn_once("WARNING: timer value should be numerical\n");
+ return -1;
+ }
+
+ tsc_freq = x86_platform.calibrate_tsc();
+ if (!tsc_freq) {
+ pr_warn_once("WARNING: can not calculate TSC khz\n");
+ return -1;
+ }
+
+ /*
+ * TSC loops (EBX input) = Timer(nsec) *
+ * TSC freq(khz) / 1000000
+ */
+ timeout = timeout * tsc_freq;
+ do_div(timeout, 1000000);
+
+ idle_param = timeout;
+ } else
+ idle_param = MWAITX_EBX_WAIT_TIMEOUT;
} else
return -1;

--
2.1.0

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/