[GIT pull] x86 timer updates for 4.15

From: Thomas Gleixner
Date: Mon Nov 13 2017 - 07:19:10 EST


Linus,

please pull the latest x86-timers-for-linus git tree from:

git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git x86-timers-for-linus

These updates are related to TSC handling:

- Support platforms which have synchronized TSCs but the boot CPU has a
non zero TSC_ADJUST value, which is considered a firmware bug on normal
systems.

This applies to HPE/SGI UV platforms where the platform firmware uses
TSC_ADJUST to ensure TSC synchronization across a huge number of
sockets, but due to power on timings the boot CPU cannot be guaranteed
to have a zero TSC_ADJUST register value.

- Fix the ordering of udelay calibration and kvmclock_init()

- Cleanup the udelay and calibration code

Thanks,

tglx

------------------>
Boris Ostrovsky (1):
x86/timers: Move simple_udelay_calibration() past kvmclock_init()

Dou Liyang (3):
x86/timers: Move the simple udelay calibration to tsc.h
x86/timers: Make recalibrate_cpu_khz() void
x86/tsc: Mark cyc2ns_init() and detect_art() __init

Thomas Gleixner (1):
x86/tsc: Make CONFIG_X86_TSC=n build work again

mike.travis@xxxxxxx (6):
x86/tsc: Add option that TSC on Socket 0 being non-zero is valid
x86/tsc: Skip TSC test and error messages if already unstable
x86/tsc: Drastically reduce the number of firmware bug warnings
x86/tsc: Provide a means to disable TSC ART
x86/platform/UV: Add check of TSC state set by UV BIOS
x86/platform/UV: Mark tsc_check_sync as an init function


arch/x86/include/asm/timer.h | 2 +-
arch/x86/include/asm/tsc.h | 7 +++++
arch/x86/include/asm/uv/uv_hub.h | 23 ++++++++++++----
arch/x86/kernel/apic/x2apic_uv_x.c | 43 +++++++++++++++++++++++++++++
arch/x86/kernel/setup.c | 29 +++-----------------
arch/x86/kernel/tsc.c | 39 +++++++++++++++++++-------
arch/x86/kernel/tsc_sync.c | 56 +++++++++++++++++++++++++++++++-------
7 files changed, 148 insertions(+), 51 deletions(-)

diff --git a/arch/x86/include/asm/timer.h b/arch/x86/include/asm/timer.h
index 2016962103df..e90edbee313a 100644
--- a/arch/x86/include/asm/timer.h
+++ b/arch/x86/include/asm/timer.h
@@ -8,7 +8,7 @@
#define TICK_SIZE (tick_nsec / 1000)

unsigned long long native_sched_clock(void);
-extern int recalibrate_cpu_khz(void);
+extern void recalibrate_cpu_khz(void);

extern int no_timer_check;

diff --git a/arch/x86/include/asm/tsc.h b/arch/x86/include/asm/tsc.h
index f5e6f1c417df..c016167720b1 100644
--- a/arch/x86/include/asm/tsc.h
+++ b/arch/x86/include/asm/tsc.h
@@ -31,15 +31,22 @@ static inline cycles_t get_cycles(void)

extern struct system_counterval_t convert_art_to_tsc(u64 art);

+extern void tsc_early_delay_calibrate(void);
extern void tsc_init(void);
extern void mark_tsc_unstable(char *reason);
extern int unsynchronized_tsc(void);
extern int check_tsc_unstable(void);
+extern void mark_tsc_async_resets(char *reason);
extern unsigned long native_calibrate_cpu(void);
extern unsigned long native_calibrate_tsc(void);
extern unsigned long long native_sched_clock_from_tsc(u64 tsc);

extern int tsc_clocksource_reliable;
+#ifdef CONFIG_X86_TSC
+extern bool tsc_async_resets;
+#else
+# define tsc_async_resets false
+#endif

/*
* Boot-time check whether the TSCs are synchronized across
diff --git a/arch/x86/include/asm/uv/uv_hub.h b/arch/x86/include/asm/uv/uv_hub.h
index 9cffb44a3cf5..036e26d63d9a 100644
--- a/arch/x86/include/asm/uv/uv_hub.h
+++ b/arch/x86/include/asm/uv/uv_hub.h
@@ -776,23 +776,36 @@ static inline int uv_num_possible_blades(void)
extern void uv_nmi_setup(void);
extern void uv_nmi_setup_hubless(void);

+/* BIOS/Kernel flags exchange MMR */
+#define UVH_BIOS_KERNEL_MMR UVH_SCRATCH5
+#define UVH_BIOS_KERNEL_MMR_ALIAS UVH_SCRATCH5_ALIAS
+#define UVH_BIOS_KERNEL_MMR_ALIAS_2 UVH_SCRATCH5_ALIAS_2
+
+/* TSC sync valid, set by BIOS */
+#define UVH_TSC_SYNC_MMR UVH_BIOS_KERNEL_MMR
+#define UVH_TSC_SYNC_SHIFT 10
+#define UVH_TSC_SYNC_SHIFT_UV2K 16 /* UV2/3k have different bits */
+#define UVH_TSC_SYNC_MASK 3 /* 0011 */
+#define UVH_TSC_SYNC_VALID 3 /* 0011 */
+#define UVH_TSC_SYNC_INVALID 2 /* 0010 */
+
/* BMC sets a bit this MMR non-zero before sending an NMI */
-#define UVH_NMI_MMR UVH_SCRATCH5
-#define UVH_NMI_MMR_CLEAR UVH_SCRATCH5_ALIAS
+#define UVH_NMI_MMR UVH_BIOS_KERNEL_MMR
+#define UVH_NMI_MMR_CLEAR UVH_BIOS_KERNEL_MMR_ALIAS
#define UVH_NMI_MMR_SHIFT 63
-#define UVH_NMI_MMR_TYPE "SCRATCH5"
+#define UVH_NMI_MMR_TYPE "SCRATCH5"

/* Newer SMM NMI handler, not present in all systems */
#define UVH_NMI_MMRX UVH_EVENT_OCCURRED0
#define UVH_NMI_MMRX_CLEAR UVH_EVENT_OCCURRED0_ALIAS
#define UVH_NMI_MMRX_SHIFT UVH_EVENT_OCCURRED0_EXTIO_INT0_SHFT
-#define UVH_NMI_MMRX_TYPE "EXTIO_INT0"
+#define UVH_NMI_MMRX_TYPE "EXTIO_INT0"

/* Non-zero indicates newer SMM NMI handler present */
#define UVH_NMI_MMRX_SUPPORTED UVH_EXTIO_INT0_BROADCAST

/* Indicates to BIOS that we want to use the newer SMM NMI handler */
-#define UVH_NMI_MMRX_REQ UVH_SCRATCH5_ALIAS_2
+#define UVH_NMI_MMRX_REQ UVH_BIOS_KERNEL_MMR_ALIAS_2
#define UVH_NMI_MMRX_REQ_SHIFT 62

struct uv_hub_nmi_s {
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index 0d57bb9079c9..5eda48a72617 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -154,6 +154,48 @@ static int __init early_get_pnodeid(void)
return pnode;
}

+static void __init uv_tsc_check_sync(void)
+{
+ u64 mmr;
+ int sync_state;
+ int mmr_shift;
+ char *state;
+ bool valid;
+
+ /* Accommodate different UV arch BIOSes */
+ mmr = uv_early_read_mmr(UVH_TSC_SYNC_MMR);
+ mmr_shift =
+ is_uv1_hub() ? 0 :
+ is_uv2_hub() ? UVH_TSC_SYNC_SHIFT_UV2K : UVH_TSC_SYNC_SHIFT;
+ if (mmr_shift)
+ sync_state = (mmr >> mmr_shift) & UVH_TSC_SYNC_MASK;
+ else
+ sync_state = 0;
+
+ switch (sync_state) {
+ case UVH_TSC_SYNC_VALID:
+ state = "in sync";
+ valid = true;
+ break;
+
+ case UVH_TSC_SYNC_INVALID:
+ state = "unstable";
+ valid = false;
+ break;
+ default:
+ state = "unknown: assuming valid";
+ valid = true;
+ break;
+ }
+ pr_info("UV: TSC sync state from BIOS:0%d(%s)\n", sync_state, state);
+
+ /* Mark flag that says TSC != 0 is valid for socket 0 */
+ if (valid)
+ mark_tsc_async_resets("UV BIOS");
+ else
+ mark_tsc_unstable("UV BIOS");
+}
+
/* [Copied from arch/x86/kernel/cpu/topology.c:detect_extended_topology()] */

#define SMT_LEVEL 0 /* Leaf 0xb SMT level */
@@ -288,6 +330,7 @@ static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
}

pr_info("UV: OEM IDs %s/%s, System/HUB Types %d/%d, uv_apic %d\n", oem_id, oem_table_id, uv_system_type, uv_min_hub_revision_id, uv_apic);
+ uv_tsc_check_sync();

return uv_apic;

diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 0957dd73d127..b40311027c15 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -822,26 +822,6 @@ dump_kernel_offset(struct notifier_block *self, unsigned long v, void *p)
return 0;
}

-static void __init simple_udelay_calibration(void)
-{
- unsigned int tsc_khz, cpu_khz;
- unsigned long lpj;
-
- if (!boot_cpu_has(X86_FEATURE_TSC))
- return;
-
- cpu_khz = x86_platform.calibrate_cpu();
- tsc_khz = x86_platform.calibrate_tsc();
-
- tsc_khz = tsc_khz ? : cpu_khz;
- if (!tsc_khz)
- return;
-
- lpj = tsc_khz * 1000;
- do_div(lpj, HZ);
- loops_per_jiffy = lpj;
-}
-
/*
* Determine if we were loaded by an EFI loader. If so, then we have also been
* passed the efi memmap, systab, etc., so we should use these data structures
@@ -1049,8 +1029,6 @@ void __init setup_arch(char **cmdline_p)
*/
init_hypervisor_platform();

- simple_udelay_calibration();
-
x86_init.resources.probe_roms();

/* after parse_early_param, so could debug it */
@@ -1135,9 +1113,6 @@ void __init setup_arch(char **cmdline_p)
memblock_set_current_limit(ISA_END_ADDRESS);
e820__memblock_setup();

- if (!early_xdbc_setup_hardware())
- early_xdbc_register_console();
-
reserve_bios_regions();

if (efi_enabled(EFI_MEMMAP)) {
@@ -1243,6 +1218,10 @@ void __init setup_arch(char **cmdline_p)
kvmclock_init();
#endif

+ tsc_early_delay_calibrate();
+ if (!early_xdbc_setup_hardware())
+ early_xdbc_register_console();
+
x86_init.paging.pagetable_init();

kasan_init();
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 796d96bb0821..416de29fe862 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -112,7 +112,7 @@ static void cyc2ns_data_init(struct cyc2ns_data *data)
data->cyc2ns_offset = 0;
}

-static void cyc2ns_init(int cpu)
+static void __init cyc2ns_init(int cpu)
{
struct cyc2ns *c2n = &per_cpu(cyc2ns, cpu);

@@ -812,13 +812,13 @@ unsigned long native_calibrate_cpu(void)
return tsc_pit_min;
}

-int recalibrate_cpu_khz(void)
+void recalibrate_cpu_khz(void)
{
#ifndef CONFIG_SMP
unsigned long cpu_khz_old = cpu_khz;

if (!boot_cpu_has(X86_FEATURE_TSC))
- return -ENODEV;
+ return;

cpu_khz = x86_platform.calibrate_cpu();
tsc_khz = x86_platform.calibrate_tsc();
@@ -828,10 +828,6 @@ int recalibrate_cpu_khz(void)
cpu_khz = tsc_khz;
cpu_data(0).loops_per_jiffy = cpufreq_scale(cpu_data(0).loops_per_jiffy,
cpu_khz_old, cpu_khz);
-
- return 0;
-#else
- return -ENODEV;
#endif
}

@@ -959,17 +955,21 @@ core_initcall(cpufreq_register_tsc_scaling);
/*
* If ART is present detect the numerator:denominator to convert to TSC
*/
-static void detect_art(void)
+static void __init detect_art(void)
{
unsigned int unused[2];

if (boot_cpu_data.cpuid_level < ART_CPUID_LEAF)
return;

- /* Don't enable ART in a VM, non-stop TSC and TSC_ADJUST required */
+ /*
+ * Don't enable ART in a VM, non-stop TSC and TSC_ADJUST required,
+ * and the TSC counter resets must not occur asynchronously.
+ */
if (boot_cpu_has(X86_FEATURE_HYPERVISOR) ||
!boot_cpu_has(X86_FEATURE_NONSTOP_TSC) ||
- !boot_cpu_has(X86_FEATURE_TSC_ADJUST))
+ !boot_cpu_has(X86_FEATURE_TSC_ADJUST) ||
+ tsc_async_resets)
return;

cpuid(ART_CPUID_LEAF, &art_to_tsc_denominator,
@@ -1263,6 +1263,25 @@ static int __init init_tsc_clocksource(void)
*/
device_initcall(init_tsc_clocksource);

+void __init tsc_early_delay_calibrate(void)
+{
+ unsigned long lpj;
+
+ if (!boot_cpu_has(X86_FEATURE_TSC))
+ return;
+
+ cpu_khz = x86_platform.calibrate_cpu();
+ tsc_khz = x86_platform.calibrate_tsc();
+
+ tsc_khz = tsc_khz ? : cpu_khz;
+ if (!tsc_khz)
+ return;
+
+ lpj = tsc_khz * 1000;
+ do_div(lpj, HZ);
+ loops_per_jiffy = lpj;
+}
+
void __init tsc_init(void)
{
u64 lpj, cyc;
diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c
index 7842371bc9e4..26ba0530a8a7 100644
--- a/arch/x86/kernel/tsc_sync.c
+++ b/arch/x86/kernel/tsc_sync.c
@@ -30,6 +30,20 @@ struct tsc_adjust {

static DEFINE_PER_CPU(struct tsc_adjust, tsc_adjust);

+/*
+ * TSC's on different sockets may be reset asynchronously.
+ * This may cause the TSC ADJUST value on socket 0 to be NOT 0.
+ */
+bool __read_mostly tsc_async_resets;
+
+void mark_tsc_async_resets(char *reason)
+{
+ if (tsc_async_resets)
+ return;
+ tsc_async_resets = true;
+ pr_info("tsc: Marking TSC async resets true due to %s\n", reason);
+}
+
void tsc_verify_tsc_adjust(bool resume)
{
struct tsc_adjust *adj = this_cpu_ptr(&tsc_adjust);
@@ -38,6 +52,10 @@ void tsc_verify_tsc_adjust(bool resume)
if (!boot_cpu_has(X86_FEATURE_TSC_ADJUST))
return;

+ /* Skip unnecessary error messages if TSC already unstable */
+ if (check_tsc_unstable())
+ return;
+
/* Rate limit the MSR check */
if (!resume && time_before(jiffies, adj->nextcheck))
return;
@@ -71,12 +89,22 @@ static void tsc_sanitize_first_cpu(struct tsc_adjust *cur, s64 bootval,
* non zero. We don't do that on non boot cpus because physical
* hotplug should have set the ADJUST register to a value > 0 so
* the TSC is in sync with the already running cpus.
+ *
+ * Also don't force the ADJUST value to zero if that is a valid value
+ * for socket 0 as determined by the system arch. This is required
+ * when multiple sockets are reset asynchronously with each other
+ * and socket 0 may not have an TSC ADJUST value of 0.
*/
if (bootcpu && bootval != 0) {
- pr_warn(FW_BUG "TSC ADJUST: CPU%u: %lld force to 0\n", cpu,
- bootval);
- wrmsrl(MSR_IA32_TSC_ADJUST, 0);
- bootval = 0;
+ if (likely(!tsc_async_resets)) {
+ pr_warn(FW_BUG "TSC ADJUST: CPU%u: %lld force to 0\n",
+ cpu, bootval);
+ wrmsrl(MSR_IA32_TSC_ADJUST, 0);
+ bootval = 0;
+ } else {
+ pr_info("TSC ADJUST: CPU%u: %lld NOT forced to 0\n",
+ cpu, bootval);
+ }
}
cur->adjusted = bootval;
}
@@ -90,6 +118,10 @@ bool __init tsc_store_and_check_tsc_adjust(bool bootcpu)
if (!boot_cpu_has(X86_FEATURE_TSC_ADJUST))
return false;

+ /* Skip unnecessary error messages if TSC already unstable */
+ if (check_tsc_unstable())
+ return false;
+
rdmsrl(MSR_IA32_TSC_ADJUST, bootval);
cur->bootval = bootval;
cur->nextcheck = jiffies + HZ;
@@ -118,6 +150,13 @@ bool tsc_store_and_check_tsc_adjust(bool bootcpu)
cur->warned = false;

/*
+ * If a non-zero TSC value for socket 0 may be valid then the default
+ * adjusted value cannot assumed to be zero either.
+ */
+ if (tsc_async_resets)
+ cur->adjusted = bootval;
+
+ /*
* Check whether this CPU is the first in a package to come up. In
* this case do not check the boot value against another package
* because the new package might have been physically hotplugged,
@@ -138,10 +177,9 @@ bool tsc_store_and_check_tsc_adjust(bool bootcpu)
* Compare the boot value and complain if it differs in the
* package.
*/
- if (bootval != ref->bootval) {
- pr_warn(FW_BUG "TSC ADJUST differs: Reference CPU%u: %lld CPU%u: %lld\n",
- refcpu, ref->bootval, cpu, bootval);
- }
+ if (bootval != ref->bootval)
+ printk_once(FW_BUG "TSC ADJUST differs within socket(s), fixing all errors\n");
+
/*
* The TSC_ADJUST values in a package must be the same. If the boot
* value on this newly upcoming CPU differs from the adjustment
@@ -149,8 +187,6 @@ bool tsc_store_and_check_tsc_adjust(bool bootcpu)
* adjusted value.
*/
if (bootval != ref->adjusted) {
- pr_warn("TSC ADJUST synchronize: Reference CPU%u: %lld CPU%u: %lld\n",
- refcpu, ref->adjusted, cpu, bootval);
cur->adjusted = ref->adjusted;
wrmsrl(MSR_IA32_TSC_ADJUST, ref->adjusted);
}