Re: [PATCH v2] x86: Reduce clock calibration time during slave cpu startup

From: Yinghai Lu
Date: Fri Aug 05 2011 - 20:21:46 EST


On Fri, Aug 5, 2011 at 6:16 AM, Jack Steiner <steiner@xxxxxxx> wrote:
> Aside from the bogus comment (I'll send a V3 with a fixed comment), does the patch
> look ok.

Several months ago, Robin said that he will test updated version

[PATCH] x86: Make calibrate_delay run in parallel.

so any reason that you sgi guyes stop that path?

Please check attached patch for updated version to current tip.

Thanks

Yinghai Lu
[PATCH -v4] x86: Make calibrate_delay run in parallel.

On a 4096 cpu machine, we noticed that 318 seconds were taken for bringing
up the cpus. By specifying lpj=<value>, we reduced that to 75 seconds.
Andi Kleen suggested we rework the calibrate_delay calls to run in
parallel.

-v2: from Yinghai
two path: one for initial boot cpus. and one for hotplug cpus
initial path:
after all cpu boot up, enter idle, use smp_call_function_many
let every ap call __calibrate_delay.
We can not put that calibrate_delay after local_irq_enable
in start_secondary(), at that time that cpu could be involed
with perf_event with nmi_watchdog enabling. that will cause
strange calibrating result.
hotplug path:
need to add cpu notify block.
add __calibrate_delay instead of changing calibrate_delay all over.
use cpu_calibrated_delay_mask instead.
use print_lpj to make print line complete.

Signed-off-by: Robin Holt <holt@xxxxxxx>
To: Andi Kleen <andi@xxxxxxxxxxxxxx>
Cc: Thomas Gleixner <tglx@xxxxxxxxxxxxx>
Cc: Ingo Molnar <mingo@xxxxxxxxxx>
Signed-off-by: Yinghai Lu <yinghai@xxxxxxxxxx>

---
arch/x86/include/asm/cpumask.h | 1
arch/x86/kernel/cpu/common.c | 3 ++
arch/x86/kernel/smpboot.c | 58 ++++++++++++++++++++++++++++++++++-------
include/linux/delay.h | 1
init/calibrate.c | 54 +++++++++++++++++++-------------------
5 files changed, 81 insertions(+), 36 deletions(-)


--
Index: linux-2.6/arch/x86/include/asm/cpumask.h
===================================================================
--- linux-2.6.orig/arch/x86/include/asm/cpumask.h
+++ linux-2.6/arch/x86/include/asm/cpumask.h
@@ -6,6 +6,7 @@
extern cpumask_var_t cpu_callin_mask;
extern cpumask_var_t cpu_callout_mask;
extern cpumask_var_t cpu_initialized_mask;
+extern cpumask_var_t cpu_calibrated_delay_mask;
extern cpumask_var_t cpu_sibling_setup_mask;

extern void setup_cpu_local_masks(void);
Index: linux-2.6/arch/x86/kernel/cpu/common.c
===================================================================
--- linux-2.6.orig/arch/x86/kernel/cpu/common.c
+++ linux-2.6/arch/x86/kernel/cpu/common.c
@@ -45,6 +45,7 @@
cpumask_var_t cpu_initialized_mask;
cpumask_var_t cpu_callout_mask;
cpumask_var_t cpu_callin_mask;
+cpumask_var_t cpu_calibrated_delay_mask;

/* representing cpus for which sibling maps can be computed */
cpumask_var_t cpu_sibling_setup_mask;
@@ -58,6 +59,8 @@ void __init setup_cpu_local_masks(void)
alloc_bootmem_cpumask_var(&cpu_callin_mask);
set_bootmem_name("cpu_callout_mask");
alloc_bootmem_cpumask_var(&cpu_callout_mask);
+ set_bootmem_name("cpu_calibrated_delay_mask");
+ alloc_bootmem_cpumask_var(&cpu_calibrated_delay_mask);
set_bootmem_name("cpu_sibling_setup_mask");
alloc_bootmem_cpumask_var(&cpu_sibling_setup_mask);
}
Index: linux-2.6/arch/x86/kernel/smpboot.c
===================================================================
--- linux-2.6.orig/arch/x86/kernel/smpboot.c
+++ linux-2.6/arch/x86/kernel/smpboot.c
@@ -52,6 +52,7 @@
#include <linux/gfp.h>

#include <asm/acpi.h>
+#include <asm/cpumask.h>
#include <asm/desc.h>
#include <asm/nmi.h>
#include <asm/irq.h>
@@ -210,15 +211,7 @@ static void __cpuinit smp_callin(void)
* Need to setup vector mappings before we enable interrupts.
*/
setup_vector_irq(smp_processor_id());
- /*
- * Get our bogomips.
- *
- * Need to enable IRQs because it can take longer and then
- * the NMI watchdog might kill us.
- */
- local_irq_enable();
- calibrate_delay();
- local_irq_disable();
+
pr_debug("Stack at about %p\n", &cpuid);

/*
@@ -1038,6 +1031,8 @@ void __init native_smp_prepare_cpus(unsi
}
set_cpu_sibling_map(0);

+ /* already called earlier for boot cpu */
+ cpumask_set_cpu(0, cpu_calibrated_delay_mask);

if (smp_sanity_check(max_cpus) < 0) {
printk(KERN_INFO "SMP disabled\n");
@@ -1126,8 +1121,53 @@ void __init native_smp_prepare_boot_cpu(
per_cpu(cpu_state, me) = CPU_ONLINE;
}

+static void __cpuinit calibrate_delay_fn(void *info)
+{
+ int cpu = smp_processor_id();
+
+ cpu_data(cpu).loops_per_jiffy = __calibrate_delay(cpu);
+ cpumask_set_cpu(cpu, cpu_calibrated_delay_mask);
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+static int __cpuinit
+cal_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
+{
+ int cpu = (unsigned long)hcpu;
+
+ switch (action) {
+ case CPU_ONLINE:
+ case CPU_ONLINE_FROZEN:
+ smp_call_function_single(cpu, calibrate_delay_fn, NULL, 1);
+ break;
+ }
+
+ return NOTIFY_OK;
+}
+
+static __cpuinitdata struct notifier_block __cpuinitdata cal_cpu_nfb = {
+ .notifier_call = cal_cpu_callback
+};
+
+static void __init register_cal_cpu_nfb(void)
+{
+ register_cpu_notifier(&cal_cpu_nfb);
+}
+#else
+static void __init register_cal_cpu_nfb(void)
+{
+}
+#endif
+
void __init native_smp_cpus_done(unsigned int max_cpus)
{
+ smp_call_function_many(cpu_online_mask, calibrate_delay_fn, NULL, 0);
+ while (cpumask_weight(cpu_calibrated_delay_mask) != num_online_cpus()) {
+ cpu_relax();
+ touch_nmi_watchdog();
+ }
+ register_cal_cpu_nfb();
+
pr_debug("Boot done.\n");

impress_friends();
Index: linux-2.6/include/linux/delay.h
===================================================================
--- linux-2.6.orig/include/linux/delay.h
+++ linux-2.6/include/linux/delay.h
@@ -43,6 +43,7 @@ static inline void ndelay(unsigned long

extern unsigned long lpj_fine;
void calibrate_delay(void);
+unsigned long __calibrate_delay(int cpu);
void msleep(unsigned int msecs);
unsigned long msleep_interruptible(unsigned int msecs);
void usleep_range(unsigned long min, unsigned long max);
Index: linux-2.6/init/calibrate.c
===================================================================
--- linux-2.6.orig/init/calibrate.c
+++ linux-2.6/init/calibrate.c
@@ -31,7 +31,7 @@ __setup("lpj=", lpj_setup);
#define DELAY_CALIBRATION_TICKS ((HZ < 100) ? 1 : (HZ/100))
#define MAX_DIRECT_CALIBRATION_RETRIES 5

-static unsigned long __cpuinit calibrate_delay_direct(void)
+static unsigned long __cpuinit calibrate_delay_direct(int cpu)
{
unsigned long pre_start, start, post_start;
unsigned long pre_end, end, post_end;
@@ -134,15 +134,9 @@ static unsigned long __cpuinit calibrate
good_timer_count = 0;
if ((measured_times[max] - estimate) <
(estimate - measured_times[min])) {
- printk(KERN_NOTICE "calibrate_delay_direct() dropping "
- "min bogoMips estimate %d = %lu\n",
- min, measured_times[min]);
measured_times[min] = 0;
min = max;
} else {
- printk(KERN_NOTICE "calibrate_delay_direct() dropping "
- "max bogoMips estimate %d = %lu\n",
- max, measured_times[max]);
measured_times[max] = 0;
max = min;
}
@@ -160,9 +154,9 @@ static unsigned long __cpuinit calibrate

}

- printk(KERN_NOTICE "calibrate_delay_direct() failed to get a good "
+ printk(KERN_NOTICE "CPU%d: calibrate_delay_direct() failed to get a good "
"estimate for loops_per_jiffy.\nProbably due to long platform "
- "interrupts. Consider using \"lpj=\" boot option.\n");
+ "interrupts. Consider using \"lpj=\" boot option.\n", cpu);
return 0;
}
#else
@@ -246,32 +240,38 @@ recalibrate:

static DEFINE_PER_CPU(unsigned long, cpu_loops_per_jiffy) = { 0 };

-void __cpuinit calibrate_delay(void)
+static void __cpuinit print_lpj(int cpu, char *str, unsigned long lpj)
+{
+ pr_info("CPU%d: Calibrating delay%s"
+ "%lu.%02lu BogoMIPS (lpj=%lu)\n", cpu, str,
+ lpj/(500000/HZ), (lpj/(5000/HZ)) % 100, lpj);
+}
+
+unsigned long __cpuinit __calibrate_delay(int cpu)
{
unsigned long lpj;
- int this_cpu = smp_processor_id();

- if (per_cpu(cpu_loops_per_jiffy, this_cpu)) {
- lpj = per_cpu(cpu_loops_per_jiffy, this_cpu);
- pr_info("Calibrating delay loop (skipped) "
- "already calibrated this CPU");
+ if (per_cpu(cpu_loops_per_jiffy, cpu)) {
+ lpj = per_cpu(cpu_loops_per_jiffy, cpu);
+ print_lpj(cpu, " (skipped) already calibrated this CPU ", lpj);
} else if (preset_lpj) {
lpj = preset_lpj;
- pr_info("Calibrating delay loop (skipped) preset value.. ");
- } else if ((this_cpu == 0) && lpj_fine) {
+ print_lpj(cpu, " (skipped) preset value.. ", lpj);
+ } else if ((cpu == 0) && lpj_fine) {
lpj = lpj_fine;
- pr_info("Calibrating delay loop (skipped), "
- "value calculated using timer frequency.. ");
- } else if ((lpj = calibrate_delay_direct()) != 0) {
- pr_info("Calibrating delay using timer specific routine.. ");
+ print_lpj(cpu, " loop (skipped), value calculated using timer frequency.. ", lpj);
+ } else if ((lpj = calibrate_delay_direct(cpu)) != 0) {
+ print_lpj(cpu, " using timer specific routine.. ", lpj);
} else {
- pr_info("Calibrating delay loop... ");
lpj = calibrate_delay_converge();
+ print_lpj(cpu, " delay loop... ", lpj);
}
- per_cpu(cpu_loops_per_jiffy, this_cpu) = lpj;
- pr_cont("%lu.%02lu BogoMIPS (lpj=%lu)\n",
- lpj/(500000/HZ),
- (lpj/(5000/HZ)) % 100, lpj);
+ per_cpu(cpu_loops_per_jiffy, cpu) = lpj;

- loops_per_jiffy = lpj;
+ return lpj;
+}
+
+void __cpuinit calibrate_delay(void)
+{
+ loops_per_jiffy = __calibrate_delay(smp_processor_id());
}