Re: [RFC 2/2] Make x86 calibrate_delay run in parallel.

From: Yinghai Lu
Date: Thu Mar 31 2011 - 00:46:52 EST


On Tue, Dec 14, 2010 at 5:58 PM, <Robin@xxxxxxx> wrote:
>
> On a 4096 cpu machine, we noticed that 318 seconds were taken for bringing
> up the cpus.  By specifying lpj=<value>, we reduced that to 75 seconds.
> Andi Kleen suggested we rework the calibrate_delay calls to run in
> parallel.  With that code in place, a test boot of the same machine took
> 61 seconds to bring the cups up.  I am not sure how we beat the lpj=
> case, but it did outperform.
>
> One thing to note is the total BogoMIPS value is also consistently higher.
> I am wondering if this is an effect with the cores being in performance
> mode.  I did notice that the parallel calibrate_delay calls did cause the
> fans on the machine to ramp up to full speed where the normal sequential
> calls did not cause them to budge at all.

please check attached patch, that could calibrate correctly.

Thanks

Yinghai
[PATCH -v2] x86: Make calibrate_delay run in parallel.

On a 4096 cpu machine, we noticed that 318 seconds were taken for bringing
up the cpus. By specifying lpj=<value>, we reduced that to 75 seconds.
Andi Kleen suggested we rework the calibrate_delay calls to run in
parallel.

-v2: from Yinghai
two path: one for initial boot cpus. and one for hotplug cpus
initial path:
after all cpu boot up, enter idle, use smp_call_function_many
let every ap call __calibrate_delay.
We can not put that calibrate_delay after local_irq_enable
in start_secondary(), at that time that cpu could be involed
with perf_event with nmi_watchdog enabling. that will cause
strange calibrating result.
add __calibrate_delay instead of changing calibrate_delay all over.
use cpu_calibrated_delay_mask instead...
use print_lpj to make print line complete.

Signed-off-by: Robin Holt <holt@xxxxxxx>
To: Andi Kleen <andi@xxxxxxxxxxxxxx>
Cc: Thomas Gleixner <tglx@xxxxxxxxxxxxx>
Cc: Ingo Molnar <mingo@xxxxxxxxxx>
Signed-off-by: Yinghai Lu <yinghai@xxxxxxxxxx>

---
arch/x86/include/asm/cpumask.h | 1
arch/x86/kernel/cpu/common.c | 2 +
arch/x86/kernel/smpboot.c | 58 ++++++++++++++++++++++++++++++++++-------
include/linux/delay.h | 1
init/calibrate.c | 44 +++++++++++++++----------------
5 files changed, 75 insertions(+), 31 deletions(-)


--
Index: linux-2.6/arch/x86/include/asm/cpumask.h
===================================================================
--- linux-2.6.orig/arch/x86/include/asm/cpumask.h
+++ linux-2.6/arch/x86/include/asm/cpumask.h
@@ -6,6 +6,7 @@
extern cpumask_var_t cpu_callin_mask;
extern cpumask_var_t cpu_callout_mask;
extern cpumask_var_t cpu_initialized_mask;
+extern cpumask_var_t cpu_calibrated_delay_mask;
extern cpumask_var_t cpu_sibling_setup_mask;

extern void setup_cpu_local_masks(void);
Index: linux-2.6/arch/x86/kernel/cpu/common.c
===================================================================
--- linux-2.6.orig/arch/x86/kernel/cpu/common.c
+++ linux-2.6/arch/x86/kernel/cpu/common.c
@@ -45,6 +45,7 @@
cpumask_var_t cpu_initialized_mask;
cpumask_var_t cpu_callout_mask;
cpumask_var_t cpu_callin_mask;
+cpumask_var_t cpu_calibrated_delay_mask;

/* representing cpus for which sibling maps can be computed */
cpumask_var_t cpu_sibling_setup_mask;
@@ -55,6 +56,7 @@ void __init setup_cpu_local_masks(void)
alloc_bootmem_cpumask_var(&cpu_initialized_mask);
alloc_bootmem_cpumask_var(&cpu_callin_mask);
alloc_bootmem_cpumask_var(&cpu_callout_mask);
+ alloc_bootmem_cpumask_var(&cpu_calibrated_delay_mask);
alloc_bootmem_cpumask_var(&cpu_sibling_setup_mask);
}

Index: linux-2.6/arch/x86/kernel/smpboot.c
===================================================================
--- linux-2.6.orig/arch/x86/kernel/smpboot.c
+++ linux-2.6/arch/x86/kernel/smpboot.c
@@ -52,6 +52,7 @@
#include <linux/gfp.h>

#include <asm/acpi.h>
+#include <asm/cpumask.h>
#include <asm/desc.h>
#include <asm/nmi.h>
#include <asm/irq.h>
@@ -207,15 +208,7 @@ static void __cpuinit smp_callin(void)
* Need to setup vector mappings before we enable interrupts.
*/
setup_vector_irq(smp_processor_id());
- /*
- * Get our bogomips.
- *
- * Need to enable IRQs because it can take longer and then
- * the NMI watchdog might kill us.
- */
- local_irq_enable();
- calibrate_delay();
- local_irq_disable();
+
pr_debug("Stack at about %p\n", &cpuid);

/*
@@ -1037,6 +1030,8 @@ void __init native_smp_prepare_cpus(unsi
}
set_cpu_sibling_map(0);

+ /* already called earlier for boot cpu */
+ cpumask_set_cpu(0, cpu_calibrated_delay_mask);

if (smp_sanity_check(max_cpus) < 0) {
printk(KERN_INFO "SMP disabled\n");
@@ -1125,8 +1120,53 @@ void __init native_smp_prepare_boot_cpu(
per_cpu(cpu_state, me) = CPU_ONLINE;
}

+static void __cpuinit calibrate_delay_fn(void *info)
+{
+ int cpu = smp_processor_id();
+
+ cpu_data(cpu).loops_per_jiffy = __calibrate_delay(cpu, loops_per_jiffy);
+ cpumask_set_cpu(cpu, cpu_calibrated_delay_mask);
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+static int __cpuinit
+cal_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
+{
+ int cpu = (unsigned long)hcpu;
+
+ switch (action) {
+ case CPU_ONLINE:
+ case CPU_ONLINE_FROZEN:
+ smp_call_function_single(cpu, calibrate_delay_fn, NULL, 1);
+ break;
+ }
+
+ return NOTIFY_OK;
+}
+
+static __cpuinitdata struct notifier_block __cpuinitdata cal_cpu_nfb = {
+ .notifier_call = cal_cpu_callback
+};
+
+static void __init register_cal_cpu_nfb(void)
+{
+ register_cpu_notifier(&cal_cpu_nfb);
+}
+#else
+static void __init register_cal_cpu_nfb(void)
+{
+}
+#endif
+
void __init native_smp_cpus_done(unsigned int max_cpus)
{
+ smp_call_function_many(cpu_online_mask, calibrate_delay_fn, NULL, 0);
+ while (cpumask_weight(cpu_calibrated_delay_mask) != num_online_cpus()) {
+ cpu_relax();
+ touch_nmi_watchdog();
+ }
+ register_cal_cpu_nfb();
+
pr_debug("Boot done.\n");

impress_friends();
Index: linux-2.6/include/linux/delay.h
===================================================================
--- linux-2.6.orig/include/linux/delay.h
+++ linux-2.6/include/linux/delay.h
@@ -43,6 +43,7 @@ static inline void ndelay(unsigned long

extern unsigned long lpj_fine;
void calibrate_delay(void);
+unsigned long __calibrate_delay(int cpu, unsigned long lpj);
void msleep(unsigned int msecs);
unsigned long msleep_interruptible(unsigned int msecs);
void usleep_range(unsigned long min, unsigned long max);
Index: linux-2.6/init/calibrate.c
===================================================================
--- linux-2.6.orig/init/calibrate.c
+++ linux-2.6/init/calibrate.c
@@ -183,32 +183,32 @@ recalibrate:
return lpj;
}

-void __cpuinit calibrate_delay(void)
+static void __cpuinit print_lpj(int cpu, char *str, unsigned long lpj)
{
- static bool printed;
+ pr_info("CPU%d: Calibrating delay%s"
+ "%lu.%02lu BogoMIPS (lpj=%lu)\n", cpu, str,
+ lpj/(500000/HZ), (lpj/(5000/HZ)) % 100, lpj);
+}

+unsigned long __cpuinit __calibrate_delay(int cpu, unsigned long lpj)
+{
if (preset_lpj) {
- loops_per_jiffy = preset_lpj;
- if (!printed)
- pr_info("Calibrating delay loop (skipped) "
- "preset value.. ");
- } else if ((!printed) && lpj_fine) {
- loops_per_jiffy = lpj_fine;
- pr_info("Calibrating delay loop (skipped), "
- "value calculated using timer frequency.. ");
- } else if ((loops_per_jiffy = calibrate_delay_direct()) != 0) {
- if (!printed)
- pr_info("Calibrating delay using timer "
- "specific routine.. ");
+ lpj = preset_lpj;
+ print_lpj(cpu, " loop (skipped) preset value..", lpj);
+ } else if ((cpu == 0) && lpj_fine) {
+ lpj = lpj_fine;
+ print_lpj(cpu, " loop (skipped), value calculated using timer frequency.. ", lpj);
+ } else if ((lpj = calibrate_delay_direct()) != 0) {
+ print_lpj(cpu, " using timer specific routine.. ", lpj);
} else {
- if (!printed)
- pr_info("Calibrating delay loop... ");
- loops_per_jiffy = calibrate_delay_converge();
+ lpj = calibrate_delay_converge();
+ print_lpj(cpu, " loop ... ", lpj);
}
- if (!printed)
- pr_cont("%lu.%02lu BogoMIPS (lpj=%lu)\n",
- loops_per_jiffy/(500000/HZ),
- (loops_per_jiffy/(5000/HZ)) % 100, loops_per_jiffy);

- printed = true;
+ return lpj;
+}
+
+void __cpuinit calibrate_delay(void)
+{
+ loops_per_jiffy = __calibrate_delay(0, loops_per_jiffy);
}