Re: [PATCH 2/4] x86: fix cpu_mask_to_apicid_and to include cpu_online_mask

From: Rusty Russell
Date: Fri Dec 12 2008 - 06:06:57 EST


On Thursday 11 December 2008 21:58:08 Mike Travis wrote:
> Impact: fix potential problem.
>
> In determining the destination apicid, there are usually three cpumasks
> that are considered: the incoming cpumask arg, cfg->domain and the
> cpu_online_mask. Since we are just introducing the cpu_mask_to_apicid_and
> function, make sure it includes the cpu_online_mask in it's evaluation.

Yerk. Can we really "fail" cpu_mask_to_apicid_and with no repercussions?
And does it make sense to try to fix this there?

This is not a new problem with the cpumask patches is it? I toyed with a
patch which converts flush_tlb_others, and it actually ensures that those
cases never hand an offline mask to cpu_mask_to_apicid_and as a side
effect (others still might).

Patch below for your reading (x86:flush_tlb_others-cpumask-ptr.patch in
my series file).

Rusty.

x86: change flush_tlb_others to take a const struct cpumask *. FIXME: REVIEW

This is made a little more tricky by uv_flush_tlb_others which
actually alters its argument, for an IPI to be sent to the remaining
cpus in the mask.

I solve this by allocating a cpumask_var_t for this case and falling back
to IPI should this fail.

To eliminate temporaries in the caller, all flush_tlb_others implementations
now do the this-cpu-elimination step themselves.

Note also the curious "cpus_or(f->flush_cpumask, cpumask, f->flush_cpumask)"
which has been there since pre-git and yet f->flush_cpumask is always zero
at this point.

diff --git a/arch/x86/include/asm/mmu_context_32.h b/arch/x86/include/asm/mmu_codiff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -244,7 +244,8 @@ struct pv_mmu_ops {
void (*flush_tlb_user)(void);
void (*flush_tlb_kernel)(void);
void (*flush_tlb_single)(unsigned long addr);
- void (*flush_tlb_others)(const cpumask_t *cpus, struct mm_struct *mm,
+ void (*flush_tlb_others)(const struct cpumask *cpus,
+ struct mm_struct *mm,
unsigned long va);

/* Hooks for allocating and freeing a pagetable top-level */
@@ -984,10 +985,11 @@ static inline void __flush_tlb_single(un
PVOP_VCALL1(pv_mmu_ops.flush_tlb_single, addr);
}

-static inline void flush_tlb_others(cpumask_t cpumask, struct mm_struct *mm,
+static inline void flush_tlb_others(const struct cpumask *cpumask,
+ struct mm_struct *mm,
unsigned long va)
{
- PVOP_VCALL3(pv_mmu_ops.flush_tlb_others, &cpumask, mm, va);
+ PVOP_VCALL3(pv_mmu_ops.flush_tlb_others, cpumask, mm, va);
}

static inline int paravirt_pgd_alloc(struct mm_struct *mm)
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -113,7 +113,7 @@ static inline void flush_tlb_range(struc
__flush_tlb();
}

-static inline void native_flush_tlb_others(const cpumask_t *cpumask,
+static inline void native_flush_tlb_others(const struct cpumask *cpumask,
struct mm_struct *mm,
unsigned long va)
{
@@ -142,8 +142,8 @@ static inline void flush_tlb_range(struc
flush_tlb_mm(vma->vm_mm);
}

-void native_flush_tlb_others(const cpumask_t *cpumask, struct mm_struct *mm,
- unsigned long va);
+void native_flush_tlb_others(const struct cpumask *cpumask,
+ struct mm_struct *mm, unsigned long va);

#define TLBSTATE_OK 1
#define TLBSTATE_LAZY 2
diff --git a/arch/x86/include/asm/uv/uv_bau.h b/arch/x86/include/asm/uv/uv_bau.h
--- a/arch/x86/include/asm/uv/uv_bau.h
+++ b/arch/x86/include/asm/uv/uv_bau.h
@@ -325,7 +325,8 @@ static inline void bau_cpubits_clear(str
#define cpubit_isset(cpu, bau_local_cpumask) \
test_bit((cpu), (bau_local_cpumask).bits)

-extern int uv_flush_tlb_others(cpumask_t *, struct mm_struct *, unsigned long);
+extern int uv_flush_tlb_others(const struct cpumask *,
+ struct mm_struct *, unsigned long);
extern void uv_bau_message_intr1(void);
extern void uv_bau_timeout_intr1(void);

diff --git a/arch/x86/kernel/tlb_32.c b/arch/x86/kernel/tlb_32.c
--- a/arch/x86/kernel/tlb_32.c
+++ b/arch/x86/kernel/tlb_32.c
@@ -20,7 +20,7 @@ DEFINE_PER_CPU(struct tlb_state, cpu_tlb
* Optimizations Manfred Spraul <manfred@xxxxxxxxxxxxxxxx>
*/

-static cpumask_t flush_cpumask;
+static cpumask_var_t flush_cpumask;
static struct mm_struct *flush_mm;
static unsigned long flush_va;
static DEFINE_SPINLOCK(tlbstate_lock);
@@ -93,7 +94,7 @@ void smp_invalidate_interrupt(struct pt_

cpu = get_cpu();

- if (!cpu_isset(cpu, flush_cpumask))
+ if (!cpumask_test_cpu(cpu, flush_cpumask))
goto out;
/*
* This was a BUG() but until someone can quote me the
@@ -115,34 +116,21 @@ void smp_invalidate_interrupt(struct pt_
}
ack_APIC_irq();
smp_mb__before_clear_bit();
- cpu_clear(cpu, flush_cpumask);
+ cpumask_clear_cpu(cpu, flush_cpumask);
smp_mb__after_clear_bit();
out:
put_cpu_no_resched();
__get_cpu_var(irq_stat).irq_tlb_count++;
}

-void native_flush_tlb_others(const cpumask_t *cpumaskp, struct mm_struct *mm,
- unsigned long va)
+void native_flush_tlb_others(const struct cpumask *cpumask,
+ struct mm_struct *mm, unsigned long va)
{
- cpumask_t cpumask = *cpumaskp;
-
/*
- * A couple of (to be removed) sanity checks:
- *
- * - current CPU must not be in mask
* - mask must exist :)
*/
- BUG_ON(cpus_empty(cpumask));
- BUG_ON(cpu_isset(smp_processor_id(), cpumask));
+ BUG_ON(cpumask_empty(cpumask));
BUG_ON(!mm);
-
-#ifdef CONFIG_HOTPLUG_CPU
- /* If a CPU which we ran on has gone down, OK. */
- cpus_and(cpumask, cpumask, cpu_online_map);
- if (unlikely(cpus_empty(cpumask)))
- return;
-#endif

/*
* i'm not happy about this global shared spinlock in the
@@ -151,9 +139,17 @@ void native_flush_tlb_others(const cpuma
*/
spin_lock(&tlbstate_lock);

+ cpumask_andnot(flush_cpumask, cpumask, cpumask_of(smp_processor_id()));
+#ifdef CONFIG_HOTPLUG_CPU
+ /* If a CPU which we ran on has gone down, OK. */
+ cpumask_and(flush_cpumask, flush_cpumask, cpu_online_mask);
+ if (unlikely(cpumask_empty(flush_cpumask))) {
+ spin_unlock(&tlbstate_lock);
+ return;
+ }
+#endif
flush_mm = mm;
flush_va = va;
- cpus_or(flush_cpumask, cpumask, flush_cpumask);

/*
* Make the above memory operations globally visible before
@@ -164,9 +160,9 @@ void native_flush_tlb_others(const cpuma
* We have to send the IPI only to
* CPUs affected.
*/
- send_IPI_mask(&cpumask, INVALIDATE_TLB_VECTOR);
+ send_IPI_mask(flush_cpumask, INVALIDATE_TLB_VECTOR);

- while (!cpus_empty(flush_cpumask))
+ while (!cpumask_empty(flush_cpumask))
/* nothing. lockup detection does not belong here */
cpu_relax();

@@ -178,25 +174,17 @@ void flush_tlb_current_task(void)
void flush_tlb_current_task(void)
{
struct mm_struct *mm = current->mm;
- cpumask_t cpu_mask;

preempt_disable();
- cpu_mask = mm->cpu_vm_mask;
- cpu_clear(smp_processor_id(), cpu_mask);
-
local_flush_tlb();
- if (!cpus_empty(cpu_mask))
- flush_tlb_others(cpu_mask, mm, TLB_FLUSH_ALL);
+ if (cpumask_any_but(mm->cpu_vm_mask, smp_processor_id()) < nr_cpu_ids)
+ flush_tlb_others(mm->cpu_vm_mask, mm, TLB_FLUSH_ALL);
preempt_enable();
}

void flush_tlb_mm(struct mm_struct *mm)
{
- cpumask_t cpu_mask;
-
preempt_disable();
- cpu_mask = *mm->cpu_vm_mask;
- cpu_clear(smp_processor_id(), cpu_mask);

if (current->active_mm == mm) {
if (current->mm)
@@ -204,8 +192,8 @@ void flush_tlb_mm(struct mm_struct *mm)
else
leave_mm(smp_processor_id());
}
- if (!cpus_empty(cpu_mask))
- flush_tlb_others(cpu_mask, mm, TLB_FLUSH_ALL);
+ if (cpumask_any_but(mm->cpu_vm_mask, smp_processor_id()) < nr_cpu_ids)
+ flush_tlb_others(mm->cpu_vm_mask, mm, TLB_FLUSH_ALL);

preempt_enable();
}
@@ -213,12 +201,8 @@ void flush_tlb_page(struct vm_area_struc
void flush_tlb_page(struct vm_area_struct *vma, unsigned long va)
{
struct mm_struct *mm = vma->vm_mm;
- cpumask_t cpu_mask;

preempt_disable();
- cpu_mask = *mm->cpu_vm_mask;
- cpu_clear(smp_processor_id(), cpu_mask);
-
if (current->active_mm == mm) {
if (current->mm)
__flush_tlb_one(va);
@@ -226,9 +210,8 @@ void flush_tlb_page(struct vm_area_struc
leave_mm(smp_processor_id());
}

- if (!cpus_empty(cpu_mask))
- flush_tlb_others(cpu_mask, mm, va);
-
+ if (cpumask_any_but(mm->cpu_vm_mask, smp_processor_id()) < nr_cpu_ids)
+ flush_tlb_others(mm->cpu_vm_mask, mm, va);
preempt_enable();
}
EXPORT_SYMBOL(flush_tlb_page);
@@ -255,3 +238,9 @@ void reset_lazy_tlbstate(void)
per_cpu(cpu_tlbstate, cpu).active_mm = &init_mm;
}

+static int init_flush_cpumask(void)
+{
+ alloc_cpumask_var(&flush_cpumask, GFP_KERNEL);
+ return 0;
+}
+early_initcall(init_flush_cpumask);
diff --git a/arch/x86/kernel/tlb_64.c b/arch/x86/kernel/tlb_64.c
--- a/arch/x86/kernel/tlb_64.c
+++ b/arch/x86/kernel/tlb_64.c
@@ -43,10 +43,10 @@

union smp_flush_state {
struct {
- cpumask_t flush_cpumask;
struct mm_struct *flush_mm;
unsigned long flush_va;
spinlock_t tlbstate_lock;
+ DECLARE_BITMAP(flush_cpumask, NR_CPUS);
};
char pad[SMP_CACHE_BYTES];
} ____cacheline_aligned;
@@ -131,7 +131,7 @@ asmlinkage void smp_invalidate_interrupt
sender = ~regs->orig_ax - INVALIDATE_TLB_VECTOR_START;
f = &per_cpu(flush_state, sender);

- if (!cpu_isset(cpu, f->flush_cpumask))
+ if (!cpumask_test_cpu(cpu, to_cpumask(f->flush_cpumask)))
goto out;
/*
* This was a BUG() but until someone can quote me the
@@ -153,19 +153,15 @@ asmlinkage void smp_invalidate_interrupt
}
out:
ack_APIC_irq();
- cpu_clear(cpu, f->flush_cpumask);
+ cpumask_clear(cpu, to_cpumask(f->flush_cpumask));
add_pda(irq_tlb_count, 1);
}

-void native_flush_tlb_others(const cpumask_t *cpumaskp, struct mm_struct *mm,
- unsigned long va)
+static void flush_tlb_others_ipi(const struct cpumask *cpumask,
+ struct mm_struct *mm, unsigned long va)
{
int sender;
union smp_flush_state *f;
- cpumask_t cpumask = *cpumaskp;
-
- if (is_uv_system() && uv_flush_tlb_others(&cpumask, mm, va))
- return;

/* Caller has disabled preemption */
sender = smp_processor_id() % NUM_INVALIDATE_TLB_VECTORS;
@@ -180,7 +176,8 @@ void native_flush_tlb_others(const cpuma

f->flush_mm = mm;
f->flush_va = va;
- cpus_or(f->flush_cpumask, cpumask, f->flush_cpumask);
+ cpumask_andnot(to_cpumask(f->flush_cpumask),
+ cpumask, cpumask_of(smp_processor_id));

/*
* Make the above memory operations globally visible before
@@ -191,14 +188,32 @@ void native_flush_tlb_others(const cpuma
* We have to send the IPI only to
* CPUs affected.
*/
- send_IPI_mask(&cpumask, INVALIDATE_TLB_VECTOR_START + sender);
+ send_IPI_mask(cpumask, INVALIDATE_TLB_VECTOR_START + sender);

- while (!cpus_empty(f->flush_cpumask))
+ while (!cpumask_empty(to_cpumask(f->flush_cpumask)))
cpu_relax();

f->flush_mm = NULL;
f->flush_va = 0;
spin_unlock(&f->tlbstate_lock);
+
+
+void native_flush_tlb_others(const struct cpumask *cpumask,
+ struct mm_struct *mm, unsigned long va)
+{
+ if (is_uv_system()) {
+ cpumask_var_t after_uv_flush;
+
+ if (alloc_cpumask_var(&after_uv_flush, GFP_ATOMIC)) {
+ cpumask_andnot(after_uv_flush,
+ cpumask, cpumask_of(smp_processor_id()));
+ if (!uv_flush_tlb_others(after_uv_flush, mm, va))
+ flush_tlb_others_ipi(after_uv_flush, mm, va);
+ free_cpumask_var(after_uv_flush);
+ return;
+ }
+ }
+ flush_tlb_others_ipi(cpumask, mm, va);
}

static int __cpuinit init_smp_flush(void)
@@ -215,34 +230,26 @@ void flush_tlb_current_task(void)
void flush_tlb_current_task(void)
{
struct mm_struct *mm = current->mm;
- cpumask_t cpu_mask;

preempt_disable();
- cpu_mask = *mm->cpu_vm_mask;
- cpu_clear(smp_processor_id(), cpu_mask);
-
local_flush_tlb();
- if (!cpus_empty(cpu_mask))
- flush_tlb_others(cpu_mask, mm, TLB_FLUSH_ALL);
+ if (cpumask_any_but(mm->cpu_vm_mask, smp_processor_id()) < nr_cpu_ids)
+ flush_tlb_others(mm->cpu_vm_mask, mm, TLB_FLUSH_ALL);
preempt_enable();
}

void flush_tlb_mm(struct mm_struct *mm)
{
- cpumask_t cpu_mask;

preempt_disable();
- cpu_mask = *mm->cpu_vm_mask;
- cpu_clear(smp_processor_id(), cpu_mask);
-
if (current->active_mm == mm) {
if (current->mm)
local_flush_tlb();
else
leave_mm(smp_processor_id());
}
- if (!cpus_empty(cpu_mask))
- flush_tlb_others(cpu_mask, mm, TLB_FLUSH_ALL);
+ if (cpumask_any_but(mm->cpu_vm_mask, smp_processor_id()) < nr_cpu_ids)
+ flush_tlb_others(mm->cpu_vm_mask, mm, TLB_FLUSH_ALL);

preempt_enable();
}
@@ -250,11 +257,8 @@ void flush_tlb_page(struct vm_area_struc
void flush_tlb_page(struct vm_area_struct *vma, unsigned long va)
{
struct mm_struct *mm = vma->vm_mm;
- cpumask_t cpu_mask;

preempt_disable();
- cpu_mask = *mm->cpu_vm_mask;
- cpu_clear(smp_processor_id(), cpu_mask);

if (current->active_mm == mm) {
if (current->mm)
@@ -263,8 +267,8 @@ void flush_tlb_page(struct vm_area_struc
leave_mm(smp_processor_id());
}

- if (!cpus_empty(cpu_mask))
- flush_tlb_others(cpu_mask, mm, va);
+ if (cpumask_any_but(mm->cpu_vm_mask, smp_processor_id()) < nr_cpu_ids)
+ flush_tlb_others(mm->cpu_vm_mask, mm, va);

preempt_enable();
}
diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c
--- a/arch/x86/kernel/tlb_uv.c
+++ b/arch/x86/kernel/tlb_uv.c
@@ -212,11 +212,11 @@ static int uv_wait_completion(struct bau
* The cpumaskp mask contains the cpus the broadcast was sent to.
*
* Returns 1 if all remote flushing was done. The mask is zeroed.
- * Returns 0 if some remote flushing remains to be done. The mask is left
- * unchanged.
+ * Returns 0 if some remote flushing remains to be done. The mask will have
+ * some bits still set.
*/
int uv_flush_send_and_wait(int cpu, int this_blade, struct bau_desc *bau_desc,
- cpumask_t *cpumaskp)
+ struct cpumask *cpumaskp)
{
int completion_status = 0;
int right_shift;
@@ -263,13 +263,13 @@ int uv_flush_send_and_wait(int cpu, int
* Success, so clear the remote cpu's from the mask so we don't
* use the IPI method of shootdown on them.
*/
- for_each_cpu_mask(bit, *cpumaskp) {
+ for_each_cpu(bit, cpumaskp) {
blade = uv_cpu_to_blade_id(bit);
if (blade == this_blade)
continue;
- cpu_clear(bit, *cpumaskp);
+ cpumask_clear_cpu(bit, cpumaskp);
}
- if (!cpus_empty(*cpumaskp))
+ if (!cpumask_empty(cpumaskp))
return 0;
return 1;
}
@@ -296,7 +296,7 @@ int uv_flush_send_and_wait(int cpu, int
* Returns 1 if all remote flushing was done.
* Returns 0 if some remote flushing remains to be done.
*/
-int uv_flush_tlb_others(cpumask_t *cpumaskp, struct mm_struct *mm,
+int uv_flush_tlb_others(struct cpumask *cpumaskp, struct mm_struct *mm,
unsigned long va)
{
int i;
@@ -315,7 +315,7 @@ int uv_flush_tlb_others(cpumask_t *cpuma
bau_nodes_clear(&bau_desc->distribution, UV_DISTRIBUTION_SIZE);

i = 0;
- for_each_cpu_mask(bit, *cpumaskp) {
+ for_each_cpu(bit, cpumaskp) {
blade = uv_cpu_to_blade_id(bit);
BUG_ON(blade > (UV_DISTRIBUTION_SIZE - 1));
if (blade == this_blade) {
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -633,35 +633,27 @@ static void xen_flush_tlb_single(unsigne
preempt_enable();
}

-static void xen_flush_tlb_others(const cpumask_t *cpus, struct mm_struct *mm,
- unsigned long va)
+static void xen_flush_tlb_others(const struct cpumask *cpus,
+ struct mm_struct *mm, unsigned long va)
{
struct {
struct mmuext_op op;
- cpumask_t mask;
+ DECLARE_BITMAP(mask, NR_CPUS);
} *args;
- cpumask_t cpumask = *cpus;
struct multicall_space mcs;

- /*
- * A couple of (to be removed) sanity checks:
- *
- * - current CPU must not be in mask
- * - mask must exist :)
- */
- BUG_ON(cpus_empty(cpumask));
- BUG_ON(cpu_isset(smp_processor_id(), cpumask));
+ BUG_ON(cpumask_empty(cpus));
BUG_ON(!mm);
-
- /* If a CPU which we ran on has gone down, OK. */
- cpus_and(cpumask, cpumask, cpu_online_map);
- if (cpus_empty(cpumask))
- return;

mcs = xen_mc_entry(sizeof(*args));
args = mcs.args;
- args->mask = cpumask;
- args->op.arg2.vcpumask = &args->mask;
+ args->op.arg2.vcpumask = to_cpumask(args->mask);
+
+ /* Remove us, and any offline CPUS. */
+ cpumask_and(to_cpumask(args->mask), cpus, cpu_online_mask);
+ cpumask_clear_cpu(smp_processor_id(), to_cpumask(args->mask));
+ if (unlikely(cpumask_empty(to_cpumask(args->mask))))
+ goto issue;

if (va == TLB_FLUSH_ALL) {
args->op.cmd = MMUEXT_TLB_FLUSH_MULTI;
@@ -672,6 +664,7 @@ static void xen_flush_tlb_others(const c

MULTI_mmuext_op(mcs.mc, &args->op, 1, NULL, DOMID_SELF);

+issue:
xen_mc_issue(PARAVIRT_LAZY_MMU);
}


¢éì®&Þ~º&¶¬–+-±éÝ¥Šw®žË±Êâmébžìdz¹Þ)í…æèw*jg¬±¨¶‰šŽŠÝj/êäz¹ÞŠà2ŠÞ¨è­Ú&¢)ß«a¶Úþø®G«éh®æj:+v‰¨Šwè†Ù>Wš±êÞiÛaxPjØm¶Ÿÿà -»+ƒùdš_