Re: [PATCH v2 5/5] perf/x86/amd/power: Add AMD accumulated power reporting mechanism

From: Huang Rui
Date: Thu Jan 21 2016 - 02:04:57 EST


On Wed, Jan 20, 2016 at 10:22:44AM +0100, Peter Zijlstra wrote:
> On Wed, Jan 20, 2016 at 12:48:24PM +0800, Huang Rui wrote:
> > Hi Peter,
> >
> > Thanks so much to your comments.
> >
> > On Tue, Jan 19, 2016 at 01:12:50PM +0100, Peter Zijlstra wrote:
> > > On Thu, Jan 14, 2016 at 10:50:08AM +0800, Huang Rui wrote:
> > > > +struct power_pmu {
> > > > + spinlock_t lock;
> > >
> > > This should be a raw_spinlock_t, as it'll be nested under other
> > > raw_spinlock_t's.
> > >
> >
> > Do you mean the following spinlock operations are in hardware
> > interrupts disabled case, so I need use raw_spinlock_t instead, right?
>
>
> mainline -rt
>
> raw_spinlock_t spin-waits spin-waits
> spinlock_t spin-waits blocks (rt-mutex)
> struct mutex blocks blocks (rt-mutex)
>
>
> since these functions are themselves called with raw_spinlock_t held
> (perf_event_context::lock for example, but also rq::lock), any lock
> nested inside them must also be raw_spinlock_t.
>

I see, thank you. :-)

I just quickly looked at about the spinlock on -rt mode. Because
realtime linux kernel provides two kinds of spinlock, the original
spinlock_t will be replaced the one which is able to sleep, actually,
like mutex. And another one (you mentioned here, raw_spinlock_t) can
keep on non-sleep behavior, that is the real spinlock.

And my lock here also will be nested under perf_event_context::lock,
right?

> I have a lockdep patch somewhere that checks these ordering things; I
> should rebase and post that again.
>

Can you CC me when you post that patch next time?

> > Use raw_spin_lock_irqsave/raw_spin_unlock_irqrestore?
>
> pmu::{start,stop,add,del} will be called with IRQs already disabled.
>
> > > > +static int power_cpu_init(int cpu)
> > > > +{
> > > > + int i, cu, ret = 0;
> > > > + cpumask_var_t mask, dummy_mask;
> > > > +
> > > > + cu = cpu / cores_per_cu;
> > > > +
> > > > + if (!zalloc_cpumask_var(&mask, GFP_KERNEL))
> > > > + return -ENOMEM;
> > > > +
> > > > + if (!zalloc_cpumask_var(&dummy_mask, GFP_KERNEL)) {
> > > > + ret = -ENOMEM;
> > > > + goto out;
> > > > + }
> > > > +
> > > > + for (i = 0; i < cores_per_cu; i++)
> > > > + cpumask_set_cpu(i, mask);
> > > > +
> > > > + cpumask_shift_left(mask, mask, cu * cores_per_cu);
> > > > +
> > > > + if (!cpumask_and(dummy_mask, mask, &cpu_mask))
> > > > + cpumask_set_cpu(cpu, &cpu_mask);
> > > > +
> > > > + free_cpumask_var(dummy_mask);
> > > > +out:
> > > > + free_cpumask_var(mask);
> > > > +
> > > > + return ret;
> > > > +}
> > >
> > > > +static int power_cpu_notifier(struct notifier_block *self,
> > > > + unsigned long action, void *hcpu)
> > > > +{
> > > > + unsigned int cpu = (long)hcpu;
> > > > +
> > > > + switch (action & ~CPU_TASKS_FROZEN) {
> > > > + case CPU_UP_PREPARE:
> > > > + if (power_cpu_prepare(cpu))
> > > > + return NOTIFY_BAD;
> > > > + break;
> > > > + case CPU_STARTING:
> > > > + if (power_cpu_init(cpu))
> > > > + return NOTIFY_BAD;
> > >
> > > this is called with IRQs disabled, which makes those GFP_KERNEL allocs
> > > above a pretty bad idea.
> > >
> >
> > Right, so should I use GFP_ATOMIC to allocate cpumask here?
>
> One should not use GFP_ATOMIC if at all possible, also no, -rt cannot do
> _any_ allocations from this site.
>

OK, that's because allocation might sleep when IRQ disabled. That's
incorrect.

> > > Also, note that -rt cannot actually do _any_ allocations/frees from
> > > STARTING.
> > >
> > > Please move the allocs/frees to PREPARE/ONLINE.
> > >
> >
> > How about add two cpumask_var_t at power_pmu structure? Then allocate
> > the two cpumask_var_t (pmu->mask, pmu->dummy_mask), and they can be
> > also used on power_cpu_init.
>
> That would work.

I draft an update diff that based on original patch, please take a
look.

8<--------------------------------------------------------------------------

diff --git a/arch/x86/kernel/cpu/perf_event_amd_power.c b/arch/x86/kernel/cpu/perf_event_amd_power.c
index 69ef234..e71d993 100644
--- a/arch/x86/kernel/cpu/perf_event_amd_power.c
+++ b/arch/x86/kernel/cpu/perf_event_amd_power.c
@@ -46,10 +46,17 @@ static unsigned int cu_num;
static u64 max_cu_acc_power;

struct power_pmu {
- spinlock_t lock;
+ raw_spinlock_t lock;
struct list_head active_list;
struct pmu *pmu; /* pointer to power_pmu_class */
local64_t cpu_sw_pwr_ptsc;
+ /*
+ * These two cpumasks is used for avoiding the allocations on
+ * CPU_STARTING phase. Because power_cpu_prepare will be
+ * called on IRQs disabled status.
+ */
+ cpumask_var_t mask;
+ cpumask_var_t tmp_mask;
};

static struct pmu pmu_class;
@@ -126,9 +133,9 @@ static void pmu_event_start(struct perf_event *event, int mode)
struct power_pmu *pmu = __this_cpu_read(amd_power_pmu);
unsigned long flags;

- spin_lock_irqsave(&pmu->lock, flags);
+ raw_spin_lock_irqsave(&pmu->lock, flags);
__pmu_event_start(pmu, event);
- spin_unlock_irqrestore(&pmu->lock, flags);
+ raw_spin_unlock_irqrestore(&pmu->lock, flags);
}

static void pmu_event_stop(struct perf_event *event, int mode)
@@ -137,7 +144,7 @@ static void pmu_event_stop(struct perf_event *event, int mode)
struct hw_perf_event *hwc = &event->hw;
unsigned long flags;

- spin_lock_irqsave(&pmu->lock, flags);
+ raw_spin_lock_irqsave(&pmu->lock, flags);

/* mark event as deactivated and stopped */
if (!(hwc->state & PERF_HES_STOPPED)) {
@@ -155,7 +162,7 @@ static void pmu_event_stop(struct perf_event *event, int mode)
hwc->state |= PERF_HES_UPTODATE;
}

- spin_unlock_irqrestore(&pmu->lock, flags);
+ raw_spin_unlock_irqrestore(&pmu->lock, flags);
}

static int pmu_event_add(struct perf_event *event, int mode)
@@ -164,14 +171,14 @@ static int pmu_event_add(struct perf_event *event, int mode)
struct hw_perf_event *hwc = &event->hw;
unsigned long flags;

- spin_lock_irqsave(&pmu->lock, flags);
+ raw_spin_lock_irqsave(&pmu->lock, flags);

hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;

if (mode & PERF_EF_START)
__pmu_event_start(pmu, event);

- spin_unlock_irqrestore(&pmu->lock, flags);
+ raw_spin_unlock_irqrestore(&pmu->lock, flags);

return 0;
}
@@ -297,89 +304,71 @@ static int power_cpu_exit(int cpu)
struct power_pmu *pmu = per_cpu(amd_power_pmu, cpu);
int i, cu, ret = 0;
int target = nr_cpumask_bits;
- cpumask_var_t mask, tmp_mask;

cu = cpu / cores_per_cu;

- if (!zalloc_cpumask_var(&mask, GFP_KERNEL))
- return -ENOMEM;
-
- if (!zalloc_cpumask_var(&tmp_mask, GFP_KERNEL)) {
- ret = -ENOMEM;
- goto out;
- }
+ cpumask_clear(pmu->mask);
+ cpumask_clear(pmu->tmp_mask);

for (i = 0; i < cores_per_cu; i++)
- cpumask_set_cpu(i, mask);
+ cpumask_set_cpu(i, pmu->mask);

- cpumask_shift_left(mask, mask, cu * cores_per_cu);
+ cpumask_shift_left(pmu->mask, pmu->mask, cu * cores_per_cu);

cpumask_clear_cpu(cpu, &cpu_mask);
- cpumask_clear_cpu(cpu, mask);
+ cpumask_clear_cpu(cpu, pmu->mask);

- if (!cpumask_and(tmp_mask, mask, cpu_online_mask))
- goto out1;
+ if (!cpumask_and(pmu->tmp_mask, pmu->mask, cpu_online_mask))
+ goto out;

/*
* find a new CPU on same compute unit, if was set in cpumask
* and still some CPUs on compute unit, then move to the new
* CPU
*/
- target = cpumask_any(tmp_mask);
+ target = cpumask_any(pmu->tmp_mask);
if (target < nr_cpumask_bits && target != cpu)
cpumask_set_cpu(target, &cpu_mask);

WARN_ON(cpumask_empty(&cpu_mask));

-out1:
+out:
/*
* migrate events and context to new CPU
*/
if (target < nr_cpumask_bits)
perf_pmu_migrate_context(pmu->pmu, cpu, target);

- free_cpumask_var(tmp_mask);
-out:
- free_cpumask_var(mask);
-
return ret;

}

static int power_cpu_init(int cpu)
{
- int i, cu, ret = 0;
- cpumask_var_t mask, dummy_mask;
-
- cu = cpu / cores_per_cu;
+ struct power_pmu *pmu = per_cpu(amd_power_pmu, cpu);
+ int i, cu;

- if (!zalloc_cpumask_var(&mask, GFP_KERNEL))
- return -ENOMEM;
+ if (pmu)
+ return 0;

- if (!zalloc_cpumask_var(&dummy_mask, GFP_KERNEL)) {
- ret = -ENOMEM;
- goto out;
- }
+ cu = cpu / cores_per_cu;

for (i = 0; i < cores_per_cu; i++)
- cpumask_set_cpu(i, mask);
+ cpumask_set_cpu(i, pmu->mask);

- cpumask_shift_left(mask, mask, cu * cores_per_cu);
+ cpumask_shift_left(pmu->mask, pmu->mask, cu * cores_per_cu);

- if (!cpumask_and(dummy_mask, mask, &cpu_mask))
+ if (!cpumask_and(pmu->tmp_mask, pmu->mask, &cpu_mask))
cpumask_set_cpu(cpu, &cpu_mask);

- free_cpumask_var(dummy_mask);
-out:
- free_cpumask_var(mask);
-
- return ret;
+ return 0;
}

static int power_cpu_prepare(int cpu)
{
struct power_pmu *pmu = per_cpu(amd_power_pmu, cpu);
int phys_id = topology_physical_package_id(cpu);
+ int ret = 0;

if (pmu)
return 0;
@@ -391,7 +380,17 @@ static int power_cpu_prepare(int cpu)
if (!pmu)
return -ENOMEM;

- spin_lock_init(&pmu->lock);
+ if (!zalloc_cpumask_var(&pmu->mask, GFP_KERNEL)) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ if (!zalloc_cpumask_var(&pmu->tmp_mask, GFP_KERNEL)) {
+ ret = -ENOMEM;
+ goto out1;
+ }
+
+ raw_spin_lock_init(&pmu->lock);

INIT_LIST_HEAD(&pmu->active_list);

@@ -400,12 +399,21 @@ static int power_cpu_prepare(int cpu)
per_cpu(amd_power_pmu, cpu) = pmu;

return 0;
+
+out1:
+ free_cpumask_var(pmu->mask);
+out:
+ kfree(pmu);
+
+ return ret;
}

static void power_cpu_kfree(int cpu)
{
struct power_pmu *pmu = per_cpu(amd_power_pmu, cpu);

+ free_cpumask_var(pmu->mask);
+ free_cpumask_var(pmu->tmp_mask);
kfree(pmu);
}