[PATCH 4/5] MIPS: perf: Allocate per-core counters on demand

From: Matt Redfearn
Date: Tue Apr 03 2018 - 08:35:45 EST


Previously when performance counters are per-core, rather than
per-thread, the number available were divided by 2 on detection, and the
counters used by each thread in a core were "swizzled" to ensure
separation. However, this solution is suboptimal since it relies on a
couple of assumptions:
a) Always having 2 VPEs / core (number of counters was divided by 2)
b) Always having a number of counters implemented in the core that is
divisible by 2. For instance if an SoC implementation had a single
counter and 2 VPEs per core, then this logic would fail and no
performance counters would be available.
The mechanism also does not allow for one VPE in a core using more than
it's allocation of the per-core counters to count multiple events even
though other VPEs may not be using them.

Fix this situation by instead allocating (and releasing) per-core
performance counters when they are requested. This approach removes the
above assumptions and fixes the shortcomings.

In order to do this:
Add additional logic to mipsxx_pmu_alloc_counter() to detect if a
sibling is using a per-core counter, and to allocate a per-core counter
in all sibling CPUs.
Similarly, add a mipsxx_pmu_free_counter() function to release a
per-core counter in all sibling CPUs when it is finished with.
A new spinlock, core_counters_lock, is introduced to ensure exclusivity
when allocating and releasing per-core counters.
Since counters are now allocated per-core on demand, rather than being
reserved per-thread at boot, all of the "swizzling" of counters is
removed.

The upshot is that in an SoC with 2 counters / thread, counters are
reported as:
Performance counters: mips/interAptiv PMU enabled, 2 32-bit counters
available to each CPU, irq 18

Running an instance of a test program on each of 2 threads in a
core, both threads can use their 2 counters to count 2 events:

taskset 4 perf stat -e instructions:u,branches:u ./test_prog & taskset 8
perf stat -e instructions:u,branches:u ./test_prog

Performance counter stats for './test_prog':

30002 instructions:u
10000 branches:u

0.005164264 seconds time elapsed
Performance counter stats for './test_prog':

30002 instructions:u
10000 branches:u

0.006139975 seconds time elapsed

In an SoC with 2 counters / core (which can be forced by setting
cpu_has_mipsmt_pertccounters = 0), counters are reported as:
Performance counters: mips/interAptiv PMU enabled, 2 32-bit counters
available to each core, irq 18

Running an instance of a test program on each of 2 threads in a
core, now only one thread manages to secure the performance counters to
count 2 events. The other thread does not get any counters.

taskset 4 perf stat -e instructions:u,branches:u ./test_prog & taskset 8
perf stat -e instructions:u,branches:u ./test_prog

Performance counter stats for './test_prog':

<not counted> instructions:u
<not counted> branches:u

0.005179533 seconds time elapsed

Performance counter stats for './test_prog':

30002 instructions:u
10000 branches:u

0.005179467 seconds time elapsed

Signed-off-by: Matt Redfearn <matt.redfearn@xxxxxxxx>
---

arch/mips/kernel/perf_event_mipsxx.c | 130 ++++++++++++++++++++++++-----------
1 file changed, 88 insertions(+), 42 deletions(-)

diff --git a/arch/mips/kernel/perf_event_mipsxx.c b/arch/mips/kernel/perf_event_mipsxx.c
index 782a1b6c6352..bedb0d5ff3f2 100644
--- a/arch/mips/kernel/perf_event_mipsxx.c
+++ b/arch/mips/kernel/perf_event_mipsxx.c
@@ -131,6 +131,8 @@ static struct mips_pmu mipspmu;
#ifdef CONFIG_MIPS_PERF_SHARED_TC_COUNTERS
static int cpu_has_mipsmt_pertccounters;

+static DEFINE_SPINLOCK(core_counters_lock);
+
static DEFINE_RWLOCK(pmuint_rwlock);

#if defined(CONFIG_CPU_BMIPS5000)
@@ -141,20 +143,6 @@ static DEFINE_RWLOCK(pmuint_rwlock);
0 : cpu_vpe_id(&current_cpu_data))
#endif

-/* Copied from op_model_mipsxx.c */
-static unsigned int vpe_shift(void)
-{
- if (num_possible_cpus() > 1)
- return 1;
-
- return 0;
-}
-
-static unsigned int counters_total_to_per_cpu(unsigned int counters)
-{
- return counters >> vpe_shift();
-}
-
#else /* !CONFIG_MIPS_PERF_SHARED_TC_COUNTERS */
#define vpe_id() 0

@@ -165,17 +153,8 @@ static void pause_local_counters(void);
static irqreturn_t mipsxx_pmu_handle_irq(int, void *);
static int mipsxx_pmu_handle_shared_irq(void);

-static unsigned int mipsxx_pmu_swizzle_perf_idx(unsigned int idx)
-{
- if (vpe_id() == 1)
- idx = (idx + 2) & 3;
- return idx;
-}
-
static u64 mipsxx_pmu_read_counter(unsigned int idx)
{
- idx = mipsxx_pmu_swizzle_perf_idx(idx);
-
switch (idx) {
case 0:
/*
@@ -197,8 +176,6 @@ static u64 mipsxx_pmu_read_counter(unsigned int idx)

static u64 mipsxx_pmu_read_counter_64(unsigned int idx)
{
- idx = mipsxx_pmu_swizzle_perf_idx(idx);
-
switch (idx) {
case 0:
return read_c0_perfcntr0_64();
@@ -216,8 +193,6 @@ static u64 mipsxx_pmu_read_counter_64(unsigned int idx)

static void mipsxx_pmu_write_counter(unsigned int idx, u64 val)
{
- idx = mipsxx_pmu_swizzle_perf_idx(idx);
-
switch (idx) {
case 0:
write_c0_perfcntr0(val);
@@ -236,8 +211,6 @@ static void mipsxx_pmu_write_counter(unsigned int idx, u64 val)

static void mipsxx_pmu_write_counter_64(unsigned int idx, u64 val)
{
- idx = mipsxx_pmu_swizzle_perf_idx(idx);
-
switch (idx) {
case 0:
write_c0_perfcntr0_64(val);
@@ -256,8 +229,6 @@ static void mipsxx_pmu_write_counter_64(unsigned int idx, u64 val)

static unsigned int mipsxx_pmu_read_control(unsigned int idx)
{
- idx = mipsxx_pmu_swizzle_perf_idx(idx);
-
switch (idx) {
case 0:
return read_c0_perfctrl0();
@@ -275,8 +246,6 @@ static unsigned int mipsxx_pmu_read_control(unsigned int idx)

static void mipsxx_pmu_write_control(unsigned int idx, unsigned int val)
{
- idx = mipsxx_pmu_swizzle_perf_idx(idx);
-
switch (idx) {
case 0:
write_c0_perfctrl0(val);
@@ -296,7 +265,7 @@ static void mipsxx_pmu_write_control(unsigned int idx, unsigned int val)
static int mipsxx_pmu_alloc_counter(struct cpu_hw_events *cpuc,
struct hw_perf_event *hwc)
{
- int i;
+ int i, cpu = smp_processor_id();

/*
* We only need to care the counter mask. The range has been
@@ -315,14 +284,88 @@ static int mipsxx_pmu_alloc_counter(struct cpu_hw_events *cpuc,
* they can be dynamically swapped, they both feel happy.
* But here we leave this issue alone for now.
*/
- if (test_bit(i, &cntr_mask) &&
- !test_and_set_bit(i, cpuc->used_mask))
+ if (!test_bit(i, &cntr_mask))
+ continue;
+
+#ifdef CONFIG_MIPS_PERF_SHARED_TC_COUNTERS
+ /*
+ * When counters are per-core, check for use and allocate
+ * them in all sibling CPUs.
+ */
+ if (!cpu_has_mipsmt_pertccounters) {
+ struct cpu_hw_events *sibling_cpuc;
+ int sibling_cpu, allocated = 0;
+ unsigned long flags;
+
+ spin_lock_irqsave(&core_counters_lock, flags);
+
+ for_each_cpu(sibling_cpu, &cpu_sibling_map[cpu]) {
+ sibling_cpuc = per_cpu_ptr(&cpu_hw_events,
+ sibling_cpu);
+
+ if (test_bit(i, sibling_cpuc->used_mask)) {
+ pr_debug("CPU%d already using core counter %d\n",
+ sibling_cpu, i);
+ goto next_counter;
+ }
+ }
+
+ /* Counter i is not used by any siblings - use it */
+ allocated = 1;
+ for_each_cpu(sibling_cpu, &cpu_sibling_map[cpu]) {
+ sibling_cpuc = per_cpu_ptr(&cpu_hw_events,
+ sibling_cpu);
+
+ set_bit(i, sibling_cpuc->used_mask);
+ /* sibling is using the counter */
+ pr_debug("CPU%d now using core counter %d\n",
+ sibling_cpu, i);
+ }
+next_counter:
+ spin_unlock_irqrestore(&core_counters_lock, flags);
+ if (allocated)
+ return i;
+ }
+ else
+#endif
+ if (!test_and_set_bit(i, cpuc->used_mask)) {
+ pr_debug("CPU%d now using counter %d\n", cpu, i);
return i;
+ }
}

return -EAGAIN;
}

+static void mipsxx_pmu_free_counter(struct cpu_hw_events *cpuc,
+ struct hw_perf_event *hwc)
+{
+#ifdef CONFIG_MIPS_PERF_SHARED_TC_COUNTERS
+ int sibling_cpu, cpu = smp_processor_id();
+
+ /* When counters are per-core, free them in all sibling CPUs */
+ if (!cpu_has_mipsmt_pertccounters) {
+ struct cpu_hw_events *sibling_cpuc;
+ unsigned long flags;
+
+ spin_lock_irqsave(&core_counters_lock, flags);
+
+ for_each_cpu(sibling_cpu, &cpu_sibling_map[cpu]) {
+ sibling_cpuc = per_cpu_ptr(&cpu_hw_events, sibling_cpu);
+
+ clear_bit(hwc->idx, sibling_cpuc->used_mask);
+ pr_debug("CPU%d released core counter %d\n",
+ sibling_cpu, hwc->idx);
+ }
+
+ spin_unlock_irqrestore(&core_counters_lock, flags);
+ return;
+ }
+#endif
+ pr_debug("CPU%d released counter %d\n", cpu, hwc->idx);
+ clear_bit(hwc->idx, cpuc->used_mask);
+}
+
static void mipsxx_pmu_enable_event(struct hw_perf_event *evt, int idx)
{
struct perf_event *event = container_of(evt, struct perf_event, hw);
@@ -510,7 +553,7 @@ static void mipspmu_del(struct perf_event *event, int flags)

mipspmu_stop(event, PERF_EF_UPDATE);
cpuc->events[idx] = NULL;
- clear_bit(idx, cpuc->used_mask);
+ mipsxx_pmu_free_counter(cpuc, hwc);

perf_event_update_userpage(event);
}
@@ -1735,8 +1778,6 @@ init_hw_perf_events(void)

#ifdef CONFIG_MIPS_PERF_SHARED_TC_COUNTERS
cpu_has_mipsmt_pertccounters = probe_mipsmt_pertccounters();
- if (!cpu_has_mipsmt_pertccounters)
- counters = counters_total_to_per_cpu(counters);
#endif

if (get_c0_perfcount_int)
@@ -1860,9 +1901,14 @@ init_hw_perf_events(void)

on_each_cpu(reset_counters, (void *)(long)counters, 1);

- pr_cont("%s PMU enabled, %d %d-bit counters available to each "
- "CPU, irq %d%s\n", mipspmu.name, counters, counter_bits, irq,
- irq < 0 ? " (share with timer interrupt)" : "");
+ pr_cont("%s PMU enabled, %d %d-bit counters available to each %s, irq %d%s\n",
+ mipspmu.name, counters, counter_bits,
+#ifdef CONFIG_MIPS_PERF_SHARED_TC_COUNTERS
+ cpu_has_mipsmt_pertccounters ? "CPU" : "core",
+#else
+ "CPU",
+#endif
+ irq, irq < 0 ? " (share with timer interrupt)" : "");

perf_pmu_register(&pmu, "cpu", PERF_TYPE_RAW);

--
2.7.4