[PATCH 02/14] perf/x86: output NMI overhead

From: kan . liang
Date: Wed Nov 23 2016 - 12:45:07 EST


From: Kan Liang <kan.liang@xxxxxxxxx>

NMI handler is one of the most important part which brings overhead.

There are lots of NMI during sampling. It's very expensive to log each
NMI. So the accumulated time and NMI# will be output when event is going
to be disabled or task is scheduling out.
The newly introduced flag PERF_EF_LOG indicate to output the overhead
log.

Signed-off-by: Kan Liang <kan.liang@xxxxxxxxx>
---
arch/x86/events/core.c | 19 ++++++++++++++-
arch/x86/events/perf_event.h | 2 ++
include/linux/perf_event.h | 1 +
include/uapi/linux/perf_event.h | 2 ++
kernel/events/core.c | 54 ++++++++++++++++++++++-------------------
5 files changed, 52 insertions(+), 26 deletions(-)

diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index d31735f..6c3b0ef 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -1397,6 +1397,11 @@ static void x86_pmu_del(struct perf_event *event, int flags)

perf_event_update_userpage(event);

+ if ((flags & PERF_EF_LOG) && cpuc->nmi_overhead.nr) {
+ cpuc->nmi_overhead.cpu = smp_processor_id();
+ perf_log_overhead(event, PERF_NMI_OVERHEAD, &cpuc->nmi_overhead);
+ }
+
do_del:
if (x86_pmu.del) {
/*
@@ -1475,11 +1480,21 @@ void perf_events_lapic_init(void)
apic_write(APIC_LVTPC, APIC_DM_NMI);
}

+static void
+perf_caculate_nmi_overhead(u64 time)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+
+ cpuc->nmi_overhead.nr++;
+ cpuc->nmi_overhead.time += time;
+}
+
static int
perf_event_nmi_handler(unsigned int cmd, struct pt_regs *regs)
{
u64 start_clock;
u64 finish_clock;
+ u64 clock;
int ret;

/*
@@ -1492,8 +1507,10 @@ perf_event_nmi_handler(unsigned int cmd, struct pt_regs *regs)
start_clock = sched_clock();
ret = x86_pmu.handle_irq(regs);
finish_clock = sched_clock();
+ clock = finish_clock - start_clock;

- perf_sample_event_took(finish_clock - start_clock);
+ perf_caculate_nmi_overhead(clock);
+ perf_sample_event_took(clock);

return ret;
}
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index 5874d8d..c6b25ac 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -230,6 +230,8 @@ struct cpu_hw_events {
struct intel_excl_cntrs *excl_cntrs;
int excl_thread_id; /* 0 or 1 */

+ struct perf_overhead_entry nmi_overhead;
+
/*
* AMD specific bits
*/
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 3a530a7..632647f 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -313,6 +313,7 @@ struct pmu {
#define PERF_EF_START 0x01 /* start the counter when adding */
#define PERF_EF_RELOAD 0x02 /* reload the counter when starting */
#define PERF_EF_UPDATE 0x04 /* update the counter when stopping */
+#define PERF_EF_LOG 0x08

/*
* Adds/Removes a counter to/from the PMU, can be done inside a
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index ae571be..071323d 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -992,6 +992,8 @@ struct perf_branch_entry {
};

enum perf_record_overhead_type {
+ PERF_NMI_OVERHEAD = 0,
+
PERF_OVERHEAD_MAX,
};

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 590c399..d82e6ca 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1785,7 +1785,8 @@ event_filter_match(struct perf_event *event)
static void
event_sched_out(struct perf_event *event,
struct perf_cpu_context *cpuctx,
- struct perf_event_context *ctx)
+ struct perf_event_context *ctx,
+ bool log_overhead)
{
u64 tstamp = perf_event_time(event);
u64 delta;
@@ -1810,9 +1811,8 @@ event_sched_out(struct perf_event *event,
return;

perf_pmu_disable(event->pmu);
-
event->tstamp_stopped = tstamp;
- event->pmu->del(event, 0);
+ event->pmu->del(event, log_overhead ? PERF_EF_LOG : 0);
event->oncpu = -1;
event->state = PERF_EVENT_STATE_INACTIVE;
if (event->pending_disable) {
@@ -1835,20 +1835,21 @@ event_sched_out(struct perf_event *event,
static void
group_sched_out(struct perf_event *group_event,
struct perf_cpu_context *cpuctx,
- struct perf_event_context *ctx)
+ struct perf_event_context *ctx,
+ bool log_overhead)
{
struct perf_event *event;
int state = group_event->state;

perf_pmu_disable(ctx->pmu);

- event_sched_out(group_event, cpuctx, ctx);
+ event_sched_out(group_event, cpuctx, ctx, log_overhead);

/*
* Schedule out siblings (if any):
*/
list_for_each_entry(event, &group_event->sibling_list, group_entry)
- event_sched_out(event, cpuctx, ctx);
+ event_sched_out(event, cpuctx, ctx, log_overhead);

perf_pmu_enable(ctx->pmu);

@@ -1872,7 +1873,7 @@ __perf_remove_from_context(struct perf_event *event,
{
unsigned long flags = (unsigned long)info;

- event_sched_out(event, cpuctx, ctx);
+ event_sched_out(event, cpuctx, ctx, false);
if (flags & DETACH_GROUP)
perf_group_detach(event);
list_del_event(event, ctx);
@@ -1918,9 +1919,9 @@ static void __perf_event_disable(struct perf_event *event,
update_cgrp_time_from_event(event);
update_group_times(event);
if (event == event->group_leader)
- group_sched_out(event, cpuctx, ctx);
+ group_sched_out(event, cpuctx, ctx, true);
else
- event_sched_out(event, cpuctx, ctx);
+ event_sched_out(event, cpuctx, ctx, true);
event->state = PERF_EVENT_STATE_OFF;
}

@@ -2141,10 +2142,10 @@ group_sched_in(struct perf_event *group_event,
event->tstamp_running += now - event->tstamp_stopped;
event->tstamp_stopped = now;
} else {
- event_sched_out(event, cpuctx, ctx);
+ event_sched_out(event, cpuctx, ctx, false);
}
}
- event_sched_out(group_event, cpuctx, ctx);
+ event_sched_out(group_event, cpuctx, ctx, false);

pmu->cancel_txn(pmu);

@@ -2198,7 +2199,8 @@ static void add_event_to_ctx(struct perf_event *event,

static void ctx_sched_out(struct perf_event_context *ctx,
struct perf_cpu_context *cpuctx,
- enum event_type_t event_type);
+ enum event_type_t event_type,
+ bool log_overhead);
static void
ctx_sched_in(struct perf_event_context *ctx,
struct perf_cpu_context *cpuctx,
@@ -2206,7 +2208,8 @@ ctx_sched_in(struct perf_event_context *ctx,
struct task_struct *task);

static void task_ctx_sched_out(struct perf_cpu_context *cpuctx,
- struct perf_event_context *ctx)
+ struct perf_event_context *ctx,
+ bool log_overhead)
{
if (!cpuctx->task_ctx)
return;
@@ -2214,7 +2217,7 @@ static void task_ctx_sched_out(struct perf_cpu_context *cpuctx,
if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
return;

- ctx_sched_out(ctx, cpuctx, EVENT_ALL);
+ ctx_sched_out(ctx, cpuctx, EVENT_ALL, log_overhead);
}

static void perf_event_sched_in(struct perf_cpu_context *cpuctx,
@@ -2234,7 +2237,7 @@ static void ctx_resched(struct perf_cpu_context *cpuctx,
{
perf_pmu_disable(cpuctx->ctx.pmu);
if (task_ctx)
- task_ctx_sched_out(cpuctx, task_ctx);
+ task_ctx_sched_out(cpuctx, task_ctx, false);
cpu_ctx_sched_out(cpuctx, EVENT_ALL);
perf_event_sched_in(cpuctx, task_ctx, current);
perf_pmu_enable(cpuctx->ctx.pmu);
@@ -2281,7 +2284,7 @@ static int __perf_install_in_context(void *info)
}

if (activate) {
- ctx_sched_out(ctx, cpuctx, EVENT_TIME);
+ ctx_sched_out(ctx, cpuctx, EVENT_TIME, false);
add_event_to_ctx(event, ctx);
ctx_resched(cpuctx, task_ctx);
} else {
@@ -2396,7 +2399,7 @@ static void __perf_event_enable(struct perf_event *event,
return;

if (ctx->is_active)
- ctx_sched_out(ctx, cpuctx, EVENT_TIME);
+ ctx_sched_out(ctx, cpuctx, EVENT_TIME, false);

__perf_event_mark_enabled(event);

@@ -2611,7 +2614,8 @@ EXPORT_SYMBOL_GPL(perf_event_refresh);

static void ctx_sched_out(struct perf_event_context *ctx,
struct perf_cpu_context *cpuctx,
- enum event_type_t event_type)
+ enum event_type_t event_type,
+ bool log_overhead)
{
int is_active = ctx->is_active;
struct perf_event *event;
@@ -2662,12 +2666,12 @@ static void ctx_sched_out(struct perf_event_context *ctx,
perf_pmu_disable(ctx->pmu);
if (is_active & EVENT_PINNED) {
list_for_each_entry(event, &ctx->pinned_groups, group_entry)
- group_sched_out(event, cpuctx, ctx);
+ group_sched_out(event, cpuctx, ctx, log_overhead);
}

if (is_active & EVENT_FLEXIBLE) {
list_for_each_entry(event, &ctx->flexible_groups, group_entry)
- group_sched_out(event, cpuctx, ctx);
+ group_sched_out(event, cpuctx, ctx, log_overhead);
}
perf_pmu_enable(ctx->pmu);
}
@@ -2850,7 +2854,7 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn,

if (do_switch) {
raw_spin_lock(&ctx->lock);
- task_ctx_sched_out(cpuctx, ctx);
+ task_ctx_sched_out(cpuctx, ctx, false);
raw_spin_unlock(&ctx->lock);
}
}
@@ -2958,7 +2962,7 @@ void __perf_event_task_sched_out(struct task_struct *task,
static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
enum event_type_t event_type)
{
- ctx_sched_out(&cpuctx->ctx, cpuctx, event_type);
+ ctx_sched_out(&cpuctx->ctx, cpuctx, event_type, false);
}

static void
@@ -3350,7 +3354,7 @@ static int perf_rotate_context(struct perf_cpu_context *cpuctx)

cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
if (ctx)
- ctx_sched_out(ctx, cpuctx, EVENT_FLEXIBLE);
+ ctx_sched_out(ctx, cpuctx, EVENT_FLEXIBLE, false);

rotate_ctx(&cpuctx->ctx);
if (ctx)
@@ -3415,7 +3419,7 @@ static void perf_event_enable_on_exec(int ctxn)

cpuctx = __get_cpu_context(ctx);
perf_ctx_lock(cpuctx, ctx);
- ctx_sched_out(ctx, cpuctx, EVENT_TIME);
+ ctx_sched_out(ctx, cpuctx, EVENT_TIME, false);
list_for_each_entry(event, &ctx->event_list, event_entry)
enabled |= event_enable_on_exec(event, ctx);

@@ -10190,7 +10194,7 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
* in.
*/
raw_spin_lock_irq(&child_ctx->lock);
- task_ctx_sched_out(__get_cpu_context(child_ctx), child_ctx);
+ task_ctx_sched_out(__get_cpu_context(child_ctx), child_ctx, true);

/*
* Now that the context is inactive, destroy the task <-> ctx relation
--
2.5.5