[PATCH 2/2] perf,core: use parent avg sample period as child initial period

From: kan . liang
Date: Wed Jan 18 2017 - 15:17:55 EST


From: Kan Liang <kan.liang@xxxxxxxxx>

perf brings additional overhead when monitoring the task which
frequently generates child task.

When inheriting a event from parent task to child task, the
sample_period of original parent event (parent_event->parent) will be
assigned to child event as its initial period, which is usually the
default sample_period 1. But too many very short period like 1 will
increase overhead and may cause various problems.

avg_time_stamp is introduced to keep the average sample period. Each
child event can use its original parent event's avg period as its initial
sample period, which can reduce the overhead.

The avg_time_stamp doesn't update more than once every tick to avoid the
contention.
For each new child event, the parent event refcount++. Parent will not
go away until all children go away. So it's safe to access its parent.

Here is some data from the overhead test on Broadwell server
perf record -e $TEST_EVENTS -- ./loop.sh 50000

loop.sh
start=$(date +%s%N)
i=0
while [ "$i" -le "$1" ]
do
date > /dev/null
i=`expr $i + 1`
done
end=$(date +%s%N)
elapsed=`expr $end - $start`

Event# Original Elapsed time Elapsed time with patch delta
1 196,573,192,397 188,480,366,278 -4.12%
2 257,567,753,013 242,256,126,043 -5.94%
3 398,730,726,971 373,882,492,502 -6.23%
4 824,983,761,120 750,906,525,917 -8.98%
5 1,883,411,923,498 1,648,192,098,897 -12.49%

Signed-off-by: Kan Liang <kan.liang@xxxxxxxxx>
---
include/linux/perf_event.h | 3 +++
kernel/events/core.c | 20 ++++++++++++++++++--
2 files changed, 21 insertions(+), 2 deletions(-)

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 78ed810..84b0f47 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -648,6 +648,9 @@ struct perf_event {
struct list_head child_list;
struct perf_event *parent;

+ atomic64_t avg_sample_period;
+ u64 avg_time_stamp;
+
int oncpu;
int cpu;

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 924268c..82a2c0e 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -3237,9 +3237,11 @@ static DEFINE_PER_CPU(u64, perf_throttled_seq);

static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count, bool disable)
{
+ struct perf_event *head_event = (event->parent != NULL) ? event->parent : event;
struct hw_perf_event *hwc = &event->hw;
s64 period, sample_period;
s64 delta;
+ u64 now;

period = perf_calculate_period(event, nsec, count);

@@ -3253,6 +3255,15 @@ static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count, bo

hwc->sample_period = sample_period;

+ now = perf_clock();
+ if ((now - head_event->avg_time_stamp) > TICK_NSEC) {
+ s64 avg_period;
+
+ head_event->avg_time_stamp = now;
+ avg_period = (atomic64_read(&head_event->avg_sample_period) + sample_period) / 2;
+ atomic64_set(&head_event->avg_sample_period, avg_period);
+ }
+
if (local64_read(&hwc->period_left) > 8*sample_period) {
if (disable)
event->pmu->stop(event, PERF_EF_UPDATE);
@@ -9231,8 +9242,13 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,

hwc = &event->hw;
hwc->sample_period = attr->sample_period;
- if (attr->freq && attr->sample_freq)
+ if (attr->freq && attr->sample_freq) {
hwc->sample_period = 1;
+ if (parent_event)
+ hwc->sample_period = atomic64_read(&parent_event->avg_sample_period);
+ else
+ atomic64_set(&event->avg_sample_period, hwc->sample_period);
+ }
hwc->last_period = hwc->sample_period;

local64_set(&hwc->period_left, hwc->sample_period);
@@ -10464,8 +10480,8 @@ inherit_event(struct perf_event *parent_event,
child_event->state = PERF_EVENT_STATE_OFF;

if (parent_event->attr.freq) {
- u64 sample_period = parent_event->hw.sample_period;
struct hw_perf_event *hwc = &child_event->hw;
+ u64 sample_period = atomic64_read(&parent_event->avg_sample_period);

hwc->sample_period = sample_period;
hwc->last_period = sample_period;
--
2.4.3