Re: [PATCH v6 2/3]: perf/core: use context tstamp_data for skipped events on mux interrupt

From: Alexey Budankov
Date: Thu Aug 10 2017 - 11:58:06 EST


Well, Ok.

I re-implemented this patch of patch set and also implemented a unit
test that IMHO mimics the case mentioned above to check if it solves the issue.

The test is a single thread application that creates 272 fds for every of
two INSTRUCTIONS_RETIRED events covering 272 cores of Intel Xeon Phi.

The test simultaneously executes 1 million of instructions several times
when the events are enabled and reads the events' counts, TOTAL_ENABLED and
TOTAL_RUNNING over read() system call.

The first event is allocated, enabled and disabled at the beginning and
the end of the test execution phase where as the second event allocation and
measurements are included into the measurement interval of the first event.

Below is what I am getting now when running the test on the patched kernel:

EID CPU COUNT T_ENABLED T_RUNNING SCALE
--- 0 enabled ---
--- million instructions ---
0 26 1334 1666671 137366 8.24
0 94 138637 1735872 527098 30.37
0 162 874166 1823695 1083785 59.43
--- million instructions ---
0 26 1334 3328832 137366 4.13
0 94 1164146 3318599 2109825 63.58
0 162 874166 3390716 1083785 31.96
--- million instructions ---
0 26 1334 4835955 137366 2.84
0 94 2189671 4820750 3611976 74.93
0 162 874166 4934513 1083785 21.96
--- 1 enabled ---
--- million instructions ---
0 26 1334 22661310 137366 0.61
0 94 3248450 22667165 21458391 94.67
0 162 874166 22742990 1083785 4.77
1 94 1033387 2150307 2150307 100.00
--- million instructions ---
0 26 1334 24878504 137366 0.55
0 94 4289784 24869644 23660870 95.14
0 162 874166 24943564 1083785 4.34
1 94 2074675 4370708 4370708 100.00
--- 1 disabled ---
--- million instructions ---
0 26 1334 27681611 137366 0.50
0 94 5337752 27675968 26467194 95.63
0 162 874166 27749528 1083785 3.91
1 94 2089278 5024218 5024218 100.00
--- 0 disabled ---
--- million instructions ---
0 26 1334 29004868 137366 0.47
0 94 5372734 28964075 27755301 95.83
0 162 874166 29005751 1083785 3.74
1 94 2089278 5024218 5024218 100.00

The output demonstrates that test thread migrated two time during execution
thus several fds were employed for measuring amount of executed instructions.

Also the output shows that T_RUNNING values of events updated and
maintained so that sums of SCALE values for every event are near 100%
(no multiplexing) after every million instructions execution.

Unit test code is attached for convenience.

The key thing in the patch is explicit updating of tstamp fields for
INACTIVE events in update_event_times().

Signed-off-by: Alexey Budankov <alexey.budankov@xxxxxxxxxxxxxxx>
---
kernel/events/core.c | 52 +++++++++++++++++++++++++++++++---------------------
1 file changed, 31 insertions(+), 21 deletions(-)

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 0a4f619..d195fdc 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1391,6 +1391,27 @@ static u64 perf_event_time(struct perf_event *event)
return ctx ? ctx->time : 0;
}

+void perf_event_tstamp_init(struct perf_event *event)
+{
+ u64 tstamp = perf_event_time(event);
+
+ event->tstamp_enabled = tstamp;
+ event->tstamp_running = tstamp;
+ event->tstamp_stopped = tstamp;
+}
+
+void perf_event_tstamp_update(struct perf_event *event)
+{
+ u64 tstamp, delta;
+
+ tstamp = perf_event_time(event);
+
+ delta = tstamp - event->tstamp_stopped;
+
+ event->tstamp_running += delta;
+ event->tstamp_stopped = tstamp;
+}
+
/*
* Update the total_time_enabled and total_time_running fields for a event.
*/
@@ -1405,6 +1426,9 @@ static void update_event_times(struct perf_event *event)
event->group_leader->state < PERF_EVENT_STATE_INACTIVE)
return;

+ if (event->state == PERF_EVENT_STATE_INACTIVE)
+ perf_event_tstamp_update(event);
+
/*
* in cgroup mode, time_enabled represents
* the time the event was enabled AND active
@@ -1430,7 +1454,6 @@ static void update_event_times(struct perf_event *event)
run_end = perf_event_time(event);

event->total_time_running = run_end - event->tstamp_running;
-
}

/*
@@ -1954,9 +1977,6 @@ event_sched_out(struct perf_event *event,
struct perf_cpu_context *cpuctx,
struct perf_event_context *ctx)
{
- u64 tstamp = perf_event_time(event);
- u64 delta;
-
WARN_ON_ONCE(event->ctx != ctx);
lockdep_assert_held(&ctx->lock);

@@ -1967,18 +1987,15 @@ event_sched_out(struct perf_event *event,
* via read() for time_enabled, time_running:
*/
if (event->state == PERF_EVENT_STATE_INACTIVE &&
- !event_filter_match(event)) {
- delta = tstamp - event->tstamp_stopped;
- event->tstamp_running += delta;
- event->tstamp_stopped = tstamp;
- }
+ !event_filter_match(event))
+ perf_event_tstamp_update(event);

if (event->state != PERF_EVENT_STATE_ACTIVE)
return;

perf_pmu_disable(event->pmu);

- event->tstamp_stopped = tstamp;
+ event->tstamp_stopped = perf_event_time(event);
event->pmu->del(event, 0);
event->oncpu = -1;
event->state = PERF_EVENT_STATE_INACTIVE;
@@ -2294,7 +2311,6 @@ group_sched_in(struct perf_event *group_event,
{
struct perf_event *event, *partial_group = NULL;
struct pmu *pmu = ctx->pmu;
- u64 now = ctx->time;
bool simulate = false;

if (group_event->state == PERF_EVENT_STATE_OFF)
@@ -2340,12 +2356,10 @@ group_sched_in(struct perf_event *group_event,
if (event == partial_group)
simulate = true;

- if (simulate) {
- event->tstamp_running += now - event->tstamp_stopped;
- event->tstamp_stopped = now;
- } else {
+ if (simulate)
+ perf_event_tstamp_update(event);
+ else
event_sched_out(event, cpuctx, ctx);
- }
}
event_sched_out(group_event, cpuctx, ctx);

@@ -2390,13 +2404,9 @@ static int group_can_go_on(struct perf_event *event,
static void add_event_to_ctx(struct perf_event *event,
struct perf_event_context *ctx)
{
- u64 tstamp = perf_event_time(event);
-
list_add_event(event, ctx);
perf_group_attach(event);
- event->tstamp_enabled = tstamp;
- event->tstamp_running = tstamp;
- event->tstamp_stopped = tstamp;
+ perf_event_tstamp_init(event);
}

static void ctx_sched_out(struct perf_event_context *ctx,
/* check_multiplexing_read.c */

#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#include <unistd.h>
#include <errno.h>

#include <sys/mman.h>

#include <sys/ioctl.h>
#include <asm/unistd.h>

#include "perf_event.h"
#include "test_utils.h"
#include "perf_helpers.h"
#include "instructions_testcode.h"

#define NUM_CPUS 272
#define NUM_EVENTS 2

int fd[NUM_EVENTS][NUM_CPUS];
long long events[NUM_EVENTS]={
PERF_COUNT_HW_CPU_CYCLES,
PERF_COUNT_HW_INSTRUCTIONS
};

static long long base_results[NUM_CPUS][3];

#define TIME_ENABLED 1
#define TIME_RUNNING 2

static int test_routine(void) {

int i,result;

printf("--- million instructions ---\n");

for(i=0;i<1;i++) {
result=instructions_million();
}

return result;
}

void alloc_events(long long config, int fd[NUM_CPUS])
{
int ret,i,j;

struct perf_event_attr pe;

for(j=0;j<NUM_CPUS;j++) {
memset(&pe,0,sizeof(struct perf_event_attr));
pe.type=PERF_TYPE_HARDWARE;
pe.size=sizeof(struct perf_event_attr);
pe.config=config;
pe.disabled=1;
pe.exclude_kernel=1;
pe.exclude_hv=1;
pe.read_format=PERF_FORMAT_TOTAL_TIME_ENABLED |
PERF_FORMAT_TOTAL_TIME_RUNNING;

fd[j]=perf_event_open(&pe,0,j,-1,0);
if (fd[j]<0) {
fprintf(stderr,"Failed adding mpx event 0 %s\n",
strerror(errno));
return -1;
}
}
}

void free_events(int fd[NUM_CPUS])
{
int j;
for(j=0;j<NUM_CPUS;j++) {
close(fd[j]);
}
}

void enable_events(int fd[NUM_CPUS])
{
int j,ret;
for(j=0;j<NUM_CPUS;j++) {
ret=ioctl(fd[j], PERF_EVENT_IOC_ENABLE,0);
if (ret<0) {
fprintf(stderr,"Error starting event fd[%d]\n",j);
}
}
}

void disable_events(int fd[NUM_CPUS])
{
int j,ret;
for(j=0;j<NUM_CPUS;j++) {
ret=ioctl(fd[j], PERF_EVENT_IOC_DISABLE,0);
if (ret<0) {
fprintf(stderr,"Error stopping event fd[%d]\n",j);
}
}
}

void read_events(int i, int fd[NUM_CPUS])
{
int j, ret;
for(j=0;j<NUM_CPUS;j++) {
ret=read(fd[j],&base_results[j],3*sizeof(long long));
if (ret<3*sizeof(long long)) {
fprintf(stderr,"Event fd[0][%d] unexpected read size %d\n",j,ret);
return;
}
if (base_results[j][0])
printf("%d\t%d\t%lld\t\t\t%lld\t\t\t%lld\t\t\t%.2f\n", i, j,
base_results[j][0],
base_results[j][TIME_ENABLED],
base_results[j][TIME_RUNNING],
(double)base_results[j][TIME_RUNNING]/
(double)base_results[j][TIME_ENABLED] * 100.);
}
}

int main(int argc, char** argv) {

int ret,quiet,i,j;

struct perf_event_attr pe;

char test_string[]="Testing ...";

printf("\nEID\tCPU\tCOUNT\t\t\tT_ENABLED\t\t\tT_RUNNING\t\t\tSCALE\n");

alloc_events(events[1], fd[0]);
enable_events(fd[0]);
printf("--- 1 enabled\n");

test_routine();
read_events(0, fd[0]);

test_routine();
read_events(0, fd[0]);

test_routine();
read_events(0, fd[0]);

alloc_events(events[1], fd[1]);
enable_events(fd[1]);
printf("--- 1 enabled\n");

test_routine();
read_events(0, fd[0]);
read_events(1, fd[1]);

test_routine();
read_events(0, fd[0]);
read_events(1, fd[1]);

disable_events(fd[1]);
printf("--- 1 disabled\n");

test_routine();
read_events(0,fd[0]);
read_events(1,fd[1]);

disable_events(fd[0]);
printf("--- 0 disabled\n");
test_routine();

read_events(0,fd[0]);
read_events(1,fd[1]);

free_events(fd[1]);
free_events(fd[0]);

test_pass(test_string);

return 0;
}