[PATCH] perf_event use rdpmc rather than rdmsr when possible inkernel

From: Vince Weaver
Date: Mon Feb 20 2012 - 17:38:55 EST


Hello

The perfctr interface uses rdpmc rather than rdmsr when possible in the
kernel, as rdpmc tends to have lower latencies. (One can look in
etc/costs in the perfctr-2.6 package to see a historical list of the
overhead).

I have done some tests on a 3.2 kernel:

rdmsr rdpmc
Core2 T9900: 203.9 cycles 30.9 cycles
AMD fam0fh: 56.2 cycles 9.8 cycles
Atom 6/28/2: 129.7 cycles 50.6 cycles

As you can see the speedup of using rdpmc is large, although granted
it really is a drop in the bucket compared to the other overheads
involved.

I've attached a kernel module that can be used to test this on your own
personal x86 machine.

Below is a patch that changes perf_event to use rdpmc rather than rdmsr
when possible. It's probably possible (and desirable) to do this without
requiring a new field in the hw_perf_event structure, but the fixed events
make this tricky.

Signed-off-by: Vince Weaver <vweaver1@xxxxxxxxxxxx>

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 5adce10..5550047 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -85,7 +85,7 @@ u64 x86_perf_event_update(struct perf_event *event)
*/
again:
prev_raw_count = local64_read(&hwc->prev_count);
- rdmsrl(hwc->event_base, new_raw_count);
+ new_raw_count=native_read_pmc(hwc->event_base_rdpmc);

if (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
new_raw_count) != prev_raw_count)
@@ -768,9 +768,11 @@ static inline void x86_assign_hw_event(struct perf_event *event,
} else if (hwc->idx >= X86_PMC_IDX_FIXED) {
hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL;
hwc->event_base = MSR_ARCH_PERFMON_FIXED_CTR0 + (hwc->idx - X86_PMC_IDX_FIXED);
+ hwc->event_base_rdpmc = (hwc->idx - X86_PMC_IDX_FIXED) | 1<<30;
} else {
hwc->config_base = x86_pmu_config_addr(hwc->idx);
hwc->event_base = x86_pmu_event_addr(hwc->idx);
+ hwc->event_base_rdpmc = x86_pmu_addr_offset(hwc->idx);
}
}

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index abb2776..432ac69 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -562,6 +562,7 @@ struct hw_perf_event {
u64 last_tag;
unsigned long config_base;
unsigned long event_base;
+ unsigned long event_base_rdpmc;
int idx;
int last_cpu;
struct hw_perf_event_extra extra_reg;obj-m := rdpmc-module.o
KDIR := /lib/modules/$(shell uname -r)/build
PWD := $(shell pwd)

default:
$(MAKE) -C $(KDIR) SUBDIRS=$(PWD) modules

#clean:
# rm -f *.o *.ko *~ *.symvers *.order/* Code taken from perfctr by Mikael Pettersson */
/* http://user.it.uu.se/~mikpe/linux/perfctr/ */
/* Slightly modified by Vince Weaver <vweaver1@xxxxxxxxxxxx> */

#include <linux/module.h>
#include <linux/init.h>

#include <asm/msr.h>

#define MSR_P5_CESR 0x11
#define MSR_P5_CTR0 0x12
#define P5_CESR_VAL (0x16 | (3<<6))

#define P6_EVNTSEL0_VAL (0xC0 | (3<<16) | (1<<22))

#define K7_EVNTSEL0_VAL (0xC0 | (3<<16) | (1<<22))

#define MSR_P4_IQ_COUNTER0 0x30C
#define P4_CRU_ESCR0_VAL ((2<<25) | (1<<9) | (0x3<<2))
#define P4_IQ_CCCR0_VAL ((0x3<<16) | (4<<13) | (1<<12))

#define CORE2_PMC_FIXED_CTR0 ((1<<30) | 0)


#define NITER 64
#define X2(S) S";"S
#define X8(S) X2(X2(X2(S)))

#ifdef __x86_64__
#define CR4MOV "movq"
#else
#define CR4MOV "movl"
#endif


#define rdtsc_low(low) \
__asm__ __volatile__("rdtsc" : "=a"(low) : : "edx")


static void __init do_rdpmc(unsigned pmc, unsigned unused2)
{
unsigned i;
for(i = 0; i < NITER/8; ++i)
__asm__ __volatile__(X8("rdpmc") : : "c"(pmc) : "eax", "edx");
}

static void __init do_rdmsr(unsigned msr, unsigned unused2)
{
unsigned i;
for(i = 0; i < NITER/8; ++i)
__asm__ __volatile__(X8("rdmsr") : : "c"(msr) : "eax", "edx");
}

static void __init do_wrmsr(unsigned msr, unsigned data)
{
unsigned i;
for(i = 0; i < NITER/8; ++i)
__asm__ __volatile__(X8("wrmsr") : : "c"(msr), "a"(data), "d"(0));
}

static void __init do_rdcr4(unsigned unused1, unsigned unused2)
{
unsigned i;
unsigned long dummy;
for(i = 0; i < NITER/8; ++i)
__asm__ __volatile__(X8(CR4MOV" %%cr4,%0") : "=r"(dummy));
}

static void __init do_wrcr4(unsigned cr4, unsigned unused2)
{
unsigned i;
for(i = 0; i < NITER/8; ++i)
__asm__ __volatile__(X8(CR4MOV" %0,%%cr4") : : "r"((long)cr4));
}

static void __init do_rdtsc(unsigned unused1, unsigned unused2)
{
unsigned i;
for(i = 0; i < NITER/8; ++i)
__asm__ __volatile__(X8("rdtsc") : : : "eax", "edx");
}

#if 0
static void __init do_wrlvtpc(unsigned val, unsigned unused2)
{
unsigned i;
for(i = 0; i < NITER/8; ++i) {
apic_write(APIC_LVTPC, val);
apic_write(APIC_LVTPC, val);
apic_write(APIC_LVTPC, val);
apic_write(APIC_LVTPC, val);
apic_write(APIC_LVTPC, val);
apic_write(APIC_LVTPC, val);
apic_write(APIC_LVTPC, val);
apic_write(APIC_LVTPC, val);
}
}
#endif

static void __init do_sync_core(unsigned unused1, unsigned unused2)
{
unsigned i;
for(i = 0; i < NITER/8; ++i) {
sync_core();
sync_core();
sync_core();
sync_core();
sync_core();
sync_core();
sync_core();
sync_core();
}
}


static void __init do_empty_loop(unsigned unused1, unsigned unused2)
{
unsigned i;
for(i = 0; i < NITER/8; ++i)
__asm__ __volatile__("" : : "c"(0));
}

static unsigned __init run(void (*doit)(unsigned, unsigned),
unsigned arg1, unsigned arg2)
{
unsigned start, stop;
sync_core();
rdtsc_low(start);
(*doit)(arg1, arg2); /* should take < 2^32 cycles to complete */
sync_core();
rdtsc_low(stop);
return stop - start;
}




static void __init
measure_overheads(unsigned msr_evntsel0, unsigned evntsel0, unsigned msr_perfctr0,
unsigned msr_cccr, unsigned cccr_val, unsigned is_core2)
{
int i;
unsigned int loop, ticks[14];
const char *name[14];

if( msr_evntsel0 )
wrmsr(msr_evntsel0, 0, 0);
if( msr_cccr )
wrmsr(msr_cccr, 0, 0);

name[0] = "rdtsc";
ticks[0] = run(do_rdtsc, 0, 0);
name[1] = "rdpmc";
ticks[1] = run(do_rdpmc,1,0);
name[2] = "rdmsr (counter)";
ticks[2] = msr_perfctr0 ? run(do_rdmsr, msr_perfctr0, 0) : 0;
name[3] = msr_cccr ? "rdmsr (escr)" : "rdmsr (evntsel)";
ticks[3] = msr_evntsel0 ? run(do_rdmsr, msr_evntsel0, 0) : 0;
name[4] = "wrmsr (counter)";
ticks[4] = msr_perfctr0 ? run(do_wrmsr, msr_perfctr0, 0) : 0;
name[5] = msr_cccr ? "wrmsr (escr)" : "wrmsr (evntsel)";
ticks[5] = msr_evntsel0 ? run(do_wrmsr, msr_evntsel0, evntsel0) : 0;
name[6] = "read cr4";
ticks[6] = run(do_rdcr4, 0, 0);
name[7] = "write cr4";
ticks[7] = run(do_wrcr4, read_cr4(), 0);
name[8] = "rdpmc (fast)";
ticks[8] = msr_cccr ? run(do_rdpmc, 0x80000001, 0) : 0;
name[9] = "rdmsr (cccr)";
ticks[9] = msr_cccr ? run(do_rdmsr, msr_cccr, 0) : 0;
name[10] = "wrmsr (cccr)";
ticks[10] = msr_cccr ? run(do_wrmsr, msr_cccr, cccr_val) : 0;
name[11] = "sync_core";
ticks[11] = run(do_sync_core, 0, 0);
name[12] = "read fixed_ctr0";
ticks[12] = is_core2 ? run(do_rdpmc, CORE2_PMC_FIXED_CTR0, 0) : 0;
name[13] = "wrmsr fixed_ctr_ctrl";
ticks[13] = is_core2 ? run(do_wrmsr, MSR_CORE_PERF_FIXED_CTR_CTRL, 0) : 0;
/*
name[14] = "write LVTPC";
ticks[14] = (perfctr_info.cpu_features & PERFCTR_FEATURE_PCINT)
? run(do_wrlvtpc, APIC_DM_NMI|APIC_LVT_MASKED, 0) : 0;
*/

loop = run(do_empty_loop, 0, 0);

if( msr_evntsel0 )
wrmsr(msr_evntsel0, 0, 0);
if( msr_cccr )
wrmsr(msr_cccr, 0, 0);

printk(KERN_INFO "COUNTER_OVERHEAD: NITER == %u\n", NITER);
printk(KERN_INFO "COUNTER_OVERHEAD: loop overhead is %u cycles\n", loop);
for(i = 0; i < ARRAY_SIZE(ticks); ++i) {
unsigned int x;
if( !ticks[i] )
continue;
x = ((ticks[i] - loop) * 10) / NITER;
printk(KERN_INFO "COUNTER_OVERHEAD: %s cost is %u.%u cycles (%u total)\n",
name[i], x/10, x%10, ticks[i]);
}
}


static inline void perfctr_p5_init_tests(void)
{
measure_overheads(MSR_P5_CESR, P5_CESR_VAL, MSR_P5_CTR0, 0, 0, 0);
}

static inline void perfctr_p6_init_tests(void)
{
measure_overheads(MSR_P6_EVNTSEL0, P6_EVNTSEL0_VAL, MSR_P6_PERFCTR0, 0,
0, 0);
}

static inline void perfctr_core2_init_tests(void)
{
measure_overheads(MSR_P6_EVNTSEL0, P6_EVNTSEL0_VAL, MSR_P6_PERFCTR0, 0,
0, 1);
}

static inline void perfctr_p4_init_tests(void)
{
measure_overheads(MSR_P4_CRU_ESCR0, P4_CRU_ESCR0_VAL, MSR_P4_IQ_COUNTER0,
MSR_P4_IQ_CCCR0, P4_IQ_CCCR0_VAL, 0);
}



static inline void perfctr_k7_init_tests(void)
{
measure_overheads(MSR_K7_EVNTSEL0, K7_EVNTSEL0_VAL, MSR_K7_PERFCTR0, 0, 0, 0);
}

static inline void perfctr_generic_init_tests(void)
{
measure_overheads(0, 0, 0, 0, 0, 0);
}




static int __init mymodule_init(void)
{

printk(KERN_INFO "COUNTER_OVERHEAD: vendor %u, family %u, model %u, stepping %u\n",
boot_cpu_data.x86_vendor,
boot_cpu_data.x86,
boot_cpu_data.x86_model,
boot_cpu_data.x86_mask);

if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) {
perfctr_k7_init_tests();
}
else if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) {
if (boot_cpu_data.x86_model==5) perfctr_p5_init_tests();
else if (boot_cpu_data.x86_model==6) perfctr_p6_init_tests();
else if (boot_cpu_data.x86_model==15) perfctr_p4_init_tests();
else perfctr_core2_init_tests();
}
else {
perfctr_generic_init_tests();
}
return 0;
}

static void __exit mymodule_exit(void)
{
printk ("COUNTER_OVERHEAD: Unloading.\n");
return;
}

module_init(mymodule_init);
module_exit(mymodule_exit);

MODULE_LICENSE("GPL");