Re: [PATCH V3 01/13] perf/core, x86: Add PERF_SAMPLE_DATA_PAGE_SIZE

From: Peter Zijlstra
Date: Thu Jan 31 2019 - 07:37:39 EST


On Wed, Jan 30, 2019 at 06:23:42AM -0800, kan.liang@xxxxxxxxxxxxxxx wrote:
> diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
> index 374a197..03bf45d 100644
> --- a/arch/x86/events/core.c
> +++ b/arch/x86/events/core.c
> @@ -2578,3 +2578,45 @@ void perf_get_x86_pmu_capability(struct x86_pmu_capability *cap)
> cap->events_mask_len = x86_pmu.events_mask_len;
> }
> EXPORT_SYMBOL_GPL(perf_get_x86_pmu_capability);
> +
> +/*
> + * map x86 page levels to perf page sizes
> + */
> +static const enum perf_page_size perf_page_size_map[PG_LEVEL_NUM] = {
> + [PG_LEVEL_NONE] = PERF_PAGE_SIZE_NONE,
> + [PG_LEVEL_4K] = PERF_PAGE_SIZE_4K,
> + [PG_LEVEL_2M] = PERF_PAGE_SIZE_2M,
> + [PG_LEVEL_1G] = PERF_PAGE_SIZE_1G,
> + [PG_LEVEL_512G] = PERF_PAGE_SIZE_512G,
> +};
> +
> +u64 perf_get_page_size(u64 virt)
> +{
> + unsigned long flags;
> + unsigned int level;
> + pte_t *pte;
> +
> + if (!virt)
> + return 0;
> +
> + /*
> + * Interrupts are disabled, so it prevents any tear down
> + * of the page tables.
> + * See the comment near struct mmu_table_batch.
> + */
> + local_irq_save(flags);
> + if (virt >= TASK_SIZE)
> + pte = lookup_address(virt, &level);
> + else {
> + if (current->mm)
> + pte = lookup_address_in_pgd(pgd_offset(current->mm, virt),
> + virt, &level);

Aside from all the missin {}, I'm fairly sure this is broken since this
happens from NMI context. This can interrupt switch_mm() and things like
use_temporary_mm().

Also; why does this live in the x86 code and not in the generic code?

> + else
> + level = PG_LEVEL_NUM;
> + }
> + local_irq_restore(flags);
> + if (level >= PG_LEVEL_NUM)
> + return PERF_PAGE_SIZE_NONE;
> +
> + return (u64)perf_page_size_map[level];
> +}

> diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
> index 7198ddd..79daacd 100644
> --- a/include/uapi/linux/perf_event.h
> +++ b/include/uapi/linux/perf_event.h
> @@ -141,8 +141,9 @@ enum perf_event_sample_format {
> PERF_SAMPLE_TRANSACTION = 1U << 17,
> PERF_SAMPLE_REGS_INTR = 1U << 18,
> PERF_SAMPLE_PHYS_ADDR = 1U << 19,
> + PERF_SAMPLE_DATA_PAGE_SIZE = 1U << 20,
>
> - PERF_SAMPLE_MAX = 1U << 20, /* non-ABI */
> + PERF_SAMPLE_MAX = 1U << 21, /* non-ABI */
>
> __PERF_SAMPLE_CALLCHAIN_EARLY = 1ULL << 63, /* non-ABI; internal use */
> };
> @@ -863,6 +864,7 @@ enum perf_event_type {
> * { u64 abi; # enum perf_sample_regs_abi
> * u64 regs[weight(mask)]; } && PERF_SAMPLE_REGS_INTR
> * { u64 phys_addr;} && PERF_SAMPLE_PHYS_ADDR
> + * { u64 data_page_size;} && PERF_SAMPLE_DATA_PAGE_SIZE
> * };
> */
> PERF_RECORD_SAMPLE = 9,
> @@ -1150,6 +1152,18 @@ union perf_mem_data_src {
> #define PERF_MEM_S(a, s) \
> (((__u64)PERF_MEM_##a##_##s) << PERF_MEM_##a##_SHIFT)
>
> +
> +enum perf_page_size {
> + PERF_PAGE_SIZE_NONE,
> + PERF_PAGE_SIZE_4K,
> + PERF_PAGE_SIZE_8K,
> + PERF_PAGE_SIZE_16K,
> + PERF_PAGE_SIZE_64K,
> + PERF_PAGE_SIZE_2M,
> + PERF_PAGE_SIZE_1G,
> + PERF_PAGE_SIZE_512G,
> +};

Since you have a u64 to store this in, WTH do you use this limited enum?
Are you very sure this covers all the possible page sizes for all
architectures?

Why not simply report the page size in bytes?