Re: [PATCH V4 01/13] perf/core, x86: Add PERF_SAMPLE_DATA_PAGE_SIZE

From: Kirill A. Shutemov
Date: Fri Feb 01 2019 - 05:34:41 EST


On Fri, Feb 01, 2019 at 10:22:40AM +0100, Peter Zijlstra wrote:
> On Thu, Jan 31, 2019 at 12:27:54PM -0800, kan.liang@xxxxxxxxxxxxxxx wrote:
> > diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
> > index 374a197..229a73b 100644
> > --- a/arch/x86/events/core.c
> > +++ b/arch/x86/events/core.c
> > @@ -2578,3 +2578,34 @@ void perf_get_x86_pmu_capability(struct x86_pmu_capability *cap)
> > cap->events_mask_len = x86_pmu.events_mask_len;
> > }
> > EXPORT_SYMBOL_GPL(perf_get_x86_pmu_capability);
> > +
> > +u64 perf_get_page_size(u64 virt)
> > +{
> > + unsigned long flags;
> > + unsigned int level;
> > + pte_t *pte;
> > +
> > + if (!virt)
> > + return 0;
> > +
> > + /*
> > + * Interrupts are disabled, so it prevents any tear down
> > + * of the page tables.
> > + * See the comment near struct mmu_table_batch.
> > + */
> > + local_irq_save(flags);
> > + if (virt >= TASK_SIZE)
> > + pte = lookup_address(virt, &level);
> > + else {
> > + if (current->mm) {
> > + pte = lookup_address_in_pgd(pgd_offset(current->mm, virt),
> > + virt, &level);
> > + } else
> > + level = PG_LEVEL_NUM;
> > + }
> > + local_irq_restore(flags);
> > + if (level >= PG_LEVEL_NUM)
> > + return 0;
> > +
> > + return (u64)page_level_size(level);
> > +}
>
> *sigh* there really isn't anything x86 specific there.
>
> > diff --git a/kernel/events/core.c b/kernel/events/core.c
> > index 236bb8d..d233f45 100644
> > --- a/kernel/events/core.c
> > +++ b/kernel/events/core.c
> > @@ -6352,6 +6358,12 @@ static u64 perf_virt_to_phys(u64 virt)
> > return phys_addr;
> > }
> >
> > +/* Return page size of given virtual address. IRQ-safe required. */
> > +u64 __weak perf_get_page_size(u64 virt)
> > +{
> > + return 0;
> > +}
> > +
> > static struct perf_callchain_entry __empty_callchain = { .nr = 0, };
> >
> > struct perf_callchain_entry *
>
> How about something like so instead?
>
> (completely untested, will likely make your grandma eat puppies)
>
> --- a/kernel/events/core.c
> +++ b/kernel/events/core.c
> @@ -6357,10 +6357,72 @@ static u64 perf_virt_to_phys(u64 virt)
> return phys_addr;
> }
>
> -/* Return page size of given virtual address. IRQ-safe required. */
> -u64 __weak perf_get_page_size(u64 virt)
> +static u64 __perf_get_page_size(struct mm_struct *mm, unsigned long addr)
> {
> - return 0;
> + pgd_t *pgd;
> + p4d_t *p4d;
> + pud_t *pud;
> + pmd_t *pmd;
> +
> + pgd = pgd_offset(mm, addr);
> + if (pgd_none(*pgd))
> + return 0;
> +
> + p4d = p4d_offset(pgd, addr);
> + if (p4d_none(*p4d))
> + return 0;
> +
> + if (p4d_large(*p4d));

We dont have 512GiB pages yet.

> + return 1ULL << P4D_SHIFT;

return P4D_SIZE;

And the same P?D_SIZE below.

> +
> + if (!p4d_present(*p4d))
> + return 0;

No need to check p4d_none() *and* p4d_present(). Just p4d_present() should
be enough. Large is still suppose to be present. The same for other levels.

> +
> + pud = pud_offset(p4d, addr);
> + if (pud_none(*pud))
> + return 0;
> +
> + if (pud_large(*pud))
> + return 1ULL << PUD_SHIFT;
> +
> + if (!pud_present(*pud))
> + return 0;
> +
> + pmd = pmd_offset(pud, addr);
> + if (pmd_none(*pmd))
> + return 0;
> +
> + if (pmd_large(*pmd))
> + return 1ULL << PMD_SHIFT;
> +
> + if (!pmd_present(*pmd))
> + return 0;
> +
> + return 1ULL << PAGE_SHIFT;
> +}
> +
> +static u64 perf_get_page_size(unsigned long addr)
> +{
> + struct mm_struct *mm;
> + unsigned long flags;
> + u64 ret;
> +
> + /*
> + * Software page-table walkers must disable IRQs, see asm-generic/tlb.h.
> + */
> + local_irq_save(flags);
> + mm = current->mm;
> + if (!mm) {
> + /*
> + * For kernel threads and the like, use init_mm so that
> + * we can find kernel memory.
> + */
> + mm = &init_mm;
> + }
> + ret = __perf_get_page_size(mm, addr);
> + local_irq_restore(flags);
> +
> + return ret;
> }
>
> static struct perf_callchain_entry __empty_callchain = { .nr = 0, };

--
Kirill A. Shutemov