Re: [kernel-hardening] [PATCH v5 04/10] arm64: Add __flush_tlb_one()
From: Tycho Andersen
Date: Tue Aug 29 2017 - 13:24:40 EST
Hi Mark,
On Thu, Aug 24, 2017 at 04:45:19PM +0100, Mark Rutland wrote:
> On Wed, Aug 23, 2017 at 11:13:02AM -0600, Tycho Andersen wrote:
> > On Wed, Aug 23, 2017 at 06:04:43PM +0100, Mark Rutland wrote:
> > > On Wed, Aug 23, 2017 at 10:58:42AM -0600, Tycho Andersen wrote:
> > > > Hi Mark,
> > > >
> > > > On Mon, Aug 14, 2017 at 05:50:47PM +0100, Mark Rutland wrote:
> > > > > That said, is there any reason not to use flush_tlb_kernel_range()
> > > > > directly?
> > > >
> > > > So it turns out that there is a difference between __flush_tlb_one() and
> > > > flush_tlb_kernel_range() on x86: flush_tlb_kernel_range() flushes all the TLBs
> > > > via on_each_cpu(), where as __flush_tlb_one() only flushes the local TLB (which
> > > > I think is enough here).
> > >
> > > That sounds suspicious; I don't think that __flush_tlb_one() is
> > > sufficient.
> > >
> > > If you only do local TLB maintenance, then the page is left accessible
> > > to other CPUs via the (stale) kernel mappings. i.e. the page isn't
> > > exclusively mapped by userspace.
> >
> > I thought so too, so I tried to test it with something like the patch
> > below. But it correctly failed for me when using __flush_tlb_one(). I
> > suppose I'm doing something wrong in the test, but I'm not sure what.
>
> I suspect the issue is that you use a completion to synchronise the
> mapping.
>
> The reader thread will block (i.e. it we go into schedule() and
> something else will run), and I guess that on x86, that the
> context-switch this entails upon completion happens to invalidate the
> TLBs.
>
> Instead, you could serialise the update with the reader doing:
>
> /* spin until address is published to us */
> addr = smp_cond_load_acquire(arg->virt_addr, VAL != NULL);
>
> read_map(addr);
>
> ... and the writer doing:
>
> user_addr = do_map(...)
>
> ...
>
> smp_store_release(arg->virt_addr, user_addr);
>
> There would still be a chance of a context-switch, but it wouldn't be
> mandatory.
>
Thanks for this. However, I've tried several different variations
(including just raw spinning), and it still seems to fail correctly
for me.
I'll see about just cleaning up the series with this round of feedback
and post the whole thing again with CCs to the arm/x86 lists where
appropriate, and hopefully someone can tell me what I'm doing wrong.
Thanks,
Tycho
> As an aside, it looks like DEBUG_PAGEALLOC on x86 has the problem w.r.t.
> under-invalidating, juding by the comments in x86's
> __kernel_map_pages(). It only invalidates the local TLBs, even though
> it should do it on all CPUs.
>
> Thanks,
> Mark.
>
> >
> > Tycho
> >
> >
> > From 1d1b0a18d56cf1634072096231bfbaa96cb2aa16 Mon Sep 17 00:00:00 2001
> > From: Tycho Andersen <tycho@xxxxxxxxxx>
> > Date: Tue, 22 Aug 2017 18:07:12 -0600
> > Subject: [PATCH] add XPFO_SMP test
> >
> > Signed-off-by: Tycho Andersen <tycho@xxxxxxxxxx>
> > ---
> > drivers/misc/lkdtm.h | 1 +
> > drivers/misc/lkdtm_core.c | 1 +
> > drivers/misc/lkdtm_xpfo.c | 139 ++++++++++++++++++++++++++++++++++++++++++----
> > 3 files changed, 130 insertions(+), 11 deletions(-)
> >
> > diff --git a/drivers/misc/lkdtm.h b/drivers/misc/lkdtm.h
> > index fc53546113c1..34a6ee37f216 100644
> > --- a/drivers/misc/lkdtm.h
> > +++ b/drivers/misc/lkdtm.h
> > @@ -67,5 +67,6 @@ void lkdtm_USERCOPY_KERNEL(void);
> > /* lkdtm_xpfo.c */
> > void lkdtm_XPFO_READ_USER(void);
> > void lkdtm_XPFO_READ_USER_HUGE(void);
> > +void lkdtm_XPFO_SMP(void);
> >
> > #endif
> > diff --git a/drivers/misc/lkdtm_core.c b/drivers/misc/lkdtm_core.c
> > index 164bc404f416..9544e329de4b 100644
> > --- a/drivers/misc/lkdtm_core.c
> > +++ b/drivers/misc/lkdtm_core.c
> > @@ -237,6 +237,7 @@ struct crashtype crashtypes[] = {
> > CRASHTYPE(USERCOPY_KERNEL),
> > CRASHTYPE(XPFO_READ_USER),
> > CRASHTYPE(XPFO_READ_USER_HUGE),
> > + CRASHTYPE(XPFO_SMP),
> > };
> >
> >
> > diff --git a/drivers/misc/lkdtm_xpfo.c b/drivers/misc/lkdtm_xpfo.c
> > index c72509128eb3..7600fdcae22f 100644
> > --- a/drivers/misc/lkdtm_xpfo.c
> > +++ b/drivers/misc/lkdtm_xpfo.c
> > @@ -4,22 +4,27 @@
> >
> > #include "lkdtm.h"
> >
> > +#include <linux/cpumask.h>
> > #include <linux/mman.h>
> > #include <linux/uaccess.h>
> > #include <linux/xpfo.h>
> > +#include <linux/kthread.h>
> >
> > -void read_user_with_flags(unsigned long flags)
> > +#include <linux/delay.h>
> > +#include <linux/sched/task.h>
> > +
> > +#define XPFO_DATA 0xdeadbeef
> > +
> > +static unsigned long do_map(unsigned long flags)
> > {
> > - unsigned long user_addr, user_data = 0xdeadbeef;
> > - phys_addr_t phys_addr;
> > - void *virt_addr;
> > + unsigned long user_addr, user_data = XPFO_DATA;
> >
> > user_addr = vm_mmap(NULL, 0, PAGE_SIZE,
> > PROT_READ | PROT_WRITE | PROT_EXEC,
> > flags, 0);
> > if (user_addr >= TASK_SIZE) {
> > pr_warn("Failed to allocate user memory\n");
> > - return;
> > + return 0;
> > }
> >
> > if (copy_to_user((void __user *)user_addr, &user_data,
> > @@ -28,25 +33,61 @@ void read_user_with_flags(unsigned long flags)
> > goto free_user;
> > }
> >
> > + return user_addr;
> > +
> > +free_user:
> > + vm_munmap(user_addr, PAGE_SIZE);
> > + return 0;
> > +}
> > +
> > +static unsigned long *user_to_kernel(unsigned long user_addr)
> > +{
> > + phys_addr_t phys_addr;
> > + void *virt_addr;
> > +
> > phys_addr = user_virt_to_phys(user_addr);
> > if (!phys_addr) {
> > pr_warn("Failed to get physical address of user memory\n");
> > - goto free_user;
> > + return 0;
> > }
> >
> > virt_addr = phys_to_virt(phys_addr);
> > if (phys_addr != virt_to_phys(virt_addr)) {
> > pr_warn("Physical address of user memory seems incorrect\n");
> > - goto free_user;
> > + return 0;
> > }
> >
> > + return virt_addr;
> > +}
> > +
> > +static void read_map(unsigned long *virt_addr)
> > +{
> > pr_info("Attempting bad read from kernel address %p\n", virt_addr);
> > - if (*(unsigned long *)virt_addr == user_data)
> > - pr_info("Huh? Bad read succeeded?!\n");
> > + if (*(unsigned long *)virt_addr == XPFO_DATA)
> > + pr_err("FAIL: Bad read succeeded?!\n");
> > else
> > - pr_info("Huh? Bad read didn't fail but data is incorrect?!\n");
> > + pr_err("FAIL: Bad read didn't fail but data is incorrect?!\n");
> > +}
> > +
> > +static void read_user_with_flags(unsigned long flags)
> > +{
> > + unsigned long user_addr, *kernel;
> > +
> > + user_addr = do_map(flags);
> > + if (!user_addr) {
> > + pr_err("FAIL: map failed\n");
> > + return;
> > + }
> > +
> > + kernel = user_to_kernel(user_addr);
> > + if (!kernel) {
> > + pr_err("FAIL: user to kernel conversion failed\n");
> > + goto free_user;
> > + }
> > +
> > + read_map(kernel);
> >
> > - free_user:
> > +free_user:
> > vm_munmap(user_addr, PAGE_SIZE);
> > }
> >
> > @@ -60,3 +101,79 @@ void lkdtm_XPFO_READ_USER_HUGE(void)
> > {
> > read_user_with_flags(MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB);
> > }
> > +
> > +struct smp_arg {
> > + struct completion map_done;
> > + unsigned long *virt_addr;
> > + unsigned int cpu;
> > +};
> > +
> > +static int smp_reader(void *parg)
> > +{
> > + struct smp_arg *arg = parg;
> > +
> > + if (arg->cpu != smp_processor_id()) {
> > + pr_err("FAIL: scheduled on wrong CPU?\n");
> > + return 0;
> > + }
> > +
> > + wait_for_completion(&arg->map_done);
> > +
> > + if (arg->virt_addr)
> > + read_map(arg->virt_addr);
> > +
> > + return 0;
> > +}
> > +
> > +/* The idea here is to read from the kernel's map on a different thread than
> > + * did the mapping (and thus the TLB flushing), to make sure that the page
> > + * faults on other cores too.
> > + */
> > +void lkdtm_XPFO_SMP(void)
> > +{
> > + unsigned long user_addr;
> > + struct task_struct *thread;
> > + int ret;
> > + struct smp_arg arg;
> > +
> > + init_completion(&arg.map_done);
> > +
> > + if (num_online_cpus() < 2) {
> > + pr_err("not enough to do a multi cpu test\n");
> > + return;
> > + }
> > +
> > + arg.cpu = (smp_processor_id() + 1) % num_online_cpus();
> > + thread = kthread_create(smp_reader, &arg, "lkdtm_xpfo_test");
> > + if (IS_ERR(thread)) {
> > + pr_err("couldn't create kthread? %ld\n", PTR_ERR(thread));
> > + return;
> > + }
> > +
> > + kthread_bind(thread, arg.cpu);
> > + get_task_struct(thread);
> > + wake_up_process(thread);
> > +
> > + user_addr = do_map(MAP_PRIVATE | MAP_ANONYMOUS);
> > + if (user_addr) {
> > + arg.virt_addr = user_to_kernel(user_addr);
> > + /* child thread checks for failure */
> > + }
> > +
> > + complete(&arg.map_done);
> > +
> > + /* there must be a better way to do this. */
> > + while (1) {
> > + if (thread->exit_state)
> > + break;
> > + msleep_interruptible(100);
> > + }
> > +
> > + ret = kthread_stop(thread);
> > + if (ret != SIGKILL)
> > + pr_err("FAIL: thread wasn't killed: %d\n", ret);
> > + put_task_struct(thread);
> > +
> > + if (user_addr)
> > + vm_munmap(user_addr, PAGE_SIZE);
> > +}
> > --
> > 2.11.0
> >