Re: [kernel-hardening] [PATCH v5 04/10] arm64: Add __flush_tlb_one()

From: Mark Rutland
Date: Thu Aug 24 2017 - 11:46:39 EST


On Wed, Aug 23, 2017 at 11:13:02AM -0600, Tycho Andersen wrote:
> On Wed, Aug 23, 2017 at 06:04:43PM +0100, Mark Rutland wrote:
> > On Wed, Aug 23, 2017 at 10:58:42AM -0600, Tycho Andersen wrote:
> > > Hi Mark,
> > >
> > > On Mon, Aug 14, 2017 at 05:50:47PM +0100, Mark Rutland wrote:
> > > > That said, is there any reason not to use flush_tlb_kernel_range()
> > > > directly?
> > >
> > > So it turns out that there is a difference between __flush_tlb_one() and
> > > flush_tlb_kernel_range() on x86: flush_tlb_kernel_range() flushes all the TLBs
> > > via on_each_cpu(), where as __flush_tlb_one() only flushes the local TLB (which
> > > I think is enough here).
> >
> > That sounds suspicious; I don't think that __flush_tlb_one() is
> > sufficient.
> >
> > If you only do local TLB maintenance, then the page is left accessible
> > to other CPUs via the (stale) kernel mappings. i.e. the page isn't
> > exclusively mapped by userspace.
>
> I thought so too, so I tried to test it with something like the patch
> below. But it correctly failed for me when using __flush_tlb_one(). I
> suppose I'm doing something wrong in the test, but I'm not sure what.

I suspect the issue is that you use a completion to synchronise the
mapping.

The reader thread will block (i.e. it we go into schedule() and
something else will run), and I guess that on x86, that the
context-switch this entails upon completion happens to invalidate the
TLBs.

Instead, you could serialise the update with the reader doing:

/* spin until address is published to us */
addr = smp_cond_load_acquire(arg->virt_addr, VAL != NULL);

read_map(addr);

... and the writer doing:

user_addr = do_map(...)

...

smp_store_release(arg->virt_addr, user_addr);

There would still be a chance of a context-switch, but it wouldn't be
mandatory.

As an aside, it looks like DEBUG_PAGEALLOC on x86 has the problem w.r.t.
under-invalidating, juding by the comments in x86's
__kernel_map_pages(). It only invalidates the local TLBs, even though
it should do it on all CPUs.

Thanks,
Mark.

>
> Tycho
>
>
> From 1d1b0a18d56cf1634072096231bfbaa96cb2aa16 Mon Sep 17 00:00:00 2001
> From: Tycho Andersen <tycho@xxxxxxxxxx>
> Date: Tue, 22 Aug 2017 18:07:12 -0600
> Subject: [PATCH] add XPFO_SMP test
>
> Signed-off-by: Tycho Andersen <tycho@xxxxxxxxxx>
> ---
> drivers/misc/lkdtm.h | 1 +
> drivers/misc/lkdtm_core.c | 1 +
> drivers/misc/lkdtm_xpfo.c | 139 ++++++++++++++++++++++++++++++++++++++++++----
> 3 files changed, 130 insertions(+), 11 deletions(-)
>
> diff --git a/drivers/misc/lkdtm.h b/drivers/misc/lkdtm.h
> index fc53546113c1..34a6ee37f216 100644
> --- a/drivers/misc/lkdtm.h
> +++ b/drivers/misc/lkdtm.h
> @@ -67,5 +67,6 @@ void lkdtm_USERCOPY_KERNEL(void);
> /* lkdtm_xpfo.c */
> void lkdtm_XPFO_READ_USER(void);
> void lkdtm_XPFO_READ_USER_HUGE(void);
> +void lkdtm_XPFO_SMP(void);
>
> #endif
> diff --git a/drivers/misc/lkdtm_core.c b/drivers/misc/lkdtm_core.c
> index 164bc404f416..9544e329de4b 100644
> --- a/drivers/misc/lkdtm_core.c
> +++ b/drivers/misc/lkdtm_core.c
> @@ -237,6 +237,7 @@ struct crashtype crashtypes[] = {
> CRASHTYPE(USERCOPY_KERNEL),
> CRASHTYPE(XPFO_READ_USER),
> CRASHTYPE(XPFO_READ_USER_HUGE),
> + CRASHTYPE(XPFO_SMP),
> };
>
>
> diff --git a/drivers/misc/lkdtm_xpfo.c b/drivers/misc/lkdtm_xpfo.c
> index c72509128eb3..7600fdcae22f 100644
> --- a/drivers/misc/lkdtm_xpfo.c
> +++ b/drivers/misc/lkdtm_xpfo.c
> @@ -4,22 +4,27 @@
>
> #include "lkdtm.h"
>
> +#include <linux/cpumask.h>
> #include <linux/mman.h>
> #include <linux/uaccess.h>
> #include <linux/xpfo.h>
> +#include <linux/kthread.h>
>
> -void read_user_with_flags(unsigned long flags)
> +#include <linux/delay.h>
> +#include <linux/sched/task.h>
> +
> +#define XPFO_DATA 0xdeadbeef
> +
> +static unsigned long do_map(unsigned long flags)
> {
> - unsigned long user_addr, user_data = 0xdeadbeef;
> - phys_addr_t phys_addr;
> - void *virt_addr;
> + unsigned long user_addr, user_data = XPFO_DATA;
>
> user_addr = vm_mmap(NULL, 0, PAGE_SIZE,
> PROT_READ | PROT_WRITE | PROT_EXEC,
> flags, 0);
> if (user_addr >= TASK_SIZE) {
> pr_warn("Failed to allocate user memory\n");
> - return;
> + return 0;
> }
>
> if (copy_to_user((void __user *)user_addr, &user_data,
> @@ -28,25 +33,61 @@ void read_user_with_flags(unsigned long flags)
> goto free_user;
> }
>
> + return user_addr;
> +
> +free_user:
> + vm_munmap(user_addr, PAGE_SIZE);
> + return 0;
> +}
> +
> +static unsigned long *user_to_kernel(unsigned long user_addr)
> +{
> + phys_addr_t phys_addr;
> + void *virt_addr;
> +
> phys_addr = user_virt_to_phys(user_addr);
> if (!phys_addr) {
> pr_warn("Failed to get physical address of user memory\n");
> - goto free_user;
> + return 0;
> }
>
> virt_addr = phys_to_virt(phys_addr);
> if (phys_addr != virt_to_phys(virt_addr)) {
> pr_warn("Physical address of user memory seems incorrect\n");
> - goto free_user;
> + return 0;
> }
>
> + return virt_addr;
> +}
> +
> +static void read_map(unsigned long *virt_addr)
> +{
> pr_info("Attempting bad read from kernel address %p\n", virt_addr);
> - if (*(unsigned long *)virt_addr == user_data)
> - pr_info("Huh? Bad read succeeded?!\n");
> + if (*(unsigned long *)virt_addr == XPFO_DATA)
> + pr_err("FAIL: Bad read succeeded?!\n");
> else
> - pr_info("Huh? Bad read didn't fail but data is incorrect?!\n");
> + pr_err("FAIL: Bad read didn't fail but data is incorrect?!\n");
> +}
> +
> +static void read_user_with_flags(unsigned long flags)
> +{
> + unsigned long user_addr, *kernel;
> +
> + user_addr = do_map(flags);
> + if (!user_addr) {
> + pr_err("FAIL: map failed\n");
> + return;
> + }
> +
> + kernel = user_to_kernel(user_addr);
> + if (!kernel) {
> + pr_err("FAIL: user to kernel conversion failed\n");
> + goto free_user;
> + }
> +
> + read_map(kernel);
>
> - free_user:
> +free_user:
> vm_munmap(user_addr, PAGE_SIZE);
> }
>
> @@ -60,3 +101,79 @@ void lkdtm_XPFO_READ_USER_HUGE(void)
> {
> read_user_with_flags(MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB);
> }
> +
> +struct smp_arg {
> + struct completion map_done;
> + unsigned long *virt_addr;
> + unsigned int cpu;
> +};
> +
> +static int smp_reader(void *parg)
> +{
> + struct smp_arg *arg = parg;
> +
> + if (arg->cpu != smp_processor_id()) {
> + pr_err("FAIL: scheduled on wrong CPU?\n");
> + return 0;
> + }
> +
> + wait_for_completion(&arg->map_done);
> +
> + if (arg->virt_addr)
> + read_map(arg->virt_addr);
> +
> + return 0;
> +}
> +
> +/* The idea here is to read from the kernel's map on a different thread than
> + * did the mapping (and thus the TLB flushing), to make sure that the page
> + * faults on other cores too.
> + */
> +void lkdtm_XPFO_SMP(void)
> +{
> + unsigned long user_addr;
> + struct task_struct *thread;
> + int ret;
> + struct smp_arg arg;
> +
> + init_completion(&arg.map_done);
> +
> + if (num_online_cpus() < 2) {
> + pr_err("not enough to do a multi cpu test\n");
> + return;
> + }
> +
> + arg.cpu = (smp_processor_id() + 1) % num_online_cpus();
> + thread = kthread_create(smp_reader, &arg, "lkdtm_xpfo_test");
> + if (IS_ERR(thread)) {
> + pr_err("couldn't create kthread? %ld\n", PTR_ERR(thread));
> + return;
> + }
> +
> + kthread_bind(thread, arg.cpu);
> + get_task_struct(thread);
> + wake_up_process(thread);
> +
> + user_addr = do_map(MAP_PRIVATE | MAP_ANONYMOUS);
> + if (user_addr) {
> + arg.virt_addr = user_to_kernel(user_addr);
> + /* child thread checks for failure */
> + }
> +
> + complete(&arg.map_done);
> +
> + /* there must be a better way to do this. */
> + while (1) {
> + if (thread->exit_state)
> + break;
> + msleep_interruptible(100);
> + }
> +
> + ret = kthread_stop(thread);
> + if (ret != SIGKILL)
> + pr_err("FAIL: thread wasn't killed: %d\n", ret);
> + put_task_struct(thread);
> +
> + if (user_addr)
> + vm_munmap(user_addr, PAGE_SIZE);
> +}
> --
> 2.11.0
>