Re: [PATCH 2/6] __wr_after_init: write rare for static allocation
From: Andy Lutomirski
Date: Wed Dec 05 2018 - 18:14:15 EST
I added some s390 and powerpc people.
On Tue, Dec 4, 2018 at 4:18 AM Igor Stoppa <igor.stoppa@xxxxxxxxx> wrote:
>
> Implementation of write rare for statically allocated data, located in a
> specific memory section through the use of the __write_rare label.
>
> The basic functions are:
> - wr_memset(): write rare counterpart of memset()
> - wr_memcpy(): write rare counterpart of memcpy()
> - wr_assign(): write rare counterpart of the assignment ('=') operator
> - wr_rcu_assign_pointer(): write rare counterpart of rcu_assign_pointer()
>
> The implementation is based on code from Andy Lutomirski and Nadav Amit
> for patching the text on x86 [here goes reference to commits, once merged]
>
> The modification of write protected data is done through an alternate
> mapping of the same pages, as writable.
> This mapping is local to each core and is active only for the duration
> of each write operation.
> Local interrupts are disabled, while the alternate mapping is active.
>
> In theory, it could introduce a non-predictable delay, in a preemptible
> system, however the amount of data to be altered is likely to be far
> smaller than a page.
>
> Signed-off-by: Igor Stoppa <igor.stoppa@xxxxxxxxxx>
>
> CC: Andy Lutomirski <luto@xxxxxxxxxxxxxx>
> CC: Nadav Amit <nadav.amit@xxxxxxxxx>
> CC: Matthew Wilcox <willy@xxxxxxxxxxxxx>
> CC: Peter Zijlstra <peterz@xxxxxxxxxxxxx>
> CC: Kees Cook <keescook@xxxxxxxxxxxx>
> CC: Dave Hansen <dave.hansen@xxxxxxxxxxxxxxx>
> CC: linux-integrity@xxxxxxxxxxxxxxx
> CC: kernel-hardening@xxxxxxxxxxxxxxxxxx
> CC: linux-mm@xxxxxxxxx
> CC: linux-kernel@xxxxxxxxxxxxxxx
> ---
> include/linux/prmem.h | 133 ++++++++++++++++++++++++++++++++++++++++++
> init/main.c | 2 +
> mm/Kconfig | 4 ++
> mm/Makefile | 1 +
> mm/prmem.c | 124 +++++++++++++++++++++++++++++++++++++++
> 5 files changed, 264 insertions(+)
> create mode 100644 include/linux/prmem.h
> create mode 100644 mm/prmem.c
>
> diff --git a/include/linux/prmem.h b/include/linux/prmem.h
> new file mode 100644
> index 000000000000..b0131c1f5dc0
> --- /dev/null
> +++ b/include/linux/prmem.h
> @@ -0,0 +1,133 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +/*
> + * prmem.h: Header for memory protection library
> + *
> + * (C) Copyright 2018 Huawei Technologies Co. Ltd.
> + * Author: Igor Stoppa <igor.stoppa@xxxxxxxxxx>
> + *
> + * Support for:
> + * - statically allocated write rare data
> + */
> +
> +#ifndef _LINUX_PRMEM_H
> +#define _LINUX_PRMEM_H
> +
> +#include <linux/set_memory.h>
> +#include <linux/mm.h>
> +#include <linux/vmalloc.h>
> +#include <linux/string.h>
> +#include <linux/slab.h>
> +#include <linux/mutex.h>
> +#include <linux/compiler.h>
> +#include <linux/irqflags.h>
> +
> +/**
> + * memtst() - test n bytes of the source to match the c value
> + * @p: beginning of the memory to test
> + * @c: byte to compare against
> + * @len: amount of bytes to test
> + *
> + * Returns 0 on success, non-zero otherwise.
> + */
> +static inline int memtst(void *p, int c, __kernel_size_t len)
> +{
> + __kernel_size_t i;
> +
> + for (i = 0; i < len; i++) {
> + u8 d = *(i + (u8 *)p) - (u8)c;
> +
> + if (unlikely(d))
> + return d;
> + }
> + return 0;
> +}
> +
> +
> +#ifndef CONFIG_PRMEM
> +
> +static inline void *wr_memset(void *p, int c, __kernel_size_t len)
> +{
> + return memset(p, c, len);
> +}
> +
> +static inline void *wr_memcpy(void *p, const void *q, __kernel_size_t size)
> +{
> + return memcpy(p, q, size);
> +}
> +
> +#define wr_assign(var, val) ((var) = (val))
> +
> +#define wr_rcu_assign_pointer(p, v) \
> + rcu_assign_pointer(p, v)
> +
> +#else
> +
> +enum wr_op_type {
> + WR_MEMCPY,
> + WR_MEMSET,
> + WR_RCU_ASSIGN_PTR,
> + WR_OPS_NUMBER,
> +};
> +
> +void *__wr_op(unsigned long dst, unsigned long src, __kernel_size_t len,
> + enum wr_op_type op);
> +
> +/**
> + * wr_memset() - sets n bytes of the destination to the c value
> + * @p: beginning of the memory to write to
> + * @c: byte to replicate
> + * @len: amount of bytes to copy
> + *
> + * Returns true on success, false otherwise.
> + */
> +static inline void *wr_memset(void *p, int c, __kernel_size_t len)
> +{
> + return __wr_op((unsigned long)p, (unsigned long)c, len, WR_MEMSET);
> +}
> +
> +/**
> + * wr_memcpy() - copyes n bytes from source to destination
> + * @dst: beginning of the memory to write to
> + * @src: beginning of the memory to read from
> + * @n_bytes: amount of bytes to copy
> + *
> + * Returns pointer to the destination
> + */
> +static inline void *wr_memcpy(void *p, const void *q, __kernel_size_t size)
> +{
> + return __wr_op((unsigned long)p, (unsigned long)q, size, WR_MEMCPY);
> +}
> +
> +/**
> + * wr_assign() - sets a write-rare variable to a specified value
> + * @var: the variable to set
> + * @val: the new value
> + *
> + * Returns: the variable
> + *
> + * Note: it might be possible to optimize this, to use wr_memset in some
> + * cases (maybe with NULL?).
> + */
> +
> +#define wr_assign(var, val) ({ \
> + typeof(var) tmp = (typeof(var))val; \
> + \
> + wr_memcpy(&var, &tmp, sizeof(var)); \
> + var; \
> +})
> +
> +/**
> + * wr_rcu_assign_pointer() - initialize a pointer in rcu mode
> + * @p: the rcu pointer
> + * @v: the new value
> + *
> + * Returns the value assigned to the rcu pointer.
> + *
> + * It is provided as macro, to match rcu_assign_pointer()
> + */
> +#define wr_rcu_assign_pointer(p, v) ({ \
> + __wr_op((unsigned long)&p, v, sizeof(p), WR_RCU_ASSIGN_PTR); \
> + p; \
> +})
> +#endif
> +#endif
> diff --git a/init/main.c b/init/main.c
> index a461150adfb1..a36f2e54f937 100644
> --- a/init/main.c
> +++ b/init/main.c
> @@ -498,6 +498,7 @@ void __init __weak thread_stack_cache_init(void)
> void __init __weak mem_encrypt_init(void) { }
>
> void __init __weak poking_init(void) { }
> +void __init __weak wr_poking_init(void) { }
>
> bool initcall_debug;
> core_param(initcall_debug, initcall_debug, bool, 0644);
> @@ -734,6 +735,7 @@ asmlinkage __visible void __init start_kernel(void)
> delayacct_init();
>
> poking_init();
> + wr_poking_init();
> check_bugs();
>
> acpi_subsystem_init();
> diff --git a/mm/Kconfig b/mm/Kconfig
> index d85e39da47ae..9b09339c027f 100644
> --- a/mm/Kconfig
> +++ b/mm/Kconfig
> @@ -142,6 +142,10 @@ config ARCH_DISCARD_MEMBLOCK
> config MEMORY_ISOLATION
> bool
>
> +config PRMEM
> + def_bool n
> + depends on STRICT_KERNEL_RWX && X86_64
> +
> #
> # Only be set on architectures that have completely implemented memory hotplug
> # feature. If you are not sure, don't touch it.
> diff --git a/mm/Makefile b/mm/Makefile
> index d210cc9d6f80..ef3867c16ce0 100644
> --- a/mm/Makefile
> +++ b/mm/Makefile
> @@ -58,6 +58,7 @@ obj-$(CONFIG_SPARSEMEM) += sparse.o
> obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o
> obj-$(CONFIG_SLOB) += slob.o
> obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o
> +obj-$(CONFIG_PRMEM) += prmem.o
> obj-$(CONFIG_KSM) += ksm.o
> obj-$(CONFIG_PAGE_POISONING) += page_poison.o
> obj-$(CONFIG_SLAB) += slab.o
> diff --git a/mm/prmem.c b/mm/prmem.c
> new file mode 100644
> index 000000000000..e8ab76701831
> --- /dev/null
> +++ b/mm/prmem.c
> @@ -0,0 +1,124 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * prmem.c: Memory Protection Library
> + *
> + * (C) Copyright 2017-2018 Huawei Technologies Co. Ltd.
> + * Author: Igor Stoppa <igor.stoppa@xxxxxxxxxx>
> + */
> +
> +#include <linux/mm.h>
> +#include <linux/string.h>
> +#include <linux/compiler.h>
> +#include <linux/slab.h>
> +#include <linux/mmu_context.h>
> +#include <linux/rcupdate.h>
> +#include <linux/prmem.h>
> +
> +static __ro_after_init bool wr_ready;
> +static __ro_after_init struct mm_struct *wr_poking_mm;
> +static __ro_after_init unsigned long wr_poking_base;
> +
> +/*
> + * The following two variables are statically allocated by the linker
> + * script at the the boundaries of the memory region (rounded up to
> + * multiples of PAGE_SIZE) reserved for __wr_after_init.
> + */
> +extern long __start_wr_after_init;
> +extern long __end_wr_after_init;
> +
> +static inline bool is_wr_after_init(unsigned long ptr, __kernel_size_t size)
> +{
> + unsigned long start = (unsigned long)&__start_wr_after_init;
> + unsigned long end = (unsigned long)&__end_wr_after_init;
> + unsigned long low = ptr;
> + unsigned long high = ptr + size;
> +
> + return likely(start <= low && low <= high && high <= end);
> +}
> +
> +
> +void *__wr_op(unsigned long dst, unsigned long src, __kernel_size_t len,
> + enum wr_op_type op)
> +{
You might end up wanting something like:
#ifdef __arch_wr_op
return __arch_wr_op(...);
#endif
if an arch (s390? powerpc?) decides to have a totally different
implementation of this.
Hi s390 and powerpc people: it would be nice if this generic
implementation *worked* on your architectures and that it will allow
you to add some straightforward way to add a better arch-specific
implementation if you think that would be better.
--Andy
> + temporary_mm_state_t prev;
> + unsigned long flags;
> + unsigned long offset;
> + unsigned long wr_poking_addr;
> +
> + /* Confirm that the writable mapping exists. */
> + BUG_ON(!wr_ready);
> +
> + if (WARN_ONCE(op >= WR_OPS_NUMBER, "Invalid WR operation.") ||
> + WARN_ONCE(!is_wr_after_init(dst, len), "Invalid WR range."))
> + return (void *)dst;
> +
> + offset = dst - (unsigned long)&__start_wr_after_init;
> + wr_poking_addr = wr_poking_base + offset;
> + local_irq_save(flags);
> + prev = use_temporary_mm(wr_poking_mm);
> +
> + kasan_disable_current();
> + if (op == WR_MEMCPY)
> + memcpy((void *)wr_poking_addr, (void *)src, len);
> + else if (op == WR_MEMSET)
> + memset((u8 *)wr_poking_addr, (u8)src, len);
> + else if (op == WR_RCU_ASSIGN_PTR)
> + /* generic version of rcu_assign_pointer */
> + smp_store_release((void **)wr_poking_addr,
> + RCU_INITIALIZER((void **)src));
> + kasan_enable_current();
Hmm. I suspect this will explode quite badly on sane architectures
like s390. (In my book, despite how weird s390 is, it has a vastly
nicer model of "user" memory than any other architecture I know
of...). I think you should use copy_to_user(), etc, instead. I'm not
entirely sure what the best smp_store_release() replacement is.
Making this change may also mean you can get rid of the
kasan_disable_current().
> +
> + barrier(); /* XXX redundant? */
I think it's redundant. If unuse_temporary_mm() allows earlier stores
to hit the wrong address space, then something is very very wrong, and
something is also very very wrong if the optimizer starts moving
stores across a function call that is most definitely a barrier.
> +
> + unuse_temporary_mm(prev);
> + /* XXX make the verification optional? */
> + if (op == WR_MEMCPY)
> + BUG_ON(memcmp((void *)dst, (void *)src, len));
> + else if (op == WR_MEMSET)
> + BUG_ON(memtst((void *)dst, (u8)src, len));
> + else if (op == WR_RCU_ASSIGN_PTR)
> + BUG_ON(*(unsigned long *)dst != src);
Hmm. If you allowed cmpxchg or even plain xchg, then these bug_ons
would be thoroughly buggy, but maybe they're okay. But they should,
at most, be WARN_ON_ONCE(), given that you can trigger them by writing
the same addresses from two threads at once, and this isn't even
entirely obviously bogus given the presence of smp_store_release().
> + local_irq_restore(flags);
> + return (void *)dst;
> +}
> +
> +struct mm_struct *copy_init_mm(void);
> +void __init wr_poking_init(void)
> +{
> + unsigned long start = (unsigned long)&__start_wr_after_init;
> + unsigned long end = (unsigned long)&__end_wr_after_init;
> + unsigned long i;
> + unsigned long wr_range;
> +
> + wr_poking_mm = copy_init_mm();
> + BUG_ON(!wr_poking_mm);
> +
> + /* XXX What if it's too large to fit in the task unmapped mem? */
> + wr_range = round_up(end - start, PAGE_SIZE);
> +
> + /* Randomize the poking address base*/
> + wr_poking_base = TASK_UNMAPPED_BASE +
> + (kaslr_get_random_long("Write Rare Poking") & PAGE_MASK) %
> + (TASK_SIZE - (TASK_UNMAPPED_BASE + wr_range));
> +
> + /* Create alternate mapping for the entire wr_after_init range. */
> + for (i = start; i < end; i += PAGE_SIZE) {
> + struct page *page;
> + spinlock_t *ptl;
> + pte_t pte;
> + pte_t *ptep;
> + unsigned long wr_poking_addr;
> +
> + BUG_ON(!(page = virt_to_page(i)));
> + wr_poking_addr = i - start + wr_poking_base;
> +
> + /* The lock is not needed, but avoids open-coding. */
> + ptep = get_locked_pte(wr_poking_mm, wr_poking_addr, &ptl);
> + VM_BUG_ON(!ptep);
> +
> + pte = mk_pte(page, PAGE_KERNEL);
> + set_pte_at(wr_poking_mm, wr_poking_addr, ptep, pte);
> + spin_unlock(ptl);
> + }
> + wr_ready = true;
> +}
> --
> 2.19.1
>