Re: [PATCH 26/26] x86, pkeys: Documentation

From: Dave Hansen
Date: Thu Oct 01 2015 - 18:33:19 EST


On 10/01/2015 01:39 PM, Kees Cook wrote:
> On Thu, Oct 1, 2015 at 4:17 AM, Ingo Molnar <mingo@xxxxxxxxxx> wrote:
>> So could we try to add an (opt-in) kernel option that enables this transparently
>> and automatically for all PROT_EXEC && !PROT_WRITE mappings, without any
>> user-space changes and syscalls necessary?
>
> I would like this very much. :)

Here it is in a quite fugly form (well, it's not opt-in). Init crashes
if I boot with this, though.

I'll see if I can turn it in to a bit more of an opt-in and see what's
actually going wrong.




---

b/arch/x86/include/asm/fpu/internal.h | 4 ++++
b/arch/x86/kernel/fpu/core.c | 4 ++++
b/arch/x86/kernel/fpu/xstate.c | 16 +++++++++++++++-
b/arch/x86/mm/fault.c | 8 ++++++--
b/include/linux/mm_types.h | 1 +
b/kernel/fork.c | 3 ++-
b/kernel/sched/core.c | 3 +++
b/mm/mmap.c | 8 +++++++-
b/mm/mprotect.c | 27 ++++++++++++++++++++++++++-
9 files changed, 68 insertions(+), 6 deletions(-)

diff -puN mm/mprotect.c~pkeys-95-rewire-mprotect-to-use-pkeys mm/mprotect.c
--- a/mm/mprotect.c~pkeys-95-rewire-mprotect-to-use-pkeys 2015-10-01 15:21:25.183874598 -0700
+++ b/mm/mprotect.c 2015-10-01 15:28:14.741262888 -0700
@@ -24,6 +24,7 @@
#include <linux/migrate.h>
#include <linux/perf_event.h>
#include <linux/ksm.h>
+#include <linux/debugfs.h>
#include <asm/uaccess.h>
#include <asm/pgtable.h>
#include <asm/cacheflush.h>
@@ -453,10 +454,34 @@ out:
return error;
}

+u32 __read_mostly mprotect_hack_pkey = 1;
+int mprotect_hack_pkey_init(void)
+{
+ debugfs_create_u32("mprotect_hack_pkey", S_IRUSR | S_IWUSR,
+ NULL, &mprotect_hack_pkey);
+ return 0;
+}
+late_initcall(mprotect_hack_pkey_init);
+
+int pkey_for_access_protect = 1;
+int pkey_for_write_protect = 2;
SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len,
unsigned long, prot)
{
- return do_mprotect_key(start, len, prot, 0);
+ int ret;
+ unsigned long newprot = prot;
+ u32 pkey_hack = READ_ONCE(mprotect_hack_pkey);
+ u16 pkey = 0;
+
+ if (!pkey_hack)
+ return do_mprotect_key(start, len, prot, 0);
+
+ if ((prot & PROT_EXEC) && !(prot & PROT_WRITE))
+ pkey = pkey_for_access_protect;
+
+ ret = do_mprotect_key(start, len, newprot, pkey);
+
+ return ret;
}

SYSCALL_DEFINE4(mprotect_key, unsigned long, start, size_t, len,
diff -puN include/linux/mm_types.h~pkeys-95-rewire-mprotect-to-use-pkeys include/linux/mm_types.h
--- a/include/linux/mm_types.h~pkeys-95-rewire-mprotect-to-use-pkeys 2015-10-01 15:21:25.185874687 -0700
+++ b/include/linux/mm_types.h 2015-10-01 15:21:25.227876573 -0700
@@ -486,6 +486,7 @@ struct mm_struct {
/* address of the bounds directory */
void __user *bd_addr;
#endif
+ u32 fake_mprotect_pkey;
};

static inline void mm_init_cpumask(struct mm_struct *mm)
diff -puN kernel/fork.c~pkeys-95-rewire-mprotect-to-use-pkeys kernel/fork.c
--- a/kernel/fork.c~pkeys-95-rewire-mprotect-to-use-pkeys 2015-10-01 15:21:25.187874777 -0700
+++ b/kernel/fork.c 2015-10-01 15:21:25.228876618 -0700
@@ -927,6 +927,7 @@ static struct mm_struct *dup_mm(struct t

mm->hiwater_rss = get_mm_rss(mm);
mm->hiwater_vm = mm->total_vm;
+ mm->fake_mprotect_pkey = 0;

if (mm->binfmt && !try_module_get(mm->binfmt->module))
goto free_pt;
@@ -1700,7 +1701,7 @@ long _do_fork(unsigned long clone_flags,
struct task_struct *p;
int trace = 0;
long nr;
-
+ //printk("%s()\n", __func__);
/*
* Determine whether and which event to report to ptracer. When
* called from kernel_thread or CLONE_UNTRACED is explicitly
diff -puN arch/x86/kernel/fpu/xstate.c~pkeys-95-rewire-mprotect-to-use-pkeys arch/x86/kernel/fpu/xstate.c
--- a/arch/x86/kernel/fpu/xstate.c~pkeys-95-rewire-mprotect-to-use-pkeys 2015-10-01 15:21:25.197875226 -0700
+++ b/arch/x86/kernel/fpu/xstate.c 2015-10-01 15:21:25.228876618 -0700
@@ -41,6 +41,17 @@ u64 xfeatures_mask __read_mostly;
static unsigned int xstate_offsets[XFEATURE_MAX] = { [ 0 ... XFEATURE_MAX - 1] = -1};
static unsigned int xstate_sizes[XFEATURE_MAX] = { [ 0 ... XFEATURE_MAX - 1] = -1};
static unsigned int xstate_comp_offsets[sizeof(xfeatures_mask)*8];
+void hack_fpstate_for_pkru(struct xregs_state *xstate)
+{
+ void *__pkru;
+ xstate->header.xfeatures |= XFEATURE_MASK_PKRU;
+ __pkru = ((char *)xstate) + xstate_offsets[XFEATURE_PKRU];
+ /*
+ * Access disable PKEY 1 and
+ * Write disable PKEY 2
+ */
+ *(u32 *)__pkru = 0x00000024;
+}

/*
* Clear all of the X86_FEATURE_* bits that are unavailable
@@ -321,7 +332,10 @@ static void __init setup_init_fpu_buf(vo
init_fpstate.xsave.header.xcomp_bv = (u64)1 << 63 | xfeatures_mask;
init_fpstate.xsave.header.xfeatures = xfeatures_mask;
}
-
+ {
+ void hack_fpstate_for_pkru(struct xregs_state *xstate);
+ hack_fpstate_for_pkru(&init_fpstate.xsave);
+ }
/*
* Init all the features state with header_bv being 0x0
*/
diff -puN arch/x86/mm/fault.c~pkeys-95-rewire-mprotect-to-use-pkeys arch/x86/mm/fault.c
--- a/arch/x86/mm/fault.c~pkeys-95-rewire-mprotect-to-use-pkeys 2015-10-01 15:21:25.204875540 -0700
+++ b/arch/x86/mm/fault.c 2015-10-01 15:21:25.229876663 -0700
@@ -902,8 +902,10 @@ static inline bool bad_area_access_from_
{
if (!boot_cpu_has(X86_FEATURE_OSPKE))
return false;
- if (error_code & PF_PK)
+ if (error_code & PF_PK) {
+ printk("%s() PF_PK\n", __func__);
return true;
+ }
/* this checks permission keys on the VMA: */
if (!arch_vma_access_permitted(vma, (error_code & PF_WRITE)))
return true;
@@ -1095,8 +1097,10 @@ access_error(unsigned long error_code, s
* to, for instance, confuse a protection-key-denied
* write with one for which we should do a COW.
*/
- if (error_code & PF_PK)
+ if (error_code & PF_PK) {
+ printk("%s() PF_PK\n", __func__);
return 1;
+ }
/*
* Make sure to check the VMA so that we do not perform
* faults just to hit a PF_PK as soon as we fill in a
diff -puN arch/x86/kernel/fpu/core.c~pkeys-95-rewire-mprotect-to-use-pkeys arch/x86/kernel/fpu/core.c
--- a/arch/x86/kernel/fpu/core.c~pkeys-95-rewire-mprotect-to-use-pkeys 2015-10-01 15:21:25.207875675 -0700
+++ b/arch/x86/kernel/fpu/core.c 2015-10-01 15:21:25.229876663 -0700
@@ -262,6 +262,10 @@ static void fpu_copy(struct fpu *dst_fpu
fpregs_deactivate(src_fpu);
}
preempt_enable();
+ {
+ void hack_fpstate_for_pkru(struct xregs_state *xstate);
+ hack_fpstate_for_pkru(&dst_fpu->state.xsave);
+ }
}

int fpu__copy(struct fpu *dst_fpu, struct fpu *src_fpu)
diff -puN arch/x86/include/asm/fpu/internal.h~pkeys-95-rewire-mprotect-to-use-pkeys arch/x86/include/asm/fpu/internal.h
--- a/arch/x86/include/asm/fpu/internal.h~pkeys-95-rewire-mprotect-to-use-pkeys 2015-10-01 15:21:25.209875765 -0700
+++ b/arch/x86/include/asm/fpu/internal.h 2015-10-01 15:21:25.230876707 -0700
@@ -335,6 +335,10 @@ static inline void copy_xregs_to_kernel(

/* We should never fault when copying to a kernel buffer: */
WARN_ON_FPU(err);
+ {
+ void hack_fpstate_for_pkru(struct xregs_state *xstate);
+ hack_fpstate_for_pkru(xstate);
+ }
}

/*
diff -puN kernel/sched/core.c~pkeys-95-rewire-mprotect-to-use-pkeys kernel/sched/core.c
--- a/kernel/sched/core.c~pkeys-95-rewire-mprotect-to-use-pkeys 2015-10-01 15:21:25.216876079 -0700
+++ b/kernel/sched/core.c 2015-10-01 15:21:25.232876797 -0700
@@ -2644,6 +2644,9 @@ context_switch(struct rq *rq, struct tas
/* Here we just switch the register state and the stack. */
switch_to(prev, next, prev);
barrier();
+ if (read_pkru() && printk_ratelimit()) {
+ printk("pid: %d pkru: 0x%x\n", current->pid, read_pkru());
+ }

return finish_task_switch(prev);
}
diff -puN mm/mmap.c~pkeys-95-rewire-mprotect-to-use-pkeys mm/mmap.c
--- a/mm/mmap.c~pkeys-95-rewire-mprotect-to-use-pkeys 2015-10-01 15:21:25.223876393 -0700
+++ b/mm/mmap.c 2015-10-01 15:25:44.327508557 -0700
@@ -1267,6 +1267,8 @@ unsigned long do_mmap(struct file *file,
unsigned long flags, vm_flags_t vm_flags,
unsigned long pgoff, unsigned long *populate)
{
+ extern u16 pkey_for_access_protect;
+ u16 pkey = 0;
struct mm_struct *mm = current->mm;

*populate = 0;
@@ -1311,7 +1313,11 @@ unsigned long do_mmap(struct file *file,
* to. we assume access permissions have been handled by the open
* of the memory object, so we don't do any here.
*/
- vm_flags |= calc_vm_prot_bits(prot, 0) | calc_vm_flag_bits(flags) |
+ if ((prot & PROT_EXEC) && !(prot & PROT_WRITE)) {
+ pkey = pkey_for_access_protect;
+ trace_printk("hacking mmap() to use pkey %d\n", pkey);
+ }
+ vm_flags |= calc_vm_prot_bits(prot, pkey) | calc_vm_flag_bits(flags) |
mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;

if (flags & MAP_LOCKED)
_