Re: [PATCH v5 29/32] x86/mm: Add support to encrypt the kernel in-place

From: Borislav Petkov
Date: Thu May 18 2017 - 08:46:43 EST


On Tue, Apr 18, 2017 at 04:21:49PM -0500, Tom Lendacky wrote:
> Add the support to encrypt the kernel in-place. This is done by creating
> new page mappings for the kernel - a decrypted write-protected mapping
> and an encrypted mapping. The kernel is encrypted by copying it through
> a temporary buffer.
>
> Signed-off-by: Tom Lendacky <thomas.lendacky@xxxxxxx>
> ---
> arch/x86/include/asm/mem_encrypt.h | 6 +
> arch/x86/mm/Makefile | 2
> arch/x86/mm/mem_encrypt.c | 262 ++++++++++++++++++++++++++++++++++++
> arch/x86/mm/mem_encrypt_boot.S | 151 +++++++++++++++++++++
> 4 files changed, 421 insertions(+)
> create mode 100644 arch/x86/mm/mem_encrypt_boot.S
>
> diff --git a/arch/x86/include/asm/mem_encrypt.h b/arch/x86/include/asm/mem_encrypt.h
> index b406df2..8f6f9b4 100644
> --- a/arch/x86/include/asm/mem_encrypt.h
> +++ b/arch/x86/include/asm/mem_encrypt.h
> @@ -31,6 +31,12 @@ static inline u64 sme_dma_mask(void)
> return ((u64)sme_me_mask << 1) - 1;
> }
>
> +void sme_encrypt_execute(unsigned long encrypted_kernel_vaddr,
> + unsigned long decrypted_kernel_vaddr,
> + unsigned long kernel_len,
> + unsigned long encryption_wa,
> + unsigned long encryption_pgd);
> +
> void __init sme_early_encrypt(resource_size_t paddr,
> unsigned long size);
> void __init sme_early_decrypt(resource_size_t paddr,
> diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
> index 9e13841..0633142 100644
> --- a/arch/x86/mm/Makefile
> +++ b/arch/x86/mm/Makefile
> @@ -38,3 +38,5 @@ obj-$(CONFIG_NUMA_EMU) += numa_emulation.o
> obj-$(CONFIG_X86_INTEL_MPX) += mpx.o
> obj-$(CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS) += pkeys.o
> obj-$(CONFIG_RANDOMIZE_MEMORY) += kaslr.o
> +
> +obj-$(CONFIG_AMD_MEM_ENCRYPT) += mem_encrypt_boot.o
> diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c
> index 30b07a3..0ff41a4 100644
> --- a/arch/x86/mm/mem_encrypt.c
> +++ b/arch/x86/mm/mem_encrypt.c
> @@ -24,6 +24,7 @@
> #include <asm/setup.h>
> #include <asm/bootparam.h>
> #include <asm/cacheflush.h>
> +#include <asm/sections.h>
>
> /*
> * Since SME related variables are set early in the boot process they must
> @@ -216,8 +217,269 @@ void swiotlb_set_mem_attributes(void *vaddr, unsigned long size)
> set_memory_decrypted((unsigned long)vaddr, size >> PAGE_SHIFT);
> }
>
> +void __init sme_clear_pgd(pgd_t *pgd_base, unsigned long start,

static

> + unsigned long end)
> +{
> + unsigned long addr = start;
> + pgdval_t *pgd_p;
> +
> + while (addr < end) {
> + unsigned long pgd_end;
> +
> + pgd_end = (addr & PGDIR_MASK) + PGDIR_SIZE;
> + if (pgd_end > end)
> + pgd_end = end;
> +
> + pgd_p = (pgdval_t *)pgd_base + pgd_index(addr);
> + *pgd_p = 0;

Hmm, so this is a contiguous range from [start:end] which translates to
8-byte PGD pointers in the PGD page so you can simply memset that range,
no?

Instead of iterating over each one?

> +
> + addr = pgd_end;
> + }
> +}
> +
> +#define PGD_FLAGS _KERNPG_TABLE_NOENC
> +#define PUD_FLAGS _KERNPG_TABLE_NOENC
> +#define PMD_FLAGS (__PAGE_KERNEL_LARGE_EXEC & ~_PAGE_GLOBAL)
> +
> +static void __init *sme_populate_pgd(pgd_t *pgd_base, void *pgtable_area,
> + unsigned long vaddr, pmdval_t pmd_val)
> +{
> + pgdval_t pgd, *pgd_p;
> + pudval_t pud, *pud_p;
> + pmdval_t pmd, *pmd_p;

You should use the enclosing type, not the underlying one. I.e.,

pgd_t *pgd;
pud_t *pud;
...

and then the macros native_p*d_val(), p*d_offset() and so on. I say
native_* because we don't want to have any paravirt nastyness here.
I believe your previous version was using the proper interfaces.

And the kernel has gotten 5-level pagetables support in
the meantime, so this'll need to start at p4d AFAICT.
arch/x86/mm/fault.c::dump_pagetable() looks like a good example to stare
at.

> + pgd_p = (pgdval_t *)pgd_base + pgd_index(vaddr);
> + pgd = *pgd_p;
> + if (pgd) {
> + pud_p = (pudval_t *)(pgd & ~PTE_FLAGS_MASK);
> + } else {
> + pud_p = pgtable_area;
> + memset(pud_p, 0, sizeof(*pud_p) * PTRS_PER_PUD);
> + pgtable_area += sizeof(*pud_p) * PTRS_PER_PUD;
> +
> + *pgd_p = (pgdval_t)pud_p + PGD_FLAGS;
> + }
> +
> + pud_p += pud_index(vaddr);
> + pud = *pud_p;
> + if (pud) {
> + if (pud & _PAGE_PSE)
> + goto out;
> +
> + pmd_p = (pmdval_t *)(pud & ~PTE_FLAGS_MASK);
> + } else {
> + pmd_p = pgtable_area;
> + memset(pmd_p, 0, sizeof(*pmd_p) * PTRS_PER_PMD);
> + pgtable_area += sizeof(*pmd_p) * PTRS_PER_PMD;
> +
> + *pud_p = (pudval_t)pmd_p + PUD_FLAGS;
> + }
> +
> + pmd_p += pmd_index(vaddr);
> + pmd = *pmd_p;
> + if (!pmd || !(pmd & _PAGE_PSE))
> + *pmd_p = pmd_val;
> +
> +out:
> + return pgtable_area;
> +}
> +
> +static unsigned long __init sme_pgtable_calc(unsigned long len)
> +{
> + unsigned long pud_tables, pmd_tables;
> + unsigned long total = 0;
> +
> + /*
> + * Perform a relatively simplistic calculation of the pagetable
> + * entries that are needed. That mappings will be covered by 2MB
> + * PMD entries so we can conservatively calculate the required
> + * number of PUD and PMD structures needed to perform the mappings.
> + * Incrementing the count for each covers the case where the
> + * addresses cross entries.
> + */
> + pud_tables = ALIGN(len, PGDIR_SIZE) / PGDIR_SIZE;
> + pud_tables++;
> + pmd_tables = ALIGN(len, PUD_SIZE) / PUD_SIZE;
> + pmd_tables++;
> +
> + total += pud_tables * sizeof(pud_t) * PTRS_PER_PUD;
> + total += pmd_tables * sizeof(pmd_t) * PTRS_PER_PMD;
> +
> + /*
> + * Now calculate the added pagetable structures needed to populate
> + * the new pagetables.
> + */

Nice commenting, helps following what's going on.

> + pud_tables = ALIGN(total, PGDIR_SIZE) / PGDIR_SIZE;
> + pmd_tables = ALIGN(total, PUD_SIZE) / PUD_SIZE;
> +
> + total += pud_tables * sizeof(pud_t) * PTRS_PER_PUD;
> + total += pmd_tables * sizeof(pmd_t) * PTRS_PER_PMD;
> +
> + return total;
> +}
> +
> void __init sme_encrypt_kernel(void)
> {
> + pgd_t *pgd;
> + void *pgtable_area;
> + unsigned long kernel_start, kernel_end, kernel_len;
> + unsigned long workarea_start, workarea_end, workarea_len;
> + unsigned long execute_start, execute_end, execute_len;
> + unsigned long pgtable_area_len;
> + unsigned long decrypted_base;
> + unsigned long paddr, pmd_flags;


Please sort function local variables declaration in a reverse christmas
tree order:

<type> longest_variable_name;
<type> shorter_var_name;
<type> even_shorter;
<type> i;

> +
> + if (!sme_active())
> + return;

...

> diff --git a/arch/x86/mm/mem_encrypt_boot.S b/arch/x86/mm/mem_encrypt_boot.S
> new file mode 100644
> index 0000000..fb58f9f
> --- /dev/null
> +++ b/arch/x86/mm/mem_encrypt_boot.S
> @@ -0,0 +1,151 @@
> +/*
> + * AMD Memory Encryption Support
> + *
> + * Copyright (C) 2016 Advanced Micro Devices, Inc.
> + *
> + * Author: Tom Lendacky <thomas.lendacky@xxxxxxx>
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License version 2 as
> + * published by the Free Software Foundation.
> + */
> +
> +#include <linux/linkage.h>
> +#include <asm/pgtable.h>
> +#include <asm/page.h>
> +#include <asm/processor-flags.h>
> +#include <asm/msr-index.h>
> +
> + .text
> + .code64
> +ENTRY(sme_encrypt_execute)
> +
> + /*
> + * Entry parameters:
> + * RDI - virtual address for the encrypted kernel mapping
> + * RSI - virtual address for the decrypted kernel mapping
> + * RDX - length of kernel
> + * RCX - virtual address of the encryption workarea, including:
> + * - stack page (PAGE_SIZE)
> + * - encryption routine page (PAGE_SIZE)
> + * - intermediate copy buffer (PMD_PAGE_SIZE)
> + * R8 - physcial address of the pagetables to use for encryption
> + */
> +
> + push %rbp
> + push %r12
> +
> + /* Set up a one page stack in the non-encrypted memory area */
> + movq %rsp, %rbp /* Save current stack pointer */
> + movq %rcx, %rax /* Workarea stack page */
> + movq %rax, %rsp /* Set new stack pointer */
> + addq $PAGE_SIZE, %rsp /* Stack grows from the bottom */
> + addq $PAGE_SIZE, %rax /* Workarea encryption routine */
> +
> + movq %rdi, %r10 /* Encrypted kernel */
> + movq %rsi, %r11 /* Decrypted kernel */
> + movq %rdx, %r12 /* Kernel length */
> +
> + /* Copy encryption routine into the workarea */
> + movq %rax, %rdi /* Workarea encryption routine */
> + leaq .Lenc_start(%rip), %rsi /* Encryption routine */
> + movq $(.Lenc_stop - .Lenc_start), %rcx /* Encryption routine length */
> + rep movsb
> +
> + /* Setup registers for call */
> + movq %r10, %rdi /* Encrypted kernel */
> + movq %r11, %rsi /* Decrypted kernel */
> + movq %r8, %rdx /* Pagetables used for encryption */
> + movq %r12, %rcx /* Kernel length */
> + movq %rax, %r8 /* Workarea encryption routine */
> + addq $PAGE_SIZE, %r8 /* Workarea intermediate copy buffer */
> +
> + call *%rax /* Call the encryption routine */
> +
> + movq %rbp, %rsp /* Restore original stack pointer */
> +
> + pop %r12
> + pop %rbp
> +
> + ret
> +ENDPROC(sme_encrypt_execute)
> +
> +.Lenc_start:
> +ENTRY(sme_enc_routine)

A function called a "routine"? Why do we need the global symbol?
Nothing's referencing it AFAICT.

> +/*
> + * Routine used to encrypt kernel.
> + * This routine must be run outside of the kernel proper since
> + * the kernel will be encrypted during the process. So this
> + * routine is defined here and then copied to an area outside
> + * of the kernel where it will remain and run decrypted
> + * during execution.
> + *
> + * On entry the registers must be:
> + * RDI - virtual address for the encrypted kernel mapping
> + * RSI - virtual address for the decrypted kernel mapping
> + * RDX - address of the pagetables to use for encryption
> + * RCX - length of kernel
> + * R8 - intermediate copy buffer
> + *
> + * RAX - points to this routine
> + *
> + * The kernel will be encrypted by copying from the non-encrypted
> + * kernel space to an intermediate buffer and then copying from the
> + * intermediate buffer back to the encrypted kernel space. The physical
> + * addresses of the two kernel space mappings are the same which
> + * results in the kernel being encrypted "in place".
> + */
> + /* Enable the new page tables */
> + mov %rdx, %cr3
> +
> + /* Flush any global TLBs */
> + mov %cr4, %rdx
> + andq $~X86_CR4_PGE, %rdx
> + mov %rdx, %cr4
> + orq $X86_CR4_PGE, %rdx
> + mov %rdx, %cr4
> +
> + /* Set the PAT register PA5 entry to write-protect */
> + push %rcx
> + movl $MSR_IA32_CR_PAT, %ecx
> + rdmsr
> + push %rdx /* Save original PAT value */
> + andl $0xffff00ff, %edx /* Clear PA5 */
> + orl $0x00000500, %edx /* Set PA5 to WP */

Maybe check first whether PA5 is already set correctly and avoid the
WRMSR and the restoring below too?

> + wrmsr
> + pop %rdx /* RDX contains original PAT value */
> + pop %rcx
> +
> + movq %rcx, %r9 /* Save kernel length */
> + movq %rdi, %r10 /* Save encrypted kernel address */
> + movq %rsi, %r11 /* Save decrypted kernel address */
> +
> + wbinvd /* Invalidate any cache entries */
> +
> + /* Copy/encrypt 2MB at a time */
> +1:
> + movq %r11, %rsi /* Source - decrypted kernel */
> + movq %r8, %rdi /* Dest - intermediate copy buffer */
> + movq $PMD_PAGE_SIZE, %rcx /* 2MB length */
> + rep movsb

not movsQ?

> + movq %r8, %rsi /* Source - intermediate copy buffer */
> + movq %r10, %rdi /* Dest - encrypted kernel */
> + movq $PMD_PAGE_SIZE, %rcx /* 2MB length */
> + rep movsb
> +
> + addq $PMD_PAGE_SIZE, %r11
> + addq $PMD_PAGE_SIZE, %r10
> + subq $PMD_PAGE_SIZE, %r9 /* Kernel length decrement */
> + jnz 1b /* Kernel length not zero? */
> +
> + /* Restore PAT register */
> + push %rdx /* Save original PAT value */
> + movl $MSR_IA32_CR_PAT, %ecx
> + rdmsr
> + pop %rdx /* Restore original PAT value */
> + wrmsr
> +
> + ret
> +ENDPROC(sme_enc_routine)
> +.Lenc_stop:
>

--
Regards/Gruss,
Boris.

Good mailing practices for 400: avoid top-posting and trim the reply.