Re: Yet another KPTI regression with 4.14.x series in a VM

From: Andy Lutomirski
Date: Sat Jan 13 2018 - 15:52:33 EST


On Sat, Jan 13, 2018 at 12:45 PM, Thomas Gleixner <tglx@xxxxxxxxxxxxx> wrote:
> On Sat, 13 Jan 2018, Andy Lutomirski wrote:
>> Trying to inventory this stuff scattered all over the place:
>>
>> #define PTI_PGTABLE_SWITCH_BIT PAGE_SHIFT
>> #define PTI_SWITCH_PGTABLES_MASK (1<<PAGE_SHIFT)
>> # define X86_CR3_PTI_SWITCH_BIT 11
>> #define PTI_SWITCH_MASK
>> (PTI_SWITCH_PGTABLES_MASK|(1<<X86_CR3_PTI_SWITCH_BIT))
>>
>> Blech. I wouldn't be terribly surprised if I missed a few as well. How about:
>>
>> PTI_USER_PGTABLE_BIT = PAGE_SHIFT
>> PTI_USER_PGTABLE_MASK = 1 << PTI_USER_PGTABLE_BIT
>> PTI_USER_PCID_BIT = 11
>> PTI_USER_PCID_MASK = 1 << PTI_USER_PCID_BIT
>> PTI_USER_PGTABLE_AND_PCID_MASK = PTI_USER_PCID_MASK | PTI_USER_PGTABLE_MASK
>>
>> This naming would make the apparently buggy code look fishy, as it
>> should. I will give this a shot some time soon if no one beats me to
>> it.
>
> Well, the thing we tripped over is that we trusted the SDM that bit 11 is
> ignored. Seems its not and the AMD APM says that reserved bit should be
> cleared. Next time I surely stare into both....
>
> So something like the below should make it clear. I've not done the
> alternatives thing yet...
>

Looks generally sane to me.

>
> 8<-------------------
> --- a/arch/x86/entry/calling.h
> +++ b/arch/x86/entry/calling.h
> @@ -198,8 +198,11 @@ For 32-bit we have the following convent
> * PAGE_TABLE_ISOLATION PGDs are 8k. Flip bit 12 to switch between the two
> * halves:
> */
> -#define PTI_SWITCH_PGTABLES_MASK (1<<PAGE_SHIFT)
> -#define PTI_SWITCH_MASK (PTI_SWITCH_PGTABLES_MASK|(1<<X86_CR3_PTI_SWITCH_BIT))
> +#define PTI_USER_PGTABLE_BIT PAGE_SHIFT
> +#define PTI_USER_PGTABLE_MASK (1 << PTI_USER_PGTABLE_BIT)
> +#define PTI_USER_PCID_BIT X86_CR3_PTI_PCID_USER_BIT
> +#define PTI_USER_PCID_MASK (1 << PTI_USER_PCID_BIT)
> +#define PTI_USER_PGTABLE_AND_PCID_MASK (PTI_USER_PCID_MASK | PTI_USER_PGTABLE_MASK)
>
> .macro SET_NOFLUSH_BIT reg:req
> bts $X86_CR3_PCID_NOFLUSH_BIT, \reg
> @@ -208,7 +211,7 @@ For 32-bit we have the following convent
> .macro ADJUST_KERNEL_CR3 reg:req
> ALTERNATIVE "", "SET_NOFLUSH_BIT \reg", X86_FEATURE_PCID
> /* Clear PCID and "PAGE_TABLE_ISOLATION bit", point CR3 at kernel pagetables: */
> - andq $(~PTI_SWITCH_MASK), \reg
> + andq $(~PTI_USER_PGTABLE_AND_PCID_MASK), \reg
> .endm
>
> .macro SWITCH_TO_KERNEL_CR3 scratch_reg:req
> @@ -239,15 +242,18 @@ For 32-bit we have the following convent
> /* Flush needed, clear the bit */
> btr \scratch_reg, THIS_CPU_user_pcid_flush_mask
> movq \scratch_reg2, \scratch_reg
> - jmp .Lwrcr3_\@
> + jmp .Lwrcr3_pcid_\@
>
> .Lnoflush_\@:
> movq \scratch_reg2, \scratch_reg
> SET_NOFLUSH_BIT \scratch_reg
>
> +.Lwcr3_pcid_\@:
> + orq $(PTI_USER_PCID_MASK), \scratch_reg
> +
> .Lwrcr3_\@:
> /* Flip the PGD and ASID to the user version */
> - orq $(PTI_SWITCH_MASK), \scratch_reg
> + orq $(PTI_USER_PGTABLE_MASK), \scratch_reg
> mov \scratch_reg, %cr3
> .Lend_\@:
> .endm
> @@ -272,7 +278,7 @@ For 32-bit we have the following convent
> *
> * That indicates a kernel CR3 value, not a user CR3.
> */
> - testq $(PTI_SWITCH_MASK), \scratch_reg
> + testq $(PTI_USER_PGTABLE_MASK), \scratch_reg
> jz .Ldone_\@
>
> ADJUST_KERNEL_CR3 \scratch_reg
> @@ -290,7 +296,7 @@ For 32-bit we have the following convent
> * KERNEL pages can always resume with NOFLUSH as we do
> * explicit flushes.
> */
> - bt $X86_CR3_PTI_SWITCH_BIT, \save_reg
> + bt $PTI_USER_PGTABLE_BIT, \save_reg
> jnc .Lnoflush_\@
>
> /*
> --- a/arch/x86/include/asm/processor-flags.h
> +++ b/arch/x86/include/asm/processor-flags.h
> @@ -40,7 +40,7 @@
> #define CR3_NOFLUSH BIT_ULL(63)
>
> #ifdef CONFIG_PAGE_TABLE_ISOLATION
> -# define X86_CR3_PTI_SWITCH_BIT 11
> +# define X86_CR3_PTI_PCID_USER_BIT 11
> #endif
>
> #else
> --- a/arch/x86/include/asm/tlbflush.h
> +++ b/arch/x86/include/asm/tlbflush.h
> @@ -81,13 +81,13 @@ static inline u16 kern_pcid(u16 asid)
> * Make sure that the dynamic ASID space does not confict with the
> * bit we are using to switch between user and kernel ASIDs.
> */
> - BUILD_BUG_ON(TLB_NR_DYN_ASIDS >= (1 << X86_CR3_PTI_SWITCH_BIT));
> + BUILD_BUG_ON(TLB_NR_DYN_ASIDS >= (1 << X86_CR3_PTI_PCID_USER_BIT));
>
> /*
> * The ASID being passed in here should have respected the
> * MAX_ASID_AVAILABLE and thus never have the switch bit set.
> */
> - VM_WARN_ON_ONCE(asid & (1 << X86_CR3_PTI_SWITCH_BIT));
> + VM_WARN_ON_ONCE(asid & (1 << X86_CR3_PTI_PCID_USER_BIT));
> #endif
> /*
> * The dynamically-assigned ASIDs that get passed in are small
> @@ -112,7 +112,7 @@ static inline u16 user_pcid(u16 asid)
> {
> u16 ret = kern_pcid(asid);
> #ifdef CONFIG_PAGE_TABLE_ISOLATION
> - ret |= 1 << X86_CR3_PTI_SWITCH_BIT;
> + ret |= 1 << X86_CR3_PTI_PCID_USER_BIT;
> #endif
> return ret;
> }
>
>
>