Re: [patch 7/9] x86/iopl: Restrict iopl() permission scope

From: Andy Lutomirski
Date: Sun Nov 10 2019 - 12:26:10 EST


On 11/6/19 11:35 AM, Thomas Gleixner wrote:
> The access to the full I/O port range can be also provided by the TSS I/O
> bitmap, but that would require to copy 8k of data on scheduling in the
> task. As shown with the sched out optimization TSS.io_bitmap_base can be
> used to switch the incoming task to a preallocated I/O bitmap which has all
> bits zero, i.e. allows access to all I/O ports.
>
> Implementing this allows to provide an iopl() emulation mode which restricts
> the IOPL level 3 permissions to I/O port access but removes the STI/CLI
> permission which is coming with the hardware IOPL mechansim.
>
> Provide a config option to switch IOPL to emulation mode, make it the
> default and while at it also provide an option to disable IOPL completely.
>
> Signed-off-by: Thomas Gleixner <tglx@xxxxxxxxxxxxx>
> ---
> arch/x86/Kconfig | 32 ++++++++++++++++
> arch/x86/include/asm/processor.h | 24 +++++++++---
> arch/x86/kernel/cpu/common.c | 4 +-
> arch/x86/kernel/ioport.c | 75 ++++++++++++++++++++++++++++++---------
> arch/x86/kernel/process.c | 15 +++++--
> 5 files changed, 123 insertions(+), 27 deletions(-)
>
> --- a/arch/x86/Kconfig
> +++ b/arch/x86/Kconfig
> @@ -1254,6 +1254,38 @@ config X86_VSYSCALL_EMULATION
> Disabling this option saves about 7K of kernel size and
> possibly 4K of additional runtime pagetable memory.
>
> +choice
> + prompt "IOPL"
> + default X86_IOPL_EMULATION
> +
> +config X86_IOPL_EMULATION
> + bool "IOPL Emulation"
> + ---help---
> + Legacy IOPL support is an overbroad mechanism which allows user
> + space aside of accessing all 65536 I/O ports also to disable
> + interrupts. To gain this access the caller needs CAP_SYS_RAWIO
> + capabilities and permission from eventually active security
> + modules.
> +
> + The emulation restricts the functionality of the syscall to
> + only allowing the full range I/O port access, but prevents the
> + ability to disable interrupts from user space.
> +
> +config X86_IOPL_LEGACY
> + bool "IOPL Legacy"
> + ---help---
> + Allow the full IOPL permissions, i.e. user space access to all
> + 65536 I/O ports and also the ability to disable interrupts, which
> + is overbroad and can result in system lockups.
> +
> +config X86_IOPL_NONE
> + bool "IOPL None"
> + ---help---
> + Disable the IOPL permission syscall. That's the safest option as
> + no sane application should depend on this functionality.
> +
> +endchoice
> +
> config TOSHIBA
> tristate "Toshiba Laptop support"
> depends on X86_32
> --- a/arch/x86/include/asm/processor.h
> +++ b/arch/x86/include/asm/processor.h
> @@ -331,8 +331,12 @@ struct x86_hw_tss {
> #define IO_BITMAP_BYTES (IO_BITMAP_BITS/8)
> #define IO_BITMAP_LONGS (IO_BITMAP_BYTES/sizeof(long))
>
> -#define IO_BITMAP_OFFSET_VALID \
> - (offsetof(struct tss_struct, io_bitmap) - \
> +#define IO_BITMAP_OFFSET_VALID_MAP \
> + (offsetof(struct tss_struct, io_bitmap_map) - \
> + offsetof(struct tss_struct, x86_tss))
> +
> +#define IO_BITMAP_OFFSET_VALID_ALL \
> + (offsetof(struct tss_struct, io_bitmap_all) - \
> offsetof(struct tss_struct, x86_tss))
>
> /*
> @@ -343,7 +347,7 @@ struct x86_hw_tss {
> * last valid byte
> */
> #define __KERNEL_TSS_LIMIT \
> - (IO_BITMAP_OFFSET_VALID + IO_BITMAP_BYTES + sizeof(unsigned long) - 1)
> + (IO_BITMAP_OFFSET_VALID_ALL + IO_BITMAP_BYTES + sizeof(unsigned long) - 1)
>
> /* Base offset outside of TSS_LIMIT so unpriviledged IO causes #GP */
> #define IO_BITMAP_OFFSET_INVALID (__KERNEL_TSS_LIMIT + 1)
> @@ -379,7 +383,8 @@ struct tss_struct {
> * byte beyond the end of the I/O permission bitmap. The extra byte
> * must have all bits set and must be within the TSS limit.
> */
> - unsigned long io_bitmap[IO_BITMAP_LONGS + 1];
> + unsigned long io_bitmap_map[IO_BITMAP_LONGS + 1];
> + unsigned long io_bitmap_all[IO_BITMAP_LONGS + 1];
> } __aligned(PAGE_SIZE);
>
> DECLARE_PER_CPU_PAGE_ALIGNED(struct tss_struct, cpu_tss_rw);
> @@ -495,7 +500,6 @@ struct thread_struct {
> #endif
> /* IO permissions: */
> unsigned long *io_bitmap_ptr;
> - unsigned long iopl;
> /*
> * The byte range in the I/O permission bitmap which contains zero
> * bits.
> @@ -503,6 +507,13 @@ struct thread_struct {
> unsigned int io_zerobits_start;
> unsigned int io_zerobits_end;
>
> + /*
> + * IOPL. Priviledge level dependent I/O permission which includes
> + * user space CLI/STI when granted.
> + */
> + unsigned long iopl;
> + unsigned long iopl_emul;
> +
> mm_segment_t addr_limit;
>
> unsigned int sig_on_uaccess_err:1;
> @@ -524,6 +535,9 @@ static inline void arch_thread_struct_wh
> *size = fpu_kernel_xstate_size;
> }
>
> +extern void tss_update_io_bitmap(struct tss_struct *tss,
> + struct thread_struct *thread);
> +
> /*
> * Thread-synchronous status.
> *
> --- a/arch/x86/kernel/cpu/common.c
> +++ b/arch/x86/kernel/cpu/common.c
> @@ -1866,7 +1866,9 @@ void cpu_init(void)
> tss->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET_INVALID;
> tss->io_zerobits_start = IO_BITMAP_BYTES;
> tss->io_zerobits_end = 0;
> - memset(tss->io_bitmap, 0xff, sizeof(tss->io_bitmap));
> + memset(tss->io_bitmap_map, 0xff, sizeof(tss->io_bitmap_map));
> + /* Invalidate the extra tail entry of the allow all I/O bitmap */
> + tss->io_bitmap_all[IO_BITMAP_LONGS] = ~0UL;
> set_tss_desc(cpu, &get_cpu_entry_area(cpu)->tss.x86_tss);
>
> load_TR_desc();
> --- a/arch/x86/kernel/ioport.c
> +++ b/arch/x86/kernel/ioport.c
> @@ -109,7 +109,7 @@ long ksys_ioperm(unsigned long from, uns
> }
>
> /* Copy the changed range over to the TSS bitmap */
> - dst = (char *)tss->io_bitmap;
> + dst = (char *)tss->io_bitmap_map;
> src = (char *)bitmap;
> memcpy(dst + copy_start, src + copy_start, copy_len);
>
> @@ -130,7 +130,7 @@ long ksys_ioperm(unsigned long from, uns
> * is correct. It might have been wreckaged by a VMEXiT.
> */
> set_thread_flag(TIF_IO_BITMAP);
> - tss->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET_VALID;
> + tss->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET_VALID_MAP;
> refresh_tss_limit();
> }
>
> @@ -184,36 +184,77 @@ SYSCALL_DEFINE3(ioperm, unsigned long, f
> */
> SYSCALL_DEFINE1(iopl, unsigned int, level)
> {
> - struct pt_regs *regs = current_pt_regs();
> struct thread_struct *t = &current->thread;
> + struct pt_regs *regs = current_pt_regs();
> + unsigned int old;
>
> /*
> * Careful: the IOPL bits in regs->flags are undefined under Xen PV
> * and changing them has no effect.
> */
> - unsigned int old = t->iopl >> X86_EFLAGS_IOPL_BIT;
> + if (IS_ENABLED(CONFIG_X86_IOPL_NONE))
> + return -ENOSYS;
>
> if (level > 3)
> return -EINVAL;
> +
> + if (IS_ENABLED(CONFIG_X86_IOPL_EMULATION))
> + old = t->iopl_emul;
> + else
> + old = t->iopl >> X86_EFLAGS_IOPL_BIT;
> +
> + /* No point in going further if nothing changes */
> + if (level == old)
> + return 0;
> +
> /* Trying to gain more privileges? */
> if (level > old) {
> if (!capable(CAP_SYS_RAWIO) ||
> security_locked_down(LOCKDOWN_IOPORT))
> return -EPERM;
> }
> - /*
> - * Change the flags value on the return stack, which has been set
> - * up on system-call entry. See also the fork and signal handling
> - * code how this is handled.
> - */
> - regs->flags = (regs->flags & ~X86_EFLAGS_IOPL) |
> - (level << X86_EFLAGS_IOPL_BIT);
> - /* Store the new level in the thread struct */
> - t->iopl = level << X86_EFLAGS_IOPL_BIT;
> - /*
> - * X86_32 switches immediately and XEN handles it via emulation.
> - */
> - set_iopl_mask(t->iopl);
> +
> + if (IS_ENABLED(CONFIG_X86_IOPL_EMULATION)) {
> + struct tss_struct *tss;
> + unsigned int tss_base;
> +
> + /* Prevent racing against a task switch */
> + preempt_disable();
> + tss = this_cpu_ptr(&cpu_tss_rw);
> + if (level == 3) {
> + /* Grant access to all I/O ports */
> + set_thread_flag(TIF_IO_BITMAP);
> + tss_base = IO_BITMAP_OFFSET_VALID_ALL;

Where is the actual TSS updated?

> + } else if (t->io_bitmap_ptr) {
> + /* Thread has a I/O bitmap */
> + tss_update_io_bitmap(tss, t);
> + set_thread_flag(TIF_IO_BITMAP);
> + tss_base = IO_BITMAP_OFFSET_VALID_MAP;
> + } else {
> + /* Take it out of the context switch work burden */
> + clear_thread_flag(TIF_IO_BITMAP);
> + tss_base = IO_BITMAP_OFFSET_INVALID;

Ditto.

I think what you need to do is have a single function, called by
exit_thread(), switch_to(), and here, that updates the TSS to match a
given task's IO bitmap state. This is probably switch_to_bitmap() or
similar.

(Maybe it already is, but I swear I checked all the patches in the
series and I can't find the body of tss_update_io_bitmap(). But you
should call it in all branches of this if-else thing.)