[PATCH] tile: support KVM for tilegx

From: Chris Metcalf
Date: Mon Aug 12 2013 - 12:16:40 EST


This change provides the initial framework support for KVM on tilegx.
Basic virtual disk and networking is supported.

Signed-off-by: Chris Metcalf <cmetcalf@xxxxxxxxxx>
---
arch/tile/Kconfig | 19 +-
arch/tile/Makefile | 1 +
arch/tile/include/asm/io.h | 2 +
arch/tile/include/asm/kvm.h | 29 +
arch/tile/include/asm/kvm_host.h | 119 +++
arch/tile/include/asm/kvm_para.h | 20 +
arch/tile/include/asm/kvm_virtio.h | 26 +
arch/tile/include/asm/module.h | 9 +-
arch/tile/include/asm/page.h | 56 +-
arch/tile/include/asm/pgtable_32.h | 2 +-
arch/tile/include/asm/pgtable_64.h | 3 +-
arch/tile/include/asm/processor.h | 6 +-
arch/tile/include/asm/ptrace.h | 2 +-
arch/tile/include/asm/switch_to.h | 25 +-
arch/tile/include/asm/thread_info.h | 17 +-
arch/tile/include/asm/timex.h | 8 +
arch/tile/include/hv/hypervisor.h | 183 +++-
arch/tile/include/uapi/arch/sim.h | 19 +
arch/tile/include/uapi/arch/sim_def.h | 8 +
arch/tile/include/uapi/arch/spr_def_32.h | 15 +
arch/tile/include/uapi/arch/spr_def_64.h | 25 +
arch/tile/include/uapi/asm/Kbuild | 2 +
arch/tile/include/uapi/asm/kvm.h | 249 +++++
arch/tile/include/uapi/asm/kvm_virtio.h | 60 ++
arch/tile/kernel/Makefile | 1 +
arch/tile/kernel/asm-offsets.c | 7 +
arch/tile/kernel/early_printk.c | 17 +
arch/tile/kernel/head_32.S | 4 +-
arch/tile/kernel/head_64.S | 6 +-
arch/tile/kernel/hvglue.S | 8 +-
arch/tile/kernel/hvglue_trace.c | 14 +
arch/tile/kernel/intvec_32.S | 18 +-
arch/tile/kernel/intvec_64.S | 226 +++--
arch/tile/kernel/kvm_virtio.c | 430 ++++++++
arch/tile/kernel/process.c | 40 +-
arch/tile/kernel/relocate_kernel_64.S | 9 +-
arch/tile/kernel/setup.c | 21 +-
arch/tile/kernel/smp.c | 28 +-
arch/tile/kernel/stack.c | 2 +-
arch/tile/kernel/sysfs.c | 4 +
arch/tile/kernel/time.c | 14 +-
arch/tile/kernel/traps.c | 2 +-
arch/tile/kernel/vmlinux.lds.S | 10 +-
arch/tile/kvm/Kconfig | 3 -
arch/tile/kvm/Makefile | 12 +
arch/tile/kvm/entry.S | 91 ++
arch/tile/kvm/kvm-tile.c | 1585 ++++++++++++++++++++++++++++++
arch/tile/lib/exports.c | 20 +-
arch/tile/mm/elf.c | 2 +
arch/tile/mm/fault.c | 4 +-
arch/tile/mm/init.c | 8 +-
arch/tile/mm/pgtable.c | 35 +-
include/uapi/linux/kvm.h | 3 +
virt/kvm/kvm_main.c | 7 +-
54 files changed, 3338 insertions(+), 198 deletions(-)
create mode 100644 arch/tile/include/asm/kvm.h
create mode 100644 arch/tile/include/asm/kvm_host.h
create mode 100644 arch/tile/include/asm/kvm_para.h
create mode 100644 arch/tile/include/asm/kvm_virtio.h
create mode 100644 arch/tile/include/uapi/asm/kvm.h
create mode 100644 arch/tile/include/uapi/asm/kvm_virtio.h
create mode 100644 arch/tile/kernel/kvm_virtio.c
create mode 100644 arch/tile/kvm/Makefile
create mode 100644 arch/tile/kvm/entry.S
create mode 100644 arch/tile/kvm/kvm-tile.c

diff --git a/arch/tile/Kconfig b/arch/tile/Kconfig
index ecff467..bbb6d51 100644
--- a/arch/tile/Kconfig
+++ b/arch/tile/Kconfig
@@ -5,7 +5,6 @@ config TILE
def_bool y
select HAVE_DMA_ATTRS
select HAVE_DMA_API_DEBUG
- select HAVE_KVM if !TILEGX
select GENERIC_FIND_FIRST_BIT
select SYSCTL_EXCEPTION_TRACE
select USE_GENERIC_SMP_HELPERS
@@ -113,6 +112,7 @@ config SMP
def_bool y

config HVC_TILE
+ depends on !KVM_GUEST
depends on TTY
select HVC_DRIVER
select HVC_IRQ if TILEGX
@@ -127,6 +127,7 @@ config TILEGX
select HAVE_FTRACE_MCOUNT_RECORD
select HAVE_KPROBES
select HAVE_KRETPROBES
+ select HAVE_KVM if !KVM_GUEST

config TILEPRO
def_bool !TILEGX
@@ -366,11 +367,23 @@ config HARDWALL
bool "Hardwall support to allow access to user dynamic network"
default y

+config KVM_GUEST
+ bool "Build kernel as guest for KVM"
+ default n
+ depends on TILEGX
+ select VIRTIO
+ select VIRTIO_RING
+ select VIRTIO_CONSOLE
+ ---help---
+ This will build a kernel that runs at a lower protection level
+ than the default kernel and is suitable to run under KVM.
+
+# TILEPro kernels run at PL1; TILE-Gx runs at PL2 unless it's a KVM guest.
config KERNEL_PL
int "Processor protection level for kernel"
range 1 2
- default 2 if TILEGX
- default 1 if !TILEGX
+ default 2 if TILEGX && !KVM_GUEST
+ default 1 if !TILEGX || KVM_GUEST
---help---
Since MDE 4.2, the Tilera hypervisor runs the kernel
at PL2 by default. If running under an older hypervisor,
diff --git a/arch/tile/Makefile b/arch/tile/Makefile
index 3d15364..8e7f852 100644
--- a/arch/tile/Makefile
+++ b/arch/tile/Makefile
@@ -62,6 +62,7 @@ libs-y += $(LIBGCC_PATH)

# See arch/tile/Kbuild for content of core part of the kernel
core-y += arch/tile/
+core-$(CONFIG_KVM) += arch/tile/kvm/

core-$(CONFIG_TILE_GXIO) += arch/tile/gxio/

diff --git a/arch/tile/include/asm/io.h b/arch/tile/include/asm/io.h
index 9fe4349..023659b 100644
--- a/arch/tile/include/asm/io.h
+++ b/arch/tile/include/asm/io.h
@@ -43,6 +43,8 @@
* long before casting it to a pointer to avoid compiler warnings.
*/
#if CHIP_HAS_MMIO()
+extern void *generic_remap_prot(resource_size_t phys_addr, unsigned long size,
+ unsigned long flags, pgprot_t prot);
extern void __iomem *ioremap(resource_size_t offset, unsigned long size);
extern void __iomem *ioremap_prot(resource_size_t offset, unsigned long size,
pgprot_t pgprot);
diff --git a/arch/tile/include/asm/kvm.h b/arch/tile/include/asm/kvm.h
new file mode 100644
index 0000000..2ea6c41
--- /dev/null
+++ b/arch/tile/include/asm/kvm.h
@@ -0,0 +1,29 @@
+/*
+ * Copyright 2013 Tilera Corporation. All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation, version 2.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT. See the GNU General Public License for
+ * more details.
+ */
+#ifndef _ASM_TILE_KVM_H
+#define _ASM_TILE_KVM_H
+
+#include <hv/hypervisor.h>
+#include <uapi/asm/kvm.h>
+
+#ifndef __ASSEMBLER__
+/* For hv_*() */
+#define KVM_EMULATE(name) [HV_SYS_##name] = kvm_emulate_hv_##name,
+#define USER_EMULATE(name) [HV_SYS_##name] = kvm_deliver_to_user,
+#define NO_EMULATE(name) [HV_SYS_##name] = kvm_emulate_illegal,
+#define BOTH_EMULATE(name) [HV_SYS_##name] = kvm_emulate_hv_##name,
+/* For others */
+#define USER_HCALL(name) [KVM_HCALL_##name] = kvm_deliver_to_user,
+#endif
+#endif /* _ASM_TILE_KVM_H */
diff --git a/arch/tile/include/asm/kvm_host.h b/arch/tile/include/asm/kvm_host.h
new file mode 100644
index 0000000..8241f50
--- /dev/null
+++ b/arch/tile/include/asm/kvm_host.h
@@ -0,0 +1,119 @@
+/*
+ * Copyright 2013 Tilera Corporation. All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation, version 2.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT. See the GNU General Public License for
+ * more details.
+ */
+
+#ifndef _ASM_TILE_KVM_HOST_H
+#define _ASM_TILE_KVM_HOST_H
+
+#define KVM_MAX_VCPUS 64
+#define KVM_USER_MEM_SLOTS 32
+#define KVM_PRIVATE_MEM_SLOTS 4
+
+/* For now, claim we have no huge pages. */
+#define KVM_HPAGE_GFN_SHIFT(x) 0
+#define KVM_NR_PAGE_SIZES 1
+#define KVM_PAGES_PER_HPAGE(x) 1
+
+/* Max number of message tags for hv_send/receive_message() */
+#define MAX_MSG_TAG (sizeof(unsigned long) * 8)
+
+/* Bits in pending_downcalls */
+#define DOWNCALL_MESSAGE_RCV 0x01 /**< Message receive */
+
+#ifndef __ASSEMBLY__
+
+#include <linux/types.h>
+#include <linux/ptrace.h>
+
+struct kvm_vcpu_stat {
+ /* None yet. */
+};
+
+struct kvm_vcpu_arch {
+ struct pt_regs regs;
+ unsigned long host_sp; /* Host "real" sp during vmresume. */
+ HV_Context guest_context;
+ unsigned long pending_msgs; /* Pending guest messages */
+ unsigned long ipi_events; /* Pending guest ipi events. */
+ unsigned long ipi_gpa; /* pa for hv_get_ipi_pte() */
+ pte_t ipi_gpte; /* pte for hv_get_ipi_pte() */
+ unsigned long fault_addr; /* addr for VPGTABLE_MISS faults */
+ int suspended; /* true for cores not yet started by host */
+ unsigned long timer_control; /* AUX_TILE_TIMER_CONTROL value */
+ unsigned long vmexit_cycles; /* cycle count of last vmexit */
+
+#define FOR_EACH_GUEST_SPR(f) \
+ f(INTERRUPT_MASK_1); \
+ f(INTERRUPT_VECTOR_BASE_1); \
+ f(EX_CONTEXT_1_0); \
+ f(EX_CONTEXT_1_1); \
+ f(SYSTEM_SAVE_1_0); \
+ f(SYSTEM_SAVE_1_1); \
+ f(SYSTEM_SAVE_1_2); \
+ f(SYSTEM_SAVE_1_3); \
+ f(INTCTRL_1_STATUS); \
+ f(IPI_MASK_1); \
+ f(IPI_EVENT_1); \
+ f(SINGLE_STEP_CONTROL_1); \
+ f(SINGLE_STEP_EN_1_1); \
+
+#define DECLARE_SPR(f) unsigned long f
+ FOR_EACH_GUEST_SPR(DECLARE_SPR)
+#undef DECLARE_SPR
+};
+
+struct kvm_vm_stat {
+ /*
+ * FIXME - does this make sense for us? It's used in common KVM
+ * code.
+ */
+ u32 remote_tlb_flush;
+};
+
+struct kvm_arch_memory_slot {
+};
+
+struct kvm_arch {
+ pgd_t *vpgd;
+ unsigned long resv_gpa_start; /* For special purpose. */
+ struct completion smp_start;
+};
+
+struct kvm_vcpu;
+
+extern void kvm_vmresume(struct pt_regs *guest,
+ unsigned long *host_sp_ptr);
+extern void kvm_vmexit(unsigned long host_sp);
+extern void kvm_trigger_vmexit(struct pt_regs *regs, int exit_reason);
+extern void kvm_do_hypervisor_call(struct pt_regs *regs, int fault_num);
+extern void kvm_do_vpgtable_miss(struct pt_regs *regs, int fault_num,
+ unsigned long, unsigned long);
+extern void kvm_do_vguest_fatal(struct pt_regs *regs, int fault_num);
+
+extern void kvm_vcpu_kick(struct kvm_vcpu *vcpu);
+
+#define gpud_offset(kvm, pgd, address) pud_offset(pgd, address)
+
+#define gpud_page_vaddr(kvm, pud) gfn_to_hva(kvm, pud_pfn(pud))
+
+#define gpmd_offset(kvm, pud, address) \
+ ((pmd_t *)gpud_page_vaddr(kvm, *(pud)) + pmd_index(address))
+
+#define gpmd_page_vaddr(kvm, pmd) gfn_to_hva(kvm, pmd_pfn(pmd))
+
+#define gpte_offset_kernel(kvm, pmd, address) \
+ ((pte_t *) gpmd_page_vaddr(kvm, *(pmd)) + pte_index(address))
+
+#endif /* __ASSEMBLY__*/
+
+#endif /* _ASM_TILE_KVM_HOST_H */
diff --git a/arch/tile/include/asm/kvm_para.h b/arch/tile/include/asm/kvm_para.h
new file mode 100644
index 0000000..c8c31d5
--- /dev/null
+++ b/arch/tile/include/asm/kvm_para.h
@@ -0,0 +1,20 @@
+/*
+ * Copyright 2013 Tilera Corporation. All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation, version 2.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT. See the GNU General Public License for
+ * more details.
+ */
+#ifndef _ASM_TILE_KVM_PARA_H
+#define _ASM_TILE_KVM_PARA_H
+
+#include <uapi/asm/kvm_para.h>
+
+int hcall_virtio(unsigned long instrument, unsigned long mem);
+#endif /* _ASM_TILE_KVM_PARA_H */
diff --git a/arch/tile/include/asm/kvm_virtio.h b/arch/tile/include/asm/kvm_virtio.h
new file mode 100644
index 0000000..8faa959
--- /dev/null
+++ b/arch/tile/include/asm/kvm_virtio.h
@@ -0,0 +1,26 @@
+/*
+ * Copyright 2013 Tilera Corporation. All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation, version 2.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT. See the GNU General Public License for
+ * more details.
+ */
+#ifndef _ASM_TILE_KVM_VIRTIO_H
+#define _ASM_TILE_KVM_VIRTIO_H
+
+#include <uapi/asm/kvm_virtio.h>
+
+
+struct kvm_device {
+ struct virtio_device vdev;
+ struct kvm_device_desc *desc;
+ unsigned long desc_pa;
+};
+
+#endif /* _ASM_TILE_KVM_VIRTIO_H */
diff --git a/arch/tile/include/asm/module.h b/arch/tile/include/asm/module.h
index 44ed07c..927c97f 100644
--- a/arch/tile/include/asm/module.h
+++ b/arch/tile/include/asm/module.h
@@ -28,6 +28,13 @@
# define MODULE_PGSZ ""
#endif

+/* Tag guest Linux, since it uses different SPRs, etc. */
+#if CONFIG_KERNEL_PL == 2
+#define MODULE_PL ""
+#else
+#define MODULE_PL " guest"
+#endif
+
/* We don't really support no-SMP so tag if someone tries. */
#ifdef CONFIG_SMP
#define MODULE_NOSMP ""
@@ -35,6 +42,6 @@
#define MODULE_NOSMP " nosmp"
#endif

-#define MODULE_ARCH_VERMAGIC CHIP_ARCH_NAME MODULE_PGSZ MODULE_NOSMP
+#define MODULE_ARCH_VERMAGIC CHIP_ARCH_NAME MODULE_PGSZ MODULE_PL MODULE_NOSMP

#endif /* _ASM_TILE_MODULE_H */
diff --git a/arch/tile/include/asm/page.h b/arch/tile/include/asm/page.h
index b4f96c0..65ee752 100644
--- a/arch/tile/include/asm/page.h
+++ b/arch/tile/include/asm/page.h
@@ -148,8 +148,17 @@ static inline __attribute_const__ int get_order(unsigned long size)
#define HAVE_ARCH_HUGETLB_UNMAPPED_AREA
#endif

+#ifdef CONFIG_KVM_GUEST
+/* Paravirtualized guests get half the VA, and thus half the PA. */
+#define MAX_PA_WIDTH (CHIP_PA_WIDTH() - 1)
+#define MAX_VA_WIDTH (CHIP_VA_WIDTH() - 1)
+#else
+#define MAX_PA_WIDTH CHIP_PA_WIDTH()
+#define MAX_VA_WIDTH CHIP_VA_WIDTH()
+#endif
+
/* Each memory controller has PAs distinct in their high bits. */
-#define NR_PA_HIGHBIT_SHIFT (CHIP_PA_WIDTH() - CHIP_LOG_NUM_MSHIMS())
+#define NR_PA_HIGHBIT_SHIFT (MAX_PA_WIDTH - CHIP_LOG_NUM_MSHIMS())
#define NR_PA_HIGHBIT_VALUES (1 << CHIP_LOG_NUM_MSHIMS())
#define __pa_to_highbits(pa) ((phys_addr_t)(pa) >> NR_PA_HIGHBIT_SHIFT)
#define __pfn_to_highbits(pfn) ((pfn) >> (NR_PA_HIGHBIT_SHIFT - PAGE_SHIFT))
@@ -160,7 +169,7 @@ static inline __attribute_const__ int get_order(unsigned long size)
* We reserve the lower half of memory for user-space programs, and the
* upper half for system code. We re-map all of physical memory in the
* upper half, which takes a quarter of our VA space. Then we have
- * the vmalloc regions. The supervisor code lives at 0xfffffff700000000,
+ * the vmalloc regions. The supervisor code lives at the highest address,
* with the hypervisor above that.
*
* Loadable kernel modules are placed immediately after the static
@@ -172,26 +181,25 @@ static inline __attribute_const__ int get_order(unsigned long size)
* Similarly, for now we don't play any struct page mapping games.
*/

-#if CHIP_PA_WIDTH() + 2 > CHIP_VA_WIDTH()
+#if MAX_PA_WIDTH + 2 > MAX_VA_WIDTH
# error Too much PA to map with the VA available!
#endif
-#define HALF_VA_SPACE (_AC(1, UL) << (CHIP_VA_WIDTH() - 1))

-#define MEM_LOW_END (HALF_VA_SPACE - 1) /* low half */
-#define MEM_HIGH_START (-HALF_VA_SPACE) /* high half */
-#define PAGE_OFFSET MEM_HIGH_START
-#define FIXADDR_BASE _AC(0xfffffff400000000, UL) /* 4 GB */
-#define FIXADDR_TOP _AC(0xfffffff500000000, UL) /* 4 GB */
+#ifdef CONFIG_KVM_GUEST
+#define PAGE_OFFSET (_AC(1, UL) << (MAX_VA_WIDTH - 1))
+#define KERNEL_HIGH_VADDR (_AC(1, UL) << MAX_VA_WIDTH)
+#else
+#define PAGE_OFFSET (-(_AC(1, UL) << (MAX_VA_WIDTH - 1)))
+#define KERNEL_HIGH_VADDR _AC(0xfffffff800000000, UL) /* high 32GB */
+#endif
+
+#define FIXADDR_BASE (KERNEL_HIGH_VADDR - 0x400000000) /* 4 GB */
+#define FIXADDR_TOP (KERNEL_HIGH_VADDR - 0x300000000) /* 4 GB */
#define _VMALLOC_START FIXADDR_TOP
-#define HUGE_VMAP_BASE _AC(0xfffffff600000000, UL) /* 4 GB */
-#define MEM_SV_START _AC(0xfffffff700000000, UL) /* 256 MB */
-#define MEM_SV_INTRPT MEM_SV_START
-#define MEM_MODULE_START _AC(0xfffffff710000000, UL) /* 256 MB */
+#define HUGE_VMAP_BASE (KERNEL_HIGH_VADDR - 0x200000000) /* 4 GB */
+#define MEM_SV_START (KERNEL_HIGH_VADDR - 0x100000000) /* 256 MB */
+#define MEM_MODULE_START (MEM_SV_START + (256*1024*1024)) /* 256 MB */
#define MEM_MODULE_END (MEM_MODULE_START + (256*1024*1024))
-#define MEM_HV_START _AC(0xfffffff800000000, UL) /* 32 GB */
-
-/* Highest DTLB address we will use */
-#define KERNEL_HIGH_VADDR MEM_SV_START

#else /* !__tilegx__ */

@@ -213,8 +221,8 @@ static inline __attribute_const__ int get_order(unsigned long size)
* values, and after that, we show "typical" values, since the actual
* addresses depend on kernel #defines.
*
- * MEM_HV_INTRPT 0xfe000000
- * MEM_SV_INTRPT (kernel code) 0xfd000000
+ * MEM_HV_START 0xfe000000
+ * MEM_SV_START (kernel code) 0xfd000000
* MEM_USER_INTRPT (user vector) 0xfc000000
* FIX_KMAP_xxx 0xf8000000 (via NR_CPUS * KM_TYPE_NR)
* PKMAP_BASE 0xf7000000 (via LAST_PKMAP)
@@ -224,14 +232,8 @@ static inline __attribute_const__ int get_order(unsigned long size)
*/

#define MEM_USER_INTRPT _AC(0xfc000000, UL)
-#if CONFIG_KERNEL_PL == 1
-#define MEM_SV_INTRPT _AC(0xfd000000, UL)
-#define MEM_HV_INTRPT _AC(0xfe000000, UL)
-#else
-#define MEM_GUEST_INTRPT _AC(0xfd000000, UL)
-#define MEM_SV_INTRPT _AC(0xfe000000, UL)
-#define MEM_HV_INTRPT _AC(0xff000000, UL)
-#endif
+#define MEM_SV_START _AC(0xfd000000, UL)
+#define MEM_HV_START _AC(0xfe000000, UL)

#define INTRPT_SIZE 0x4000

diff --git a/arch/tile/include/asm/pgtable_32.h b/arch/tile/include/asm/pgtable_32.h
index e5bdc0e..63142ab 100644
--- a/arch/tile/include/asm/pgtable_32.h
+++ b/arch/tile/include/asm/pgtable_32.h
@@ -89,7 +89,7 @@ static inline int pud_huge_page(pud_t pud) { return 0; }
/* We don't define any pgds for these addresses. */
static inline int pgd_addr_invalid(unsigned long addr)
{
- return addr >= MEM_HV_INTRPT;
+ return addr >= MEM_HV_START;
}

/*
diff --git a/arch/tile/include/asm/pgtable_64.h b/arch/tile/include/asm/pgtable_64.h
index 7cb8d35..3421177 100644
--- a/arch/tile/include/asm/pgtable_64.h
+++ b/arch/tile/include/asm/pgtable_64.h
@@ -140,8 +140,7 @@ static inline unsigned long pgd_addr_normalize(unsigned long addr)
/* We don't define any pgds for these addresses. */
static inline int pgd_addr_invalid(unsigned long addr)
{
- return addr >= MEM_HV_START ||
- (addr > MEM_LOW_END && addr < MEM_HIGH_START);
+ return addr >= KERNEL_HIGH_VADDR || addr != pgd_addr_normalize(addr);
}

/*
diff --git a/arch/tile/include/asm/processor.h b/arch/tile/include/asm/processor.h
index 230b830..5aa5431 100644
--- a/arch/tile/include/asm/processor.h
+++ b/arch/tile/include/asm/processor.h
@@ -15,6 +15,8 @@
#ifndef _ASM_TILE_PROCESSOR_H
#define _ASM_TILE_PROCESSOR_H

+#include <arch/chip.h>
+
#ifndef __ASSEMBLY__

/*
@@ -25,7 +27,6 @@
#include <asm/ptrace.h>
#include <asm/percpu.h>

-#include <arch/chip.h>
#include <arch/spr_def.h>

struct task_struct;
@@ -167,7 +168,7 @@ struct thread_struct {
#ifndef __ASSEMBLY__

#ifdef __tilegx__
-#define TASK_SIZE_MAX (MEM_LOW_END + 1)
+#define TASK_SIZE_MAX (_AC(1, UL) << (MAX_VA_WIDTH - 1))
#else
#define TASK_SIZE_MAX PAGE_OFFSET
#endif
@@ -347,7 +348,6 @@ extern int kdata_huge;

/*
* Provide symbolic constants for PLs.
- * Note that assembly code assumes that USER_PL is zero.
*/
#define USER_PL 0
#if CONFIG_KERNEL_PL == 2
diff --git a/arch/tile/include/asm/ptrace.h b/arch/tile/include/asm/ptrace.h
index 0d25c21..b9620c0 100644
--- a/arch/tile/include/asm/ptrace.h
+++ b/arch/tile/include/asm/ptrace.h
@@ -39,7 +39,7 @@ typedef unsigned long pt_reg_t;
#define user_stack_pointer(regs) ((regs)->sp)

/* Does the process account for user or for system time? */
-#define user_mode(regs) (EX1_PL((regs)->ex1) == USER_PL)
+#define user_mode(regs) (EX1_PL((regs)->ex1) < KERNEL_PL)

/* Fill in a struct pt_regs with the current kernel registers. */
struct pt_regs *get_pt_regs(struct pt_regs *);
diff --git a/arch/tile/include/asm/switch_to.h b/arch/tile/include/asm/switch_to.h
index b8f888c..8e9150f 100644
--- a/arch/tile/include/asm/switch_to.h
+++ b/arch/tile/include/asm/switch_to.h
@@ -50,16 +50,31 @@ extern struct task_struct *__switch_to(struct task_struct *prev,
extern unsigned long get_switch_to_pc(void);

/*
+ * Normally we notify the simulator whenever we change from one pid
+ * to another, so it can track symbol files appropriately on the fly.
+ * For now, we don't do this for the guest Linux, since we don't
+ * have a way to tell the simulator that we are entering a separate
+ * pid space when we are in the guest.
+ */
+#ifdef CONFIG_KVM_GUEST
+#define notify_sim_task_change(prev) do { } while (0)
+#else
+#define notify_sim_task_change(prev) do { \
+ if (unlikely((prev)->state == TASK_DEAD)) \
+ __insn_mtspr(SPR_SIM_CONTROL, SIM_CONTROL_OS_EXIT | \
+ ((prev)->pid << _SIM_CONTROL_OPERATOR_BITS)); \
+ __insn_mtspr(SPR_SIM_CONTROL, SIM_CONTROL_OS_SWITCH | \
+ (current->pid << _SIM_CONTROL_OPERATOR_BITS)); \
+} while (0)
+#endif
+
+/*
* Kernel threads can check to see if they need to migrate their
* stack whenever they return from a context switch; for user
* threads, we defer until they are returning to user-space.
*/
#define finish_arch_switch(prev) do { \
- if (unlikely((prev)->state == TASK_DEAD)) \
- __insn_mtspr(SPR_SIM_CONTROL, SIM_CONTROL_OS_EXIT | \
- ((prev)->pid << _SIM_CONTROL_OPERATOR_BITS)); \
- __insn_mtspr(SPR_SIM_CONTROL, SIM_CONTROL_OS_SWITCH | \
- (current->pid << _SIM_CONTROL_OPERATOR_BITS)); \
+ notify_sim_task_change(prev); \
if (current->mm == NULL && !kstack_hash && \
current_thread_info()->homecache_cpu != smp_processor_id()) \
homecache_migrate_kthread(); \
diff --git a/arch/tile/include/asm/thread_info.h b/arch/tile/include/asm/thread_info.h
index b8aa6df..1c26cdf 100644
--- a/arch/tile/include/asm/thread_info.h
+++ b/arch/tile/include/asm/thread_info.h
@@ -18,7 +18,9 @@

#include <asm/processor.h>
#include <asm/page.h>
+
#ifndef __ASSEMBLY__
+struct kvm_vcpu;

/*
* Low level task data that assembly code needs immediate access to.
@@ -44,6 +46,9 @@ struct thread_info {
unsigned long unalign_jit_tmp[4]; /* temp r0..r3 storage */
void __user *unalign_jit_base; /* unalign fixup JIT base */
#endif
+#ifdef CONFIG_KVM
+ struct kvm_vcpu *vcpu; /* vcpu during vmresume */
+#endif
};

/*
@@ -117,8 +122,8 @@ extern void _cpu_idle(void);

/*
* Thread information flags that various assembly files may need to access.
- * Keep flags accessed frequently in low bits, particular since it makes
- * it easier to build constants in assembly.
+ * Keep flags accessed frequently in low bits, since it makes it
+ * easier to build constants in assembly.
*/
#define TIF_SIGPENDING 0 /* signal pending */
#define TIF_NEED_RESCHED 1 /* rescheduling necessary */
@@ -131,6 +136,7 @@ extern void _cpu_idle(void);
#define TIF_MEMDIE 7 /* OOM killer at work */
#define TIF_NOTIFY_RESUME 8 /* callback before returning to user */
#define TIF_SYSCALL_TRACEPOINT 9 /* syscall tracepoint instrumentation */
+#define TIF_VIRT_EXIT 10 /* force exit of task in vmresume */

#define _TIF_SIGPENDING (1<<TIF_SIGPENDING)
#define _TIF_NEED_RESCHED (1<<TIF_NEED_RESCHED)
@@ -142,11 +148,12 @@ extern void _cpu_idle(void);
#define _TIF_MEMDIE (1<<TIF_MEMDIE)
#define _TIF_NOTIFY_RESUME (1<<TIF_NOTIFY_RESUME)
#define _TIF_SYSCALL_TRACEPOINT (1<<TIF_SYSCALL_TRACEPOINT)
+#define _TIF_VIRT_EXIT (1<<TIF_VIRT_EXIT)

/* Work to do on any return to user space. */
-#define _TIF_ALLWORK_MASK \
- (_TIF_SIGPENDING|_TIF_NEED_RESCHED|_TIF_SINGLESTEP|\
- _TIF_ASYNC_TLB|_TIF_NOTIFY_RESUME)
+#define _TIF_ALLWORK_MASK \
+ (_TIF_SIGPENDING|_TIF_NEED_RESCHED|_TIF_SINGLESTEP| \
+ _TIF_ASYNC_TLB|_TIF_NOTIFY_RESUME|_TIF_VIRT_EXIT)

/* Work to do at syscall entry. */
#define _TIF_SYSCALL_ENTRY_WORK (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_TRACEPOINT)
diff --git a/arch/tile/include/asm/timex.h b/arch/tile/include/asm/timex.h
index edbd7e4..0417617 100644
--- a/arch/tile/include/asm/timex.h
+++ b/arch/tile/include/asm/timex.h
@@ -27,6 +27,14 @@

typedef unsigned long long cycles_t;

+#ifdef CONFIG_KVM_GUEST
+#define INT_LINUX_TIMER INT_AUX_TILE_TIMER
+#define SPR_LINUX_TIMER_CONTROL SPR_AUX_TILE_TIMER_CONTROL
+#else
+#define INT_LINUX_TIMER INT_TILE_TIMER
+#define SPR_LINUX_TIMER_CONTROL SPR_TILE_TIMER_CONTROL
+#endif
+
#if CHIP_HAS_SPLIT_CYCLE()
cycles_t get_cycles(void);
#define get_cycles_low() __insn_mfspr(SPR_CYCLE_LOW)
diff --git a/arch/tile/include/hv/hypervisor.h b/arch/tile/include/hv/hypervisor.h
index f71b08e..71abe38 100644
--- a/arch/tile/include/hv/hypervisor.h
+++ b/arch/tile/include/hv/hypervisor.h
@@ -321,6 +321,18 @@
/** hv_set_speed */
#define HV_DISPATCH_SET_SPEED 58

+/** hv_install_virt_context */
+#define HV_DISPATCH_INSTALL_VIRT_CONTEXT 59
+
+/** hv_inquire_virt_context */
+#define HV_DISPATCH_INQUIRE_VIRT_CONTEXT 60
+
+/** hv_install_guest_context */
+#define HV_DISPATCH_INSTALL_GUEST_CONTEXT 61
+
+/** hv_inquire_guest_context */
+#define HV_DISPATCH_INQUIRE_GUEST_CONTEXT 62
+
/** hv_console_set_ipi */
#define HV_DISPATCH_CONSOLE_SET_IPI 63

@@ -783,12 +795,15 @@ HV_SetSpeed hv_set_speed(unsigned long speed, __hv64 start_cycle,
* new page table does not need to contain any mapping for the
* hv_install_context address itself.
*
- * At most one HV_CTX_PG_SM_* flag may be specified in "flags";
+ * At most one HV_CTX_PG_SM_* flag may be specified in the flags argument;
* if multiple flags are specified, HV_EINVAL is returned.
* Specifying none of the flags results in using the default page size.
* All cores participating in a given client must request the same
* page size, or the results are undefined.
*
+ * To disable an installed page table, install HV_CTX_NONE. The access
+ * and asid fields are ignored.
+ *
* @param page_table Root of the page table.
* @param access PTE providing info on how to read the page table. This
* value must be consistent between multiple tiles sharing a page table,
@@ -804,16 +819,101 @@ int hv_install_context(HV_PhysAddr page_table, HV_PTE access, HV_ASID asid,

#endif /* !__ASSEMBLER__ */

+#define HV_CTX_NONE ((HV_PhysAddr)-1) /**< Disable page table. */
+
#define HV_CTX_DIRECTIO 0x1 /**< Direct I/O requests are accepted from
PL0. */

+#define HV_CTX_GUEST_CACHE 0x4 /**< Let guest control caching flags (only
+ usable with hv_install_virt_context.) */
+
#define HV_CTX_PG_SM_4K 0x10 /**< Use 4K small pages, if available. */
#define HV_CTX_PG_SM_16K 0x20 /**< Use 16K small pages, if available. */
#define HV_CTX_PG_SM_64K 0x40 /**< Use 64K small pages, if available. */
#define HV_CTX_PG_SM_MASK 0xf0 /**< Mask of all possible small pages. */

+
#ifndef __ASSEMBLER__

+/** Install a virtualization context.
+ *
+ * When a virtualization context is installed, all faults from PL0 or
+ * PL1 are handled via a "guest context" and then post-processed by
+ * the "virtualization context"; faults at PL2 are still handled by
+ * the normal context. For guest faults, the "guest PAs" produced by
+ * the guest page table are passed through the virtualization page
+ * table as pseudo-VAs, generating the true CPA as a result. See the
+ * individual HV_PTE_xxx bits for the effect the bits have when
+ * present in the virtualization page table. The ASID is currently
+ * ignored in this syscall, but it might be used later, so the API
+ * includes it. The HV_CTX_GUEST_CACHE flag indicates that all
+ * cache-related flags should be taken from the primary page table,
+ * not the virtualization page table.
+ *
+ * Once the virtualization context is installed, a guest context
+ * should also be installed; otherwise a VA-equals-PA context will be
+ * used for accesses at PL 0 or 1, i.e. VAs will be passed directly to
+ * the virtualization context to generate CPAs.
+ *
+ * When entering client PL after being at guest or user PL, the
+ * client is expected to call hv_flush_all() to clear any TLB mappings
+ * that might otherwise conflict. Similarly, hv_flush_all() should
+ * be called before returning to guest or user PL with a virtualization
+ * context installed, so that any TLB mappings are cleared. Future
+ * work may include adding a "vpid" or similar namespace so that
+ * the TLBs may be managed independently.
+ *
+ * Subsequent guest page table installations will have their root PA
+ * and PTE cached after translating through the virtualization
+ * context, so if entries in the virtualization page table are
+ * modified or removed, the guest context should be re-installed.
+ * This, in conjunction with flushing the TLB on return to the guest,
+ * will ensure that the new virtualization entries are honored.
+ *
+ * @param page_table Root of the page table.
+ * @param access PTE providing info on how to read the page table. This
+ * value must be consistent between multiple tiles sharing a page table,
+ * and must also be consistent with any virtual mappings the client
+ * may be using to access the page table.
+ * @param asid HV_ASID the page table is to be used for (currently ignored).
+ * @param flags Context flags, denoting attributes or privileges of the
+ * current virtualization context (see below).
+ * @return Zero on success, or a hypervisor error code on failure.
+ */
+
+int hv_install_virt_context(HV_PhysAddr page_table, HV_PTE access,
+ HV_ASID asid, __hv32 flags);
+
+
+
+/** Install a guest context.
+ *
+ * The guest context is only consulted when a virtualization context
+ * is also installed, and for faults that occur below the client's PL.
+ * If no guest context is installed, in such a case, a VA=PA context
+ * is used instead.
+ *
+ * The access PTE will only be honored if the virtualization table was
+ * installed with HV_CTX_GUEST_CACHE.
+ *
+ * A virtualization context must already be installed prior to
+ * installing the guest context.
+ *
+ * @param page_table Root of the page table; the value is the guest's
+ * physical address (GPA), not a CPA.
+ * @param access PTE providing info on how to read the page table. This
+ * value must be consistent between multiple tiles sharing a page table,
+ * and must also be consistent with any virtual mappings the client
+ * may be using to access the page table.
+ * @param asid HV_ASID the page table is to be used for.
+ * @param flags Context flags, denoting attributes or privileges of the
+ * current context (HV_CTX_xxx).
+ * @return Zero on success, or a hypervisor error code on failure.
+ */
+
+int hv_install_guest_context(HV_PhysAddr page_table, HV_PTE access,
+ HV_ASID asid, __hv32 flags);
+

/** Set the number of pages ganged together by HV_PTE_SUPER at a
* particular level of the page table.
@@ -823,7 +923,7 @@ int hv_install_context(HV_PhysAddr page_table, HV_PTE access, HV_ASID asid,
* "super" page size must be less than the span of the next level in
* the page table. The largest size that can be requested is 64GB.
*
- * The shift value is initially "0" for all page table levels,
+ * The shift value is initially 0 for all page table levels,
* indicating that the HV_PTE_SUPER bit is effectively ignored.
*
* If you change the count from one non-zero value to another, the
@@ -854,11 +954,26 @@ typedef struct
} HV_Context;

/** Retrieve information about the currently installed context.
- * @return The data passed to the last successful hv_install_context call.
+ * @return The data passed to the last successful call to
+ * hv_install_context().
*/
HV_Context hv_inquire_context(void);


+/** Retrieve information about the currently installed virtualization context.
+ * @return The data passed to the last successful call to
+ * hv_install_virt_context().
+ */
+HV_Context hv_inquire_virt_context(void);
+
+
+/** Retrieve information about the currently installed guest context.
+ * @return The data passed to the last successful call to
+ * hv_install_guest_context().
+ */
+HV_Context hv_inquire_guest_context(void);
+
+
/** Flushes all translations associated with the named address space
* identifier from the TLB and any other hypervisor data structures.
* Translations installed with the "global" bit are not flushed.
@@ -917,7 +1032,7 @@ int hv_flush_pages(HV_VirtAddr start, HV_PageSize page_size,
/** Flushes all non-global translations (if preserve_global is true),
* or absolutely all translations (if preserve_global is false).
*
- * @param preserve_global Non-zero if we want to preserve "global" mappings.
+ * @param preserve_global Non-zero if we want to preserve global mappings.
* @return Zero on success, or a hypervisor error code on failure.
*/
int hv_flush_all(int preserve_global);
@@ -991,7 +1106,11 @@ typedef enum {
HV_INQ_TILES_HFH_CACHE = 2,

/** The set of tiles that can be legally used as a LOTAR for a PTE. */
- HV_INQ_TILES_LOTAR = 3
+ HV_INQ_TILES_LOTAR = 3,
+
+ /** The set of "shared" driver tiles that the hypervisor may
+ * periodically interrupt. */
+ HV_INQ_TILES_SHARED = 4
} HV_InqTileSet;

/** Returns specific information about various sets of tiles within the
@@ -1271,14 +1390,21 @@ void hv_downcall_dispatch(void);
*/
/** Message receive downcall interrupt vector */
#define INT_MESSAGE_RCV_DWNCL INT_BOOT_ACCESS
+/** Device interrupt downcall interrupt vector */
+#define INT_DEV_INTR_DWNCL INT_WORLD_ACCESS
+#ifdef __tilegx__
+/** Virtualization page table miss downcall interrupt vector */
+#define INT_VPGTABLE_MISS_DWNCL INT_I_ASID
+/** Virtualization guest illegal page table */
+#define INT_VGUEST_FATAL_DWNCL INT_D_ASID
+#else
/** DMA TLB miss downcall interrupt vector */
#define INT_DMATLB_MISS_DWNCL INT_DMA_ASID
-/** Static nework processor instruction TLB miss interrupt vector */
-#define INT_SNITLB_MISS_DWNCL INT_SNI_ASID
/** DMA TLB access violation downcall interrupt vector */
#define INT_DMATLB_ACCESS_DWNCL INT_DMA_CPL
-/** Device interrupt downcall interrupt vector */
-#define INT_DEV_INTR_DWNCL INT_WORLD_ACCESS
+/** Static nework processor instruction TLB miss interrupt vector */
+#define INT_SNITLB_MISS_DWNCL INT_SNI_ASID
+#endif

#ifndef __ASSEMBLER__

@@ -2041,8 +2167,16 @@ int hv_flush_remote(HV_PhysAddr cache_pa, unsigned long cache_control,
#define HV_PTE_PTFN_BITS 29 /**< Number of bits in a PTFN */

/*
- * Legal values for the PTE's mode field
+ * Legal values for the PTE's mode field.
+ *
+ * If a virtualization page table is installed, this field is only honored
+ * in the primary page table if HV_CTX_GUEST_CACHE was set when the page
+ * table was installed, otherwise only in the virtualization page table.
+ * Note that if HV_CTX_GUEST_CACHE is not set, guests will only be able
+ * to access MMIO resources via pseudo PAs that map to MMIO in the
+ * virtualization page table.
*/
+
/** Data is not resident in any caches; loads and stores access memory
* directly.
*/
@@ -2161,6 +2295,8 @@ int hv_flush_remote(HV_PhysAddr cache_pa, unsigned long cache_control,
* doing so may race with the hypervisor's update of ACCESSED and DIRTY bits.
*
* This bit is ignored in level-1 PTEs unless the Page bit is set.
+ * This bit is ignored in the primary page table if a virtualization
+ * page table is installed.
*/
#define HV_PTE_GLOBAL (__HV_PTE_ONE << HV_PTE_INDEX_GLOBAL)

@@ -2174,6 +2310,7 @@ int hv_flush_remote(HV_PhysAddr cache_pa, unsigned long cache_control,
* doing so may race with the hypervisor's update of ACCESSED and DIRTY bits.
*
* This bit is ignored in level-1 PTEs unless the Page bit is set.
+ * This bit is ignored in the virtualization page table.
*/
#define HV_PTE_USER (__HV_PTE_ONE << HV_PTE_INDEX_USER)

@@ -2185,7 +2322,7 @@ int hv_flush_remote(HV_PhysAddr cache_pa, unsigned long cache_control,
* has been cleared, subsequent references are not guaranteed to set
* it again until the translation has been flushed from the TLB.
*
- * This bit is ignored in level-1 PTEs unless the Page bit is set.
+ * This bit is ignored in level-0 or level-1 PTEs unless the Page bit is set.
*/
#define HV_PTE_ACCESSED (__HV_PTE_ONE << HV_PTE_INDEX_ACCESSED)

@@ -2197,7 +2334,7 @@ int hv_flush_remote(HV_PhysAddr cache_pa, unsigned long cache_control,
* has been cleared, subsequent references are not guaranteed to set
* it again until the translation has been flushed from the TLB.
*
- * This bit is ignored in level-1 PTEs unless the Page bit is set.
+ * This bit is ignored in level-0 or level-1 PTEs unless the Page bit is set.
*/
#define HV_PTE_DIRTY (__HV_PTE_ONE << HV_PTE_INDEX_DIRTY)

@@ -2239,6 +2376,10 @@ int hv_flush_remote(HV_PhysAddr cache_pa, unsigned long cache_control,
*
* In level-1 PTEs, if the Page bit is clear, this bit determines how the
* level-2 page table is accessed.
+ *
+ * If a virtualization page table is installed, this field is only honored
+ * in the primary page table if HV_CTX_GUEST_CACHE was set when the page
+ * table was installed, otherwise only in the virtualization page table.
*/
#define HV_PTE_NC (__HV_PTE_ONE << HV_PTE_INDEX_NC)

@@ -2252,6 +2393,10 @@ int hv_flush_remote(HV_PhysAddr cache_pa, unsigned long cache_control,
*
* In level-1 PTEs, if the Page bit is clear, this bit
* determines how the level-2 page table is accessed.
+ *
+ * If a virtualization page table is installed, this field is only honored
+ * in the primary page table if HV_CTX_GUEST_CACHE was set when the page
+ * table was installed, otherwise only in the virtualization page table.
*/
#define HV_PTE_NO_ALLOC_L1 (__HV_PTE_ONE << HV_PTE_INDEX_NO_ALLOC_L1)

@@ -2265,6 +2410,10 @@ int hv_flush_remote(HV_PhysAddr cache_pa, unsigned long cache_control,
*
* In level-1 PTEs, if the Page bit is clear, this bit determines how the
* level-2 page table is accessed.
+ *
+ * If a virtualization page table is installed, this field is only honored
+ * in the primary page table if HV_CTX_GUEST_CACHE was set when the page
+ * table was installed, otherwise only in the virtualization page table.
*/
#define HV_PTE_NO_ALLOC_L2 (__HV_PTE_ONE << HV_PTE_INDEX_NO_ALLOC_L2)

@@ -2284,6 +2433,10 @@ int hv_flush_remote(HV_PhysAddr cache_pa, unsigned long cache_control,
* the page map directly to memory.
*
* This bit is ignored in level-1 PTEs unless the Page bit is set.
+ *
+ * If a virtualization page table is installed, this field is only honored
+ * in the primary page table if HV_CTX_GUEST_CACHE was set when the page
+ * table was installed, otherwise only in the virtualization page table.
*/
#define HV_PTE_CACHED_PRIORITY (__HV_PTE_ONE << \
HV_PTE_INDEX_CACHED_PRIORITY)
@@ -2297,6 +2450,8 @@ int hv_flush_remote(HV_PhysAddr cache_pa, unsigned long cache_control,
* It is illegal for this bit to be clear if the Writable bit is set.
*
* This bit is ignored in level-1 PTEs unless the Page bit is set.
+ * If a virtualization page table is present, the final Readable status
+ * is the logical "and" of this bit in both page tables.
*/
#define HV_PTE_READABLE (__HV_PTE_ONE << HV_PTE_INDEX_READABLE)

@@ -2307,6 +2462,8 @@ int hv_flush_remote(HV_PhysAddr cache_pa, unsigned long cache_control,
* PTE.
*
* This bit is ignored in level-1 PTEs unless the Page bit is set.
+ * If a virtualization page table is present, the final Writable status
+ * is the logical "and" of this bit in both page tables.
*/
#define HV_PTE_WRITABLE (__HV_PTE_ONE << HV_PTE_INDEX_WRITABLE)

@@ -2319,6 +2476,8 @@ int hv_flush_remote(HV_PhysAddr cache_pa, unsigned long cache_control,
* than one.
*
* This bit is ignored in level-1 PTEs unless the Page bit is set.
+ * If a virtualization page table is present, the final Executable status
+ * is the logical "and" of this bit in both page tables.
*/
#define HV_PTE_EXECUTABLE (__HV_PTE_ONE << HV_PTE_INDEX_EXECUTABLE)

diff --git a/arch/tile/include/uapi/arch/sim.h b/arch/tile/include/uapi/arch/sim.h
index e54b7b0..36fb24c 100644
--- a/arch/tile/include/uapi/arch/sim.h
+++ b/arch/tile/include/uapi/arch/sim.h
@@ -611,6 +611,25 @@ sim_profiler_chip_clear(unsigned int mask)
__insn_mtspr(SPR_SIM_CONTROL, SIM_PROFILER_CHIP_CLEAR_SPR_ARG(mask));
}

+/**
+ * Set vCPU number for a given task.
+ * @param vcpu Virtual cpu to set.
+ */
+static __inline void
+sim_set_vcpu(int vcpu)
+{
+ __insn_mtspr(SPR_SIM_CONTROL,
+ SIM_CONTROL_VCPU | (vcpu << _SIM_CONTROL_OPERATOR_BITS));
+}
+
+/** Clear vCPU status for a given task. */
+static __inline void
+sim_clear_vcpu(void)
+{
+ __insn_mtspr(SPR_SIM_CONTROL,
+ SIM_CONTROL_VCPU | (-1 << _SIM_CONTROL_OPERATOR_BITS));
+}
+

/*
* Event support.
diff --git a/arch/tile/include/uapi/arch/sim_def.h b/arch/tile/include/uapi/arch/sim_def.h
index 4b44a2b..b9aad66 100644
--- a/arch/tile/include/uapi/arch/sim_def.h
+++ b/arch/tile/include/uapi/arch/sim_def.h
@@ -221,6 +221,14 @@
*/
#define SIM_CONTROL_ENABLE_MPIPE_LINK_MAGIC_BYTE 36

+/**
+ * If written to SPR_SIM_CONTROL, combined with a signed virtual cpu
+ * number shifted by 8, will tag any identification of the cpu that
+ * task is running on with the given virtual cpu number. If the
+ * virtual cpu number is -1, the tag is removed.
+ */
+#define SIM_CONTROL_VCPU 37
+

/*
* Syscall numbers for use with "sim_syscall()".
diff --git a/arch/tile/include/uapi/arch/spr_def_32.h b/arch/tile/include/uapi/arch/spr_def_32.h
index c689446..4644c8d 100644
--- a/arch/tile/include/uapi/arch/spr_def_32.h
+++ b/arch/tile/include/uapi/arch/spr_def_32.h
@@ -121,6 +121,9 @@
#define SPR_MPL_DMA_NOTIFY_SET_0 0x3800
#define SPR_MPL_DMA_NOTIFY_SET_1 0x3801
#define SPR_MPL_DMA_NOTIFY_SET_2 0x3802
+#define SPR_MPL_GPV_SET_0 0x0600
+#define SPR_MPL_GPV_SET_1 0x0601
+#define SPR_MPL_GPV_SET_2 0x0602
#define SPR_MPL_IDN_ACCESS_SET_0 0x0a00
#define SPR_MPL_IDN_ACCESS_SET_1 0x0a01
#define SPR_MPL_IDN_ACCESS_SET_2 0x0a02
@@ -142,6 +145,9 @@
#define SPR_MPL_IDN_TIMER_SET_0 0x3400
#define SPR_MPL_IDN_TIMER_SET_1 0x3401
#define SPR_MPL_IDN_TIMER_SET_2 0x3402
+#define SPR_MPL_ILL_SET_0 0x0400
+#define SPR_MPL_ILL_SET_1 0x0401
+#define SPR_MPL_ILL_SET_2 0x0402
#define SPR_MPL_INTCTRL_0_SET_0 0x4a00
#define SPR_MPL_INTCTRL_0_SET_1 0x4a01
#define SPR_MPL_INTCTRL_0_SET_2 0x4a02
@@ -166,6 +172,12 @@
#define SPR_MPL_SN_NOTIFY_SET_0 0x2a00
#define SPR_MPL_SN_NOTIFY_SET_1 0x2a01
#define SPR_MPL_SN_NOTIFY_SET_2 0x2a02
+#define SPR_MPL_SWINT_0_SET_0 0x1c00
+#define SPR_MPL_SWINT_0_SET_1 0x1c01
+#define SPR_MPL_SWINT_0_SET_2 0x1c02
+#define SPR_MPL_SWINT_1_SET_0 0x1a00
+#define SPR_MPL_SWINT_1_SET_1 0x1a01
+#define SPR_MPL_SWINT_1_SET_2 0x1a02
#define SPR_MPL_UDN_ACCESS_SET_0 0x0c00
#define SPR_MPL_UDN_ACCESS_SET_1 0x0c01
#define SPR_MPL_UDN_ACCESS_SET_2 0x0c02
@@ -187,6 +199,9 @@
#define SPR_MPL_UDN_TIMER_SET_0 0x3600
#define SPR_MPL_UDN_TIMER_SET_1 0x3601
#define SPR_MPL_UDN_TIMER_SET_2 0x3602
+#define SPR_MPL_UNALIGN_DATA_SET_0 0x1e00
+#define SPR_MPL_UNALIGN_DATA_SET_1 0x1e01
+#define SPR_MPL_UNALIGN_DATA_SET_2 0x1e02
#define SPR_MPL_WORLD_ACCESS_SET_0 0x4e00
#define SPR_MPL_WORLD_ACCESS_SET_1 0x4e01
#define SPR_MPL_WORLD_ACCESS_SET_2 0x4e02
diff --git a/arch/tile/include/uapi/arch/spr_def_64.h b/arch/tile/include/uapi/arch/spr_def_64.h
index 67a6c17..727cda7 100644
--- a/arch/tile/include/uapi/arch/spr_def_64.h
+++ b/arch/tile/include/uapi/arch/spr_def_64.h
@@ -21,6 +21,10 @@
#define SPR_AUX_PERF_COUNT_1 0x2106
#define SPR_AUX_PERF_COUNT_CTL 0x2107
#define SPR_AUX_PERF_COUNT_STS 0x2108
+#define SPR_AUX_TILE_TIMER_CONTROL 0x1705
+#define SPR_AUX_TILE_TIMER_CONTROL__COUNT_MASK 0xffffffff
+#define SPR_AUX_TILE_TIMER_CONTROL__DISABLE_SHIFT 62
+#define SPR_AUX_TILE_TIMER_CONTROL__UNDERFLOW_SHIFT 63
#define SPR_CMPEXCH_VALUE 0x2780
#define SPR_CYCLE 0x2781
#define SPR_DONE 0x2705
@@ -101,6 +105,9 @@
#define SPR_MPL_AUX_TILE_TIMER_SET_0 0x1700
#define SPR_MPL_AUX_TILE_TIMER_SET_1 0x1701
#define SPR_MPL_AUX_TILE_TIMER_SET_2 0x1702
+#define SPR_MPL_GPV_SET_0 0x0900
+#define SPR_MPL_GPV_SET_1 0x0901
+#define SPR_MPL_GPV_SET_2 0x0902
#define SPR_MPL_IDN_ACCESS_SET_0 0x0a00
#define SPR_MPL_IDN_ACCESS_SET_1 0x0a01
#define SPR_MPL_IDN_ACCESS_SET_2 0x0a02
@@ -116,6 +123,12 @@
#define SPR_MPL_IDN_TIMER_SET_0 0x1800
#define SPR_MPL_IDN_TIMER_SET_1 0x1801
#define SPR_MPL_IDN_TIMER_SET_2 0x1802
+#define SPR_MPL_ILL_SET_0 0x0800
+#define SPR_MPL_ILL_SET_1 0x0801
+#define SPR_MPL_ILL_SET_2 0x0802
+#define SPR_MPL_ILL_TRANS_SET_0 0x1000
+#define SPR_MPL_ILL_TRANS_SET_1 0x1001
+#define SPR_MPL_ILL_TRANS_SET_2 0x1002
#define SPR_MPL_INTCTRL_0_SET_0 0x2500
#define SPR_MPL_INTCTRL_0_SET_1 0x2501
#define SPR_MPL_INTCTRL_0_SET_2 0x2502
@@ -140,6 +153,15 @@
#define SPR_MPL_PERF_COUNT_SET_0 0x2000
#define SPR_MPL_PERF_COUNT_SET_1 0x2001
#define SPR_MPL_PERF_COUNT_SET_2 0x2002
+#define SPR_MPL_SINGLE_STEP_1_SET_0 0x0300
+#define SPR_MPL_SINGLE_STEP_1_SET_1 0x0301
+#define SPR_MPL_SINGLE_STEP_1_SET_2 0x0302
+#define SPR_MPL_SWINT_0_SET_0 0x0f00
+#define SPR_MPL_SWINT_0_SET_1 0x0f01
+#define SPR_MPL_SWINT_0_SET_2 0x0f02
+#define SPR_MPL_SWINT_1_SET_0 0x0e00
+#define SPR_MPL_SWINT_1_SET_1 0x0e01
+#define SPR_MPL_SWINT_1_SET_2 0x0e02
#define SPR_MPL_UDN_ACCESS_SET_0 0x0b00
#define SPR_MPL_UDN_ACCESS_SET_1 0x0b01
#define SPR_MPL_UDN_ACCESS_SET_2 0x0b02
@@ -155,6 +177,9 @@
#define SPR_MPL_UDN_TIMER_SET_0 0x1900
#define SPR_MPL_UDN_TIMER_SET_1 0x1901
#define SPR_MPL_UDN_TIMER_SET_2 0x1902
+#define SPR_MPL_UNALIGN_DATA_SET_0 0x1100
+#define SPR_MPL_UNALIGN_DATA_SET_1 0x1101
+#define SPR_MPL_UNALIGN_DATA_SET_2 0x1102
#define SPR_MPL_WORLD_ACCESS_SET_0 0x2700
#define SPR_MPL_WORLD_ACCESS_SET_1 0x2701
#define SPR_MPL_WORLD_ACCESS_SET_2 0x2702
diff --git a/arch/tile/include/uapi/asm/Kbuild b/arch/tile/include/uapi/asm/Kbuild
index c20db8e..f07cc24 100644
--- a/arch/tile/include/uapi/asm/Kbuild
+++ b/arch/tile/include/uapi/asm/Kbuild
@@ -6,7 +6,9 @@ header-y += bitsperlong.h
header-y += byteorder.h
header-y += cachectl.h
header-y += hardwall.h
+header-y += kvm.h
header-y += kvm_para.h
+header-y += kvm_virtio.h
header-y += mman.h
header-y += ptrace.h
header-y += setup.h
diff --git a/arch/tile/include/uapi/asm/kvm.h b/arch/tile/include/uapi/asm/kvm.h
new file mode 100644
index 0000000..25ca8ce
--- /dev/null
+++ b/arch/tile/include/uapi/asm/kvm.h
@@ -0,0 +1,249 @@
+/*
+ * Copyright 2013 Tilera Corporation. All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation, version 2.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT. See the GNU General Public License for
+ * more details.
+ */
+
+#ifndef _UAPI_ASM_TILE_KVM_H
+#define _UAPI_ASM_TILE_KVM_H
+
+#ifndef __ASSEMBLER__
+#include <linux/ptrace.h>
+#endif
+
+#include <arch/abi.h>
+
+/*
+ * For Hypervisor syscalls. Note this comes from the hv: syscall.h,
+ * with small modifications: Remove HV_SYS_fence_incoherent.
+ */
+/* Syscall allowed from guest PL bit mask. */
+#define HV_SYS_GUEST_SHIFT 12
+#define HV_SYS_GUEST_MASK (1 << HV_SYS_GUEST_SHIFT)
+/* downcall_dispatch; this syscall number must be zero */
+#define HV_SYS_downcall_dispatch 0
+/* install_context */
+#define HV_SYS_install_context 1
+/* sysconf */
+#define HV_SYS_sysconf 2
+/* get_rtc */
+#define HV_SYS_get_rtc 3
+/* set_rtc */
+#define HV_SYS_set_rtc 4
+/* flush_asid */
+#define HV_SYS_flush_asid 5
+/* flush_page */
+#define HV_SYS_flush_page 6
+/* flush_pages */
+#define HV_SYS_flush_pages 7
+/* restart */
+#define HV_SYS_restart 8
+/* halt */
+#define HV_SYS_halt 9
+/* power_off */
+#define HV_SYS_power_off 10
+/* inquire_physical */
+#define HV_SYS_inquire_physical 11
+/* inquire_memory_controller */
+#define HV_SYS_inquire_memory_controller 12
+/* inquire_virtual */
+#define HV_SYS_inquire_virtual 13
+/* inquire_asid */
+#define HV_SYS_inquire_asid 14
+/* console_read_if_ready */
+#define HV_SYS_console_read_if_ready 15
+/* console_write */
+#define HV_SYS_console_write 16
+/* init */
+#define HV_SYS_init 17
+/* inquire_topology */
+#define HV_SYS_inquire_topology 18
+/* fs_findfile */
+#define HV_SYS_fs_findfile 19
+/* fs_fstat */
+#define HV_SYS_fs_fstat 20
+/* fs_pread */
+#define HV_SYS_fs_pread 21
+/* physaddr_read64 */
+#define HV_SYS_physaddr_read64 22
+/* physaddr_write64 */
+#define HV_SYS_physaddr_write64 23
+/* get_command_line */
+#define HV_SYS_get_command_line 24
+/* set_caching */
+#define HV_SYS_set_caching 25
+/* bzero_page */
+#define HV_SYS_bzero_page 26
+/* register_message_state */
+#define HV_SYS_register_message_state 27
+/* send_message */
+#define HV_SYS_send_message 28
+/* receive_message */
+#define HV_SYS_receive_message 29
+/* inquire_context */
+#define HV_SYS_inquire_context 30
+/* start_all_tiles */
+#define HV_SYS_start_all_tiles 31
+/* dev_open */
+#define HV_SYS_dev_open 32
+/* dev_close */
+#define HV_SYS_dev_close 33
+/* dev_pread */
+#define HV_SYS_dev_pread 34
+/* dev_pwrite */
+#define HV_SYS_dev_pwrite 35
+/* dev_poll */
+#define HV_SYS_dev_poll 36
+/* dev_poll_cancel */
+#define HV_SYS_dev_poll_cancel 37
+/* dev_preada */
+#define HV_SYS_dev_preada 38
+/* dev_pwritea */
+#define HV_SYS_dev_pwritea 39
+/* flush_remote */
+#define HV_SYS_flush_remote 40
+/* console_putc */
+#define HV_SYS_console_putc 41
+/* inquire_tiles */
+#define HV_SYS_inquire_tiles 42
+/* confstr */
+#define HV_SYS_confstr 43
+/* reexec */
+#define HV_SYS_reexec 44
+/* set_command_line */
+#define HV_SYS_set_command_line 45
+
+/* store_mapping */
+#define HV_SYS_store_mapping 52
+/* inquire_realpa */
+#define HV_SYS_inquire_realpa 53
+/* flush_all */
+#define HV_SYS_flush_all 54
+/* get_ipi_pte */
+#define HV_SYS_get_ipi_pte 55
+/* set_pte_super_shift */
+#define HV_SYS_set_pte_super_shift 56
+/* set_speed */
+#define HV_SYS_set_speed 57
+/* install_virt_context */
+#define HV_SYS_install_virt_context 58
+/* inquire_virt_context */
+#define HV_SYS_inquire_virt_context 59
+/* inquire_guest_context */
+#define HV_SYS_install_guest_context 60
+/* inquire_guest_context */
+#define HV_SYS_inquire_guest_context 61
+
+/*
+ * Number of hypercall (from guest os to host os) other than hv_*().
+ * We leave the previous 128 entries to the usual hv_*() calls
+ * as defined in hypervisor.h.
+ */
+#define KVM_OTHER_HCALL 128
+
+/* Hypercall index for virtio. */
+#define KVM_HCALL_virtio 128
+
+/* One greater than the maximum hypercall number. */
+#define KVM_NUM_HCALLS 256
+
+#ifndef __ASSEMBLER__
+
+struct kvm_regs {
+ struct pt_regs regs;
+};
+
+struct kvm_sregs {
+};
+
+struct kvm_fpu {
+};
+
+struct kvm_debug_exit_arch {
+};
+
+struct kvm_guest_debug_arch {
+};
+
+/* definition of registers in kvm_run */
+struct kvm_sync_regs {
+};
+
+#ifndef __KERNEL__
+/* For hv_*() */
+#define KVM_EMULATE(name) [HV_SYS_##name] = qemu_emulate_illegal,
+#define USER_EMULATE(name) [HV_SYS_##name] = qemu_emulate_hv_##name,
+#define NO_EMULATE(name) [HV_SYS_##name] = qemu_emulate_illegal,
+#define BOTH_EMULATE(name) [HV_SYS_##name] = qemu_emulate_hv_##name,
+/* For others */
+#define USER_HCALL(name) [KVM_HCALL_##name] = qemu_handle_##name,
+#endif
+
+#define HCALL_DEFS \
+ /* For hv_*() */ \
+ KVM_EMULATE(init) \
+ NO_EMULATE(install_context) \
+ KVM_EMULATE(sysconf) \
+ KVM_EMULATE(get_rtc) \
+ KVM_EMULATE(set_rtc) \
+ NO_EMULATE(flush_asid) \
+ NO_EMULATE(flush_page) \
+ NO_EMULATE(flush_pages) \
+ USER_EMULATE(restart) \
+ USER_EMULATE(halt) \
+ USER_EMULATE(power_off) \
+ USER_EMULATE(inquire_physical) \
+ USER_EMULATE(inquire_memory_controller) \
+ KVM_EMULATE(inquire_virtual) \
+ KVM_EMULATE(inquire_asid) \
+ NO_EMULATE(console_read_if_ready) \
+ NO_EMULATE(console_write) \
+ NO_EMULATE(downcall_dispatch) \
+ KVM_EMULATE(inquire_topology) \
+ USER_EMULATE(fs_findfile) \
+ USER_EMULATE(fs_fstat) \
+ USER_EMULATE(fs_pread) \
+ KVM_EMULATE(physaddr_read64) \
+ KVM_EMULATE(physaddr_write64) \
+ USER_EMULATE(get_command_line) \
+ USER_EMULATE(set_caching) \
+ NO_EMULATE(bzero_page) \
+ KVM_EMULATE(register_message_state) \
+ KVM_EMULATE(send_message) \
+ KVM_EMULATE(receive_message) \
+ KVM_EMULATE(inquire_context) \
+ KVM_EMULATE(start_all_tiles) \
+ USER_EMULATE(dev_open) \
+ USER_EMULATE(dev_close) \
+ USER_EMULATE(dev_pread) \
+ USER_EMULATE(dev_pwrite) \
+ USER_EMULATE(dev_poll) \
+ USER_EMULATE(dev_poll_cancel) \
+ USER_EMULATE(dev_preada) \
+ USER_EMULATE(dev_pwritea) \
+ USER_EMULATE(flush_remote) \
+ NO_EMULATE(console_putc) \
+ KVM_EMULATE(inquire_tiles) \
+ KVM_EMULATE(confstr) \
+ USER_EMULATE(reexec) \
+ USER_EMULATE(set_command_line) \
+ USER_EMULATE(store_mapping) \
+ NO_EMULATE(inquire_realpa) \
+ NO_EMULATE(flush_all) \
+ KVM_EMULATE(get_ipi_pte) \
+ KVM_EMULATE(set_pte_super_shift) \
+ KVM_EMULATE(set_speed) \
+ /* For others */ \
+ USER_HCALL(virtio)
+
+#endif
+
+#endif /* _UAPI_ASM_TILE_KVM_H */
diff --git a/arch/tile/include/uapi/asm/kvm_virtio.h b/arch/tile/include/uapi/asm/kvm_virtio.h
new file mode 100644
index 0000000..d94f535
--- /dev/null
+++ b/arch/tile/include/uapi/asm/kvm_virtio.h
@@ -0,0 +1,60 @@
+/*
+ * Copyright 2013 Tilera Corporation. All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation, version 2.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT. See the GNU General Public License for
+ * more details.
+ */
+
+#ifndef _UAPI_ASM_TILE_KVM_VIRTIO_H
+#define _UAPI_ASM_TILE_KVM_VIRTIO_H
+
+#include <linux/types.h>
+
+#define KVM_VIRTIO_UNKNOWN 0
+#define KVM_VIRTIO_NOTIFY 1
+#define KVM_VIRTIO_RESET 2
+#define KVM_VIRTIO_SET_STATUS 3
+
+struct kvm_device_desc {
+ /* The device type: console, network, disk etc. Type 0 terminates. */
+ __u8 type;
+ /* The number of virtqueues (first in config array) */
+ __u8 num_vq;
+ /*
+ * The number of bytes of feature bits. Multiply by 2: one for host
+ * features and one for Guest acknowledgements.
+ */
+ __u8 feature_len;
+ /* The number of bytes of the config array after virtqueues. */
+ __u8 config_len;
+ /* A status byte, written by the Guest. */
+ __u8 status;
+ __u64 config[0];
+};
+
+struct kvm_vqinfo {
+ /* Pointer to the information contained in the device config. */
+ struct kvm_vqconfig *config;
+ /* The address where we mapped the virtio ring, so we can unmap it. */
+ void *pages;
+};
+
+struct kvm_vqconfig {
+ /* The physical address of the virtio ring */
+ __u64 pa;
+ /* The number of entries in the virtio_ring */
+ __u64 num;
+ /* The interrupt we get when something happens. Set by the guest. */
+ __u32 irq;
+
+};
+
+
+#endif /* _UAPI_ASM_TILE_KVM_VIRTIO_H */
diff --git a/arch/tile/kernel/Makefile b/arch/tile/kernel/Makefile
index b7c8b5e..b638d3e 100644
--- a/arch/tile/kernel/Makefile
+++ b/arch/tile/kernel/Makefile
@@ -29,5 +29,6 @@ obj-$(CONFIG_TILE_USB) += usb.o
obj-$(CONFIG_TILE_HVGLUE_TRACE) += hvglue_trace.o
obj-$(CONFIG_FUNCTION_TRACER) += ftrace.o mcount_64.o
obj-$(CONFIG_KPROBES) += kprobes.o
+obj-$(CONFIG_KVM_GUEST) += kvm_virtio.o

obj-y += vdso/
diff --git a/arch/tile/kernel/asm-offsets.c b/arch/tile/kernel/asm-offsets.c
index 97ea6ac..0a04a16 100644
--- a/arch/tile/kernel/asm-offsets.c
+++ b/arch/tile/kernel/asm-offsets.c
@@ -20,6 +20,9 @@
#include <linux/hardirq.h>
#include <linux/ptrace.h>
#include <hv/hypervisor.h>
+#ifdef CONFIG_KVM
+#include <linux/kvm_host.h>
+#endif

/* Check for compatible compiler early in the build. */
#ifdef CONFIG_TILEGX
@@ -68,6 +71,10 @@ void foo(void)
DEFINE(THREAD_INFO_UNALIGN_JIT_TMP_OFFSET,
offsetof(struct thread_info, unalign_jit_tmp));
#endif
+#ifdef CONFIG_KVM
+ DEFINE(THREAD_INFO_VCPU_OFFSET,
+ offsetof(struct thread_info, vcpu));
+#endif

DEFINE(TASK_STRUCT_THREAD_KSP_OFFSET,
offsetof(struct task_struct, thread.ksp));
diff --git a/arch/tile/kernel/early_printk.c b/arch/tile/kernel/early_printk.c
index b608e00..0393689 100644
--- a/arch/tile/kernel/early_printk.c
+++ b/arch/tile/kernel/early_printk.c
@@ -18,11 +18,27 @@
#include <linux/string.h>
#include <linux/irqflags.h>
#include <linux/printk.h>
+#ifdef CONFIG_KVM_GUEST
+#include <linux/virtio_console.h>
+#include <linux/kvm_para.h>
+#include <asm/kvm_virtio.h>
+#endif
#include <asm/setup.h>
#include <hv/hypervisor.h>

+
static void early_hv_write(struct console *con, const char *s, unsigned n)
{
+#ifdef CONFIG_KVM_GUEST
+ char buf[512];
+
+ if (n > sizeof(buf) - 1)
+ n = sizeof(buf) - 1;
+ memcpy(buf, s, n);
+ buf[n] = '\0';
+
+ hcall_virtio(KVM_VIRTIO_NOTIFY, __pa(buf));
+#else
tile_console_write(s, n);

/*
@@ -32,6 +48,7 @@ static void early_hv_write(struct console *con, const char *s, unsigned n)
*/
if (n && s[n-1] == '\n')
tile_console_write("\r", 1);
+#endif
}

static struct console early_hv_console = {
diff --git a/arch/tile/kernel/head_32.S b/arch/tile/kernel/head_32.S
index f3f17b0..8d5b40f 100644
--- a/arch/tile/kernel/head_32.S
+++ b/arch/tile/kernel/head_32.S
@@ -162,8 +162,8 @@ ENTRY(swapper_pg_dir)
.set addr, addr + PGDIR_SIZE
.endr

- /* The true text VAs are mapped as VA = PA + MEM_SV_INTRPT */
- PTE MEM_SV_INTRPT, 0, (1 << (HV_PTE_INDEX_READABLE - 32)) | \
+ /* The true text VAs are mapped as VA = PA + MEM_SV_START */
+ PTE MEM_SV_START, 0, (1 << (HV_PTE_INDEX_READABLE - 32)) | \
(1 << (HV_PTE_INDEX_EXECUTABLE - 32))
.org swapper_pg_dir + PGDIR_SIZE
END(swapper_pg_dir)
diff --git a/arch/tile/kernel/head_64.S b/arch/tile/kernel/head_64.S
index 652b814..bd0e12f 100644
--- a/arch/tile/kernel/head_64.S
+++ b/arch/tile/kernel/head_64.S
@@ -135,9 +135,9 @@ ENTRY(_start)
1:

/* Install the interrupt base. */
- moveli r0, hw2_last(MEM_SV_START)
- shl16insli r0, r0, hw1(MEM_SV_START)
- shl16insli r0, r0, hw0(MEM_SV_START)
+ moveli r0, hw2_last(intrpt_start)
+ shl16insli r0, r0, hw1(intrpt_start)
+ shl16insli r0, r0, hw0(intrpt_start)
mtspr SPR_INTERRUPT_VECTOR_BASE_K, r0

/* Get our processor number and save it away in SAVE_K_0. */
diff --git a/arch/tile/kernel/hvglue.S b/arch/tile/kernel/hvglue.S
index 16576c6..2914a9e 100644
--- a/arch/tile/kernel/hvglue.S
+++ b/arch/tile/kernel/hvglue.S
@@ -71,5 +71,11 @@ gensym hv_flush_all, 0x6e0, 32
gensym hv_get_ipi_pte, 0x700, 32
gensym hv_set_pte_super_shift, 0x720, 32
gensym hv_set_speed, 0x740, 32
+gensym hv_install_virt_context, 0x760, 32
+gensym hv_inquire_virt_context, 0x780, 32
+gensym hv_install_guest_context, 0x7a0, 32
+gensym hv_inquire_guest_context, 0x7c0, 32
gensym hv_console_set_ipi, 0x7e0, 32
-gensym hv_glue_internals, 0x800, 30720
+gensym hv_glue_internals, 0x800, 2048
+gensym hcall_virtio, 0x1000, 32
+gensym hv_hcall_internals, 0x1020, 28640
diff --git a/arch/tile/kernel/hvglue_trace.c b/arch/tile/kernel/hvglue_trace.c
index 16ef6c1..3b15c76 100644
--- a/arch/tile/kernel/hvglue_trace.c
+++ b/arch/tile/kernel/hvglue_trace.c
@@ -75,6 +75,10 @@
#define hv_get_ipi_pte _hv_get_ipi_pte
#define hv_set_pte_super_shift _hv_set_pte_super_shift
#define hv_set_speed _hv_set_speed
+#define hv_install_virt_context _hv_install_virt_context
+#define hv_inquire_virt_context _hv_inquire_virt_context
+#define hv_install_guest_context _hv_install_guest_context
+#define hv_inquire_guest_context _hv_inquire_guest_context
#define hv_console_set_ipi _hv_console_set_ipi
#include <hv/hypervisor.h>
#undef hv_init
@@ -135,6 +139,10 @@
#undef hv_get_ipi_pte
#undef hv_set_pte_super_shift
#undef hv_set_speed
+#undef hv_install_virt_context
+#undef hv_inquire_virt_context
+#undef hv_install_guest_context
+#undef hv_inquire_guest_context
#undef hv_console_set_ipi

/*
@@ -209,8 +217,14 @@ HV_WRAP3(HV_SetSpeed, hv_set_speed, unsigned long, speed, __hv64, start_cycle,
unsigned long, flags)
HV_WRAP4(int, hv_install_context, HV_PhysAddr, page_table, HV_PTE, access,
HV_ASID, asid, __hv32, flags)
+HV_WRAP4(int, hv_install_virt_context, HV_PhysAddr, page_table, HV_PTE, access,
+ HV_ASID, asid, __hv32, flags)
+HV_WRAP4(int, hv_install_guest_context, HV_PhysAddr, page_table, HV_PTE, access,
+ HV_ASID, asid, __hv32, flags)
HV_WRAP2(int, hv_set_pte_super_shift, int, level, int, log2_count)
HV_WRAP0(HV_Context, hv_inquire_context)
+HV_WRAP0(HV_Context, hv_inquire_virt_context)
+HV_WRAP0(HV_Context, hv_inquire_guest_context)
HV_WRAP1(int, hv_flush_asid, HV_ASID, asid)
HV_WRAP2(int, hv_flush_page, HV_VirtAddr, address, HV_PageSize, page_size)
HV_WRAP3(int, hv_flush_pages, HV_VirtAddr, start, HV_PageSize, page_size,
diff --git a/arch/tile/kernel/intvec_32.S b/arch/tile/kernel/intvec_32.S
index f3d26f4..2ce69a5 100644
--- a/arch/tile/kernel/intvec_32.S
+++ b/arch/tile/kernel/intvec_32.S
@@ -353,7 +353,7 @@ intvec_\vecname:
#ifdef __COLLECT_LINKER_FEEDBACK__
.pushsection .text.intvec_feedback,"ax"
.org (\vecnum << 5)
- FEEDBACK_ENTER_EXPLICIT(intvec_\vecname, .intrpt1, 1 << 8)
+ FEEDBACK_ENTER_EXPLICIT(intvec_\vecname, .intrpt, 1 << 8)
jrp lr
.popsection
#endif
@@ -806,7 +806,7 @@ handle_interrupt:
STD_ENTRY(interrupt_return)
/* If we're resuming to kernel space, don't check thread flags. */
{
- bnz r30, .Lrestore_all /* NMIs don't special-case user-space */
+ bnz r30, restore_all /* NMIs don't special-case user-space */
PTREGS_PTR(r29, PTREGS_OFFSET_EX1)
}
lw r29, r29
@@ -845,11 +845,11 @@ STD_ENTRY(interrupt_return)
seq r27, r27, r28
}
{
- bbns r27, .Lrestore_all
+ bbns r27, restore_all
addi r28, r28, 8
}
sw r29, r28
- j .Lrestore_all
+ j restore_all

.Lresume_userspace:
FEEDBACK_REENTER(interrupt_return)
@@ -887,7 +887,7 @@ STD_ENTRY(interrupt_return)
auli r1, r1, ha16(_TIF_ALLWORK_MASK)
}
and r1, r29, r1
- bzt r1, .Lrestore_all
+ bzt r1, restore_all

/*
* Make sure we have all the registers saved for signal
@@ -926,7 +926,9 @@ STD_ENTRY(interrupt_return)
* profile interrupt will actually disable interrupts in both SPRs
* before returning, which is OK.)
*/
-.Lrestore_all:
+ .global restore_all
+ .type restore_all, @function
+restore_all:
PTREGS_PTR(r0, PTREGS_OFFSET_EX1)
{
lw r0, r0
@@ -1890,8 +1892,8 @@ int_unalign:
push_extra_callee_saves r0
j do_trap

-/* Include .intrpt1 array of interrupt vectors */
- .section ".intrpt1", "ax"
+/* Include .intrpt array of interrupt vectors */
+ .section ".intrpt", "ax"

#define op_handle_perf_interrupt bad_intr
#define op_handle_aux_perf_interrupt bad_intr
diff --git a/arch/tile/kernel/intvec_64.S b/arch/tile/kernel/intvec_64.S
index 30d2d02..54ae76b 100644
--- a/arch/tile/kernel/intvec_64.S
+++ b/arch/tile/kernel/intvec_64.S
@@ -29,11 +29,25 @@
#include <arch/abi.h>
#include <arch/interrupts.h>
#include <arch/spr_def.h>
+#include <arch/opcode.h>
+#ifdef CONFIG_KVM
+#include <asm/kvm_host.h>
+#endif

#define PTREGS_PTR(reg, ptreg) addli reg, sp, C_ABI_SAVE_AREA_SIZE + (ptreg)

#define PTREGS_OFFSET_SYSCALL PTREGS_OFFSET_REG(TREG_SYSCALL_NR)

+#if CONFIG_KERNEL_PL == 1 || CONFIG_KERNEL_PL == 2
+/*
+ * Set "result" non-zero if ex1 holds the PL of the kernel
+ * (with or without ICS being set). Note this works only
+ * because we never find the PL at level 3.
+ */
+# define IS_KERNEL_EX1(result, ex1) andi result, ex1, CONFIG_KERNEL_PL
+#else
+# error Recode IS_KERNEL_EX1 for CONFIG_KERNEL_PL
+#endif

.macro push_reg reg, ptr=sp, delta=-8
{
@@ -302,7 +316,7 @@ intvec_\vecname:
mtspr SPR_SYSTEM_SAVE_K_1, r0
mfspr r0, SPR_EX_CONTEXT_K_1

- andi r0, r0, SPR_EX_CONTEXT_1_1__PL_MASK /* mask off ICS */
+ IS_KERNEL_EX1(r0, r0)

.ifc \vecnum, INT_DOUBLE_FAULT
/*
@@ -340,10 +354,6 @@ intvec_\vecname:
*
* Note that the hypervisor *always* sets SYSTEM_SAVE_K_2 for
* any path that turns into a downcall to one of our TLB handlers.
- *
- * FIXME: if we end up never using this path, perhaps we should
- * prevent the hypervisor from generating downcalls in this case.
- * The advantage of getting a downcall is we can panic in Linux.
*/
mfspr r0, SPR_SYSTEM_SAVE_K_2
{
@@ -483,6 +493,10 @@ intvec_\vecname:
mfspr r2, SPR_SYSTEM_SAVE_K_3 /* address of page fault */
mfspr r3, SPR_SYSTEM_SAVE_K_2 /* info about page fault */
.else
+ .ifc \c_routine, kvm_vpgtable_miss
+ mfspr r2, SPR_SYSTEM_SAVE_K_3 /* address of page fault */
+ mfspr r3, SPR_SYSTEM_SAVE_K_2 /* info about page fault */
+ .else
.ifc \vecnum, INT_ILL_TRANS
mfspr r2, ILL_VA_PC
.else
@@ -505,6 +519,7 @@ intvec_\vecname:
.endif
.endif
.endif
+ .endif
/* Put function pointer in r0 */
moveli r0, hw2_last(\c_routine)
shl16insli r0, r0, hw1(\c_routine)
@@ -518,7 +533,7 @@ intvec_\vecname:
#ifdef __COLLECT_LINKER_FEEDBACK__
.pushsection .text.intvec_feedback,"ax"
.org (\vecnum << 5)
- FEEDBACK_ENTER_EXPLICIT(intvec_\vecname, .intrpt1, 1 << 8)
+ FEEDBACK_ENTER_EXPLICIT(intvec_\vecname, .intrpt, 1 << 8)
jrp lr
.popsection
#endif
@@ -634,24 +649,25 @@ intvec_\vecname:
/*
* If we will be returning to the kernel, we will need to
* reset the interrupt masks to the state they had before.
- * Set DISABLE_IRQ in flags iff we came from PL1 with irqs disabled.
+ * Set DISABLE_IRQ in flags iff we came from kernel pl with
+ * irqs disabled.
*/
- mfspr r32, SPR_EX_CONTEXT_K_1
+ mfspr r22, SPR_EX_CONTEXT_K_1
{
- andi r32, r32, SPR_EX_CONTEXT_1_1__PL_MASK /* mask off ICS */
+ IS_KERNEL_EX1(r22, r22)
PTREGS_PTR(r21, PTREGS_OFFSET_FLAGS)
}
- beqzt r32, 1f /* zero if from user space */
- IRQS_DISABLED(r32) /* zero if irqs enabled */
+ beqzt r22, 1f /* zero if from user space */
+ IRQS_DISABLED(r22) /* zero if irqs enabled */
#if PT_FLAGS_DISABLE_IRQ != 1
# error Value of IRQS_DISABLED used to set PT_FLAGS_DISABLE_IRQ; fix
#endif
1:
.ifnc \function,handle_syscall
/* Record the fact that we saved the caller-save registers above. */
- ori r32, r32, PT_FLAGS_CALLER_SAVES
+ ori r22, r22, PT_FLAGS_CALLER_SAVES
.endif
- st r21, r32
+ st r21, r22

/*
* we've captured enough state to the stack (including in
@@ -691,12 +707,29 @@ intvec_\vecname:
move tp, zero
#endif

+ /*
+ * Prepare the first 256 stack bytes to be rapidly accessible
+ * without having to fetch the background data.
+ */
+ addi r52, sp, -64
+ {
+ wh64 r52
+ addi r52, r52, -64
+ }
+ {
+ wh64 r52
+ addi r52, r52, -64
+ }
+ {
+ wh64 r52
+ addi r52, r52, -64
+ }
+ wh64 r52
+
#ifdef __COLLECT_LINKER_FEEDBACK__
/*
* Notify the feedback routines that we were in the
- * appropriate fixed interrupt vector area. Note that we
- * still have ICS set at this point, so we can't invoke any
- * atomic operations or we will panic. The feedback
+ * appropriate fixed interrupt vector area. The feedback
* routines internally preserve r0..r10 and r30 up.
*/
.ifnc \function,handle_syscall
@@ -715,23 +748,15 @@ intvec_\vecname:
#endif

/*
- * Prepare the first 256 stack bytes to be rapidly accessible
- * without having to fetch the background data.
+ * Stash any interrupt state in r30..r33 for now.
+ * This makes it easier to call C code in the code that follows.
+ * We don't need to on the syscall path since we reload
+ * them from the stack instead.
*/
- addi r52, sp, -64
- {
- wh64 r52
- addi r52, r52, -64
- }
- {
- wh64 r52
- addi r52, r52, -64
- }
- {
- wh64 r52
- addi r52, r52, -64
- }
- wh64 r52
+ .ifnc \function,handle_syscall
+ { move r30, r0; move r31, r1 }
+ { move r32, r2; move r33, r3 }
+ .endif

#ifdef CONFIG_TRACE_IRQFLAGS
.ifnc \function,handle_nmi
@@ -742,17 +767,8 @@ intvec_\vecname:
* For syscalls, we already have the register state saved away
* on the stack, so we don't bother to do any register saves here,
* and later we pop the registers back off the kernel stack.
- * For interrupt handlers, save r0-r3 in callee-saved registers.
*/
- .ifnc \function,handle_syscall
- { move r30, r0; move r31, r1 }
- { move r32, r2; move r33, r3 }
- .endif
TRACE_IRQS_OFF
- .ifnc \function,handle_syscall
- { move r0, r30; move r1, r31 }
- { move r2, r32; move r3, r33 }
- .endif
.endif
#endif

@@ -801,11 +817,11 @@ handle_interrupt:
STD_ENTRY(interrupt_return)
/* If we're resuming to kernel space, don't check thread flags. */
{
- bnez r30, .Lrestore_all /* NMIs don't special-case user-space */
+ bnez r30, restore_all /* NMIs don't special-case user-space */
PTREGS_PTR(r29, PTREGS_OFFSET_EX1)
}
ld r29, r29
- andi r29, r29, SPR_EX_CONTEXT_1_1__PL_MASK /* mask off ICS */
+ IS_KERNEL_EX1(r29, r29)
{
beqzt r29, .Lresume_userspace
move r29, sp
@@ -817,14 +833,25 @@ STD_ENTRY(interrupt_return)
addli r28, r29, THREAD_INFO_FLAGS_OFFSET
{
ld r28, r28
- addli r29, r29, THREAD_INFO_PREEMPT_COUNT_OFFSET
+ addli r26, r29, THREAD_INFO_PREEMPT_COUNT_OFFSET
}
{
- andi r28, r28, _TIF_NEED_RESCHED
- ld4s r29, r29
+ andi r27, r28, _TIF_NEED_RESCHED
+ ld4s r26, r26
}
- beqzt r28, 1f
- bnez r29, 1f
+ beqzt r27, 1f
+ bnez r26, 1f
+#ifdef CONFIG_KVM
+ addli r27, r29, THREAD_INFO_VCPU_OFFSET
+ ld r27, r27
+ {
+ beqzt r27, 0f
+ movei r1, KVM_EXIT_AGAIN
+ }
+ push_extra_callee_saves r0
+ j kvm_trigger_vmexit
+0:
+#endif
jal preempt_schedule_irq
FEEDBACK_REENTER(interrupt_return)
1:
@@ -846,11 +873,11 @@ STD_ENTRY(interrupt_return)
cmpeq r27, r27, r28
}
{
- blbc r27, .Lrestore_all
+ blbc r27, restore_all
addi r28, r28, 8
}
st r29, r28
- j .Lrestore_all
+ j restore_all

.Lresume_userspace:
FEEDBACK_REENTER(interrupt_return)
@@ -890,7 +917,7 @@ STD_ENTRY(interrupt_return)
shl16insli r1, r1, hw0(_TIF_ALLWORK_MASK)
}
and r1, r29, r1
- beqzt r1, .Lrestore_all
+ beqzt r1, restore_all

/*
* Make sure we have all the registers saved for signal
@@ -922,14 +949,16 @@ STD_ENTRY(interrupt_return)
* ICS can only be used in very tight chunks of code to avoid
* tripping over various assertions that it is off.
*/
-.Lrestore_all:
+ .global restore_all
+ .type restore_all, @function
+restore_all:
PTREGS_PTR(r0, PTREGS_OFFSET_EX1)
{
ld r0, r0
PTREGS_PTR(r32, PTREGS_OFFSET_FLAGS)
}
{
- andi r0, r0, SPR_EX_CONTEXT_1_1__PL_MASK
+ IS_KERNEL_EX1(r0, r0)
ld r32, r32
}
bnez r0, 1f
@@ -1000,7 +1029,7 @@ STD_ENTRY(interrupt_return)
pop_reg r21, sp, PTREGS_OFFSET_REG(31) - PTREGS_OFFSET_PC
{
mtspr SPR_EX_CONTEXT_K_1, lr
- andi lr, lr, SPR_EX_CONTEXT_1_1__PL_MASK /* mask off ICS */
+ IS_KERNEL_EX1(lr, lr)
}
{
mtspr SPR_EX_CONTEXT_K_0, r21
@@ -1450,6 +1479,26 @@ int_unalign:
j do_unaligned
ENDPROC(hand_unalign_slow)

+#ifdef CONFIG_KVM
+/*
+ * Any call path that may lead to a vmexit needs to save the full
+ * callee-save register state, since if we vmexit we don't unwind
+ * the callee-saves from the C function stack frames, and instead
+ * just save away the register state from the interrupt handler as-is
+ * and later reload it directly and call back into the guest.
+ */
+ .macro save_callee_saves_and_tailcall func
+kvm_\func:
+ push_extra_callee_saves r0
+ j kvm_do_\func
+ ENDPROC(\func)
+ .endm
+
+ save_callee_saves_and_tailcall hypervisor_call
+ save_callee_saves_and_tailcall vpgtable_miss
+ save_callee_saves_and_tailcall vguest_fatal
+#endif
+
/* Fill the return address stack with nonzero entries. */
STD_ENTRY(fill_ra_stack)
{
@@ -1462,13 +1511,57 @@ STD_ENTRY(fill_ra_stack)
4: jrp r0
STD_ENDPROC(fill_ra_stack)

+#ifdef CONFIG_KVM
+/*
+ * Handle the downcall dispatch service. On entry, the client's
+ * system save register 3 holds the original contents of
+ * REG_SYSCALL_NR_NAME, which we need to restore before we iret to
+ * the correct interrupt vector.
+ * Note that we only support the INT_MESSAGE_RCV_DWNCL interrupt
+ * here, since this is the only interrupt handled this way on GX.
+ */
+handle_downcall_dispatch:
+ /*
+ * If we were called from PL0, jump back to slow path.
+ * We check just the low bit to make sure it's set, since we
+ * can only be called from PL0 or PL1.
+ */
+ mfspr TREG_SYSCALL_NR_NAME, SPR_EX_CONTEXT_K_1
+ blbc TREG_SYSCALL_NR_NAME, intvec_SWINT_0
+
+ /* Set the PC to the downcall interrupt vector, and PL to guest. */
+ mfspr TREG_SYSCALL_NR_NAME, SPR_INTERRUPT_VECTOR_BASE_1
+ addli TREG_SYSCALL_NR_NAME, TREG_SYSCALL_NR_NAME, \
+ INT_MESSAGE_RCV_DWNCL << 8
+ {
+ mtspr SPR_EX_CONTEXT_K_0, TREG_SYSCALL_NR_NAME
+ movei TREG_SYSCALL_NR_NAME, GUEST_PL | SPR_EX_CONTEXT_1_1__ICS_MASK
+ }
+ mtspr SPR_EX_CONTEXT_K_1, TREG_SYSCALL_NR_NAME
+
+ /* Restore REG_SYSCALL_NR_NAME and return to the new vector. */
+ mfspr TREG_SYSCALL_NR_NAME, SPR_SYSTEM_SAVE_1_3
+ iret
+
+ .macro int_hand_kvm_hcall vecnum, vecname, c_routine, \
+ processing=handle_interrupt
+ .org (\vecnum << 8)
+ /* Need special code for downcall dispatch syscall. */
+ beqz TREG_SYSCALL_NR_NAME, handle_downcall_dispatch
+ __int_hand \vecnum, \vecname, \c_routine, \processing
+ .endm
+
+#endif /* CONFIG_KVM */
+
.macro int_hand vecnum, vecname, c_routine, processing=handle_interrupt
.org (\vecnum << 8)
__int_hand \vecnum, \vecname, \c_routine, \processing
.endm

-/* Include .intrpt1 array of interrupt vectors */
- .section ".intrpt1", "ax"
+/* Include .intrpt array of interrupt vectors */
+ .section ".intrpt", "ax"
+ .global intrpt_start
+intrpt_start:

#define op_handle_perf_interrupt bad_intr
#define op_handle_aux_perf_interrupt bad_intr
@@ -1477,6 +1570,11 @@ STD_ENTRY(fill_ra_stack)
#define do_hardwall_trap bad_intr
#endif

+#ifndef CONFIG_KVM
+#define kvm_vpgtable_miss bad_intr
+#define kvm_vguest_fatal bad_intr
+#endif
+
int_hand INT_MEM_ERROR, MEM_ERROR, do_trap
int_hand INT_SINGLE_STEP_3, SINGLE_STEP_3, bad_intr
#if CONFIG_KERNEL_PL == 2
@@ -1497,14 +1595,24 @@ STD_ENTRY(fill_ra_stack)
int_hand INT_SWINT_3, SWINT_3, do_trap
int_hand INT_SWINT_2, SWINT_2, do_trap
int_hand INT_SWINT_1, SWINT_1, SYSCALL, handle_syscall
+#ifdef CONFIG_KVM
+ int_hand_kvm_hcall INT_SWINT_0, SWINT_0, kvm_hypervisor_call
+#else
int_hand INT_SWINT_0, SWINT_0, do_trap
+#endif
int_hand INT_ILL_TRANS, ILL_TRANS, do_trap
int_hand_unalign_fast INT_UNALIGN_DATA, UNALIGN_DATA
int_hand INT_DTLB_MISS, DTLB_MISS, do_page_fault
int_hand INT_DTLB_ACCESS, DTLB_ACCESS, do_page_fault
int_hand INT_IDN_FIREWALL, IDN_FIREWALL, do_hardwall_trap
int_hand INT_UDN_FIREWALL, UDN_FIREWALL, do_hardwall_trap
+#ifndef CONFIG_KVM_GUEST
int_hand INT_TILE_TIMER, TILE_TIMER, do_timer_interrupt
+ int_hand INT_AUX_TILE_TIMER, AUX_TILE_TIMER, bad_intr
+#else
+ int_hand INT_TILE_TIMER, TILE_TIMER, bad_intr
+ int_hand INT_AUX_TILE_TIMER, AUX_TILE_TIMER, do_timer_interrupt
+#endif
int_hand INT_IDN_TIMER, IDN_TIMER, bad_intr
int_hand INT_UDN_TIMER, UDN_TIMER, bad_intr
int_hand INT_IDN_AVAIL, IDN_AVAIL, bad_intr
@@ -1534,8 +1642,10 @@ STD_ENTRY(fill_ra_stack)
int_hand INT_MESSAGE_RCV_DWNCL, MESSAGE_RCV_DWNCL, \
hv_message_intr
int_hand INT_DEV_INTR_DWNCL, DEV_INTR_DWNCL, bad_intr
- int_hand INT_I_ASID, I_ASID, bad_intr
- int_hand INT_D_ASID, D_ASID, bad_intr
+ int_hand INT_VPGTABLE_MISS_DWNCL, VPGTABLE_MISS_DWNCL, \
+ kvm_vpgtable_miss
+ int_hand INT_VGUEST_FATAL_DWNCL, VGUEST_FATAL_DWNCL, \
+ kvm_vguest_fatal
int_hand INT_DOUBLE_FAULT, DOUBLE_FAULT, do_trap

/* Synthetic interrupt delivered only by the simulator */
diff --git a/arch/tile/kernel/kvm_virtio.c b/arch/tile/kernel/kvm_virtio.c
new file mode 100644
index 0000000..c6b6c6a
--- /dev/null
+++ b/arch/tile/kernel/kvm_virtio.c
@@ -0,0 +1,430 @@
+/*
+ * Copyright 2013 Tilera Corporation. All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation, version 2.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT. See the GNU General Public License for
+ * more details.
+ */
+
+/* Referred lguest & s390 implemenation */
+/*
+ * kvm_virtio.c - virtio for kvm on s390
+ *
+ * Copyright IBM Corp. 2008
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License (version 2 only)
+ * as published by the Free Software Foundation.
+ *
+ * Author(s): Christian Borntraeger <borntraeger@xxxxxxxxxx>
+ */
+
+#include <linux/bootmem.h>
+#include <linux/io.h>
+#include <linux/vmalloc.h>
+#include <linux/interrupt.h>
+#include <linux/irq.h>
+#include <linux/export.h>
+#include <linux/virtio.h>
+#include <linux/virtio_config.h>
+#include <linux/virtio_console.h>
+#include <linux/virtio_ring.h>
+#include <linux/virtio_pci.h>
+
+#include <linux/kvm_para.h>
+#include <asm/kvm_virtio.h>
+
+static void *kvm_devices;
+
+/*
+ * TODO: We actually does not use PCI virtio here. We use this
+ * because qemu: virtqueue_init() uses VIRTIO_PCI_VRING_ALIGN.
+ * Maybe we should change them to generic definitions in both qemu & Linux.
+ * Besides, Let's check whether the alignment value (4096, i.e. default
+ * x86 page size) affects performance later.
+ */
+#define KVM_TILE_VIRTIO_RING_ALIGN VIRTIO_PCI_VRING_ALIGN
+#define to_kvmdev(vd) container_of(vd, struct kvm_device, vdev)
+
+/*
+ * memory layout: (Total: PAGE_SIZE)
+ * <device 0>
+ * - kvm device descriptor
+ * struct kvm_device_desc
+ * - vqueue configuration (totally desc->num_vq)
+ * struct kvm_vqconfig
+ * ......
+ * struct kvm_vqconfig
+ * - feature bits (size: desc->feature_len * 2)
+ * - config space (size: desc->config_len)
+ * <device 1>
+ * ......
+ */
+static struct kvm_vqconfig *kvm_vq_config(const struct kvm_device_desc *desc)
+{
+ return (struct kvm_vqconfig *)(desc + 1);
+}
+
+static u8 *kvm_vq_features(const struct kvm_device_desc *desc)
+{
+ return (u8 *)(kvm_vq_config(desc) + desc->num_vq);
+}
+
+static u8 *kvm_vq_configspace(const struct kvm_device_desc *desc)
+{
+ return kvm_vq_features(desc) + desc->feature_len * 2;
+}
+
+/*
+ * The total size of the config page used by this device (incl. desc)
+ */
+static unsigned desc_size(const struct kvm_device_desc *desc)
+{
+ return sizeof(*desc)
+ + desc->num_vq * sizeof(struct kvm_vqconfig)
+ + desc->feature_len * 2
+ + desc->config_len;
+}
+
+/* This gets the device's feature bits. */
+static u32 kvm_get_features(struct virtio_device *vdev)
+{
+ unsigned int i;
+ u32 features = 0;
+ struct kvm_device_desc *desc = to_kvmdev(vdev)->desc;
+ u8 *in_features = kvm_vq_features(desc);
+
+ for (i = 0; i < min(desc->feature_len * 8, 32); i++)
+ if (in_features[i / 8] & (1 << (i % 8)))
+ features |= (1 << i);
+ return features;
+}
+
+static void kvm_finalize_features(struct virtio_device *vdev)
+{
+ unsigned int i, bits;
+ struct kvm_device_desc *desc = to_kvmdev(vdev)->desc;
+ /* Second half of bitmap is features we accept. */
+ u8 *out_features = kvm_vq_features(desc) + desc->feature_len;
+
+ /* Give virtio_ring a chance to accept features. */
+ vring_transport_features(vdev);
+
+ memset(out_features, 0, desc->feature_len);
+ bits = min_t(unsigned, desc->feature_len, sizeof(vdev->features)) * 8;
+ for (i = 0; i < bits; i++) {
+ if (test_bit(i, vdev->features))
+ out_features[i / 8] |= (1 << (i % 8));
+ }
+}
+
+/*
+ * Reading and writing elements in config space
+ */
+static void kvm_get(struct virtio_device *vdev, unsigned int offset,
+ void *buf, unsigned len)
+{
+ struct kvm_device_desc *desc = to_kvmdev(vdev)->desc;
+
+ BUG_ON(offset + len > desc->config_len);
+ memcpy(buf, kvm_vq_configspace(desc) + offset, len);
+}
+
+static void kvm_set(struct virtio_device *vdev, unsigned int offset,
+ const void *buf, unsigned len)
+{
+ struct kvm_device_desc *desc = to_kvmdev(vdev)->desc;
+
+ BUG_ON(offset + len > desc->config_len);
+ memcpy(kvm_vq_configspace(desc) + offset, buf, len);
+}
+
+/*
+ * The operations to get and set the status word just access
+ * the status field of the device descriptor. set_status will also
+ * make a hypercall to the host, to tell about status changes
+ */
+static u8 kvm_get_status(struct virtio_device *vdev)
+{
+ return to_kvmdev(vdev)->desc->status;
+}
+
+static void kvm_set_status(struct virtio_device *vdev, u8 status)
+{
+ BUG_ON(!status);
+ to_kvmdev(vdev)->desc->status = status;
+ hcall_virtio(KVM_VIRTIO_SET_STATUS, to_kvmdev(vdev)->desc_pa);
+}
+
+/*
+ * To reset the device, we use the KVM_VIRTIO_RESET hypercall, using the
+ * descriptor address. The Host will zero the status and all the
+ * features.
+ */
+static void kvm_reset(struct virtio_device *vdev)
+{
+ hcall_virtio(KVM_VIRTIO_RESET, to_kvmdev(vdev)->desc_pa);
+}
+
+/*
+ * When the virtio_ring code wants to notify the Host, it calls us here and we
+ * make a hypercall. We hand the address of the virtqueue so the Host
+ * knows which virtqueue we're talking about.
+ */
+static void kvm_notify(struct virtqueue *vq)
+{
+ struct kvm_vqinfo *vqi = vq->priv;
+
+ hcall_virtio(KVM_VIRTIO_NOTIFY, vqi->config->pa);
+}
+
+/*
+ * Must set some caching mode to keep set_pte() happy.
+ * It doesn't matter what we choose, because the PFN
+ * is illegal, so we're going to take a page fault anyway.
+ */
+static inline pgprot_t io_prot(void)
+{
+ return hv_pte_set_mode(PAGE_KERNEL, HV_PTE_MODE_UNCACHED);
+}
+
+/*
+ * This routine finds the first virtqueue described in the configuration of
+ * this device and sets it up.
+ */
+static struct virtqueue *kvm_find_vq(struct virtio_device *vdev,
+ unsigned index,
+ void (*callback)(struct virtqueue *vq),
+ const char *name)
+{
+ struct kvm_device *kdev = to_kvmdev(vdev);
+ struct kvm_vqinfo *vqi;
+ struct kvm_vqconfig *config;
+ struct virtqueue *vq;
+ long irq;
+ int err = -EINVAL;
+
+ if (index >= kdev->desc->num_vq)
+ return ERR_PTR(-ENOENT);
+
+ vqi = kzalloc(sizeof(*vqi), GFP_KERNEL);
+ if (!vqi)
+ return ERR_PTR(-ENOMEM);
+
+ config = kvm_vq_config(kdev->desc)+index;
+
+ vqi->config = config;
+ vqi->pages = generic_remap_prot(config->pa,
+ vring_size(config->num,
+ KVM_TILE_VIRTIO_RING_ALIGN),
+ 0, io_prot());
+ if (!vqi->pages) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ vq = vring_new_virtqueue(index, config->num, KVM_TILE_VIRTIO_RING_ALIGN,
+ vdev, 0, vqi->pages,
+ kvm_notify, callback, name);
+ if (!vq) {
+ err = -ENOMEM;
+ goto unmap;
+ }
+
+ /*
+ * Trigger the IPI interrupt in SW way.
+ * TODO: We do not need to create one irq for each vq. A bit wasteful.
+ */
+ irq = create_irq();
+ if (irq < 0) {
+ err = -ENXIO;
+ goto del_virtqueue;
+ }
+
+ tile_irq_activate(irq, TILE_IRQ_SW_CLEAR);
+
+ if (request_irq(irq, vring_interrupt, 0, dev_name(&vdev->dev), vq)) {
+ err = -ENXIO;
+ destroy_irq(irq);
+ goto del_virtqueue;
+ }
+
+ config->irq = irq;
+
+ vq->priv = vqi;
+ return vq;
+
+del_virtqueue:
+ vring_del_virtqueue(vq);
+unmap:
+ vunmap(vqi->pages);
+out:
+ return ERR_PTR(err);
+}
+
+static void kvm_del_vq(struct virtqueue *vq)
+{
+ struct kvm_vqinfo *vqi = vq->priv;
+
+ vring_del_virtqueue(vq);
+ vunmap(vqi->pages);
+ kfree(vqi);
+}
+
+static void kvm_del_vqs(struct virtio_device *vdev)
+{
+ struct virtqueue *vq, *n;
+
+ list_for_each_entry_safe(vq, n, &vdev->vqs, list)
+ kvm_del_vq(vq);
+}
+
+static int kvm_find_vqs(struct virtio_device *vdev, unsigned nvqs,
+ struct virtqueue *vqs[],
+ vq_callback_t *callbacks[],
+ const char *names[])
+{
+ struct kvm_device *kdev = to_kvmdev(vdev);
+ int i;
+
+ /* We must have this many virtqueues. */
+ if (nvqs > kdev->desc->num_vq)
+ return -ENOENT;
+
+ for (i = 0; i < nvqs; ++i) {
+ vqs[i] = kvm_find_vq(vdev, i, callbacks[i], names[i]);
+ if (IS_ERR(vqs[i]))
+ goto error;
+ }
+ return 0;
+
+error:
+ kvm_del_vqs(vdev);
+ return PTR_ERR(vqs[i]);
+}
+
+/*
+ * The config ops structure as defined by virtio config
+ */
+static struct virtio_config_ops kvm_vq_config_ops = {
+ .get_features = kvm_get_features,
+ .finalize_features = kvm_finalize_features,
+ .get = kvm_get,
+ .set = kvm_set,
+ .get_status = kvm_get_status,
+ .set_status = kvm_set_status,
+ .reset = kvm_reset,
+ .find_vqs = kvm_find_vqs,
+ .del_vqs = kvm_del_vqs,
+};
+
+/*
+ * The root device for the kvm virtio devices.
+ * This makes them appear as /sys/devices/kvm_tile/0,1,2 not /sys/devices/0,1,2.
+ */
+static struct device *kvm_root;
+
+/*
+ * adds a new device and register it with virtio
+ * appropriate drivers are loaded by the device model
+ */
+static void add_kvm_device(struct kvm_device_desc *d, unsigned int offset)
+{
+ struct kvm_device *kdev;
+
+ kdev = kzalloc(sizeof(*kdev), GFP_KERNEL);
+ if (!kdev) {
+ pr_emerg("Cannot allocate kvm dev %u type %u\n",
+ offset, d->type);
+ return;
+ }
+
+ kdev->vdev.dev.parent = kvm_root;
+ kdev->vdev.id.device = d->type;
+ kdev->vdev.config = &kvm_vq_config_ops;
+ kdev->desc = d;
+ kdev->desc_pa = PFN_PHYS(max_pfn) + offset;
+
+ if (register_virtio_device(&kdev->vdev) != 0) {
+ pr_err("Failed to register kvm device %u type %u\n",
+ offset, d->type);
+ kfree(kdev);
+ }
+}
+
+/*
+ * scan_devices() simply iterates through the device page.
+ * The type 0 is reserved to mean "end of devices".
+ */
+static void scan_devices(void)
+{
+ unsigned int i;
+ struct kvm_device_desc *d;
+
+ for (i = 0; i < PAGE_SIZE; i += desc_size(d)) {
+ d = kvm_devices + i;
+
+ if (d->type == 0)
+ break;
+
+ add_kvm_device(d, i);
+ }
+}
+
+/*
+ * Init function for virtio.
+ * devices are in a single page above the top of "normal" mem.
+ */
+static int __init kvm_devices_init(void)
+{
+ int rc = -ENOMEM;
+
+ kvm_root = root_device_register("kvm_tile");
+ if (IS_ERR(kvm_root)) {
+ rc = PTR_ERR(kvm_root);
+ pr_err("Could not register kvm_tile root device");
+ return rc;
+ }
+
+ kvm_devices = generic_remap_prot(PFN_PHYS(max_pfn), PAGE_SIZE,
+ 0, io_prot());
+ if (!kvm_devices) {
+ kvm_devices = NULL;
+ root_device_unregister(kvm_root);
+ return rc;
+ }
+
+ scan_devices();
+ return 0;
+}
+
+/* code for early console output with virtio_console */
+static __init int early_put_chars(u32 vtermno, const char *buf, int len)
+{
+ char scratch[512];
+
+ if (len > sizeof(scratch) - 1)
+ len = sizeof(scratch) - 1;
+ scratch[len] = '\0';
+ memcpy(scratch, buf, len);
+ hcall_virtio(KVM_VIRTIO_NOTIFY, __pa(scratch));
+
+ return len;
+}
+
+static int __init tile_virtio_console_init(void)
+{
+ return virtio_cons_early_init(early_put_chars);
+}
+console_initcall(tile_virtio_console_init);
+
+/*
+ * We do this after core stuff, but before the drivers.
+ */
+postcore_initcall(kvm_devices_init);
diff --git a/arch/tile/kernel/process.c b/arch/tile/kernel/process.c
index 44cdc4a..2629ff1 100644
--- a/arch/tile/kernel/process.c
+++ b/arch/tile/kernel/process.c
@@ -27,6 +27,7 @@
#include <linux/kernel.h>
#include <linux/tracehook.h>
#include <linux/signal.h>
+#include <linux/kvm_host.h>
#include <asm/stack.h>
#include <asm/switch_to.h>
#include <asm/homecache.h>
@@ -247,11 +248,13 @@ struct task_struct *validate_current(void)
/* Take and return the pointer to the previous task, for schedule_tail(). */
struct task_struct *sim_notify_fork(struct task_struct *prev)
{
+#ifndef CONFIG_KVM_GUEST /* see notify_sim_task_change() */
struct task_struct *tsk = current;
__insn_mtspr(SPR_SIM_CONTROL, SIM_CONTROL_OS_FORK_PARENT |
(tsk->thread.creator_pid << _SIM_CONTROL_OPERATOR_BITS));
__insn_mtspr(SPR_SIM_CONTROL, SIM_CONTROL_OS_FORK |
(tsk->pid << _SIM_CONTROL_OPERATOR_BITS));
+#endif
return prev;
}

@@ -450,6 +453,11 @@ void _prepare_arch_switch(struct task_struct *next)
struct task_struct *__sched _switch_to(struct task_struct *prev,
struct task_struct *next)
{
+#ifdef CONFIG_KVM
+ /* vmexit is needed before context switch. */
+ BUG_ON(task_thread_info(prev)->vcpu);
+#endif
+
/* DMA state is already saved; save off other arch state. */
save_arch_state(&prev->thread);

@@ -519,6 +527,29 @@ int do_work_pending(struct pt_regs *regs, u32 thread_info_flags)
/* Enable interrupts; they are disabled again on return to caller. */
local_irq_enable();

+#ifdef CONFIG_KVM
+ /*
+ * Some work requires us to exit the VM first. Typically this
+ * allows the process running the VM to respond to the work
+ * (e.g. a signal), or allows the VM mechanism to latch
+ * modified host state (e.g. a "hypervisor" message sent to a
+ * different vcpu). It also means that if we are considering
+ * calling schedule(), we exit the VM first, so we never have
+ * to worry about context-switching into a VM.
+ */
+ if (current_thread_info()->vcpu) {
+ u32 do_exit = thread_info_flags &
+ (_TIF_NEED_RESCHED|_TIF_SIGPENDING|_TIF_VIRT_EXIT);
+
+ if (thread_info_flags & _TIF_VIRT_EXIT)
+ clear_thread_flag(TIF_VIRT_EXIT);
+ if (do_exit) {
+ kvm_trigger_vmexit(regs, KVM_EXIT_AGAIN);
+ /*NORETURN*/
+ }
+ }
+#endif
+
if (thread_info_flags & _TIF_NEED_RESCHED) {
schedule();
return 1;
@@ -538,11 +569,12 @@ int do_work_pending(struct pt_regs *regs, u32 thread_info_flags)
tracehook_notify_resume(regs);
return 1;
}
- if (thread_info_flags & _TIF_SINGLESTEP) {
+
+ /* Handle a few flags here that stay set. */
+ if (thread_info_flags & _TIF_SINGLESTEP)
single_step_once(regs);
- return 0;
- }
- panic("work_pending: bad flags %#x\n", thread_info_flags);
+
+ return 0;
}

unsigned long get_wchan(struct task_struct *p)
diff --git a/arch/tile/kernel/relocate_kernel_64.S b/arch/tile/kernel/relocate_kernel_64.S
index 1c09a4f..02bc446 100644
--- a/arch/tile/kernel/relocate_kernel_64.S
+++ b/arch/tile/kernel/relocate_kernel_64.S
@@ -34,11 +34,11 @@ STD_ENTRY(relocate_new_kernel)
addi sp, sp, -8
/* we now have a stack (whether we need one or not) */

+#ifdef RELOCATE_NEW_KERNEL_VERBOSE
moveli r40, hw2_last(hv_console_putc)
shl16insli r40, r40, hw1(hv_console_putc)
shl16insli r40, r40, hw0(hv_console_putc)

-#ifdef RELOCATE_NEW_KERNEL_VERBOSE
moveli r0, 'r'
jalr r40

@@ -176,10 +176,12 @@ STD_ENTRY(relocate_new_kernel)

/* we should not get here */

+#ifdef RELOCATE_NEW_KERNEL_VERBOSE
moveli r0, '?'
jalr r40
moveli r0, '\n'
jalr r40
+#endif

j .Lhalt

@@ -237,7 +239,9 @@ STD_ENTRY(relocate_new_kernel)
j .Lloop


-.Lerr: moveli r0, 'e'
+.Lerr:
+#ifdef RELOCATE_NEW_KERNEL_VERBOSE
+ moveli r0, 'e'
jalr r40
moveli r0, 'r'
jalr r40
@@ -245,6 +249,7 @@ STD_ENTRY(relocate_new_kernel)
jalr r40
moveli r0, '\n'
jalr r40
+#endif
.Lhalt:
moveli r41, hw2_last(hv_halt)
shl16insli r41, r41, hw1(hv_halt)
diff --git a/arch/tile/kernel/setup.c b/arch/tile/kernel/setup.c
index 774e819..2352a81 100644
--- a/arch/tile/kernel/setup.c
+++ b/arch/tile/kernel/setup.c
@@ -268,7 +268,7 @@ early_param("vmalloc", parse_vmalloc);
/*
* Determine for each controller where its lowmem is mapped and how much of
* it is mapped there. On controller zero, the first few megabytes are
- * already mapped in as code at MEM_SV_INTRPT, so in principle we could
+ * already mapped in as code at MEM_SV_START, so in principle we could
* start our data mappings higher up, but for now we don't bother, to avoid
* additional confusion.
*
@@ -1074,7 +1074,20 @@ void __cpuinit setup_cpu(int boot)
* SPRs, as well as the interrupt mask.
*/
__insn_mtspr(SPR_MPL_INTCTRL_0_SET_0, 1);
+
+#ifdef CONFIG_KVM
+ /*
+ * If we launch a guest kernel, it will need some interrupts
+ * that otherwise are not used by the host or by userspace.
+ * Set them to MPL 1 now and leave them alone going forward;
+ * they are masked in the host so will never fire there anyway,
+ * and we mask them at PL1 as we exit the guest.
+ */
__insn_mtspr(SPR_MPL_INTCTRL_1_SET_1, 1);
+ __insn_mtspr(SPR_MPL_SINGLE_STEP_1_SET_1, 1);
+ __insn_mtspr(SPR_MPL_AUX_TILE_TIMER_SET_1, 1);
+ __insn_mtspr(SPR_MPL_IPI_1_SET_1, 1);
+#endif

/* Initialize IRQ support for this cpu. */
setup_irq_regs();
@@ -1242,7 +1255,7 @@ static void __init validate_va(void)
#ifndef __tilegx__ /* FIXME: GX: probably some validation relevant here */
/*
* Similarly, make sure we're only using allowed VAs.
- * We assume we can contiguously use MEM_USER_INTRPT .. MEM_HV_INTRPT,
+ * We assume we can contiguously use MEM_USER_INTRPT .. MEM_HV_START,
* and 0 .. KERNEL_HIGH_VADDR.
* In addition, make sure we CAN'T use the end of memory, since
* we use the last chunk of each pgd for the pgd_list.
@@ -1257,7 +1270,7 @@ static void __init validate_va(void)
if (range.size == 0)
break;
if (range.start <= MEM_USER_INTRPT &&
- range.start + range.size >= MEM_HV_INTRPT)
+ range.start + range.size >= MEM_HV_START)
user_kernel_ok = 1;
if (range.start == 0)
max_va = range.size;
@@ -1693,7 +1706,7 @@ insert_ram_resource(u64 start_pfn, u64 end_pfn, bool reserved)
static int __init request_standard_resources(void)
{
int i;
- enum { CODE_DELTA = MEM_SV_INTRPT - PAGE_OFFSET };
+ enum { CODE_DELTA = MEM_SV_START - PAGE_OFFSET };

#if defined(CONFIG_PCI) && !defined(__tilegx__)
insert_non_bus_resource();
diff --git a/arch/tile/kernel/smp.c b/arch/tile/kernel/smp.c
index 0ae1c59..62b3ba9 100644
--- a/arch/tile/kernel/smp.c
+++ b/arch/tile/kernel/smp.c
@@ -223,30 +223,34 @@ void __init ipi_init(void)

#if CHIP_HAS_IPI()

-void smp_send_reschedule(int cpu)
+static void __smp_send_reschedule(int cpu)
{
- WARN_ON(cpu_is_offline(cpu));
-
/*
* We just want to do an MMIO store. The traditional writeq()
* functions aren't really correct here, since they're always
* directed at the PCI shim. For now, just do a raw store,
- * casting away the __iomem attribute.
+ * casting away the __iomem attribute. We do the store as a
+ * single asm() instruction to ensure that we can force a step
+ * over it in the KVM case, if we are not binding vcpus to cpus,
+ * rather than require it to be possible to issue validly.
*/
- ((unsigned long __force *)ipi_mappings[cpu])[IRQ_RESCHEDULE] = 0;
+ unsigned long *addr =
+ &((unsigned long __force *)ipi_mappings[cpu])[IRQ_RESCHEDULE];
+ asm volatile("st %0, zero" :: "r" (addr));
}

#else

-void smp_send_reschedule(int cpu)
+static void __smp_send_reschedule(int cpu)
{
- HV_Coord coord;
-
- WARN_ON(cpu_is_offline(cpu));
-
- coord.y = cpu_y(cpu);
- coord.x = cpu_x(cpu);
+ HV_Coord coord = { .y = cpu_y(cpu), .x = cpu_x(cpu) };
hv_trigger_ipi(coord, IRQ_RESCHEDULE);
}

#endif /* CHIP_HAS_IPI() */
+
+void smp_send_reschedule(int cpu)
+{
+ WARN_ON(cpu_is_offline(cpu));
+ __smp_send_reschedule(cpu);
+}
diff --git a/arch/tile/kernel/stack.c b/arch/tile/kernel/stack.c
index 24fd223..362284a 100644
--- a/arch/tile/kernel/stack.c
+++ b/arch/tile/kernel/stack.c
@@ -103,7 +103,7 @@ static struct pt_regs *valid_fault_handler(struct KBacktraceIterator* kbt)
p->sp >= sp) {
if (kbt->verbose)
pr_err(" <%s while in kernel mode>\n", fault);
- } else if (EX1_PL(p->ex1) == USER_PL &&
+ } else if (user_mode(p) &&
p->sp < PAGE_OFFSET && p->sp != 0) {
if (kbt->verbose)
pr_err(" <%s while in user mode>\n", fault);
diff --git a/arch/tile/kernel/sysfs.c b/arch/tile/kernel/sysfs.c
index e25b0a8..024b978 100644
--- a/arch/tile/kernel/sysfs.c
+++ b/arch/tile/kernel/sysfs.c
@@ -69,7 +69,11 @@ static ssize_t type_show(struct device *dev,
struct device_attribute *attr,
char *page)
{
+#ifdef CONFIG_KVM_GUEST
+ return sprintf(page, "KVM\n");
+#else
return sprintf(page, "tilera\n");
+#endif
}
static DEVICE_ATTR(type, 0444, type_show, NULL);

diff --git a/arch/tile/kernel/time.c b/arch/tile/kernel/time.c
index 3c2dc87..b0b7264 100644
--- a/arch/tile/kernel/time.c
+++ b/arch/tile/kernel/time.c
@@ -117,9 +117,9 @@ void __init time_init(void)

/*
* Define the tile timer clock event device. The timer is driven by
- * the TILE_TIMER_CONTROL register, which consists of a 31-bit down
+ * the TILE_[AUX_]TIMER_CONTROL register, which consists of a 31-bit down
* counter, plus bit 31, which signifies that the counter has wrapped
- * from zero to (2**31) - 1. The INT_TILE_TIMER interrupt will be
+ * from zero to (2**31) - 1. The INT_[AUX_]TILE_TIMER interrupt will be
* raised as long as bit 31 is set.
*/

@@ -129,8 +129,8 @@ static int tile_timer_set_next_event(unsigned long ticks,
struct clock_event_device *evt)
{
BUG_ON(ticks > MAX_TICK);
- __insn_mtspr(SPR_TILE_TIMER_CONTROL, ticks);
- arch_local_irq_unmask_now(INT_TILE_TIMER);
+ __insn_mtspr(SPR_LINUX_TIMER_CONTROL, ticks);
+ arch_local_irq_unmask_now(INT_LINUX_TIMER);
return 0;
}

@@ -141,7 +141,7 @@ static int tile_timer_set_next_event(unsigned long ticks,
static void tile_timer_set_mode(enum clock_event_mode mode,
struct clock_event_device *evt)
{
- arch_local_irq_mask_now(INT_TILE_TIMER);
+ arch_local_irq_mask_now(INT_LINUX_TIMER);
}

static DEFINE_PER_CPU(struct clock_event_device, tile_timer) = {
@@ -161,7 +161,7 @@ void __cpuinit setup_tile_timer(void)
evt->cpumask = cpumask_of(smp_processor_id());

/* Start out with timer not firing. */
- arch_local_irq_mask_now(INT_TILE_TIMER);
+ arch_local_irq_mask_now(INT_LINUX_TIMER);

/*
* Register tile timer. Set min_delta to 1 microsecond, since
@@ -181,7 +181,7 @@ void do_timer_interrupt(struct pt_regs *regs, int fault_num)
* Mask the timer interrupt here, since we are a oneshot timer
* and there are now by definition no events pending.
*/
- arch_local_irq_mask(INT_TILE_TIMER);
+ arch_local_irq_mask(INT_LINUX_TIMER);

/* Track time spent here in an interrupt context */
irq_enter();
diff --git a/arch/tile/kernel/traps.c b/arch/tile/kernel/traps.c
index f110785..19d465c 100644
--- a/arch/tile/kernel/traps.c
+++ b/arch/tile/kernel/traps.c
@@ -30,7 +30,7 @@

void __init trap_init(void)
{
- /* Nothing needed here since we link code at .intrpt1 */
+ /* Nothing needed here since we link code at .intrpt */
}

int unaligned_fixup = 1;
diff --git a/arch/tile/kernel/vmlinux.lds.S b/arch/tile/kernel/vmlinux.lds.S
index c7ae53d..8b20163 100644
--- a/arch/tile/kernel/vmlinux.lds.S
+++ b/arch/tile/kernel/vmlinux.lds.S
@@ -5,7 +5,7 @@
#include <hv/hypervisor.h>

/* Text loads starting from the supervisor interrupt vector address. */
-#define TEXT_OFFSET MEM_SV_INTRPT
+#define TEXT_OFFSET MEM_SV_START

OUTPUT_ARCH(tile)
ENTRY(_start)
@@ -13,7 +13,7 @@ jiffies = jiffies_64;

PHDRS
{
- intrpt1 PT_LOAD ;
+ intrpt PT_LOAD ;
text PT_LOAD ;
data PT_LOAD ;
}
@@ -24,11 +24,11 @@ SECTIONS
#define LOAD_OFFSET TEXT_OFFSET

/* Interrupt vectors */
- .intrpt1 (LOAD_OFFSET) : AT ( 0 ) /* put at the start of physical memory */
+ .intrpt (LOAD_OFFSET) : AT ( 0 ) /* put at the start of physical memory */
{
_text = .;
- *(.intrpt1)
- } :intrpt1 =0
+ *(.intrpt)
+ } :intrpt =0

/* Hypervisor call vectors */
. = ALIGN(0x10000);
diff --git a/arch/tile/kvm/Kconfig b/arch/tile/kvm/Kconfig
index 2298cb1..65f7f9d 100644
--- a/arch/tile/kvm/Kconfig
+++ b/arch/tile/kvm/Kconfig
@@ -27,9 +27,6 @@ config KVM
This module provides access to the hardware capabilities through
a character device node named /dev/kvm.

- To compile this as a module, choose M here: the module
- will be called kvm.
-
If unsure, say N.

source drivers/vhost/Kconfig
diff --git a/arch/tile/kvm/Makefile b/arch/tile/kvm/Makefile
new file mode 100644
index 0000000..2c3d206
--- /dev/null
+++ b/arch/tile/kvm/Makefile
@@ -0,0 +1,12 @@
+#
+# Makefile for Kernel-based Virtual Machine module
+#
+
+ccflags-y := -Ivirt/kvm -Iarch/tile/kvm
+
+kvm-y += $(addprefix ../../../virt/kvm/, kvm_main.o)
+
+kvm-y += kvm-tile.o
+kvm-y += entry.o
+
+obj-$(CONFIG_KVM) += kvm.o
diff --git a/arch/tile/kvm/entry.S b/arch/tile/kvm/entry.S
new file mode 100644
index 0000000..07aa3a6
--- /dev/null
+++ b/arch/tile/kvm/entry.S
@@ -0,0 +1,91 @@
+/*
+ * Copyright 2013 Tilera Corporation. All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation, version 2.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT. See the GNU General Public License for
+ * more details.
+ */
+
+#include <linux/linkage.h>
+#include <asm/switch_to.h>
+#include <asm/processor.h>
+#include <arch/spr_def.h>
+#include <arch/abi.h>
+
+#define FRAME_SIZE ((4 + CALLEE_SAVED_REGS_COUNT) * 8)
+#define SAVE_REG(r) { st r12, r; addi r12, r12, 8 }
+#define LOAD_REG(r) { ld r, r12; addi r12, r12, 8 }
+#define FOR_EACH_CALLEE_SAVED_REG(f) \
+ f(r30); f(r31); \
+ f(r32); f(r33); f(r34); f(r35); f(r36); f(r37); f(r38); f(r39); \
+ f(r40); f(r41); f(r42); f(r43); f(r44); f(r45); f(r46); f(r47); \
+ f(r48); f(r49); f(r50); f(r51); f(r52);
+
+/*
+ * Called with interrupts disabled from kvm_tile_run() and is responsible
+ * just for saving the callee-save registers and the stack pointer, then
+ * resetting ksp0 so subsequent interrupts don't wipe the kernel stack.
+ * It uses restore_all in intvec_64.S to jump back into the guest.
+ * The kvm_vmexit function below undoes the stack manipulation.
+ */
+STD_ENTRY(kvm_vmresume)
+ /* Do function prolog and save callee-saves on stack. */
+ {
+ move r10, sp
+ st sp, lr
+ }
+ {
+ addli r11, sp, -FRAME_SIZE + 8
+ addli sp, sp, -FRAME_SIZE
+ }
+ {
+ st r11, r10
+ addi r12, sp, 16
+ }
+ FOR_EACH_CALLEE_SAVED_REG(SAVE_REG)
+ SAVE_REG(tp)
+ SAVE_REG(lr)
+
+ /* Save frame pointer in thread_info so we can get it back later. */
+ st r1, sp
+
+ /* Set the ksp0 for this core to be below this frame. */
+ mfspr r10, SPR_SYSTEM_SAVE_K_0
+ bfins r10, sp, 0, CPU_SHIFT-1
+ mtspr SPR_SYSTEM_SAVE_K_0, r10
+
+ /* sp points to ABI save area below pt_regs for restore_all. */
+ addli sp, r0, -C_ABI_SAVE_AREA_SIZE
+
+ /* Execute an "interrupt return" to the guest. */
+ {
+ movei r30, 0
+ j restore_all
+ }
+ STD_ENDPROC(kvm_vmresume)
+
+/*
+ * Called with interrupts disabled from kvm_trigger_vmexit(); returns with
+ * interrupts still disabled to kvm_vmresume()'s caller, discarding all the
+ * stack contents below the kvm_vmresume() frame. kvm_vmresume()'s caller
+ * is responsible for resetting SPR_SYSTEM_SAVE_K_0 to its previous value.
+ */
+STD_ENTRY(kvm_vmexit)
+ {
+ move sp, r0
+ addi r12, r0, 16
+ }
+ FOR_EACH_CALLEE_SAVED_REG(LOAD_REG)
+ LOAD_REG(tp)
+ LOAD_REG(lr)
+ {
+ addli sp, sp, FRAME_SIZE
+ jrp lr
+ }
+ STD_ENDPROC(kvm_vmexit)
diff --git a/arch/tile/kvm/kvm-tile.c b/arch/tile/kvm/kvm-tile.c
new file mode 100644
index 0000000..29b601a
--- /dev/null
+++ b/arch/tile/kvm/kvm-tile.c
@@ -0,0 +1,1585 @@
+/*
+ * Copyright 2013 Tilera Corporation. All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation, version 2.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT. See the GNU General Public License for
+ * more details.
+ */
+
+#include <linux/err.h>
+#include <linux/init.h>
+#include <linux/fs.h>
+#include <linux/kvm.h>
+#include <linux/kvm_host.h>
+#include <linux/kvm_types.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/uaccess.h>
+#include <linux/ptrace.h>
+#include <asm/traps.h>
+#include <asm/pgalloc.h>
+#include <hv/hypervisor.h>
+#include <linux/rtc.h>
+#include <asm/atomic.h>
+#include <asm/tlbflush.h>
+#include <arch/spr_def.h>
+#include <arch/sim.h>
+#include <generated/utsrelease.h>
+
+
+struct kvm_stats_debugfs_item debugfs_entries[] = {
+ { NULL }
+};
+
+static pte_t *get_vpgd_pte(struct kvm *kvm, unsigned long address)
+{
+ struct mm_struct *mm = kvm->mm;
+ pgd_t *pgd;
+ pud_t *pud;
+ pmd_t *pmd;
+
+ if (kvm->arch.vpgd == NULL)
+ kvm->arch.vpgd = pgd_alloc(kvm->mm);
+ pgd = kvm->arch.vpgd + pgd_index(address);
+ pud = pud_alloc(mm, pgd, address);
+ if (!pud)
+ return NULL;
+ pmd = pmd_alloc(mm, pud, address);
+ if (!pmd)
+ return NULL;
+ return pte_alloc_kernel(pmd, address);
+}
+
+int kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf)
+{
+ return VM_FAULT_SIGBUS;
+}
+
+void kvm_arch_free_memslot(struct kvm_memory_slot *free,
+ struct kvm_memory_slot *dont)
+{
+}
+
+int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages)
+{
+ return 0;
+}
+
+/* FIXME: support huge pages. */
+int kvm_arch_prepare_memory_region(struct kvm *kvm,
+ struct kvm_memory_slot *memslot,
+ struct kvm_userspace_memory_region *mem,
+ enum kvm_mr_change change)
+{
+ unsigned long gpa, i;
+
+ gpa = mem->guest_phys_addr;
+ for (i = 0; i < mem->memory_size; i += PAGE_SIZE, gpa += PAGE_SIZE)
+ if (get_vpgd_pte(kvm, gpa) == NULL)
+ return -ENOMEM;
+
+ return 0;
+}
+
+void kvm_arch_commit_memory_region(struct kvm *kvm,
+ struct kvm_userspace_memory_region *mem,
+ const struct kvm_memory_slot *old,
+ enum kvm_mr_change change)
+{
+ unsigned long gpa, address, pfn, i;
+ struct page *page[1];
+ pte_t *ptep, *vptep;
+
+ gpa = mem->guest_phys_addr;
+ address = mem->userspace_addr;
+ for (i = 0; i < mem->memory_size;
+ i += PAGE_SIZE, gpa += PAGE_SIZE, address += PAGE_SIZE) {
+ vptep = get_vpgd_pte(kvm, gpa);
+ BUG_ON(vptep == NULL);
+ get_user_pages_fast(address, 1, 1, page);
+ pfn = page_to_pfn(page[0]);
+ ptep = virt_to_pte(NULL, (unsigned long)__va(PFN_PHYS(pfn)));
+ *vptep = *ptep;
+ }
+}
+
+void kvm_arch_flush_shadow_all(struct kvm *kvm)
+{
+}
+
+void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
+ struct kvm_memory_slot *slot)
+{
+ kvm_arch_flush_shadow_all(kvm);
+}
+
+gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn)
+{
+ return 0;
+}
+
+long kvm_arch_dev_ioctl(struct file *filp,
+ unsigned int ioctl, unsigned long arg)
+{
+ return 0;
+}
+
+static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, unsigned long irq)
+{
+ if (irq < 0)
+ return -EINVAL;
+
+ set_bit(irq, &vcpu->arch.ipi_events);
+ kvm_vcpu_kick(vcpu);
+
+ return 0;
+}
+
+long kvm_arch_vcpu_ioctl(struct file *filp,
+ unsigned int ioctl, unsigned long arg)
+{
+ struct kvm_vcpu *vcpu = filp->private_data;
+ void __user *argp = (void __user *)arg;
+ int r = 0;
+
+ switch (ioctl) {
+ case KVM_INTERRUPT: {
+ struct kvm_interrupt irq;
+
+ r = -EFAULT;
+ if (copy_from_user(&irq, argp, sizeof(irq)))
+ goto out;
+ r = kvm_vcpu_ioctl_interrupt(vcpu, irq.irq);
+ if (r)
+ goto out;
+ r = 0;
+ break;
+ }
+ case KVM_TILE_RESET_SPR: {
+ /* Initialize guest SPR values */
+ vcpu->arch.timer_control =
+ 1UL << SPR_AUX_TILE_TIMER_CONTROL__DISABLE_SHIFT;
+ vcpu->arch.vmexit_cycles = get_cycles();
+ vcpu->arch.INTERRUPT_MASK_1 = -1UL;
+ vcpu->arch.INTERRUPT_VECTOR_BASE_1 = 0xfd000000;
+ vcpu->arch.IPI_MASK_1 = -1UL;
+ break;
+ }
+ default:
+ r = -EINVAL;
+ }
+
+out:
+ return r;
+}
+
+int kvm_dev_ioctl_check_extension(long ext)
+{
+ return 0;
+}
+
+int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
+ struct kvm_dirty_log *log)
+{
+ return 0;
+}
+
+long kvm_arch_vm_ioctl(struct file *filp,
+ unsigned int ioctl, unsigned long arg)
+{
+ long r = -EINVAL;
+
+ return r;
+}
+
+int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
+{
+ return 0;
+}
+
+int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
+{
+ return 0;
+}
+
+int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
+ struct kvm_translation *tr)
+{
+ struct kvm *kvm = vcpu->kvm;
+ unsigned long page_size;
+ unsigned long gva = tr->linear_address;
+ unsigned long gpgd_gpa, gpmd_gpa, gpte_gpa;
+ pud_t gpud;
+ pmd_t gpmd;
+ pte_t gpte;
+
+ /* Get guest pgd (aka pud for three-level tables). */
+ gpgd_gpa = vcpu->arch.guest_context.page_table +
+ (sizeof(pgd_t) * pgd_index(gva));
+ if (kvm_read_guest(kvm, gpgd_gpa, &gpud, sizeof(pgd_t)) < 0)
+ goto fail;
+ if (!pud_present(gpud))
+ goto fail;
+
+ /* Get guest pmd. */
+ if (pud_huge_page(gpud)) {
+ /* FIXME: no super huge page support yet. */
+ if (pte_super(*(pte_t *)&gpud))
+ goto fail;
+ gpte = *(pte_t *)&gpud;
+ page_size = PGDIR_SIZE;
+ goto ok;
+ }
+ gpmd_gpa = (pud_ptfn(gpud) << HV_LOG2_PAGE_TABLE_ALIGN) +
+ (sizeof(pmd_t) * pmd_index(gva));
+ if (kvm_read_guest(kvm, gpmd_gpa, &gpmd, sizeof(pmd_t)) < 0)
+ goto fail;
+ if (!pmd_present(gpmd))
+ goto fail;
+
+ /* Get guest pte. */
+ if (pmd_huge_page(gpmd)) {
+ /* FIXME: no super huge page support yet. */
+ if (pte_super(*(pte_t *)&gpmd))
+ goto fail;
+ gpte = *(pte_t *)&gpmd;
+ page_size = PMD_SIZE;
+ goto ok;
+ }
+ gpte_gpa = (pmd_ptfn(gpmd) << HV_LOG2_PAGE_TABLE_ALIGN) +
+ (sizeof(pte_t) * pte_index(gva));
+ if (kvm_read_guest(kvm, gpte_gpa, &gpte, sizeof(pte_t)) < 0)
+ goto fail;
+ if (!pte_present(gpte))
+ goto fail;
+
+ page_size = PAGE_SIZE;
+
+ok:
+ tr->physical_address =
+ PFN_PHYS(pte_pfn(gpte)) + (gva & (page_size - 1));
+ tr->valid = 1;
+ tr->writeable = pte_write(gpte);
+ tr->usermode = pte_user(gpte);
+
+ return 0;
+
+fail:
+ tr->valid = 0;
+ return 0;
+}
+
+int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
+{
+ regs->regs = vcpu->arch.regs;
+ return 0;
+}
+
+int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
+{
+ vcpu->arch.regs = regs->regs;
+ vcpu->arch.regs.flags = PT_FLAGS_CALLER_SAVES | PT_FLAGS_RESTORE_REGS;
+ return 0;
+}
+
+int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
+ struct kvm_sregs *sregs)
+{
+ return 0;
+}
+
+int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
+ struct kvm_sregs *sregs)
+{
+ return 0;
+}
+
+int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
+ struct kvm_mp_state *mp_state)
+{
+ return 0;
+}
+
+int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
+ struct kvm_mp_state *mp_state)
+{
+ return 0;
+}
+
+int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
+ struct kvm_guest_debug *dbg)
+{
+ return 0;
+}
+
+/*
+ * panic_hv() will dump stack info of both guest os and host os, and set
+ * proper exit reason so that qemu can terminate the guest process.
+ *
+ * FIXME: Probably KVM_EXIT_EXCEPTION? If using KVM_EXIT_EXCEPTION,
+ * current qemu process will "hang" (killable but Ctrl+C not working),
+ * so use KVM_EXIT_SHUTDOWN here temporarily.
+ */
+static int panic_hv(struct kvm_vcpu *vcpu, const char *fmt, ...)
+{
+ char panic_buf[256];
+ struct pt_regs *regs;
+ va_list ap;
+ int i;
+
+ va_start(ap, fmt);
+ vsnprintf(panic_buf, sizeof(panic_buf), fmt, ap);
+ va_end(ap);
+ pr_err("KVM guest panic (vcpu %d) - %s\n", vcpu->vcpu_id, panic_buf);
+
+ /* Show guest os info */
+ regs = &vcpu->arch.regs;
+ for (i = 0; i < 17; i++)
+ pr_err(" r%-2d: "REGFMT" r%-2d: "REGFMT" r%-2d: "REGFMT"\n",
+ i, regs->regs[i], i+18, regs->regs[i+18],
+ i+36, regs->regs[i+36]);
+ pr_err(" r18: "REGFMT" r35: "REGFMT" tp : "REGFMT"\n",
+ regs->regs[18], regs->regs[35], regs->tp);
+ pr_err(" sp : "REGFMT" lr : "REGFMT"\n", regs->sp, regs->lr);
+ pr_err(" pc : "REGFMT" ex1: %ld faultnum: %ld\n",
+ regs->pc, regs->ex1, regs->faultnum);
+
+ /* Show host os info */
+ pr_err("\nKVM stack in the host:\n");
+ dump_stack();
+
+ /* Shut down the guest os */
+ pr_err("Shutting down guest.\n");
+ vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN;
+ return 0;
+}
+
+/* Copied from virt/kvm/kvm_main.c */
+static int next_segment(unsigned long len, int offset)
+{
+ if (len > PAGE_SIZE - offset)
+ return PAGE_SIZE - offset;
+ else
+ return len;
+}
+
+static int kvm_read_guest_va(struct kvm_vcpu *vcpu, unsigned long gva,
+ void *data, unsigned long len)
+{
+ struct kvm *kvm = vcpu->kvm;
+ int seg;
+ int offset = offset_in_page(gva);
+ int ret;
+
+ while ((seg = next_segment(len, offset)) != 0) {
+ struct kvm_translation tr;
+ tr.linear_address = gva;
+ kvm_arch_vcpu_ioctl_translate(vcpu, &tr);
+ if (!tr.valid)
+ return -EFAULT;
+ ret = kvm_read_guest_page(kvm, PFN_DOWN(tr.physical_address),
+ data, offset, seg);
+ if (ret < 0)
+ return ret;
+ offset = 0;
+ len -= seg;
+ data += seg;
+ gva += seg;
+ }
+ return 0;
+}
+
+static int kvm_write_guest_va(struct kvm_vcpu *vcpu, unsigned long gva,
+ const void *data, unsigned long len)
+{
+ struct kvm *kvm = vcpu->kvm;
+ int seg;
+ int offset = offset_in_page(gva);
+ int ret;
+
+ while ((seg = next_segment(len, offset)) != 0) {
+ struct kvm_translation tr;
+ tr.linear_address = gva;
+ kvm_arch_vcpu_ioctl_translate(vcpu, &tr);
+ if (!tr.valid)
+ return -EFAULT;
+ ret = kvm_write_guest_page(kvm, PFN_DOWN(tr.physical_address),
+ data, offset, seg);
+ if (ret < 0)
+ return ret;
+ offset = 0;
+ len -= seg;
+ data += seg;
+ gva += seg;
+ }
+ return 0;
+}
+
+static int kvm_clear_guest_va(struct kvm_vcpu *vcpu, unsigned long gva,
+ unsigned long len)
+{
+ struct kvm *kvm = vcpu->kvm;
+ int seg;
+ int offset = offset_in_page(gva);
+ int ret;
+
+ while ((seg = next_segment(len, offset)) != 0) {
+ struct kvm_translation tr;
+ tr.linear_address = gva;
+ kvm_arch_vcpu_ioctl_translate(vcpu, &tr);
+ if (!tr.valid)
+ return -EFAULT;
+ ret = kvm_clear_guest_page(kvm, PFN_DOWN(tr.physical_address),
+ offset, seg);
+ if (ret < 0)
+ return ret;
+ offset = 0;
+ len -= seg;
+ gva += seg;
+ }
+ return 0;
+}
+
+/*
+ * The following functions are emulation functions for various
+ * hypervisor system calls (i.e. hv_*()). Return value:
+ * 1 if the host os can emulate it completely.
+ * < 0 if errors occur and then qemu will handle them.
+ * 0 if qemu emulation is needed.
+ * In both the < 0 and the == 0 cases, exit reason should
+ * be set for qemu handling.
+ */
+
+/* generic handler for hypercall which needs user (QEMU) to handle. */
+static int kvm_deliver_to_user(struct kvm_vcpu *vcpu)
+{
+ vcpu->run->exit_reason = KVM_EXIT_HYPERCALL;
+ return 0;
+}
+
+/* handler for illegal hypercall */
+static int kvm_emulate_illegal(struct kvm_vcpu *vcpu)
+{
+ return panic_hv(vcpu, "Illegal kvm hypercall: %ld",
+ (unsigned long)vcpu->arch.regs.regs[10]);
+}
+
+static int kvm_emulate_hv_init(struct kvm_vcpu *vcpu)
+{
+ int version = vcpu->arch.regs.regs[0];
+ int chip_num = vcpu->arch.regs.regs[1];
+ int chip_rev_num = vcpu->arch.regs.regs[2];
+ int client_pl = vcpu->arch.regs.regs[3];
+
+ if (client_pl != 1)
+ return panic_hv(vcpu, "Guest is requesting PL %d, but KVM"
+ " guests must request PL 1.\n"
+ "Reconfigure your guest with KVM_GUEST set.\n",
+ client_pl);
+
+ if (version != HV_VERSION)
+ return panic_hv(vcpu, "Client built for hv version %d, but"
+ " this hv is version %d\n",
+ version, HV_VERSION);
+
+ if (chip_num != TILE_CHIP)
+ return panic_hv(vcpu, "Client built for chip %d, but this"
+ " hardware is chip %d\n",
+ chip_num, TILE_CHIP);
+
+ if (chip_rev_num != TILE_CHIP_REV)
+ return panic_hv(vcpu, "Client built for chip rev %d, but this"
+ " hardware is chip rev %d\n",
+ chip_rev_num, TILE_CHIP_REV);
+
+ return 1;
+}
+
+static int kvm_emulate_hv_sysconf(struct kvm_vcpu *vcpu)
+{
+ HV_SysconfQuery query = (HV_SysconfQuery)vcpu->arch.regs.regs[0];
+ long rc;
+
+ switch (query) {
+ case HV_SYSCONF_PAGE_SIZE_SMALL:
+ rc = PAGE_SIZE;
+ break;
+
+ case HV_SYSCONF_PAGE_SIZE_LARGE:
+ rc = HPAGE_SIZE;
+ break;
+
+ case HV_SYSCONF_VALID_PAGE_SIZES:
+#if PAGE_SHIFT == 16
+ rc = HV_CTX_PG_SM_64K;
+#elif PAGE_SHIFT == 14
+ rc = HV_CTX_PG_SM_16K;
+#else
+# error Fix hv_sysconf emulation for new page size
+#endif
+ break;
+
+ case HV_SYSCONF_PAGE_SIZE_JUMBO:
+ rc = 0; /* FIXME add super page support */
+ break;
+
+ case HV_SYSCONF_CPU_SPEED:
+ case HV_SYSCONF_CPU_TEMP:
+ case HV_SYSCONF_BOARD_TEMP:
+ rc = hv_sysconf(query);
+ break;
+
+ default:
+ rc = -EINVAL;
+ break;
+ }
+
+ vcpu->arch.regs.regs[0] = rc;
+ return 1;
+}
+
+static int kvm_emulate_hv_confstr(struct kvm_vcpu *vcpu)
+{
+ HV_SysconfQuery query = (HV_SysconfQuery)vcpu->arch.regs.regs[0];
+ long buflen = vcpu->arch.regs.regs[2];
+ char hvbuf[256];
+ const char *p;
+ long rc;
+
+ switch (query) {
+
+ /* For hardware attributes, just pass to the hypervisor. */
+ case HV_CONFSTR_BOARD_PART_NUM:
+ case HV_CONFSTR_BOARD_SERIAL_NUM:
+ case HV_CONFSTR_CHIP_SERIAL_NUM:
+ case HV_CONFSTR_BOARD_REV:
+ case HV_CONFSTR_CHIP_MODEL:
+ case HV_CONFSTR_BOARD_DESC:
+ case HV_CONFSTR_MEZZ_PART_NUM:
+ case HV_CONFSTR_MEZZ_SERIAL_NUM:
+ case HV_CONFSTR_MEZZ_REV:
+ case HV_CONFSTR_MEZZ_DESC:
+ case HV_CONFSTR_SWITCH_CONTROL:
+ case HV_CONFSTR_CHIP_REV:
+ case HV_CONFSTR_CPUMOD_PART_NUM:
+ case HV_CONFSTR_CPUMOD_SERIAL_NUM:
+ case HV_CONFSTR_CPUMOD_REV:
+ case HV_CONFSTR_CPUMOD_DESC:
+ rc = hv_confstr(query, (HV_VirtAddr)hvbuf, sizeof(hvbuf));
+ if (rc > sizeof(hvbuf)) {
+ /* Not the best answer, but very unlikely anyway. */
+ rc = sizeof(hvbuf);
+ hvbuf[sizeof(hvbuf)-1] = '\0';
+ }
+ p = hvbuf;
+ break;
+
+ /* For hypervisor version info, just report the kernel version. */
+ case HV_CONFSTR_HV_SW_VER:
+ p = UTS_RELEASE;
+ break;
+ case HV_CONFSTR_HV_CONFIG:
+ case HV_CONFSTR_HV_CONFIG_VER:
+ p = "";
+ break;
+
+ default:
+ rc = HV_EINVAL;
+ goto done;
+ }
+
+ rc = strlen(p) + 1; /* include NUL */
+ if (kvm_write_guest_va(vcpu, vcpu->arch.regs.regs[1],
+ p, min(rc, buflen)))
+ rc = HV_EFAULT;
+
+done:
+ vcpu->arch.regs.regs[0] = rc;
+ return 1;
+}
+
+static int kvm_emulate_hv_get_rtc(struct kvm_vcpu *vcpu)
+{
+ HV_RTCTime *hvtm = (HV_RTCTime *) &vcpu->arch.regs.regs[0];
+ struct rtc_time tm;
+ struct timeval tv;
+
+ do_gettimeofday(&tv);
+ rtc_time_to_tm(tv.tv_sec, &tm);
+ hvtm->tm_sec = tm.tm_sec;
+ hvtm->tm_min = tm.tm_min;
+ hvtm->tm_hour = tm.tm_hour;
+ hvtm->tm_mday = tm.tm_mday;
+ hvtm->tm_mon = tm.tm_mon;
+ hvtm->tm_year = tm.tm_year;
+ hvtm->flags = 0;
+
+ return 1;
+}
+
+static int kvm_emulate_hv_set_rtc(struct kvm_vcpu *vcpu)
+{
+ /* Do nothing here. */
+ pr_warn("hv_set_rtc() will not work in kvm guest\n");
+ return 1;
+}
+
+static int kvm_emulate_hv_inquire_virtual(struct kvm_vcpu *vcpu)
+{
+ int idx = vcpu->arch.regs.regs[0];
+ HV_VirtAddrRange *var = (HV_VirtAddrRange *)&vcpu->arch.regs.regs[0];
+
+ switch (idx) {
+ case 0:
+ var->start = 0UL;
+ var->size = 0x20000000000UL;
+ break;
+ case 1:
+ var->start = 0xFFFFFFFF80000000UL;
+ var->size = 0x80000000UL;
+ break;
+ default:
+ var->start = 0UL;
+ var->size = 0UL;
+ break;
+ }
+
+ return 1;
+}
+
+/* Give all the ASIDs to the guest; we flush the whole TLB anyway. */
+static int kvm_emulate_hv_inquire_asid(struct kvm_vcpu *vcpu)
+{
+ int idx = vcpu->arch.regs.regs[0];
+ HV_ASIDRange *var = (HV_ASIDRange *)&vcpu->arch.regs.regs[0];
+
+ if (idx == 0) {
+ var->start = min_asid;
+ var->size = max_asid - min_asid + 1;
+ } else {
+ var->start = 0;
+ var->size = 0;
+ }
+
+ return 1;
+}
+
+static int kvm_emulate_hv_inquire_topology(struct kvm_vcpu *vcpu)
+{
+ HV_Topology *tp;
+ int cpus;
+
+ /* Depends on the definition of struct HV_Topology */
+ tp = (HV_Topology *)&vcpu->arch.regs.regs[0];
+
+ cpus = atomic_read(&vcpu->kvm->online_vcpus);
+ tp->coord.x = vcpu->vcpu_id;
+ tp->coord.y = 0;
+ tp->width = cpus;
+ tp->height = 1;
+
+ return 1;
+}
+
+static int xy_to_vcpu(struct kvm *kvm, int x, int y)
+{
+ if (y != 0 || x < 0 || x >= atomic_read(&kvm->online_vcpus))
+ return -1;
+ return x;
+}
+
+/*
+ * The primary vcpu is the one that initially runs while the others
+ * all block. It is the only that is allowed to call hv_start_all_tiles().
+ * The other cpus are secondary.
+ */
+static bool is_secondary_vcpu(struct kvm_vcpu *vcpu)
+{
+ return vcpu->vcpu_id != 0;
+}
+
+static int kvm_emulate_hv_start_all_tiles(struct kvm_vcpu *vcpu)
+{
+ struct completion *c = &vcpu->kvm->arch.smp_start;
+ if (is_secondary_vcpu(vcpu) || completion_done(c))
+ return panic_hv(vcpu, "start_all_tiles() called again");
+ complete_all(c);
+ return 1;
+}
+
+static int kvm_emulate_hv_physaddr_read64(struct kvm_vcpu *vcpu)
+{
+ gpa_t gpa = vcpu->arch.regs.regs[0];
+ HV_PTE *access = (HV_PTE *) &vcpu->arch.regs.regs[1];
+ gfn_t gfn;
+ pfn_t pfn;
+ hpa_t hpa;
+
+ gfn = gpa_to_gfn(gpa);
+ pfn = gfn_to_pfn(vcpu->kvm, gfn);
+ if (is_error_pfn(pfn))
+ return panic_hv(vcpu, "bogus PA %llx in physaddr_write64()",
+ gpa);
+ hpa = pfn_to_hpa(pfn) | (gpa & ~PAGE_MASK);
+
+ vcpu->arch.regs.regs[0] = hv_physaddr_read64(hpa, *access);
+
+ return 1;
+}
+
+static int kvm_emulate_hv_physaddr_write64(struct kvm_vcpu *vcpu)
+{
+ gpa_t gpa = vcpu->arch.regs.regs[0];
+ HV_PTE *access = (HV_PTE *)vcpu->arch.regs.regs[1];
+ uint64_t val = vcpu->arch.regs.regs[2];
+ gfn_t gfn;
+ pfn_t pfn;
+ hpa_t hpa;
+
+ gfn = gpa_to_gfn(gpa);
+ pfn = gfn_to_pfn(vcpu->kvm, gfn);
+ if (is_error_pfn(pfn))
+ return panic_hv(vcpu, "bogus PA %llx in physaddr_write64()",
+ gpa);
+ hpa = pfn_to_hpa(pfn) | (gpa & ~PAGE_MASK);
+
+ hv_physaddr_write64(hpa, *access, val);
+
+ return 1;
+}
+
+static int kvm_emulate_hv_register_message_state(struct kvm_vcpu *vcpu)
+{
+ /* Do we care about the argument msgstate? */
+ vcpu->arch.regs.regs[0] = HV_OK;
+
+ return 1;
+}
+
+/*
+ * NOTE: we may coalesce multiple messages with the same tag to the
+ * same recepient. Currently the only messages used by Linux are
+ * start/stop cpu (where coalescing is OK), and the smp_call_function()
+ * IPI message tag. In the latter case we rely on the generic
+ * smp_call_function code to properly handle this, and since it only
+ * uses the IPI as a way to wake up the generic list-walking code,
+ * it's OK if we coalesce several IPI deliveries before the recipient
+ * core takes action.
+ */
+static int kvm_emulate_hv_send_message(struct kvm_vcpu *vcpu)
+{
+ struct kvm *kvm = vcpu->kvm;
+ struct kvm_vcpu *vcpui;
+ HV_Recipient recip[NR_CPUS];
+ HV_Recipient *recips = (HV_Recipient *)vcpu->arch.regs.regs[0];
+ int nrecip = vcpu->arch.regs.regs[1];
+ int buflen = vcpu->arch.regs.regs[3];
+ int sent, vcpu_id, tag;
+
+ /* NOTE: we only support the Linux usage of buflen == sizeof(int). */
+ if (unlikely(buflen != sizeof(int) ||
+ nrecip >= atomic_read(&kvm->online_vcpus))) {
+ vcpu->arch.regs.regs[0] = HV_EINVAL;
+ return 1;
+ }
+
+ /* Get the buf info */
+ if (kvm_read_guest_va(vcpu, vcpu->arch.regs.regs[2],
+ &tag, sizeof(tag))) {
+ vcpu->arch.regs.regs[0] = HV_EFAULT;
+ return 1;
+ }
+
+ /* Range-check the tag value. */
+ if (tag < 0 || tag >= MAX_MSG_TAG) {
+ vcpu->arch.regs.regs[0] = HV_EFAULT;
+ return 1;
+ }
+
+ /* Get all the recipients */
+ if (kvm_read_guest_va(vcpu, (unsigned long)recips, &recip,
+ nrecip * sizeof(HV_Recipient))) {
+ vcpu->arch.regs.regs[0] = HV_EFAULT;
+ return 1;
+ }
+
+ for (sent = 0; sent < nrecip; sent++) {
+ if (recip[sent].state != HV_TO_BE_SENT)
+ continue;
+ vcpu_id = xy_to_vcpu(kvm, recip[sent].x, recip[sent].y);
+ if (unlikely(vcpu_id < 0 || vcpu_id == vcpu->vcpu_id)) {
+ recip[sent].state = HV_BAD_RECIP;
+ continue;
+ }
+ vcpui = kvm_get_vcpu(kvm, vcpu_id);
+ set_bit(tag, &vcpui->arch.pending_msgs);
+ kvm_vcpu_kick(vcpui);
+ recip[sent].state = HV_SENT;
+ }
+
+ if (kvm_write_guest_va(vcpu, (unsigned long)recips, &recip,
+ nrecip * sizeof(HV_Recipient))) {
+ vcpu->arch.regs.regs[0] = HV_EFAULT;
+ return 1;
+ }
+
+ vcpu->arch.regs.regs[0] = sent;
+
+ return 1;
+}
+
+static int kvm_emulate_hv_receive_message(struct kvm_vcpu *vcpu)
+{
+ HV_RcvMsgInfo *rmi = (HV_RcvMsgInfo *)&vcpu->arch.regs.regs[0];
+ int buflen = vcpu->arch.regs.regs[3];
+ int tag;
+
+ /* Currently we only support messages from other tiles. */
+ rmi->source = HV_MSG_TILE;
+
+ if (buflen <= sizeof(int)) {
+ rmi->msglen = HV_E2BIG;
+ return 1;
+ }
+
+ tag = find_first_bit(&vcpu->arch.pending_msgs, MAX_MSG_TAG);
+ if (tag >= MAX_MSG_TAG) {
+ /* No more messages */
+ rmi->msglen = 0;
+ return 1;
+ }
+
+ if (kvm_write_guest_va(vcpu, vcpu->arch.regs.regs[2],
+ &tag, sizeof(int))) {
+ rmi->msglen = HV_EFAULT;
+ return 1;
+ }
+
+ /*
+ * This clear_bit could race with a set_bit as another core
+ * delivers a new smp_function_call to this core. However,
+ * the smp_function_call code will have set up the additional
+ * smp_function_call data on the kernel's list prior to
+ * raising the interrupt, so even if we lose the new
+ * interrupt due to the race, we still haven't dispatched
+ * to the original interrupt handler, and when we do, it
+ * will find both smp_function_calls waiting for it, so the
+ * race is harmless. This is consistent with the fact that
+ * the generic code is trying to support pretty much
+ * arbitrary architecture-dependent IPI semantics, so it
+ * is very conservative about what it assumes.
+ *
+ * Also note that we only clear_bit on the core that owns
+ * the mask, so there's no race condition caused by the
+ * find_first_bit above and the clear_bit here, since once
+ * a bit is found it will stay set until this point.
+ */
+ clear_bit(tag, &vcpu->arch.pending_msgs);
+ rmi->msglen = sizeof(int);
+ return 1;
+}
+
+static int kvm_emulate_hv_inquire_context(struct kvm_vcpu *vcpu)
+{
+ HV_Context *ctx = (HV_Context *) &vcpu->arch.regs.regs[0];
+
+ *ctx = hv_inquire_guest_context();
+
+ return 1;
+}
+
+static int kvm_emulate_hv_inquire_tiles(struct kvm_vcpu *vcpu)
+{
+ struct kvm *kvm = vcpu->kvm;
+ HV_InqTileSet set = vcpu->arch.regs.regs[0];
+ unsigned long gva = vcpu->arch.regs.regs[1];
+ int length = vcpu->arch.regs.regs[2];
+ struct cpumask mask = CPU_MASK_NONE;
+ int cpus, i, retval, bytes2copy, bytes2zero;
+
+ switch (set) {
+ case HV_INQ_TILES_AVAIL:
+ case HV_INQ_TILES_HFH_CACHE:
+ case HV_INQ_TILES_LOTAR:
+ cpus = atomic_read(&kvm->online_vcpus);
+ for (i = 0; i < cpus; ++i)
+ cpumask_set_cpu(i, &mask);
+ break;
+ case HV_INQ_TILES_SHARED:
+ break;
+ default:
+ retval = HV_EINVAL;
+ goto done;
+ }
+
+ bytes2copy = (length > sizeof(mask)) ? sizeof(mask) : length;
+ bytes2zero = length - bytes2copy;
+
+ if (kvm_write_guest_va(vcpu, gva, &mask, bytes2copy)) {
+ retval = HV_EFAULT;
+ goto done;
+ }
+
+ if (kvm_clear_guest_va(vcpu, gva + bytes2copy, bytes2zero)) {
+ retval = HV_EFAULT;
+ goto done;
+ }
+
+ retval = HV_OK;
+done:
+ vcpu->arch.regs.regs[0] = retval;
+ return 1;
+}
+
+static int kvm_emulate_hv_get_ipi_pte(struct kvm_vcpu *vcpu)
+{
+ HV_Coord vtarget = *(HV_Coord *)&vcpu->arch.regs.regs[0];
+ int pl = (int) vcpu->arch.regs.regs[1];
+ struct kvm_vcpu *target_vcpu;
+ int vcpu_id;
+
+ vcpu_id = vtarget.x;
+ if (pl != GUEST_PL || vtarget.y != 0 || vcpu_id < 0 ||
+ vcpu_id >= atomic_read(&vcpu->kvm->online_vcpus)) {
+ vcpu->arch.regs.regs[0] = HV_EINVAL;
+ return 1;
+ }
+
+ target_vcpu = kvm_get_vcpu(vcpu->kvm, vcpu_id);
+ if (kvm_write_guest_va(vcpu, vcpu->arch.regs.regs[2],
+ &target_vcpu->arch.ipi_gpte, sizeof(pte_t))) {
+ vcpu->arch.regs.regs[0] = HV_EFAULT;
+ return 1;
+ }
+
+ vcpu->arch.regs.regs[0] = HV_OK;
+
+ return 1;
+}
+
+struct kvm_vcpu *ipi_vcpu_lookup(struct kvm *kvm, unsigned long gpa)
+{
+ struct kvm_vcpu *vcpui;
+ unsigned long idx;
+
+ kvm_for_each_vcpu(idx, vcpui, kvm)
+ if (vcpui->arch.ipi_gpa == gpa)
+ return vcpui;
+
+ return NULL;
+}
+
+/*
+ * Most page faults will be downcall-ed from hv to and be handled directly
+ * by either guest os or host os. This function is used to handle the
+ * rest cases.
+ */
+static int handle_mmio(struct kvm_vcpu *vcpu)
+{
+ struct kvm *kvm = vcpu->kvm;
+ struct kvm_translation tr;
+ struct kvm_vcpu *ipi_vcpu;
+
+ tr.linear_address = (__u64) vcpu->arch.fault_addr;
+ kvm_arch_vcpu_ioctl_translate(vcpu, &tr);
+ if (!tr.valid)
+ return 0;
+
+ /* ipi PTE for rescheduling interrupt? */
+ ipi_vcpu = ipi_vcpu_lookup(kvm, tr.physical_address);
+ if (!ipi_vcpu)
+ return 0;
+
+ set_bit(IRQ_RESCHEDULE, &ipi_vcpu->arch.ipi_events);
+ kvm_vcpu_kick(ipi_vcpu);
+
+ /* Juke the PC past the store instruction. */
+ vcpu->arch.regs.pc += 8;
+ return 1;
+}
+
+static int kvm_emulate_hv_set_pte_super_shift(struct kvm_vcpu *vcpu)
+{
+ /*
+ * We do not expect this call in guest so far. At least guest os
+ * should just follow host os instead of *set*. Besides,
+ * hv_set_pte_super_shift() will not be called in guest os with
+ * current guest os setting.
+ */
+ vcpu->arch.regs.regs[0] = HV_EINVAL;
+
+ return 1;
+}
+
+static int kvm_emulate_hv_set_speed(struct kvm_vcpu *vcpu)
+{
+ HV_SetSpeed *hvss = (HV_SetSpeed *) &vcpu->arch.regs.regs[0];
+
+ hvss->new_speed = HV_EPERM;
+ hvss->end_cycle = 0;
+ hvss->delta_ns = 0;
+
+ return 1;
+}
+
+static int (*hcall_handlers[KVM_NUM_HCALLS])(struct kvm_vcpu *vcpu) = {
+ HCALL_DEFS
+};
+
+static int kvm_handle_exit(struct kvm_vcpu *vcpu)
+{
+ unsigned long hcall_idx;
+
+ switch (vcpu->run->exit_reason) {
+ case KVM_EXIT_HYPERCALL:
+ hcall_idx = vcpu->arch.regs.regs[10];
+ if (unlikely(hcall_idx >= KVM_NUM_HCALLS ||
+ hcall_handlers[hcall_idx] == NULL))
+ return kvm_emulate_illegal(vcpu);
+
+ /* Juke us past the swint0 when we return. */
+ vcpu->arch.regs.pc += 8;
+
+ return hcall_handlers[hcall_idx](vcpu);
+
+ case KVM_EXIT_MMIO:
+ if (handle_mmio(vcpu))
+ return 1;
+ return panic_hv(vcpu, "Out-of-bounds client memory access");
+
+ case KVM_EXIT_AGAIN:
+ return 1;
+
+ default:
+ return 0;
+ }
+}
+
+static void kvm_kick_func(void *info)
+{
+ struct kvm_vcpu *vcpu = info;
+
+ /* If this is not the thread that we expect, just return. */
+ if (unlikely(vcpu->pid != get_task_pid(current, PIDTYPE_PID)))
+ return;
+
+ /* Setting this flag will cause a vmexit instead of a vmresume. */
+ set_thread_flag(TIF_VIRT_EXIT);
+}
+
+/* Note this function has been a standard kvm interface in latest Linux. */
+void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
+{
+ int me, cpu;
+
+ /* If it is waiting in kvm_vcpu_block(), wake it up. */
+ if (waitqueue_active(&vcpu->wq))
+ wake_up_interruptible(&vcpu->wq);
+
+ /* If we are kicking our own vcpu, make sure we vmexit. */
+ if (vcpu == current_thread_info()->vcpu) {
+ set_thread_flag(TIF_VIRT_EXIT);
+ return;
+ }
+
+ /*
+ * If the vcpu is running the guest, interrupt its cpu,
+ * causing it to vmexit by setting TIF_VIRT_EXIT. Note we can
+ * race with a guest already doing a vmexit, but that is benign.
+ */
+ cpu = vcpu->cpu;
+ me = get_cpu();
+ if (cpu != me && (unsigned) cpu < nr_cpu_ids && cpu_online(cpu))
+ if (!test_and_set_bit(KVM_REQ_KICK, &vcpu->requests))
+ smp_call_function_single(cpu, kvm_kick_func, vcpu, 0);
+ put_cpu();
+}
+EXPORT_SYMBOL_GPL(kvm_vcpu_kick);
+
+/*
+ * Any interrupt that would normally be handled by the host at PL2
+ * needs to be reassigned to the guest at PL1 as we enter.
+ *
+ * The TLB interrupts remain handled by the hypervisor and are downcalled
+ * to the appropriate host or guest as necessary.
+ *
+ * FIXME: We don't give the UDN interrupts for now; at some point we
+ * plan to allow an option to pin the vcpus and report the true
+ * geometry to the guest, at which point passing the UDN access would
+ * make sense.
+ *
+ * FIXME: For now we don't pass the profiling interrupts to the guest,
+ * and instead require profiling be run in the host; we should be able
+ * to support guest-level profiling pretty easily, but we need to
+ * think about whether there are vcpu migration issues there.
+ */
+static void kvm_grant_mpls(void)
+{
+ __insn_mtspr(SPR_MPL_SWINT_1_SET_1, 1);
+ __insn_mtspr(SPR_MPL_ILL_SET_1, 1);
+ __insn_mtspr(SPR_MPL_GPV_SET_1, 1);
+ __insn_mtspr(SPR_MPL_ILL_TRANS_SET_1, 1);
+ __insn_mtspr(SPR_MPL_UNALIGN_DATA_SET_1, 1);
+}
+
+static void kvm_ungrant_mpls(void)
+{
+ __insn_mtspr(SPR_MPL_SWINT_1_SET_2, 1);
+ __insn_mtspr(SPR_MPL_ILL_SET_2, 1);
+ __insn_mtspr(SPR_MPL_GPV_SET_2, 1);
+ __insn_mtspr(SPR_MPL_ILL_TRANS_SET_2, 1);
+ __insn_mtspr(SPR_MPL_UNALIGN_DATA_SET_2, 1);
+}
+
+/*
+ * There is lots of state that is (for the non-virtualized case) held
+ * permanently in SPRs, or that is in any case not context-switched.
+ * The next two routines switch in and out all the SPR state.
+ *
+ * We try to fix the timer so that when we restart, we fix up the
+ * timer value so that will fire at the correct wall-clock time even
+ * if we have been scheduled out for a little bit. This may also
+ * mean we end up firing it immediately on return, and suffer a
+ * timer delay in the guest.
+ */
+static void kvm_save_sprs(struct kvm_vcpu *vcpu)
+{
+ vcpu->arch.timer_control = __insn_mfspr(SPR_AUX_TILE_TIMER_CONTROL);
+ vcpu->arch.vmexit_cycles = get_cycles();
+
+#define SAVE_SPR(x) vcpu->arch.x = __insn_mfspr(SPR_ ## x)
+ FOR_EACH_GUEST_SPR(SAVE_SPR);
+#undef SAVE_SPR
+}
+
+static void kvm_restore_sprs(struct kvm_vcpu *vcpu)
+{
+ unsigned long count = vcpu->arch.timer_control;
+ unsigned long underflow =
+ (count >> SPR_AUX_TILE_TIMER_CONTROL__UNDERFLOW_SHIFT) & 1;
+ unsigned long disabled =
+ (count >> SPR_AUX_TILE_TIMER_CONTROL__DISABLE_SHIFT) & 1;
+
+ if (!disabled) {
+ unsigned long delta = get_cycles() - vcpu->arch.vmexit_cycles;
+ count &= SPR_AUX_TILE_TIMER_CONTROL__COUNT_MASK;
+ underflow |= delta > count;
+ count -= delta;
+ count &= SPR_AUX_TILE_TIMER_CONTROL__COUNT_MASK;
+ count |= (underflow << SPR_AUX_TILE_TIMER_CONTROL__UNDERFLOW_SHIFT);
+ }
+ __insn_mtspr(SPR_AUX_TILE_TIMER_CONTROL, count);
+
+#define RESTORE_SPR(x) __insn_mtspr(SPR_ ## x, vcpu->arch.x)
+ FOR_EACH_GUEST_SPR(RESTORE_SPR);
+#undef RESTORE_SPR
+}
+
+/*
+ * When entering the guest, we need to eliminate any PL0 translations
+ * that were in use by qemu, since the guest's PL0 translations will
+ * be different. We also flush PL1 translations in case there have
+ * been changes to the virtualization page table, etc.
+ *
+ * FIXME: Add a way to just flush PL0/PL1, or just flush below
+ * the host PAGE_OFFSET, or add vpid support, etc.
+ */
+static void kvm_guest_context_enter(struct kvm_vcpu *vcpu)
+{
+ HV_Context *ctx;
+ pgd_t *vpgdir;
+ pte_t *ptep;
+ int rc;
+
+ /* Install virtualization context */
+ vpgdir = vcpu->kvm->arch.vpgd;
+ BUG_ON(vpgdir == NULL);
+ ptep = virt_to_pte(NULL, (unsigned long)vpgdir);
+ rc = hv_install_virt_context(__pa(vpgdir), *ptep, 0, 0);
+ WARN_ON_ONCE(rc < 0);
+
+ /* Install guest context */
+ ctx = &vcpu->arch.guest_context;
+ rc = hv_install_guest_context(ctx->page_table, ctx->access,
+ ctx->asid, ctx->flags);
+ WARN_ONCE(rc < 0, "install_guest_context(%#llx,%#llx,%#x,%#x): %d\n",
+ ctx->page_table, ctx->access.val,
+ ctx->asid, ctx->flags, rc);
+
+ hv_flush_all(0);
+}
+
+/*
+ * De-install the virtualization context so we take faults below the
+ * host Linux PL in the normal manner going forward.
+ *
+ * We flush all the TLB mappings as we exit the guest, since the
+ * guest has been using the ASIDs as it pleases, and may have installed
+ * incompatible mappings for qemu's process as well. Note that we don't
+ * worry about host-PL interrupts that occur while the guest is running,
+ * on the assumption that such interrupts can't touch userspace
+ * addresses legally anyway.
+ *
+ * NOTE: we may want to add a hypervisor call to just flush mappings
+ * below PL2 and use that here instead.
+ */
+static void kvm_guest_context_exit(struct kvm_vcpu *vcpu)
+{
+ int rc;
+
+ /* Remember guest context */
+ vcpu->arch.guest_context = hv_inquire_guest_context();
+
+ /* Disable virtualization context */
+ rc = hv_install_virt_context(HV_CTX_NONE, hv_pte(0), 0, 0);
+ WARN_ON_ONCE(rc < 0);
+
+ /* Flush everything in the TLB. */
+ hv_flush_all(0);
+}
+
+static void kvm_inject_interrupts(struct kvm_vcpu *vcpu)
+{
+ /*
+ * Capture current set of ipi_events. We might race with
+ * another thread adding an event, but if so we'll just miss
+ * it on this go-around and see it next time.
+ */
+ vcpu->arch.IPI_EVENT_1 |= __insn_exch(&vcpu->arch.ipi_events, 0);
+
+ /*
+ * Note: We could set PC and EX1 for the guest os to jump
+ * directly to the INT_MESSAGE_RCV_DWNCL handler if the interrupt
+ * is unmasked and the guest is not at PL1 with ICS set.
+ * But in fact it's about as fast to just set INTCTRL_1_STATUS
+ * here and then run the short INTCTRL_1 handler in the guest.
+ */
+ vcpu->arch.INTCTRL_1_STATUS = (vcpu->arch.pending_msgs != 0);
+}
+
+static void kvm_tile_run(struct kvm_vcpu *vcpu)
+{
+ struct thread_info *ti = current_thread_info();
+ unsigned long prev_k_0 = __insn_mfspr(SPR_SYSTEM_SAVE_K_0);
+
+ /*
+ * Disable interrupts while we set up the guest state.
+ * This way, if we race with another core trying to tell us
+ * to fix up our guest state, we will take the kick only as
+ * we actually try to enter the guest, and instead we will
+ * vmexit and end up retrying.
+ */
+ local_irq_disable();
+ kvm_guest_context_enter(vcpu);
+ clear_bit(KVM_REQ_KICK, &vcpu->requests);
+ ti->vcpu = vcpu;
+ vcpu->cpu = get_cpu();
+ kvm_inject_interrupts(vcpu);
+ kvm_grant_mpls();
+ kvm_restore_sprs(vcpu);
+
+ /* Calling this function irets into the guest. */
+ kvm_vmresume(&vcpu->arch.regs, &vcpu->arch.host_sp);
+
+ /* We resume here due to a call to kvm_vmexit. */
+ __insn_mtspr(SPR_SYSTEM_SAVE_K_0, prev_k_0);
+
+ vcpu->cpu = -1;
+ put_cpu();
+ ti->vcpu = NULL;
+ set_bit(KVM_REQ_KICK, &vcpu->requests);
+ vcpu->run->ready_for_interrupt_injection = 1;
+ kvm_ungrant_mpls();
+ kvm_save_sprs(vcpu);
+ __insn_mtspr(SPR_INTERRUPT_MASK_1, -1UL);
+ kvm_guest_context_exit(vcpu);
+ local_irq_enable();
+}
+
+static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+{
+ int r = 1;
+
+ while (r > 0) {
+ kvm_guest_enter();
+ kvm_tile_run(vcpu);
+ kvm_guest_exit();
+
+ r = kvm_handle_exit(vcpu);
+ /*
+ * <0: error for userspace.
+ * =0: QEMU to handle.
+ * >0: host os can handle it fully.
+ */
+ if (r <= 0)
+ break;
+
+ if (signal_pending(current)) {
+ vcpu->run->exit_reason = KVM_EXIT_INTR;
+ r = -EINTR;
+ break;
+ }
+
+#ifdef CONFIG_HOMECACHE
+ if (current_thread_info()->homecache_cpu !=
+ smp_processor_id()) {
+ /* Do homecache migration when returning to qemu. */
+ vcpu->run->exit_reason = KVM_EXIT_INTR;
+ r = -EINTR;
+ break;
+ }
+#endif
+
+ kvm_resched(vcpu);
+ }
+
+ return r;
+}
+
+int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+{
+ int r;
+ sigset_t sigsaved;
+
+ /* Secondary cpus must wait until they are told they can start. */
+ if (vcpu->arch.suspended) {
+ struct completion *c = &vcpu->kvm->arch.smp_start;
+ if (wait_for_completion_interruptible(c))
+ return -EINTR;
+ vcpu->arch.suspended = 0;
+ }
+
+ if (vcpu->sigset_active)
+ sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved);
+
+ r = __vcpu_run(vcpu, kvm_run);
+
+ if (vcpu->sigset_active)
+ sigprocmask(SIG_SETMASK, &sigsaved, NULL);
+
+ return r;
+}
+
+int kvm_arch_init(void *opaque)
+{
+ return 0;
+}
+
+void kvm_arch_exit(void)
+{
+}
+
+int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
+{
+ int i;
+ unsigned long resv_gfn_start;
+ struct kvm_memory_slot *s;
+ struct kvm *kvm = vcpu->kvm;
+
+ if (!kvm->arch.resv_gpa_start) {
+ resv_gfn_start = 0;
+
+ for (i = 0; i < KVM_USER_MEM_SLOTS; i++) {
+ s = &kvm->memslots->memslots[i];
+
+ if (!s->npages)
+ continue;
+
+ if ((s->base_gfn + s->npages) > resv_gfn_start)
+ resv_gfn_start = s->base_gfn + s->npages;
+ }
+
+ kvm->arch.resv_gpa_start = PFN_PHYS(resv_gfn_start);
+ }
+
+ /* Initialize to enter fake PA=VA mode in hypervisor. */
+ vcpu->arch.guest_context.page_table = HV_CTX_NONE;
+
+ vcpu->arch.ipi_gpa =
+ kvm->arch.resv_gpa_start + (vcpu->vcpu_id * PAGE_SIZE);
+ vcpu->arch.ipi_gpte =
+ pfn_pte(PFN_DOWN(vcpu->arch.ipi_gpa), PAGE_KERNEL);
+
+ /* Mark the core suspended if it is not the boot cpu. */
+ vcpu->arch.suspended = is_secondary_vcpu(vcpu);
+
+ return 0;
+}
+
+void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
+{
+}
+
+void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
+{
+ /* Notify simulator that this task handles this vcpu. */
+ sim_set_vcpu(vcpu->vcpu_id);
+}
+
+void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
+{
+ sim_clear_vcpu();
+}
+
+struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id)
+{
+ /* FIXME: some archs set up a cache for these structs? */
+ struct kvm_vcpu *vcpu = kzalloc(sizeof(struct kvm_vcpu), GFP_KERNEL);
+ int rc;
+
+ if (!vcpu)
+ return ERR_PTR(-ENOMEM);
+
+ rc = kvm_vcpu_init(vcpu, kvm, id);
+ if (rc) {
+ kfree(vcpu);
+ return ERR_PTR(rc);
+ }
+
+ return vcpu;
+}
+
+int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
+{
+ memset(&vcpu->arch.regs, 0, sizeof(struct pt_regs));
+ return 0;
+}
+
+int kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
+{
+ return 0;
+}
+
+void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
+{
+ kvm_vcpu_uninit(vcpu);
+ kfree(vcpu);
+}
+
+void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
+{
+ return kvm_arch_vcpu_destroy(vcpu);
+}
+
+int kvm_arch_hardware_enable(void *garbage)
+{
+ return 0;
+}
+
+void kvm_arch_hardware_disable(void *garbage)
+{
+}
+
+int kvm_arch_hardware_setup(void)
+{
+ return 0;
+}
+
+void kvm_arch_hardware_unsetup(void)
+{
+}
+
+void kvm_arch_check_processor_compat(void *rtn)
+{
+}
+
+int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
+{
+ return 0;
+}
+
+int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
+{
+ if (type)
+ return -EINVAL;
+
+ init_completion(&kvm->arch.smp_start);
+ return 0;
+}
+
+void kvm_arch_destroy_vm(struct kvm *kvm)
+{
+ struct kvm_vcpu *vcpu;
+ int i;
+
+ kvm_for_each_vcpu(i, vcpu, kvm)
+ kvm_arch_vcpu_free(vcpu);
+
+ /* Seems to be unnecessary? */
+ mutex_lock(&kvm->lock);
+ for (i = 0; i < atomic_read(&kvm->online_vcpus); i++)
+ kvm->vcpus[i] = NULL;
+
+ atomic_set(&kvm->online_vcpus, 0);
+ mutex_unlock(&kvm->lock);
+
+ /* FIXME: release all the pmds and ptes as well! */
+ if (kvm->arch.vpgd)
+ pgd_free(kvm->mm, kvm->arch.vpgd);
+}
+
+void kvm_arch_sync_events(struct kvm *kvm)
+{
+}
+
+int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu)
+{
+ return 0;
+}
+
+/* Called from guest hv glue via swint0 traps. */
+void kvm_do_hypervisor_call(struct pt_regs *regs, int fault_num)
+{
+ /* Hypercalls are only valid from PL1. */
+ if (EX1_PL(regs->ex1) != 0) {
+ kvm_trigger_vmexit(regs, KVM_EXIT_HYPERCALL);
+ /*NORETURN*/
+ }
+ do_trap(regs, fault_num, 0);
+}
+
+void kvm_do_vpgtable_miss(struct pt_regs *regs, int fault_num,
+ unsigned long fault_addr, unsigned long write)
+{
+ struct kvm_vcpu *vcpu = current_thread_info()->vcpu;
+ BUG_ON(vcpu == NULL);
+ vcpu->arch.fault_addr = fault_addr;
+ kvm_trigger_vmexit(regs, KVM_EXIT_MMIO);
+ /*NORETURN*/
+}
+
+void kvm_do_vguest_fatal(struct pt_regs *regs, int fault_num)
+{
+ kvm_trigger_vmexit(regs, KVM_EXIT_SHUTDOWN);
+ /*NORETURN*/
+}
+
+void kvm_trigger_vmexit(struct pt_regs *regs, int exit_reason)
+{
+ struct kvm_vcpu *vcpu = current_thread_info()->vcpu;
+ vcpu->run->exit_reason = exit_reason;
+ vcpu->arch.regs = *regs;
+ vcpu->arch.regs.flags = PT_FLAGS_CALLER_SAVES | PT_FLAGS_RESTORE_REGS;
+ kvm_vmexit(vcpu->arch.host_sp);
+ /*NORETURN*/
+}
+
+static int __init kvm_tile_init(void)
+{
+ return kvm_init(NULL, sizeof(struct kvm_vcpu),
+ __alignof__(struct kvm_vcpu), THIS_MODULE);
+}
+
+static void __exit kvm_tile_exit(void)
+{
+ kvm_exit();
+}
+
+module_init(kvm_tile_init);
+module_exit(kvm_tile_exit);
diff --git a/arch/tile/lib/exports.c b/arch/tile/lib/exports.c
index 82733c8..1590282 100644
--- a/arch/tile/lib/exports.c
+++ b/arch/tile/lib/exports.c
@@ -50,18 +50,26 @@ EXPORT_SYMBOL(__copy_in_user_inatomic);

/* hypervisor glue */
#include <hv/hypervisor.h>
+EXPORT_SYMBOL(hv_confstr);
+EXPORT_SYMBOL(hv_dev_close);
EXPORT_SYMBOL(hv_dev_open);
+EXPORT_SYMBOL(hv_dev_poll);
+EXPORT_SYMBOL(hv_dev_poll_cancel);
EXPORT_SYMBOL(hv_dev_pread);
-EXPORT_SYMBOL(hv_dev_pwrite);
EXPORT_SYMBOL(hv_dev_preada);
+EXPORT_SYMBOL(hv_dev_pwrite);
EXPORT_SYMBOL(hv_dev_pwritea);
-EXPORT_SYMBOL(hv_dev_poll);
-EXPORT_SYMBOL(hv_dev_poll_cancel);
-EXPORT_SYMBOL(hv_dev_close);
-EXPORT_SYMBOL(hv_sysconf);
-EXPORT_SYMBOL(hv_confstr);
+EXPORT_SYMBOL(hv_flush_all);
EXPORT_SYMBOL(hv_get_rtc);
+#ifdef __tilegx__
+EXPORT_SYMBOL(hv_inquire_guest_context);
+EXPORT_SYMBOL(hv_install_guest_context);
+EXPORT_SYMBOL(hv_install_virt_context);
+#endif
+EXPORT_SYMBOL(hv_physaddr_read64);
+EXPORT_SYMBOL(hv_physaddr_write64);
EXPORT_SYMBOL(hv_set_rtc);
+EXPORT_SYMBOL(hv_sysconf);

/* libgcc.a */
uint32_t __udivsi3(uint32_t dividend, uint32_t divisor);
diff --git a/arch/tile/mm/elf.c b/arch/tile/mm/elf.c
index 23f044e..86cff48 100644
--- a/arch/tile/mm/elf.c
+++ b/arch/tile/mm/elf.c
@@ -42,7 +42,9 @@ static int notify_exec(struct mm_struct *mm)
char *buf, *path;
struct vm_area_struct *vma;

+#ifndef CONFIG_KVM_GUEST /* see notify_sim_task_change() */
if (!sim_is_simulator())
+#endif
return 1;

if (mm->exe_file == NULL)
diff --git a/arch/tile/mm/fault.c b/arch/tile/mm/fault.c
index 64eec3f..39c48cb 100644
--- a/arch/tile/mm/fault.c
+++ b/arch/tile/mm/fault.c
@@ -283,7 +283,7 @@ static int handle_page_fault(struct pt_regs *regs,
flags = (FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE |
(write ? FAULT_FLAG_WRITE : 0));

- is_kernel_mode = (EX1_PL(regs->ex1) != USER_PL);
+ is_kernel_mode = !user_mode(regs);

tsk = validate_current();

@@ -824,7 +824,7 @@ void do_page_fault(struct pt_regs *regs, int fault_num,
}

#if CHIP_HAS_TILE_DMA() || CHIP_HAS_SN_PROC()
- if (EX1_PL(regs->ex1) != USER_PL) {
+ if (!user_mode(regs)) {
struct async_tlb *async;
switch (fault_num) {
#if CHIP_HAS_TILE_DMA()
diff --git a/arch/tile/mm/init.c b/arch/tile/mm/init.c
index 3bfa127..c6d2160 100644
--- a/arch/tile/mm/init.c
+++ b/arch/tile/mm/init.c
@@ -234,7 +234,7 @@ static pgprot_t __init init_pgprot(ulong address)
{
int cpu;
unsigned long page;
- enum { CODE_DELTA = MEM_SV_INTRPT - PAGE_OFFSET };
+ enum { CODE_DELTA = MEM_SV_START - PAGE_OFFSET };

#if CHIP_HAS_CBOX_HOME_MAP()
/* For kdata=huge, everything is just hash-for-home. */
@@ -538,7 +538,7 @@ static void __init kernel_physical_mapping_init(pgd_t *pgd_base)
}
}

- address = MEM_SV_INTRPT;
+ address = MEM_SV_START;
pmd = get_pmd(pgtables, address);
pfn = 0; /* code starts at PA 0 */
if (ktext_small) {
@@ -1021,7 +1021,7 @@ static void free_init_pages(char *what, unsigned long begin, unsigned long end)

void free_initmem(void)
{
- const unsigned long text_delta = MEM_SV_INTRPT - PAGE_OFFSET;
+ const unsigned long text_delta = MEM_SV_START - PAGE_OFFSET;

/*
* Evict the dirty initdata on the boot cpu, evict the w1data
@@ -1040,7 +1040,7 @@ void free_initmem(void)

/*
* Free the pages mapped from 0xc0000000 that correspond to code
- * pages from MEM_SV_INTRPT that we won't use again after init.
+ * pages from MEM_SV_START that we won't use again after init.
*/
free_init_pages("unused kernel text",
(unsigned long)_sinittext - text_delta,
diff --git a/arch/tile/mm/pgtable.c b/arch/tile/mm/pgtable.c
index 3004433..d6948d4 100644
--- a/arch/tile/mm/pgtable.c
+++ b/arch/tile/mm/pgtable.c
@@ -486,25 +486,18 @@ void check_mm_caching(struct mm_struct *prev, struct mm_struct *next)

#if CHIP_HAS_MMIO()

-/* Map an arbitrary MMIO address, homed according to pgprot, into VA space. */
-void __iomem *ioremap_prot(resource_size_t phys_addr, unsigned long size,
- pgprot_t home)
+void *generic_remap_prot(resource_size_t phys_addr, unsigned long size,
+ unsigned long flags, pgprot_t prot)
{
void *addr;
struct vm_struct *area;
unsigned long offset, last_addr;
- pgprot_t pgprot;

/* Don't allow wraparound or zero size */
last_addr = phys_addr + size - 1;
if (!size || last_addr < phys_addr)
return NULL;

- /* Create a read/write, MMIO VA mapping homed at the requested shim. */
- pgprot = PAGE_KERNEL;
- pgprot = hv_pte_set_mode(pgprot, HV_PTE_MODE_MMIO);
- pgprot = hv_pte_set_lotar(pgprot, hv_pte_get_lotar(home));
-
/*
* Mappings have to be page-aligned
*/
@@ -515,17 +508,35 @@ void __iomem *ioremap_prot(resource_size_t phys_addr, unsigned long size,
/*
* Ok, go for it..
*/
- area = get_vm_area(size, VM_IOREMAP /* | other flags? */);
+ area = get_vm_area(size, flags);
if (!area)
return NULL;
area->phys_addr = phys_addr;
addr = area->addr;
if (ioremap_page_range((unsigned long)addr, (unsigned long)addr + size,
- phys_addr, pgprot)) {
+ phys_addr, prot)) {
free_vm_area(area);
return NULL;
}
- return (__force void __iomem *) (offset + (char *)addr);
+ return (void *) (offset + (char *)addr);
+}
+EXPORT_SYMBOL(generic_remap_prot);
+
+/* Map an arbitrary MMIO address, homed according to pgprot, into VA space. */
+void __iomem *ioremap_prot(resource_size_t phys_addr, unsigned long size,
+ pgprot_t home)
+{
+ pgprot_t pgprot;
+ unsigned long flags;
+
+ /* Create a read/write, MMIO VA mapping homed at the requested shim. */
+ pgprot = PAGE_KERNEL;
+ pgprot = hv_pte_set_mode(pgprot, HV_PTE_MODE_MMIO);
+ pgprot = hv_pte_set_lotar(pgprot, hv_pte_get_lotar(home));
+ flags = VM_IOREMAP; /* | other flags? */
+
+ return (__force void __iomem *) generic_remap_prot(phys_addr,
+ size, flags, pgprot);
}
EXPORT_SYMBOL(ioremap_prot);

diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index acccd08..d3879c5 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -171,6 +171,7 @@ struct kvm_pit_config {
#define KVM_EXIT_WATCHDOG 21
#define KVM_EXIT_S390_TSCH 22
#define KVM_EXIT_EPR 23
+#define KVM_EXIT_AGAIN 24

/* For KVM_EXIT_INTERNAL_ERROR */
/* Emulate instruction failed. */
@@ -1012,6 +1013,8 @@ struct kvm_s390_ucas_mapping {
#define KVM_KVMCLOCK_CTRL _IO(KVMIO, 0xad)
#define KVM_ARM_VCPU_INIT _IOW(KVMIO, 0xae, struct kvm_vcpu_init)
#define KVM_GET_REG_LIST _IOWR(KVMIO, 0xb0, struct kvm_reg_list)
+/* Reset some SPR registers for tilegx */
+#define KVM_TILE_RESET_SPR _IO(KVMIO, 0xa8)

#define KVM_DEV_ASSIGN_ENABLE_IOMMU (1 << 0)
#define KVM_DEV_ASSIGN_PCI_2_3 (1 << 1)
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 1580dd4..1b8a1f1 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1691,7 +1691,7 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu)
finish_wait(&vcpu->wq, &wait);
}

-#ifndef CONFIG_S390
+#if !defined(CONFIG_S390) && !defined(CONFIG_TILE)
/*
* Kick a sleeping VCPU, or a guest VCPU in guest mode, into host kernel mode.
*/
@@ -1714,7 +1714,7 @@ void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
put_cpu();
}
EXPORT_SYMBOL_GPL(kvm_vcpu_kick);
-#endif /* !CONFIG_S390 */
+#endif

void kvm_resched(struct kvm_vcpu *vcpu)
{
@@ -1978,7 +1978,8 @@ static long kvm_vcpu_ioctl(struct file *filp,
if (vcpu->kvm->mm != current->mm)
return -EIO;

-#if defined(CONFIG_S390) || defined(CONFIG_PPC) || defined(CONFIG_MIPS)
+#if defined(CONFIG_S390) || defined(CONFIG_PPC) || defined(CONFIG_MIPS) || \
+ defined(CONFIG_TILEGX)
/*
* Special cases: vcpu ioctls that are asynchronous to vcpu execution,
* so vcpu_load() would break it.
--
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/