Merge branch 'kvm-updates/2.6.29' of git://git.kernel.org/pub/scm/linux/kernel/git/avi/kvm

* 'kvm-updates/2.6.29' of git://git.kernel.org/pub/scm/linux/kernel/git/avi/kvm: (140 commits) KVM: MMU: handle large host sptes on invlpg/resync KVM: Add locking to virtual i8259 interrupt controller KVM: MMU: Don't treat a global pte as such if cr4.pge is cleared MAINTAINERS: Maintainership changes for kvm/ia64 KVM: ia64: Fix kvm_arch_vcpu_ioctl_[gs]et_regs() KVM: x86: Rework user space NMI injection as KVM_CAP_USER_NMI KVM: VMX: Fix pending NMI-vs.-IRQ race for user space irqchip KVM: fix handling of ACK from shared guest IRQ KVM: MMU: check for present pdptr shadow page in walk_shadow KVM: Consolidate userspace memory capability reporting into common code KVM: Advertise the bug in memory region destruction as fixed KVM: use cpumask_var_t for cpus_hardware_enabled KVM: use modern cpumask primitives, no cpumask_t on stack KVM: Extract core of kvm_flush_remote_tlbs/kvm_reload_remote_mmus KVM: set owner of cpu and vm file operations anon_inodes: use fops->owner for module refcount x86: KVM guest: kvm_get_tsc_khz: return khz, not lpj KVM: MMU: prepopulate the shadow on invlpg KVM: MMU: skip global pgtables on sync due to cr3 switch KVM: MMU: collapse remote TLB flushes on root sync ...
author: Linus Torvalds <torvalds@linux-foundation.org> 2009-01-02 14:41:11 -0500
committer: Linus Torvalds <torvalds@linux-foundation.org> 2009-01-02 14:41:11 -0500
commit: 597b0d21626da4e6f09f132442caf0cc2b0eb47c (patch)
tree: 13c0074bb20f7b05a471e78d4ff52c665a10266a
parent: 2640c9a90fa596871e142f42052608864335f102 (diff)
parent: 87917239204d67a316cb89751750f86c9ed3640b (diff)
68 files changed, 4760 insertions, 2606 deletions
diff --git a/MAINTAINERS b/MAINTAINERS
index 88f43e09aeb3..befacf07729f 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2542,8 +2542,6 @@ W:	http://kvm.qumranet.com
 S:      Supported
 KERNEL VIRTUAL MACHINE For Itanium (KVM/IA64)
-P:      Anthony Xu
-M:      anthony.xu@intel.com
 P:      Xiantao Zhang
 M:      xiantao.zhang@intel.com
 L:      kvm-ia64@vger.kernel.org
diff --git a/arch/ia64/include/asm/kvm.h b/arch/ia64/include/asm/kvm.h
index f38472ac2267..68aa6da807c1 100644
--- a/arch/ia64/include/asm/kvm.h
+++ b/arch/ia64/include/asm/kvm.h
@@ -166,8 +166,6 @@ struct saved_vpd {
 };
 struct kvm_regs {
-        char *saved_guest;
-        char *saved_stack;
        struct saved_vpd vpd;
        /*Arch-regs*/
        int mp_state;
@@ -200,6 +198,10 @@ struct kvm_regs {
        unsigned long fp_psr;       /*used for lazy float register */
        unsigned long saved_gp;
        /*for phycial  emulation */
+        union context saved_guest;
+        unsigned long reserved[64];     /* for future use */
 };
 struct kvm_sregs {
diff --git a/arch/ia64/include/asm/kvm_host.h b/arch/ia64/include/asm/kvm_host.h
index c60d324da540..0560f3fae538 100644
--- a/arch/ia64/include/asm/kvm_host.h
+++ b/arch/ia64/include/asm/kvm_host.h
@@ -23,17 +23,6 @@
 #ifndef __ASM_KVM_HOST_H
 #define __ASM_KVM_HOST_H
-#include <linux/types.h>
-#include <linux/mm.h>
-#include <linux/kvm.h>
-#include <linux/kvm_para.h>
-#include <linux/kvm_types.h>
-#include <asm/pal.h>
-#include <asm/sal.h>
-#define KVM_MAX_VCPUS 4
 #define KVM_MEMORY_SLOTS 32
 /* memory slots that does not exposed to userspace */
 #define KVM_PRIVATE_MEM_SLOTS 4
@@ -50,70 +39,132 @@
 #define EXIT_REASON_EXTERNAL_INTERRUPT  6
 #define EXIT_REASON_IPI                 7
 #define EXIT_REASON_PTC_G               8
+#define EXIT_REASON_DEBUG               20
 /*Define vmm address space and vm data space.*/
-#define KVM_VMM_SIZE (16UL<<20)
+#define KVM_VMM_SIZE (__IA64_UL_CONST(16)<<20)
 #define KVM_VMM_SHIFT 24
-#define KVM_VMM_BASE 0xD000000000000000UL
+#define KVM_VMM_BASE 0xD000000000000000
-#define VMM_SIZE (8UL<<20)
+#define VMM_SIZE (__IA64_UL_CONST(8)<<20)
 /*
 * Define vm_buffer, used by PAL Services, base address.
- * Note: vmbuffer is in the VMM-BLOCK, the size must be < 8M
+ * Note: vm_buffer is in the VMM-BLOCK, the size must be < 8M
 */
 #define KVM_VM_BUFFER_BASE (KVM_VMM_BASE + VMM_SIZE)
-#define KVM_VM_BUFFER_SIZE (8UL<<20)
+#define KVM_VM_BUFFER_SIZE (__IA64_UL_CONST(8)<<20)
-/*Define Virtual machine data layout.*/
+/*
-#define KVM_VM_DATA_SHIFT  24
+ * kvm guest's data area looks as follow:
-#define KVM_VM_DATA_SIZE (1UL << KVM_VM_DATA_SHIFT)
+ *
-#define KVM_VM_DATA_BASE (KVM_VMM_BASE + KVM_VMM_SIZE)
+ *            +----------------------+  ------- KVM_VM_DATA_SIZE
+ *            |     vcpu[n]'s data   |   |     ___________________KVM_STK_OFFSET
+ *            |                      |   |    /                   |
-#define KVM_P2M_BASE    KVM_VM_DATA_BASE
+ *            |        ..........    |   |   /vcpu's struct&stack |
-#define KVM_P2M_OFS     0
+ *            |        ..........    |   |  /---------------------|---- 0
-#define KVM_P2M_SIZE    (8UL << 20)
+ *            |     vcpu[5]'s data   |   | /       vpd            |
+ *            |     vcpu[4]'s data   |   |/-----------------------|
-#define KVM_VHPT_BASE   (KVM_P2M_BASE + KVM_P2M_SIZE)
+ *            |     vcpu[3]'s data   |   /         vtlb           |
-#define KVM_VHPT_OFS    KVM_P2M_SIZE
+ *            |     vcpu[2]'s data   |  /|------------------------|
-#define KVM_VHPT_BLOCK_SIZE   (2UL << 20)
+ *            |     vcpu[1]'s data   |/  |         vhpt           |
-#define VHPT_SHIFT      18
+ *            |     vcpu[0]'s data   |____________________________|
-#define VHPT_SIZE       (1UL << VHPT_SHIFT)
+ *            +----------------------+   |
-#define VHPT_NUM_ENTRIES (1<<(VHPT_SHIFT-5))
+ *            |    memory dirty log  |   |
+ *            +----------------------+   |
-#define KVM_VTLB_BASE   (KVM_VHPT_BASE+KVM_VHPT_BLOCK_SIZE)
+ *            |    vm's data struct  |   |
-#define KVM_VTLB_OFS    (KVM_VHPT_OFS+KVM_VHPT_BLOCK_SIZE)
+ *            +----------------------+   |
-#define KVM_VTLB_BLOCK_SIZE   (1UL<<20)
+ *            |                      |   |
-#define VTLB_SHIFT      17
+ *            |                      |   |
-#define VTLB_SIZE       (1UL<<VTLB_SHIFT)
+ *            |                      |   |
-#define VTLB_NUM_ENTRIES (1<<(VTLB_SHIFT-5))
+ *            |                      |   |
+ *            |                      |   |
-#define KVM_VPD_BASE   (KVM_VTLB_BASE+KVM_VTLB_BLOCK_SIZE)
+ *            |                      |   |
-#define KVM_VPD_OFS    (KVM_VTLB_OFS+KVM_VTLB_BLOCK_SIZE)
+ *            |                      |   |
-#define KVM_VPD_BLOCK_SIZE   (2UL<<20)
+ *            |   vm's p2m table  |      |
-#define VPD_SHIFT       16
+ *            |                      |   |
-#define VPD_SIZE        (1UL<<VPD_SHIFT)
+ *            |                      |   |
+ *            |                      |   |  |
-#define KVM_VCPU_BASE   (KVM_VPD_BASE+KVM_VPD_BLOCK_SIZE)
+ * vm's data->|                      |   |  |
-#define KVM_VCPU_OFS    (KVM_VPD_OFS+KVM_VPD_BLOCK_SIZE)
+ *            +----------------------+ ------- 0
-#define KVM_VCPU_BLOCK_SIZE   (2UL<<20)
+ * To support large memory, needs to increase the size of p2m.
-#define VCPU_SHIFT 18
+ * To support more vcpus, needs to ensure it has enough space to
-#define VCPU_SIZE (1UL<<VCPU_SHIFT)
+ * hold vcpus' data.
-#define MAX_VCPU_NUM KVM_VCPU_BLOCK_SIZE/VCPU_SIZE
+ */
-#define KVM_VM_BASE     (KVM_VCPU_BASE+KVM_VCPU_BLOCK_SIZE)
+#define KVM_VM_DATA_SHIFT       26
-#define KVM_VM_OFS      (KVM_VCPU_OFS+KVM_VCPU_BLOCK_SIZE)
+#define KVM_VM_DATA_SIZE        (__IA64_UL_CONST(1) << KVM_VM_DATA_SHIFT)
-#define KVM_VM_BLOCK_SIZE     (1UL<<19)
+#define KVM_VM_DATA_BASE        (KVM_VMM_BASE + KVM_VM_DATA_SIZE)
-#define KVM_MEM_DIRTY_LOG_BASE (KVM_VM_BASE+KVM_VM_BLOCK_SIZE)
+#define KVM_P2M_BASE            KVM_VM_DATA_BASE
-#define KVM_MEM_DIRTY_LOG_OFS  (KVM_VM_OFS+KVM_VM_BLOCK_SIZE)
+#define KVM_P2M_SIZE            (__IA64_UL_CONST(24) << 20)
-#define KVM_MEM_DIRTY_LOG_SIZE (1UL<<19)
+#define VHPT_SHIFT              16
-/* Get vpd, vhpt, tlb, vcpu, base*/
+#define VHPT_SIZE               (__IA64_UL_CONST(1) << VHPT_SHIFT)
-#define VPD_ADDR(n) (KVM_VPD_BASE+n*VPD_SIZE)
+#define VHPT_NUM_ENTRIES        (__IA64_UL_CONST(1) << (VHPT_SHIFT-5))
-#define VHPT_ADDR(n) (KVM_VHPT_BASE+n*VHPT_SIZE)
-#define VTLB_ADDR(n) (KVM_VTLB_BASE+n*VTLB_SIZE)
+#define VTLB_SHIFT              16
-#define VCPU_ADDR(n) (KVM_VCPU_BASE+n*VCPU_SIZE)
+#define VTLB_SIZE               (__IA64_UL_CONST(1) << VTLB_SHIFT)
+#define VTLB_NUM_ENTRIES        (1UL << (VHPT_SHIFT-5))
+#define VPD_SHIFT               16
+#define VPD_SIZE                (__IA64_UL_CONST(1) << VPD_SHIFT)
+#define VCPU_STRUCT_SHIFT       16
+#define VCPU_STRUCT_SIZE        (__IA64_UL_CONST(1) << VCPU_STRUCT_SHIFT)
+#define KVM_STK_OFFSET          VCPU_STRUCT_SIZE
+#define KVM_VM_STRUCT_SHIFT     19
+#define KVM_VM_STRUCT_SIZE      (__IA64_UL_CONST(1) << KVM_VM_STRUCT_SHIFT)
+#define KVM_MEM_DIRY_LOG_SHIFT  19
+#define KVM_MEM_DIRTY_LOG_SIZE (__IA64_UL_CONST(1) << KVM_MEM_DIRY_LOG_SHIFT)
+#ifndef __ASSEMBLY__
+/*Define the max vcpus and memory for Guests.*/
+#define KVM_MAX_VCPUS   (KVM_VM_DATA_SIZE - KVM_P2M_SIZE - KVM_VM_STRUCT_SIZE -\
+                        KVM_MEM_DIRTY_LOG_SIZE) / sizeof(struct kvm_vcpu_data)
+#define KVM_MAX_MEM_SIZE (KVM_P2M_SIZE >> 3 << PAGE_SHIFT)
+#define VMM_LOG_LEN 256
+#include <linux/types.h>
+#include <linux/mm.h>
+#include <linux/kvm.h>
+#include <linux/kvm_para.h>
+#include <linux/kvm_types.h>
+#include <asm/pal.h>
+#include <asm/sal.h>
+#include <asm/page.h>
+struct kvm_vcpu_data {
+        char vcpu_vhpt[VHPT_SIZE];
+        char vcpu_vtlb[VTLB_SIZE];
+        char vcpu_vpd[VPD_SIZE];
+        char vcpu_struct[VCPU_STRUCT_SIZE];
+};
+struct kvm_vm_data {
+        char kvm_p2m[KVM_P2M_SIZE];
+        char kvm_vm_struct[KVM_VM_STRUCT_SIZE];
+        char kvm_mem_dirty_log[KVM_MEM_DIRTY_LOG_SIZE];
+        struct kvm_vcpu_data vcpu_data[KVM_MAX_VCPUS];
+};
+#define VCPU_BASE(n)    KVM_VM_DATA_BASE + \
+                                offsetof(struct kvm_vm_data, vcpu_data[n])
+#define VM_BASE         KVM_VM_DATA_BASE + \
+                                offsetof(struct kvm_vm_data, kvm_vm_struct)
+#define KVM_MEM_DIRTY_LOG_BASE  KVM_VM_DATA_BASE + \
+                                offsetof(struct kvm_vm_data, kvm_mem_dirty_log)
+#define VHPT_BASE(n) (VCPU_BASE(n) + offsetof(struct kvm_vcpu_data, vcpu_vhpt))
+#define VTLB_BASE(n) (VCPU_BASE(n) + offsetof(struct kvm_vcpu_data, vcpu_vtlb))
+#define VPD_BASE(n)  (VCPU_BASE(n) + offsetof(struct kvm_vcpu_data, vcpu_vpd))
+#define VCPU_STRUCT_BASE(n)     (VCPU_BASE(n) + \
+                                offsetof(struct kvm_vcpu_data, vcpu_struct))
 /*IO section definitions*/
 #define IOREQ_READ      1
@@ -389,6 +440,7 @@ struct kvm_vcpu_arch {
        unsigned long opcode;
        unsigned long cause;
+        char log_buf[VMM_LOG_LEN];
        union context host;
        union context guest;
 };
@@ -403,14 +455,13 @@ struct kvm_sal_data {
 };
 struct kvm_arch {
+        spinlock_t dirty_log_lock;
        unsigned long   vm_base;
        unsigned long   metaphysical_rr0;
        unsigned long   metaphysical_rr4;
        unsigned long   vmm_init_rr;
-        unsigned long   vhpt_base;
-        unsigned long   vtlb_base;
-        unsigned long   vpd_base;
-        spinlock_t dirty_log_lock;
        struct kvm_ioapic *vioapic;
        struct kvm_vm_stat stat;
        struct kvm_sal_data rdv_sal_data;
@@ -512,7 +563,7 @@ struct kvm_pt_regs {
 static inline struct kvm_pt_regs *vcpu_regs(struct kvm_vcpu *v)
 {
-        return (struct kvm_pt_regs *) ((unsigned long) v + IA64_STK_OFFSET) - 1;
+        return (struct kvm_pt_regs *) ((unsigned long) v + KVM_STK_OFFSET) - 1;
 }
 typedef int kvm_vmm_entry(void);
@@ -531,5 +582,6 @@ int kvm_pal_emul(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run);
 void kvm_sal_emul(struct kvm_vcpu *vcpu);
 static inline void kvm_inject_nmi(struct kvm_vcpu *vcpu) {}
+#endif /* __ASSEMBLY__*/
 #endif
diff --git a/arch/ia64/kvm/Makefile b/arch/ia64/kvm/Makefile
index 92cef66ca268..76464dc312e6 100644
--- a/arch/ia64/kvm/Makefile
+++ b/arch/ia64/kvm/Makefile
@@ -60,7 +60,7 @@ obj-$(CONFIG_KVM) += kvm.o
 CFLAGS_vcpu.o += -mfixed-range=f2-f5,f12-f127
 kvm-intel-objs = vmm.o vmm_ivt.o trampoline.o vcpu.o optvfault.o mmio.o \
-        vtlb.o process.o
+        vtlb.o process.o kvm_lib.o
 #Add link memcpy and memset to avoid possible structure assignment error
 kvm-intel-objs += memcpy.o memset.o
 obj-$(CONFIG_KVM_INTEL) += kvm-intel.o
diff --git a/arch/ia64/kvm/asm-offsets.c b/arch/ia64/kvm/asm-offsets.c
index 4e3dc13a619c..0c3564a7a033 100644
--- a/arch/ia64/kvm/asm-offsets.c
+++ b/arch/ia64/kvm/asm-offsets.c
@@ -24,19 +24,10 @@
 #include <linux/autoconf.h>
 #include <linux/kvm_host.h>
+#include <linux/kbuild.h>
 #include "vcpu.h"
-#define task_struct kvm_vcpu
-#define DEFINE(sym, val) \
-        asm volatile("\n->" #sym " (%0) " #val : : "i" (val))
-#define BLANK() asm volatile("\n->" : :)
-#define OFFSET(_sym, _str, _mem) \
-    DEFINE(_sym, offsetof(_str, _mem));
 void foo(void)
 {
        DEFINE(VMM_TASK_SIZE, sizeof(struct kvm_vcpu));
diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c
index af1464f7a6ad..0f5ebd948437 100644
--- a/arch/ia64/kvm/kvm-ia64.c
+++ b/arch/ia64/kvm/kvm-ia64.c
@@ -180,7 +180,6 @@ int kvm_dev_ioctl_check_extension(long ext)
        switch (ext) {
        case KVM_CAP_IRQCHIP:
-        case KVM_CAP_USER_MEMORY:
        case KVM_CAP_MP_STATE:
                r = 1;
@@ -439,7 +438,6 @@ int kvm_emulate_halt(struct kvm_vcpu *vcpu)
                expires = div64_u64(itc_diff, cyc_per_usec);
                kt = ktime_set(0, 1000 * expires);
-                down_read(&vcpu->kvm->slots_lock);
                vcpu->arch.ht_active = 1;
                hrtimer_start(p_ht, kt, HRTIMER_MODE_ABS);
@@ -452,7 +450,6 @@ int kvm_emulate_halt(struct kvm_vcpu *vcpu)
                        if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED)
                                vcpu->arch.mp_state =
                                        KVM_MP_STATE_RUNNABLE;
-                up_read(&vcpu->kvm->slots_lock);
                if (vcpu->arch.mp_state != KVM_MP_STATE_RUNNABLE)
                        return -EINTR;
@@ -476,6 +473,13 @@ static int handle_external_interrupt(struct kvm_vcpu *vcpu,
        return 1;
 }
+static int handle_vcpu_debug(struct kvm_vcpu *vcpu,
+                                struct kvm_run *kvm_run)
+{
+        printk("VMM: %s", vcpu->arch.log_buf);
+        return 1;
+}
 static int (*kvm_vti_exit_handlers[])(struct kvm_vcpu *vcpu,
                struct kvm_run *kvm_run) = {
        [EXIT_REASON_VM_PANIC]              = handle_vm_error,
@@ -487,6 +491,7 @@ static int (*kvm_vti_exit_handlers[])(struct kvm_vcpu *vcpu,
        [EXIT_REASON_EXTERNAL_INTERRUPT]    = handle_external_interrupt,
        [EXIT_REASON_IPI]                   = handle_ipi,
        [EXIT_REASON_PTC_G]                 = handle_global_purge,
+        [EXIT_REASON_DEBUG]                 = handle_vcpu_debug,
 };
@@ -698,27 +703,24 @@ out:
        return r;
 }
-/*
- * Allocate 16M memory for every vm to hold its specific data.
- * Its memory map is defined in kvm_host.h.
- */
 static struct kvm *kvm_alloc_kvm(void)
 {
        struct kvm *kvm;
        uint64_t  vm_base;
+        BUG_ON(sizeof(struct kvm) > KVM_VM_STRUCT_SIZE);
        vm_base = __get_free_pages(GFP_KERNEL, get_order(KVM_VM_DATA_SIZE));
        if (!vm_base)
                return ERR_PTR(-ENOMEM);
-        printk(KERN_DEBUG"kvm: VM data's base Address:0x%lx\n", vm_base);
-        /* Zero all pages before use! */
        memset((void *)vm_base, 0, KVM_VM_DATA_SIZE);
+        kvm = (struct kvm *)(vm_base +
-        kvm = (struct kvm *)(vm_base + KVM_VM_OFS);
+                        offsetof(struct kvm_vm_data, kvm_vm_struct));
        kvm->arch.vm_base = vm_base;
+        printk(KERN_DEBUG"kvm: vm's data area:0x%lx\n", vm_base);
        return kvm;
 }
@@ -760,21 +762,12 @@ static void kvm_build_io_pmt(struct kvm *kvm)
 static void kvm_init_vm(struct kvm *kvm)
 {
-        long vm_base;
        BUG_ON(!kvm);
        kvm->arch.metaphysical_rr0 = GUEST_PHYSICAL_RR0;
        kvm->arch.metaphysical_rr4 = GUEST_PHYSICAL_RR4;
        kvm->arch.vmm_init_rr = VMM_INIT_RR;
-        vm_base = kvm->arch.vm_base;
-        if (vm_base) {
-                kvm->arch.vhpt_base = vm_base + KVM_VHPT_OFS;
-                kvm->arch.vtlb_base = vm_base + KVM_VTLB_OFS;
-                kvm->arch.vpd_base  = vm_base + KVM_VPD_OFS;
-        }
        /*
         *Fill P2M entries for MMIO/IO ranges
         */
@@ -838,9 +831,8 @@ static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
 int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
 {
-        int i;
        struct vpd *vpd = to_host(vcpu->kvm, vcpu->arch.vpd);
-        int r;
+        int i;
        vcpu_load(vcpu);
@@ -857,18 +849,7 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
        vpd->vpr = regs->vpd.vpr;
-        r = -EFAULT;
+        memcpy(&vcpu->arch.guest, &regs->saved_guest, sizeof(union context));
-        r = copy_from_user(&vcpu->arch.guest, regs->saved_guest,
-                                                sizeof(union context));
-        if (r)
-                goto out;
-        r = copy_from_user(vcpu + 1, regs->saved_stack +
-                        sizeof(struct kvm_vcpu),
-                        IA64_STK_OFFSET - sizeof(struct kvm_vcpu));
-        if (r)
-                goto out;
-        vcpu->arch.exit_data =
-                ((struct kvm_vcpu *)(regs->saved_stack))->arch.exit_data;
        RESTORE_REGS(mp_state);
        RESTORE_REGS(vmm_rr);
@@ -902,9 +883,8 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
        set_bit(KVM_REQ_RESUME, &vcpu->requests);
        vcpu_put(vcpu);
-        r = 0;
-out:
+        return 0;
-        return r;
 }
 long kvm_arch_vm_ioctl(struct file *filp,
@@ -1166,10 +1146,11 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
                /*Set entry address for first run.*/
                regs->cr_iip = PALE_RESET_ENTRY;
-                /*Initilize itc offset for vcpus*/
+                /*Initialize itc offset for vcpus*/
                itc_offset = 0UL - ia64_getreg(_IA64_REG_AR_ITC);
-                for (i = 0; i < MAX_VCPU_NUM; i++) {
+                for (i = 0; i < KVM_MAX_VCPUS; i++) {
-                        v = (struct kvm_vcpu *)((char *)vcpu + VCPU_SIZE * i);
+                        v = (struct kvm_vcpu *)((char *)vcpu +
+                                        sizeof(struct kvm_vcpu_data) * i);
                        v->arch.itc_offset = itc_offset;
                        v->arch.last_itc = 0;
                }
@@ -1183,7 +1164,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
        vcpu->arch.apic->vcpu = vcpu;
        p_ctx->gr[1] = 0;
-        p_ctx->gr[12] = (unsigned long)((char *)vmm_vcpu + IA64_STK_OFFSET);
+        p_ctx->gr[12] = (unsigned long)((char *)vmm_vcpu + KVM_STK_OFFSET);
        p_ctx->gr[13] = (unsigned long)vmm_vcpu;
        p_ctx->psr = 0x1008522000UL;
        p_ctx->ar[40] = FPSR_DEFAULT; /*fpsr*/
@@ -1218,12 +1199,12 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
        vcpu->arch.hlt_timer.function = hlt_timer_fn;
        vcpu->arch.last_run_cpu = -1;
-        vcpu->arch.vpd = (struct vpd *)VPD_ADDR(vcpu->vcpu_id);
+        vcpu->arch.vpd = (struct vpd *)VPD_BASE(vcpu->vcpu_id);
        vcpu->arch.vsa_base = kvm_vsa_base;
        vcpu->arch.__gp = kvm_vmm_gp;
        vcpu->arch.dirty_log_lock_pa = __pa(&kvm->arch.dirty_log_lock);
-        vcpu->arch.vhpt.hash = (struct thash_data *)VHPT_ADDR(vcpu->vcpu_id);
+        vcpu->arch.vhpt.hash = (struct thash_data *)VHPT_BASE(vcpu->vcpu_id);
-        vcpu->arch.vtlb.hash = (struct thash_data *)VTLB_ADDR(vcpu->vcpu_id);
+        vcpu->arch.vtlb.hash = (struct thash_data *)VTLB_BASE(vcpu->vcpu_id);
        init_ptce_info(vcpu);
        r = 0;
@@ -1273,12 +1254,22 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm,
        int r;
        int cpu;
+        BUG_ON(sizeof(struct kvm_vcpu) > VCPU_STRUCT_SIZE/2);
+        r = -EINVAL;
+        if (id >= KVM_MAX_VCPUS) {
+                printk(KERN_ERR"kvm: Can't configure vcpus > %ld",
+                                KVM_MAX_VCPUS);
+                goto fail;
+        }
        r = -ENOMEM;
        if (!vm_base) {
                printk(KERN_ERR"kvm: Create vcpu[%d] error!\n", id);
                goto fail;
        }
-        vcpu = (struct kvm_vcpu *)(vm_base + KVM_VCPU_OFS + VCPU_SIZE * id);
+        vcpu = (struct kvm_vcpu *)(vm_base + offsetof(struct kvm_vm_data,
+                                        vcpu_data[id].vcpu_struct));
        vcpu->kvm = kvm;
        cpu = get_cpu();
@@ -1374,9 +1365,9 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
 {
-        int i;
-        int r;
        struct vpd *vpd = to_host(vcpu->kvm, vcpu->arch.vpd);
+        int i;
        vcpu_load(vcpu);
        for (i = 0; i < 16; i++) {
@@ -1391,14 +1382,8 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
        regs->vpd.vpsr = vpd->vpsr;
        regs->vpd.vpr = vpd->vpr;
-        r = -EFAULT;
+        memcpy(&regs->saved_guest, &vcpu->arch.guest, sizeof(union context));
-        r = copy_to_user(regs->saved_guest, &vcpu->arch.guest,
-                                        sizeof(union context));
-        if (r)
-                goto out;
-        r = copy_to_user(regs->saved_stack, (void *)vcpu, IA64_STK_OFFSET);
-        if (r)
-                goto out;
        SAVE_REGS(mp_state);
        SAVE_REGS(vmm_rr);
        memcpy(regs->itrs, vcpu->arch.itrs, sizeof(struct thash_data) * NITRS);
@@ -1426,10 +1411,9 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
        SAVE_REGS(metaphysical_saved_rr4);
        SAVE_REGS(fp_psr);
        SAVE_REGS(saved_gp);
        vcpu_put(vcpu);
-        r = 0;
+        return 0;
-out:
-        return r;
 }
 void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
@@ -1457,6 +1441,9 @@ int kvm_arch_set_memory_region(struct kvm *kvm,
        struct kvm_memory_slot *memslot = &kvm->memslots[mem->slot];
        unsigned long base_gfn = memslot->base_gfn;
+        if (base_gfn + npages > (KVM_MAX_MEM_SIZE >> PAGE_SHIFT))
+                return -ENOMEM;
        for (i = 0; i < npages; i++) {
                pfn = gfn_to_pfn(kvm, base_gfn + i);
                if (!kvm_is_mmio_pfn(pfn)) {
@@ -1631,8 +1618,8 @@ static int kvm_ia64_sync_dirty_log(struct kvm *kvm,
        struct kvm_memory_slot *memslot;
        int r, i;
        long n, base;
-        unsigned long *dirty_bitmap = (unsigned long *)((void *)kvm - KVM_VM_OFS
+        unsigned long *dirty_bitmap = (unsigned long *)(kvm->arch.vm_base +
-                                        + KVM_MEM_DIRTY_LOG_OFS);
+                        offsetof(struct kvm_vm_data, kvm_mem_dirty_log));
        r = -EINVAL;
        if (log->slot >= KVM_MEMORY_SLOTS)
diff --git a/arch/ia64/kvm/kvm_lib.c b/arch/ia64/kvm/kvm_lib.c
new file mode 100644
index 000000000000..a85cb611ecd7
--- /dev/null
+++ b/arch/ia64/kvm/kvm_lib.c
@@ -0,0 +1,15 @@
+/*
+ * kvm_lib.c: Compile some libraries for kvm-intel module.
+ *
+ *      Just include kernel's library, and disable symbols export.
+ *      Copyright (C) 2008, Intel Corporation.
+ *      Xiantao Zhang  (xiantao.zhang@intel.com)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+#undef CONFIG_MODULES
+#include "../../../lib/vsprintf.c"
+#include "../../../lib/ctype.c"
diff --git a/arch/ia64/kvm/kvm_minstate.h b/arch/ia64/kvm/kvm_minstate.h
index 2cc41d17cf99..b2bcaa2787aa 100644
--- a/arch/ia64/kvm/kvm_minstate.h
+++ b/arch/ia64/kvm/kvm_minstate.h
@@ -24,6 +24,8 @@
 #include <asm/asmmacro.h>
 #include <asm/types.h>
 #include <asm/kregs.h>
+#include <asm/kvm_host.h>
 #include "asm-offsets.h"
 #define KVM_MINSTATE_START_SAVE_MIN                                             \
@@ -33,7 +35,7 @@
        addl r22 = VMM_RBS_OFFSET,r1;            /* compute base of RBS */      \
        ;;                                                                      \
        lfetch.fault.excl.nt1 [r22];                                            \
-        addl r1 = IA64_STK_OFFSET-VMM_PT_REGS_SIZE,r1;  /* compute base of memory stack */  \
+        addl r1 = KVM_STK_OFFSET-VMM_PT_REGS_SIZE, r1;  \
        mov r23 = ar.bspstore;                  /* save ar.bspstore */          \
        ;;                                                                      \
        mov ar.bspstore = r22;                          /* switch to kernel RBS */\
diff --git a/arch/ia64/kvm/misc.h b/arch/ia64/kvm/misc.h
index e585c4607344..dd979e00b574 100644
--- a/arch/ia64/kvm/misc.h
+++ b/arch/ia64/kvm/misc.h
@@ -27,7 +27,8 @@
 */
 static inline uint64_t *kvm_host_get_pmt(struct kvm *kvm)
 {
-        return (uint64_t *)(kvm->arch.vm_base + KVM_P2M_OFS);
+        return (uint64_t *)(kvm->arch.vm_base +
+                                offsetof(struct kvm_vm_data, kvm_p2m));
 }
 static inline void kvm_set_pmt_entry(struct kvm *kvm, gfn_t gfn,
diff --git a/arch/ia64/kvm/mmio.c b/arch/ia64/kvm/mmio.c
index 7f1a858bc69f..21f63fffc379 100644
--- a/arch/ia64/kvm/mmio.c
+++ b/arch/ia64/kvm/mmio.c
@@ -66,31 +66,25 @@ void lsapic_write(struct kvm_vcpu *v, unsigned long addr,
        switch (addr) {
        case PIB_OFST_INTA:
-                /*panic_domain(NULL, "Undefined write on PIB INTA\n");*/
+                panic_vm(v, "Undefined write on PIB INTA\n");
-                panic_vm(v);
                break;
        case PIB_OFST_XTP:
                if (length == 1) {
                        vlsapic_write_xtp(v, val);
                } else {
-                        /*panic_domain(NULL,
+                        panic_vm(v, "Undefined write on PIB XTP\n");
-                        "Undefined write on PIB XTP\n");*/
-                        panic_vm(v);
                }
                break;
        default:
                if (PIB_LOW_HALF(addr)) {
-                        /*lower half */
+                        /*Lower half */
                        if (length != 8)
-                                /*panic_domain(NULL,
+                                panic_vm(v, "Can't LHF write with size %ld!\n",
-                                "Can't LHF write with size %ld!\n",
+                                                length);
-                                length);*/
-                                panic_vm(v);
                        else
                                vlsapic_write_ipi(v, addr, val);
-                } else {   /*   upper half
+                } else {   /*Upper half */
-                                printk("IPI-UHF write %lx\n",addr);*/
+                        panic_vm(v, "IPI-UHF write %lx\n", addr);
-                        panic_vm(v);
                }
                break;
        }
@@ -108,22 +102,18 @@ unsigned long lsapic_read(struct kvm_vcpu *v, unsigned long addr,
                if (length == 1) /* 1 byte load */
                        ; /* There is no i8259, there is no INTA access*/
                else
-                        /*panic_domain(NULL,"Undefined read on PIB INTA\n"); */
+                        panic_vm(v, "Undefined read on PIB INTA\n");
-                        panic_vm(v);
                break;
        case PIB_OFST_XTP:
                if (length == 1) {
                        result = VLSAPIC_XTP(v);
-                        /* printk("read xtp %lx\n", result); */
                } else {
-                        /*panic_domain(NULL,
+                        panic_vm(v, "Undefined read on PIB XTP\n");
-                        "Undefined read on PIB XTP\n");*/
-                        panic_vm(v);
                }
                break;
        default:
-                panic_vm(v);
+                panic_vm(v, "Undefined addr access for lsapic!\n");
                break;
        }
        return result;
@@ -162,7 +152,7 @@ static void mmio_access(struct kvm_vcpu *vcpu, u64 src_pa, u64 *dest,
                        /* it's necessary to ensure zero extending */
                        *dest = p->u.ioreq.data & (~0UL >> (64-(s*8)));
        } else
-                panic_vm(vcpu);
+                panic_vm(vcpu, "Unhandled mmio access returned!\n");
 out:
        local_irq_restore(psr);
        return ;
@@ -324,7 +314,9 @@ void emulate_io_inst(struct kvm_vcpu *vcpu, u64 padr, u64 ma)
                return;
        } else {
                inst_type = -1;
-                panic_vm(vcpu);
+                panic_vm(vcpu, "Unsupported MMIO access instruction! \
+                                Bunld[0]=0x%lx, Bundle[1]=0x%lx\n",
+                                bundle.i64[0], bundle.i64[1]);
        }
        size = 1 << size;
@@ -335,7 +327,7 @@ void emulate_io_inst(struct kvm_vcpu *vcpu, u64 padr, u64 ma)
                if (inst_type == SL_INTEGER)
                        vcpu_set_gr(vcpu, inst.M1.r1, data, 0);
                else
-                        panic_vm(vcpu);
+                        panic_vm(vcpu, "Unsupported instruction type!\n");
        }
        vcpu_increment_iip(vcpu);
diff --git a/arch/ia64/kvm/process.c b/arch/ia64/kvm/process.c
index 800817307b7b..552d07724207 100644
--- a/arch/ia64/kvm/process.c
+++ b/arch/ia64/kvm/process.c
@@ -527,7 +527,8 @@ void reflect_interruption(u64 ifa, u64 isr, u64 iim,
        vector = vec2off[vec];
        if (!(vpsr & IA64_PSR_IC) && (vector != IA64_DATA_NESTED_TLB_VECTOR)) {
-                panic_vm(vcpu);
+                panic_vm(vcpu, "Interruption with vector :0x%lx occurs "
+                                                "with psr.ic = 0\n", vector);
                return;
        }
@@ -586,7 +587,7 @@ static void set_pal_call_result(struct kvm_vcpu *vcpu)
                vcpu_set_gr(vcpu, 10, p->u.pal_data.ret.v1, 0);
                vcpu_set_gr(vcpu, 11, p->u.pal_data.ret.v2, 0);
        } else
-                panic_vm(vcpu);
+                panic_vm(vcpu, "Mis-set for exit reason!\n");
 }
 static void set_sal_call_data(struct kvm_vcpu *vcpu)
@@ -614,7 +615,7 @@ static void set_sal_call_result(struct kvm_vcpu *vcpu)
                vcpu_set_gr(vcpu, 10, p->u.sal_data.ret.r10, 0);
                vcpu_set_gr(vcpu, 11, p->u.sal_data.ret.r11, 0);
        } else
-                panic_vm(vcpu);
+                panic_vm(vcpu, "Mis-set for exit reason!\n");
 }
 void  kvm_ia64_handle_break(unsigned long ifa, struct kvm_pt_regs *regs,
@@ -680,7 +681,7 @@ static void generate_exirq(struct kvm_vcpu *vcpu)
        vpsr = VCPU(vcpu, vpsr);
        isr = vpsr & IA64_PSR_RI;
        if (!(vpsr & IA64_PSR_IC))
-                panic_vm(vcpu);
+                panic_vm(vcpu, "Trying to inject one IRQ with psr.ic=0\n");
        reflect_interruption(0, isr, 0, 12, regs); /* EXT IRQ */
 }
@@ -941,8 +942,20 @@ static void vcpu_do_resume(struct kvm_vcpu *vcpu)
        ia64_set_pta(vcpu->arch.vhpt.pta.val);
 }
+static void vmm_sanity_check(struct kvm_vcpu *vcpu)
+{
+        struct exit_ctl_data *p = &vcpu->arch.exit_data;
+        if (!vmm_sanity && p->exit_reason != EXIT_REASON_DEBUG) {
+                panic_vm(vcpu, "Failed to do vmm sanity check,"
+                        "it maybe caused by crashed vmm!!\n\n");
+        }
+}
 static void kvm_do_resume_op(struct kvm_vcpu *vcpu)
 {
+        vmm_sanity_check(vcpu); /*Guarantee vcpu runing on healthy vmm!*/
        if (test_and_clear_bit(KVM_REQ_RESUME, &vcpu->requests)) {
                vcpu_do_resume(vcpu);
                return;
@@ -968,3 +981,11 @@ void vmm_transition(struct kvm_vcpu *vcpu)
                                                1, 0, 0, 0, 0, 0);
        kvm_do_resume_op(vcpu);
 }
+void vmm_panic_handler(u64 vec)
+{
+        struct kvm_vcpu *vcpu = current_vcpu;
+        vmm_sanity = 0;
+        panic_vm(vcpu, "Unexpected interruption occurs in VMM, vector:0x%lx\n",
+                        vec2off[vec]);
+}
diff --git a/arch/ia64/kvm/vcpu.c b/arch/ia64/kvm/vcpu.c
index e44027ce5667..ecd526b55323 100644
--- a/arch/ia64/kvm/vcpu.c
+++ b/arch/ia64/kvm/vcpu.c
@@ -816,8 +816,9 @@ static void vcpu_set_itc(struct kvm_vcpu *vcpu, u64 val)
        unsigned long vitv = VCPU(vcpu, itv);
        if (vcpu->vcpu_id == 0) {
-                for (i = 0; i < MAX_VCPU_NUM; i++) {
+                for (i = 0; i < KVM_MAX_VCPUS; i++) {
-                        v = (struct kvm_vcpu *)((char *)vcpu + VCPU_SIZE * i);
+                        v = (struct kvm_vcpu *)((char *)vcpu +
+                                        sizeof(struct kvm_vcpu_data) * i);
                        VMX(v, itc_offset) = itc_offset;
                        VMX(v, last_itc) = 0;
                }
@@ -1650,7 +1651,8 @@ void vcpu_set_psr(struct kvm_vcpu *vcpu, unsigned long val)
         * Otherwise panic
         */
        if (val & (IA64_PSR_PK | IA64_PSR_IS | IA64_PSR_VM))
-                panic_vm(vcpu);
+                panic_vm(vcpu, "Only support guests with vpsr.pk =0 \
+                                & vpsr.is=0\n");
        /*
         * For those IA64_PSR bits: id/da/dd/ss/ed/ia
@@ -2103,7 +2105,7 @@ void kvm_init_all_rr(struct kvm_vcpu *vcpu)
        if (is_physical_mode(vcpu)) {
                if (vcpu->arch.mode_flags & GUEST_PHY_EMUL)
-                        panic_vm(vcpu);
+                        panic_vm(vcpu, "Machine Status conflicts!\n");
                ia64_set_rr((VRN0 << VRN_SHIFT), vcpu->arch.metaphysical_rr0);
                ia64_dv_serialize_data();
@@ -2152,10 +2154,70 @@ int vmm_entry(void)
        return 0;
 }
-void panic_vm(struct kvm_vcpu *v)
+static void kvm_show_registers(struct kvm_pt_regs *regs)
-{
+{
+        unsigned long ip = regs->cr_iip + ia64_psr(regs)->ri;
+        struct kvm_vcpu *vcpu = current_vcpu;
+        if (vcpu != NULL)
+                printk("vcpu 0x%p vcpu %d\n",
+                       vcpu, vcpu->vcpu_id);
+        printk("psr : %016lx ifs : %016lx ip  : [<%016lx>]\n",
+               regs->cr_ipsr, regs->cr_ifs, ip);
+        printk("unat: %016lx pfs : %016lx rsc : %016lx\n",
+               regs->ar_unat, regs->ar_pfs, regs->ar_rsc);
+        printk("rnat: %016lx bspstore: %016lx pr  : %016lx\n",
+               regs->ar_rnat, regs->ar_bspstore, regs->pr);
+        printk("ldrs: %016lx ccv : %016lx fpsr: %016lx\n",
+               regs->loadrs, regs->ar_ccv, regs->ar_fpsr);
+        printk("csd : %016lx ssd : %016lx\n", regs->ar_csd, regs->ar_ssd);
+        printk("b0  : %016lx b6  : %016lx b7  : %016lx\n", regs->b0,
+                                                        regs->b6, regs->b7);
+        printk("f6  : %05lx%016lx f7  : %05lx%016lx\n",
+               regs->f6.u.bits[1], regs->f6.u.bits[0],
+               regs->f7.u.bits[1], regs->f7.u.bits[0]);
+        printk("f8  : %05lx%016lx f9  : %05lx%016lx\n",
+               regs->f8.u.bits[1], regs->f8.u.bits[0],
+               regs->f9.u.bits[1], regs->f9.u.bits[0]);
+        printk("f10 : %05lx%016lx f11 : %05lx%016lx\n",
+               regs->f10.u.bits[1], regs->f10.u.bits[0],
+               regs->f11.u.bits[1], regs->f11.u.bits[0]);
+        printk("r1  : %016lx r2  : %016lx r3  : %016lx\n", regs->r1,
+                                                        regs->r2, regs->r3);
+        printk("r8  : %016lx r9  : %016lx r10 : %016lx\n", regs->r8,
+                                                        regs->r9, regs->r10);
+        printk("r11 : %016lx r12 : %016lx r13 : %016lx\n", regs->r11,
+                                                        regs->r12, regs->r13);
+        printk("r14 : %016lx r15 : %016lx r16 : %016lx\n", regs->r14,
+                                                        regs->r15, regs->r16);
+        printk("r17 : %016lx r18 : %016lx r19 : %016lx\n", regs->r17,
+                                                        regs->r18, regs->r19);
+        printk("r20 : %016lx r21 : %016lx r22 : %016lx\n", regs->r20,
+                                                        regs->r21, regs->r22);
+        printk("r23 : %016lx r24 : %016lx r25 : %016lx\n", regs->r23,
+                                                        regs->r24, regs->r25);
+        printk("r26 : %016lx r27 : %016lx r28 : %016lx\n", regs->r26,
+                                                        regs->r27, regs->r28);
+        printk("r29 : %016lx r30 : %016lx r31 : %016lx\n", regs->r29,
+                                                        regs->r30, regs->r31);
+}
+void panic_vm(struct kvm_vcpu *v, const char *fmt, ...)
+{
+        va_list args;
+        char buf[256];
+        struct kvm_pt_regs *regs = vcpu_regs(v);
        struct exit_ctl_data *p = &v->arch.exit_data;
+        va_start(args, fmt);
+        vsnprintf(buf, sizeof(buf), fmt, args);
+        va_end(args);
+        printk(buf);
+        kvm_show_registers(regs);
        p->exit_reason = EXIT_REASON_VM_PANIC;
        vmm_transition(v);
        /*Never to return*/
diff --git a/arch/ia64/kvm/vcpu.h b/arch/ia64/kvm/vcpu.h
index e9b2a4e121c0..b2f12a562bdf 100644
--- a/arch/ia64/kvm/vcpu.h
+++ b/arch/ia64/kvm/vcpu.h
@@ -737,9 +737,12 @@ void kvm_init_vtlb(struct kvm_vcpu *v);
 void kvm_init_vhpt(struct kvm_vcpu *v);
 void thash_init(struct thash_cb *hcb, u64 sz);
-void panic_vm(struct kvm_vcpu *v);
+void panic_vm(struct kvm_vcpu *v, const char *fmt, ...);
 extern u64 ia64_call_vsa(u64 proc, u64 arg1, u64 arg2, u64 arg3,
                u64 arg4, u64 arg5, u64 arg6, u64 arg7);
+extern long vmm_sanity;
 #endif
 #endif  /* __VCPU_H__ */
diff --git a/arch/ia64/kvm/vmm.c b/arch/ia64/kvm/vmm.c
index 2275bf4e681a..9eee5c04bacc 100644
--- a/arch/ia64/kvm/vmm.c
+++ b/arch/ia64/kvm/vmm.c
@@ -20,6 +20,7 @@
 */
+#include<linux/kernel.h>
 #include<linux/module.h>
 #include<asm/fpswa.h>
@@ -31,6 +32,8 @@ MODULE_LICENSE("GPL");
 extern char kvm_ia64_ivt;
 extern fpswa_interface_t *vmm_fpswa_interface;
+long vmm_sanity = 1;
 struct kvm_vmm_info vmm_info = {
        .module      = THIS_MODULE,
        .vmm_entry   = vmm_entry,
@@ -62,5 +65,31 @@ void vmm_spin_unlock(spinlock_t *lock)
 {
        _vmm_raw_spin_unlock(lock);
 }
+static void vcpu_debug_exit(struct kvm_vcpu *vcpu)
+{
+        struct exit_ctl_data *p = &vcpu->arch.exit_data;
+        long psr;
+        local_irq_save(psr);
+        p->exit_reason = EXIT_REASON_DEBUG;
+        vmm_transition(vcpu);
+        local_irq_restore(psr);
+}
+asmlinkage int printk(const char *fmt, ...)
+{
+        struct kvm_vcpu *vcpu = current_vcpu;
+        va_list args;
+        int r;
+        memset(vcpu->arch.log_buf, 0, VMM_LOG_LEN);
+        va_start(args, fmt);
+        r = vsnprintf(vcpu->arch.log_buf, VMM_LOG_LEN, fmt, args);
+        va_end(args);
+        vcpu_debug_exit(vcpu);
+        return r;
+}
 module_init(kvm_vmm_init)
 module_exit(kvm_vmm_exit)
diff --git a/arch/ia64/kvm/vmm_ivt.S b/arch/ia64/kvm/vmm_ivt.S
index c1d7251a1480..3ef1a017a318 100644
--- a/arch/ia64/kvm/vmm_ivt.S
+++ b/arch/ia64/kvm/vmm_ivt.S
@@ -1,5 +1,5 @@
 /*
- * /ia64/kvm_ivt.S
+ * arch/ia64/kvm/vmm_ivt.S
 *
 * Copyright (C) 1998-2001, 2003 Hewlett-Packard Co
 *      Stephane Eranian <eranian@hpl.hp.com>
@@ -70,32 +70,39 @@
 # define PSR_DEFAULT_BITS   0
 #endif
 #define KVM_FAULT(n)    \
-    kvm_fault_##n:;          \
+        kvm_fault_##n:;          \
-    mov r19=n;;          \
+        mov r19=n;;          \
-    br.sptk.many kvm_fault_##n;         \
+        br.sptk.many kvm_vmm_panic;         \
-    ;;                  \
+        ;;                  \
 #define KVM_REFLECT(n)    \
-    mov r31=pr;           \
+        mov r31=pr;           \
-    mov r19=n;       /* prepare to save predicates */ \
+        mov r19=n;       /* prepare to save predicates */ \
-    mov r29=cr.ipsr;      \
+        mov r29=cr.ipsr;      \
-    ;;      \
+        ;;      \
-    tbit.z p6,p7=r29,IA64_PSR_VM_BIT;       \
+        tbit.z p6,p7=r29,IA64_PSR_VM_BIT;       \
-(p7)br.sptk.many kvm_dispatch_reflection;        \
+(p7)    br.sptk.many kvm_dispatch_reflection;        \
-    br.sptk.many kvm_panic;      \
+        br.sptk.many kvm_vmm_panic;      \
+GLOBAL_ENTRY(kvm_vmm_panic)
-GLOBAL_ENTRY(kvm_panic)
+        KVM_SAVE_MIN_WITH_COVER_R19
-    br.sptk.many kvm_panic
+        alloc r14=ar.pfs,0,0,1,0
-    ;;
+        mov out0=r15
-END(kvm_panic)
+        adds r3=8,r2                // set up second base pointer
+        ;;
+        ssm psr.ic
+        ;;
+        srlz.i    // guarantee that interruption collection is on
+        ;;
+        //(p15) ssm psr.i               // restore psr.i
+        addl r14=@gprel(ia64_leave_hypervisor),gp
+        ;;
+        KVM_SAVE_REST
+        mov rp=r14
+        ;;
+        br.call.sptk.many b6=vmm_panic_handler;
+END(kvm_vmm_panic)
    .section .text.ivt,"ax"
@@ -105,308 +112,307 @@ kvm_ia64_ivt:
 ///////////////////////////////////////////////////////////////
 // 0x0000 Entry 0 (size 64 bundles) VHPT Translation (8,20,47)
 ENTRY(kvm_vhpt_miss)
-    KVM_FAULT(0)
+        KVM_FAULT(0)
 END(kvm_vhpt_miss)
    .org kvm_ia64_ivt+0x400
 ////////////////////////////////////////////////////////////////
 // 0x0400 Entry 1 (size 64 bundles) ITLB (21)
 ENTRY(kvm_itlb_miss)
-    mov r31 = pr
+        mov r31 = pr
-    mov r29=cr.ipsr;
+        mov r29=cr.ipsr;
-    ;;
+        ;;
-    tbit.z p6,p7=r29,IA64_PSR_VM_BIT;
+        tbit.z p6,p7=r29,IA64_PSR_VM_BIT;
-    (p6) br.sptk kvm_alt_itlb_miss
+(p6)    br.sptk kvm_alt_itlb_miss
-    mov r19 = 1
+        mov r19 = 1
-    br.sptk kvm_itlb_miss_dispatch
+        br.sptk kvm_itlb_miss_dispatch
-    KVM_FAULT(1);
+        KVM_FAULT(1);
 END(kvm_itlb_miss)
    .org kvm_ia64_ivt+0x0800
 //////////////////////////////////////////////////////////////////
 // 0x0800 Entry 2 (size 64 bundles) DTLB (9,48)
 ENTRY(kvm_dtlb_miss)
-    mov r31 = pr
+        mov r31 = pr
-    mov r29=cr.ipsr;
+        mov r29=cr.ipsr;
-    ;;
+        ;;
-    tbit.z p6,p7=r29,IA64_PSR_VM_BIT;
+        tbit.z p6,p7=r29,IA64_PSR_VM_BIT;
-(p6)br.sptk kvm_alt_dtlb_miss
+(p6)    br.sptk kvm_alt_dtlb_miss
-    br.sptk kvm_dtlb_miss_dispatch
+        br.sptk kvm_dtlb_miss_dispatch
 END(kvm_dtlb_miss)
     .org kvm_ia64_ivt+0x0c00
 ////////////////////////////////////////////////////////////////////
 // 0x0c00 Entry 3 (size 64 bundles) Alt ITLB (19)
 ENTRY(kvm_alt_itlb_miss)
-    mov r16=cr.ifa    // get address that caused the TLB miss
+        mov r16=cr.ifa    // get address that caused the TLB miss
-    ;;
+        ;;
-    movl r17=PAGE_KERNEL
+        movl r17=PAGE_KERNEL
-    mov r24=cr.ipsr
+        mov r24=cr.ipsr
-    movl r19=(((1 << IA64_MAX_PHYS_BITS) - 1) & ~0xfff)
+        movl r19=(((1 << IA64_MAX_PHYS_BITS) - 1) & ~0xfff)
-    ;;
+        ;;
-    and r19=r19,r16     // clear ed, reserved bits, and PTE control bits
+        and r19=r19,r16     // clear ed, reserved bits, and PTE control bits
-    ;;
+        ;;
-    or r19=r17,r19      // insert PTE control bits into r19
+        or r19=r17,r19      // insert PTE control bits into r19
-    ;;
+        ;;
-    movl r20=IA64_GRANULE_SHIFT<<2
+        movl r20=IA64_GRANULE_SHIFT<<2
-    ;;
+        ;;
-    mov cr.itir=r20
+        mov cr.itir=r20
-    ;;
+        ;;
-    itc.i r19           // insert the TLB entry
+        itc.i r19               // insert the TLB entry
-    mov pr=r31,-1
+        mov pr=r31,-1
-    rfi
+        rfi
 END(kvm_alt_itlb_miss)
    .org kvm_ia64_ivt+0x1000
 /////////////////////////////////////////////////////////////////////
 // 0x1000 Entry 4 (size 64 bundles) Alt DTLB (7,46)
 ENTRY(kvm_alt_dtlb_miss)
-    mov r16=cr.ifa              // get address that caused the TLB miss
+        mov r16=cr.ifa          // get address that caused the TLB miss
-    ;;
+        ;;
-    movl r17=PAGE_KERNEL
+        movl r17=PAGE_KERNEL
-    movl r19=(((1 << IA64_MAX_PHYS_BITS) - 1) & ~0xfff)
+        movl r19=(((1 << IA64_MAX_PHYS_BITS) - 1) & ~0xfff)
-    mov r24=cr.ipsr
+        mov r24=cr.ipsr
-    ;;
+        ;;
-    and r19=r19,r16     // clear ed, reserved bits, and PTE control bits
+        and r19=r19,r16     // clear ed, reserved bits, and PTE control bits
-    ;;
+        ;;
-    or r19=r19,r17      // insert PTE control bits into r19
+        or r19=r19,r17  // insert PTE control bits into r19
-    ;;
+        ;;
-    movl r20=IA64_GRANULE_SHIFT<<2
+        movl r20=IA64_GRANULE_SHIFT<<2
-    ;;
+        ;;
-    mov cr.itir=r20
+        mov cr.itir=r20
-    ;;
+        ;;
-    itc.d r19           // insert the TLB entry
+        itc.d r19               // insert the TLB entry
-    mov pr=r31,-1
+        mov pr=r31,-1
-    rfi
+        rfi
 END(kvm_alt_dtlb_miss)
    .org kvm_ia64_ivt+0x1400
 //////////////////////////////////////////////////////////////////////
 // 0x1400 Entry 5 (size 64 bundles) Data nested TLB (6,45)
 ENTRY(kvm_nested_dtlb_miss)
-    KVM_FAULT(5)
+        KVM_FAULT(5)
 END(kvm_nested_dtlb_miss)
    .org kvm_ia64_ivt+0x1800
 /////////////////////////////////////////////////////////////////////
 // 0x1800 Entry 6 (size 64 bundles) Instruction Key Miss (24)
 ENTRY(kvm_ikey_miss)
-    KVM_REFLECT(6)
+        KVM_REFLECT(6)
 END(kvm_ikey_miss)
    .org kvm_ia64_ivt+0x1c00
 /////////////////////////////////////////////////////////////////////
 // 0x1c00 Entry 7 (size 64 bundles) Data Key Miss (12,51)
 ENTRY(kvm_dkey_miss)
-    KVM_REFLECT(7)
+        KVM_REFLECT(7)
 END(kvm_dkey_miss)
    .org kvm_ia64_ivt+0x2000
 ////////////////////////////////////////////////////////////////////
 // 0x2000 Entry 8 (size 64 bundles) Dirty-bit (54)
 ENTRY(kvm_dirty_bit)
-    KVM_REFLECT(8)
+        KVM_REFLECT(8)
 END(kvm_dirty_bit)
    .org kvm_ia64_ivt+0x2400
 ////////////////////////////////////////////////////////////////////
 // 0x2400 Entry 9 (size 64 bundles) Instruction Access-bit (27)
 ENTRY(kvm_iaccess_bit)
-    KVM_REFLECT(9)
+        KVM_REFLECT(9)
 END(kvm_iaccess_bit)
    .org kvm_ia64_ivt+0x2800
 ///////////////////////////////////////////////////////////////////
 // 0x2800 Entry 10 (size 64 bundles) Data Access-bit (15,55)
 ENTRY(kvm_daccess_bit)
-    KVM_REFLECT(10)
+        KVM_REFLECT(10)
 END(kvm_daccess_bit)
    .org kvm_ia64_ivt+0x2c00
 /////////////////////////////////////////////////////////////////
 // 0x2c00 Entry 11 (size 64 bundles) Break instruction (33)
 ENTRY(kvm_break_fault)
-    mov r31=pr
+        mov r31=pr
-    mov r19=11
+        mov r19=11
-    mov r29=cr.ipsr
+        mov r29=cr.ipsr
-    ;;
+        ;;
-    KVM_SAVE_MIN_WITH_COVER_R19
+        KVM_SAVE_MIN_WITH_COVER_R19
-    ;;
+        ;;
-    alloc r14=ar.pfs,0,0,4,0 // now it's safe (must be first in insn group!)
+        alloc r14=ar.pfs,0,0,4,0 //(must be first in insn group!)
-    mov out0=cr.ifa
+        mov out0=cr.ifa
-    mov out2=cr.isr     // FIXME: pity to make this slow access twice
+        mov out2=cr.isr     // FIXME: pity to make this slow access twice
-    mov out3=cr.iim     // FIXME: pity to make this slow access twice
+        mov out3=cr.iim     // FIXME: pity to make this slow access twice
-    adds r3=8,r2                // set up second base pointer
+        adds r3=8,r2                // set up second base pointer
-    ;;
+        ;;
-    ssm psr.ic
+        ssm psr.ic
-    ;;
+        ;;
-    srlz.i                  // guarantee that interruption collection is on
+        srlz.i         // guarantee that interruption collection is on
-    ;;
+        ;;
-    //(p15)ssm psr.i               // restore psr.i
+        //(p15)ssm psr.i               // restore psr.i
-    addl r14=@gprel(ia64_leave_hypervisor),gp
+        addl r14=@gprel(ia64_leave_hypervisor),gp
-    ;;
+        ;;
-    KVM_SAVE_REST
+        KVM_SAVE_REST
-    mov rp=r14
+        mov rp=r14
-    ;;
+        ;;
-    adds out1=16,sp
+        adds out1=16,sp
-    br.call.sptk.many b6=kvm_ia64_handle_break
+        br.call.sptk.many b6=kvm_ia64_handle_break
-    ;;
+        ;;
 END(kvm_break_fault)
    .org kvm_ia64_ivt+0x3000
 /////////////////////////////////////////////////////////////////
 // 0x3000 Entry 12 (size 64 bundles) External Interrupt (4)
 ENTRY(kvm_interrupt)
-    mov r31=pr          // prepare to save predicates
+        mov r31=pr              // prepare to save predicates
-    mov r19=12
+        mov r19=12
-    mov r29=cr.ipsr
+        mov r29=cr.ipsr
-    ;;
+        ;;
-    tbit.z p6,p7=r29,IA64_PSR_VM_BIT
+        tbit.z p6,p7=r29,IA64_PSR_VM_BIT
-    tbit.z p0,p15=r29,IA64_PSR_I_BIT
+        tbit.z p0,p15=r29,IA64_PSR_I_BIT
-    ;;
+        ;;
-(p7) br.sptk kvm_dispatch_interrupt
+(p7)    br.sptk kvm_dispatch_interrupt
-    ;;
+        ;;
-    mov r27=ar.rsc              /* M */
+        mov r27=ar.rsc          /* M */
-    mov r20=r1                  /* A */
+        mov r20=r1                      /* A */
-    mov r25=ar.unat             /* M */
+        mov r25=ar.unat         /* M */
-    mov r26=ar.pfs              /* I */
+        mov r26=ar.pfs          /* I */
-    mov r28=cr.iip              /* M */
+        mov r28=cr.iip          /* M */
-    cover                       /* B (or nothing) */
+        cover                   /* B (or nothing) */
-    ;;
+        ;;
-    mov r1=sp
+        mov r1=sp
-    ;;
+        ;;
-    invala                      /* M */
+        invala                  /* M */
-    mov r30=cr.ifs
+        mov r30=cr.ifs
-    ;;
+        ;;
-    addl r1=-VMM_PT_REGS_SIZE,r1
+        addl r1=-VMM_PT_REGS_SIZE,r1
-    ;;
+        ;;
-    adds r17=2*L1_CACHE_BYTES,r1        /* really: biggest cache-line size */
+        adds r17=2*L1_CACHE_BYTES,r1    /* really: biggest cache-line size */
-    adds r16=PT(CR_IPSR),r1
+        adds r16=PT(CR_IPSR),r1
-    ;;
+        ;;
-    lfetch.fault.excl.nt1 [r17],L1_CACHE_BYTES
+        lfetch.fault.excl.nt1 [r17],L1_CACHE_BYTES
-    st8 [r16]=r29                       /* save cr.ipsr */
+        st8 [r16]=r29                   /* save cr.ipsr */
-    ;;
+        ;;
-    lfetch.fault.excl.nt1 [r17]
+        lfetch.fault.excl.nt1 [r17]
-    mov r29=b0
+        mov r29=b0
-    ;;
+        ;;
-    adds r16=PT(R8),r1          /* initialize first base pointer */
+        adds r16=PT(R8),r1      /* initialize first base pointer */
-    adds r17=PT(R9),r1          /* initialize second base pointer */
+        adds r17=PT(R9),r1      /* initialize second base pointer */
-    mov r18=r0                  /* make sure r18 isn't NaT */
+        mov r18=r0                      /* make sure r18 isn't NaT */
-    ;;
+        ;;
 .mem.offset 0,0; st8.spill [r16]=r8,16
 .mem.offset 8,0; st8.spill [r17]=r9,16
        ;;
 .mem.offset 0,0; st8.spill [r16]=r10,24
 .mem.offset 8,0; st8.spill [r17]=r11,24
        ;;
-    st8 [r16]=r28,16            /* save cr.iip */
+        st8 [r16]=r28,16                /* save cr.iip */
-    st8 [r17]=r30,16            /* save cr.ifs */
+        st8 [r17]=r30,16                /* save cr.ifs */
-    mov r8=ar.fpsr              /* M */
+        mov r8=ar.fpsr          /* M */
-    mov r9=ar.csd
+        mov r9=ar.csd
-    mov r10=ar.ssd
+        mov r10=ar.ssd
-    movl r11=FPSR_DEFAULT       /* L-unit */
+        movl r11=FPSR_DEFAULT   /* L-unit */
-    ;;
+        ;;
-    st8 [r16]=r25,16            /* save ar.unat */
+        st8 [r16]=r25,16                /* save ar.unat */
-    st8 [r17]=r26,16            /* save ar.pfs */
+        st8 [r17]=r26,16                /* save ar.pfs */
-    shl r18=r18,16              /* compute ar.rsc to be used for "loadrs" */
+        shl r18=r18,16          /* compute ar.rsc to be used for "loadrs" */
-    ;;
+        ;;
-    st8 [r16]=r27,16            /* save ar.rsc */
+        st8 [r16]=r27,16                /* save ar.rsc */
-    adds r17=16,r17             /* skip over ar_rnat field */
+        adds r17=16,r17         /* skip over ar_rnat field */
-    ;;
+        ;;
-    st8 [r17]=r31,16            /* save predicates */
+        st8 [r17]=r31,16                /* save predicates */
-    adds r16=16,r16             /* skip over ar_bspstore field */
+        adds r16=16,r16         /* skip over ar_bspstore field */
-    ;;
+        ;;
-    st8 [r16]=r29,16            /* save b0 */
+        st8 [r16]=r29,16                /* save b0 */
-    st8 [r17]=r18,16            /* save ar.rsc value for "loadrs" */
+        st8 [r17]=r18,16                /* save ar.rsc value for "loadrs" */
-    ;;
+        ;;
 .mem.offset 0,0; st8.spill [r16]=r20,16    /* save original r1 */
 .mem.offset 8,0; st8.spill [r17]=r12,16
-    adds r12=-16,r1
+        adds r12=-16,r1
-    /* switch to kernel memory stack (with 16 bytes of scratch) */
+        /* switch to kernel memory stack (with 16 bytes of scratch) */
-    ;;
+        ;;
 .mem.offset 0,0; st8.spill [r16]=r13,16
 .mem.offset 8,0; st8.spill [r17]=r8,16 /* save ar.fpsr */
-    ;;
+        ;;
 .mem.offset 0,0; st8.spill [r16]=r15,16
 .mem.offset 8,0; st8.spill [r17]=r14,16
-    dep r14=-1,r0,60,4
+        dep r14=-1,r0,60,4
-    ;;
+        ;;
 .mem.offset 0,0; st8.spill [r16]=r2,16
 .mem.offset 8,0; st8.spill [r17]=r3,16
-    adds r2=VMM_PT_REGS_R16_OFFSET,r1
+        adds r2=VMM_PT_REGS_R16_OFFSET,r1
-    adds r14 = VMM_VCPU_GP_OFFSET,r13
+        adds r14 = VMM_VCPU_GP_OFFSET,r13
-    ;;
+        ;;
-    mov r8=ar.ccv
+        mov r8=ar.ccv
-    ld8 r14 = [r14]
+        ld8 r14 = [r14]
-    ;;
+        ;;
-    mov r1=r14       /* establish kernel global pointer */
+        mov r1=r14       /* establish kernel global pointer */
-    ;;                                          \
+        ;;                                          \
-    bsw.1
+        bsw.1
-    ;;
+        ;;
-    alloc r14=ar.pfs,0,0,1,0    // must be first in an insn group
+        alloc r14=ar.pfs,0,0,1,0        // must be first in an insn group
-    mov out0=r13
+        mov out0=r13
-    ;;
+        ;;
-    ssm psr.ic
+        ssm psr.ic
-    ;;
+        ;;
-    srlz.i
+        srlz.i
-    ;;
+        ;;
-    //(p15) ssm psr.i
+        //(p15) ssm psr.i
-    adds r3=8,r2                // set up second base pointer for SAVE_REST
+        adds r3=8,r2            // set up second base pointer for SAVE_REST
-    srlz.i                      // ensure everybody knows psr.ic is back on
+        srlz.i                  // ensure everybody knows psr.ic is back on
-    ;;
+        ;;
 .mem.offset 0,0; st8.spill [r2]=r16,16
 .mem.offset 8,0; st8.spill [r3]=r17,16
-    ;;
+        ;;
 .mem.offset 0,0; st8.spill [r2]=r18,16
 .mem.offset 8,0; st8.spill [r3]=r19,16
-    ;;
+        ;;
 .mem.offset 0,0; st8.spill [r2]=r20,16
 .mem.offset 8,0; st8.spill [r3]=r21,16
-    mov r18=b6
+        mov r18=b6
-    ;;
+        ;;
 .mem.offset 0,0; st8.spill [r2]=r22,16
 .mem.offset 8,0; st8.spill [r3]=r23,16
-    mov r19=b7
+        mov r19=b7
-    ;;
+        ;;
 .mem.offset 0,0; st8.spill [r2]=r24,16
 .mem.offset 8,0; st8.spill [r3]=r25,16
-    ;;
+        ;;
 .mem.offset 0,0; st8.spill [r2]=r26,16
 .mem.offset 8,0; st8.spill [r3]=r27,16
-    ;;
+        ;;
 .mem.offset 0,0; st8.spill [r2]=r28,16
 .mem.offset 8,0; st8.spill [r3]=r29,16
-    ;;
+        ;;
 .mem.offset 0,0; st8.spill [r2]=r30,16
 .mem.offset 8,0; st8.spill [r3]=r31,32
-    ;;
+        ;;
-    mov ar.fpsr=r11       /* M-unit */
+        mov ar.fpsr=r11       /* M-unit */
-    st8 [r2]=r8,8         /* ar.ccv */
+        st8 [r2]=r8,8         /* ar.ccv */
-    adds r24=PT(B6)-PT(F7),r3
+        adds r24=PT(B6)-PT(F7),r3
-    ;;
+        ;;
-    stf.spill [r2]=f6,32
+        stf.spill [r2]=f6,32
-    stf.spill [r3]=f7,32
+        stf.spill [r3]=f7,32
-    ;;
+        ;;
-    stf.spill [r2]=f8,32
+        stf.spill [r2]=f8,32
-    stf.spill [r3]=f9,32
+        stf.spill [r3]=f9,32
-    ;;
+        ;;
-    stf.spill [r2]=f10
+        stf.spill [r2]=f10
-    stf.spill [r3]=f11
+        stf.spill [r3]=f11
-    adds r25=PT(B7)-PT(F11),r3
+        adds r25=PT(B7)-PT(F11),r3
-    ;;
+        ;;
-    st8 [r24]=r18,16       /* b6 */
+        st8 [r24]=r18,16       /* b6 */
-    st8 [r25]=r19,16       /* b7 */
+        st8 [r25]=r19,16       /* b7 */
-    ;;
+        ;;
-    st8 [r24]=r9           /* ar.csd */
+        st8 [r24]=r9           /* ar.csd */
-    st8 [r25]=r10          /* ar.ssd */
+        st8 [r25]=r10          /* ar.ssd */
-    ;;
+        ;;
-    srlz.d              // make sure we see the effect of cr.ivr
+        srlz.d          // make sure we see the effect of cr.ivr
-    addl r14=@gprel(ia64_leave_nested),gp
+        addl r14=@gprel(ia64_leave_nested),gp
-    ;;
+        ;;
-    mov rp=r14
+        mov rp=r14
-    br.call.sptk.many b6=kvm_ia64_handle_irq
+        br.call.sptk.many b6=kvm_ia64_handle_irq
-    ;;
+        ;;
 END(kvm_interrupt)
    .global kvm_dispatch_vexirq
@@ -414,387 +420,385 @@ END(kvm_interrupt)
 //////////////////////////////////////////////////////////////////////
 // 0x3400 Entry 13 (size 64 bundles) Reserved
 ENTRY(kvm_virtual_exirq)
-    mov r31=pr
+        mov r31=pr
-    mov r19=13
+        mov r19=13
-    mov r30 =r0
+        mov r30 =r0
-    ;;
+        ;;
 kvm_dispatch_vexirq:
-    cmp.eq p6,p0 = 1,r30
+        cmp.eq p6,p0 = 1,r30
-    ;;
+        ;;
-(p6)add r29 = VMM_VCPU_SAVED_GP_OFFSET,r21
+(p6)    add r29 = VMM_VCPU_SAVED_GP_OFFSET,r21
-    ;;
+        ;;
-(p6)ld8 r1 = [r29]
+(p6)    ld8 r1 = [r29]
-    ;;
+        ;;
-    KVM_SAVE_MIN_WITH_COVER_R19
+        KVM_SAVE_MIN_WITH_COVER_R19
-    alloc r14=ar.pfs,0,0,1,0
+        alloc r14=ar.pfs,0,0,1,0
-    mov out0=r13
+        mov out0=r13
-    ssm psr.ic
+        ssm psr.ic
-    ;;
+        ;;
-    srlz.i                  // guarantee that interruption collection is on
+        srlz.i // guarantee that interruption collection is on
-    ;;
+        ;;
-    //(p15) ssm psr.i               // restore psr.i
+        //(p15) ssm psr.i               // restore psr.i
-    adds r3=8,r2                // set up second base pointer
+        adds r3=8,r2                // set up second base pointer
-    ;;
+        ;;
-    KVM_SAVE_REST
+        KVM_SAVE_REST
-    addl r14=@gprel(ia64_leave_hypervisor),gp
+        addl r14=@gprel(ia64_leave_hypervisor),gp
-    ;;
+        ;;
-    mov rp=r14
+        mov rp=r14
-    br.call.sptk.many b6=kvm_vexirq
+        br.call.sptk.many b6=kvm_vexirq
 END(kvm_virtual_exirq)
    .org kvm_ia64_ivt+0x3800
 /////////////////////////////////////////////////////////////////////
 // 0x3800 Entry 14 (size 64 bundles) Reserved
-    KVM_FAULT(14)
+        KVM_FAULT(14)
-    // this code segment is from 2.6.16.13
+        // this code segment is from 2.6.16.13
    .org kvm_ia64_ivt+0x3c00
 ///////////////////////////////////////////////////////////////////////
 // 0x3c00 Entry 15 (size 64 bundles) Reserved
-    KVM_FAULT(15)
+        KVM_FAULT(15)
    .org kvm_ia64_ivt+0x4000
 ///////////////////////////////////////////////////////////////////////
 // 0x4000 Entry 16 (size 64 bundles) Reserved
-    KVM_FAULT(16)
+        KVM_FAULT(16)
    .org kvm_ia64_ivt+0x4400
 //////////////////////////////////////////////////////////////////////
 // 0x4400 Entry 17 (size 64 bundles) Reserved
-    KVM_FAULT(17)
+        KVM_FAULT(17)
    .org kvm_ia64_ivt+0x4800
 //////////////////////////////////////////////////////////////////////
 // 0x4800 Entry 18 (size 64 bundles) Reserved
-    KVM_FAULT(18)
+        KVM_FAULT(18)
    .org kvm_ia64_ivt+0x4c00
 //////////////////////////////////////////////////////////////////////
 // 0x4c00 Entry 19 (size 64 bundles) Reserved
-    KVM_FAULT(19)
+        KVM_FAULT(19)
    .org kvm_ia64_ivt+0x5000
 //////////////////////////////////////////////////////////////////////
 // 0x5000 Entry 20 (size 16 bundles) Page Not Present
 ENTRY(kvm_page_not_present)
-    KVM_REFLECT(20)
+        KVM_REFLECT(20)
 END(kvm_page_not_present)
    .org kvm_ia64_ivt+0x5100
 ///////////////////////////////////////////////////////////////////////
 // 0x5100 Entry 21 (size 16 bundles) Key Permission vector
 ENTRY(kvm_key_permission)
-    KVM_REFLECT(21)
+        KVM_REFLECT(21)
 END(kvm_key_permission)
    .org kvm_ia64_ivt+0x5200
 //////////////////////////////////////////////////////////////////////
 // 0x5200 Entry 22 (size 16 bundles) Instruction Access Rights (26)
 ENTRY(kvm_iaccess_rights)
-    KVM_REFLECT(22)
+        KVM_REFLECT(22)
 END(kvm_iaccess_rights)
    .org kvm_ia64_ivt+0x5300
 //////////////////////////////////////////////////////////////////////
 // 0x5300 Entry 23 (size 16 bundles) Data Access Rights (14,53)
 ENTRY(kvm_daccess_rights)
-    KVM_REFLECT(23)
+        KVM_REFLECT(23)
 END(kvm_daccess_rights)
    .org kvm_ia64_ivt+0x5400
 /////////////////////////////////////////////////////////////////////
 // 0x5400 Entry 24 (size 16 bundles) General Exception (5,32,34,36,38,39)
 ENTRY(kvm_general_exception)
-   KVM_REFLECT(24)
+        KVM_REFLECT(24)
-   KVM_FAULT(24)
+        KVM_FAULT(24)
 END(kvm_general_exception)
    .org kvm_ia64_ivt+0x5500
 //////////////////////////////////////////////////////////////////////
 // 0x5500 Entry 25 (size 16 bundles) Disabled FP-Register (35)
 ENTRY(kvm_disabled_fp_reg)
-    KVM_REFLECT(25)
+        KVM_REFLECT(25)
 END(kvm_disabled_fp_reg)
    .org kvm_ia64_ivt+0x5600
 ////////////////////////////////////////////////////////////////////
 // 0x5600 Entry 26 (size 16 bundles) Nat Consumption (11,23,37,50)
 ENTRY(kvm_nat_consumption)
-    KVM_REFLECT(26)
+        KVM_REFLECT(26)
 END(kvm_nat_consumption)
    .org kvm_ia64_ivt+0x5700
 /////////////////////////////////////////////////////////////////////
 // 0x5700 Entry 27 (size 16 bundles) Speculation (40)
 ENTRY(kvm_speculation_vector)
-    KVM_REFLECT(27)
+        KVM_REFLECT(27)
 END(kvm_speculation_vector)
    .org kvm_ia64_ivt+0x5800
 /////////////////////////////////////////////////////////////////////
 // 0x5800 Entry 28 (size 16 bundles) Reserved
-    KVM_FAULT(28)
+        KVM_FAULT(28)
    .org kvm_ia64_ivt+0x5900
 ///////////////////////////////////////////////////////////////////
 // 0x5900 Entry 29 (size 16 bundles) Debug (16,28,56)
 ENTRY(kvm_debug_vector)
-    KVM_FAULT(29)
+        KVM_FAULT(29)
 END(kvm_debug_vector)
    .org kvm_ia64_ivt+0x5a00
 ///////////////////////////////////////////////////////////////
 // 0x5a00 Entry 30 (size 16 bundles) Unaligned Reference (57)
 ENTRY(kvm_unaligned_access)
-    KVM_REFLECT(30)
+        KVM_REFLECT(30)
 END(kvm_unaligned_access)
    .org kvm_ia64_ivt+0x5b00
 //////////////////////////////////////////////////////////////////////
 // 0x5b00 Entry 31 (size 16 bundles) Unsupported Data Reference (57)
 ENTRY(kvm_unsupported_data_reference)
-    KVM_REFLECT(31)
+        KVM_REFLECT(31)
 END(kvm_unsupported_data_reference)
    .org kvm_ia64_ivt+0x5c00
 ////////////////////////////////////////////////////////////////////
 // 0x5c00 Entry 32 (size 16 bundles) Floating Point FAULT (65)
 ENTRY(kvm_floating_point_fault)
-    KVM_REFLECT(32)
+        KVM_REFLECT(32)
 END(kvm_floating_point_fault)
    .org kvm_ia64_ivt+0x5d00
 /////////////////////////////////////////////////////////////////////
 // 0x5d00 Entry 33 (size 16 bundles) Floating Point Trap (66)
 ENTRY(kvm_floating_point_trap)
-    KVM_REFLECT(33)
+        KVM_REFLECT(33)
 END(kvm_floating_point_trap)
    .org kvm_ia64_ivt+0x5e00
 //////////////////////////////////////////////////////////////////////
 // 0x5e00 Entry 34 (size 16 bundles) Lower Privilege Transfer Trap (66)
 ENTRY(kvm_lower_privilege_trap)
-    KVM_REFLECT(34)
+        KVM_REFLECT(34)
 END(kvm_lower_privilege_trap)
    .org kvm_ia64_ivt+0x5f00
 //////////////////////////////////////////////////////////////////////
 // 0x5f00 Entry 35 (size 16 bundles) Taken Branch Trap (68)
 ENTRY(kvm_taken_branch_trap)
-    KVM_REFLECT(35)
+        KVM_REFLECT(35)
 END(kvm_taken_branch_trap)
    .org kvm_ia64_ivt+0x6000
 ////////////////////////////////////////////////////////////////////
 // 0x6000 Entry 36 (size 16 bundles) Single Step Trap (69)
 ENTRY(kvm_single_step_trap)
-    KVM_REFLECT(36)
+        KVM_REFLECT(36)
 END(kvm_single_step_trap)
    .global kvm_virtualization_fault_back
    .org kvm_ia64_ivt+0x6100
 /////////////////////////////////////////////////////////////////////
 // 0x6100 Entry 37 (size 16 bundles) Virtualization Fault
 ENTRY(kvm_virtualization_fault)
-    mov r31=pr
+        mov r31=pr
-    adds r16 = VMM_VCPU_SAVED_GP_OFFSET,r21
+        adds r16 = VMM_VCPU_SAVED_GP_OFFSET,r21
-    ;;
+        ;;
-    st8 [r16] = r1
+        st8 [r16] = r1
-    adds r17 = VMM_VCPU_GP_OFFSET, r21
+        adds r17 = VMM_VCPU_GP_OFFSET, r21
-    ;;
+        ;;
-    ld8 r1 = [r17]
+        ld8 r1 = [r17]
-    cmp.eq p6,p0=EVENT_MOV_FROM_AR,r24
+        cmp.eq p6,p0=EVENT_MOV_FROM_AR,r24
-    cmp.eq p7,p0=EVENT_MOV_FROM_RR,r24
+        cmp.eq p7,p0=EVENT_MOV_FROM_RR,r24
-    cmp.eq p8,p0=EVENT_MOV_TO_RR,r24
+        cmp.eq p8,p0=EVENT_MOV_TO_RR,r24
-    cmp.eq p9,p0=EVENT_RSM,r24
+        cmp.eq p9,p0=EVENT_RSM,r24
-    cmp.eq p10,p0=EVENT_SSM,r24
+        cmp.eq p10,p0=EVENT_SSM,r24
-    cmp.eq p11,p0=EVENT_MOV_TO_PSR,r24
+        cmp.eq p11,p0=EVENT_MOV_TO_PSR,r24
-    cmp.eq p12,p0=EVENT_THASH,r24
+        cmp.eq p12,p0=EVENT_THASH,r24
-    (p6) br.dptk.many kvm_asm_mov_from_ar
+(p6)    br.dptk.many kvm_asm_mov_from_ar
-    (p7) br.dptk.many kvm_asm_mov_from_rr
+(p7)    br.dptk.many kvm_asm_mov_from_rr
-    (p8) br.dptk.many kvm_asm_mov_to_rr
+(p8)    br.dptk.many kvm_asm_mov_to_rr
-    (p9) br.dptk.many kvm_asm_rsm
+(p9)    br.dptk.many kvm_asm_rsm
-    (p10) br.dptk.many kvm_asm_ssm
+(p10)   br.dptk.many kvm_asm_ssm
-    (p11) br.dptk.many kvm_asm_mov_to_psr
+(p11)   br.dptk.many kvm_asm_mov_to_psr
-    (p12) br.dptk.many kvm_asm_thash
+(p12)   br.dptk.many kvm_asm_thash
-    ;;
+        ;;
 kvm_virtualization_fault_back:
-    adds r16 = VMM_VCPU_SAVED_GP_OFFSET,r21
+        adds r16 = VMM_VCPU_SAVED_GP_OFFSET,r21
-    ;;
+        ;;
-    ld8 r1 = [r16]
+        ld8 r1 = [r16]
-    ;;
+        ;;
-    mov r19=37
+        mov r19=37
-    adds r16 = VMM_VCPU_CAUSE_OFFSET,r21
+        adds r16 = VMM_VCPU_CAUSE_OFFSET,r21
-    adds r17 = VMM_VCPU_OPCODE_OFFSET,r21
+        adds r17 = VMM_VCPU_OPCODE_OFFSET,r21
-    ;;
+        ;;
-    st8 [r16] = r24
+        st8 [r16] = r24
-    st8 [r17] = r25
+        st8 [r17] = r25
-    ;;
+        ;;
-    cmp.ne p6,p0=EVENT_RFI, r24
+        cmp.ne p6,p0=EVENT_RFI, r24
-    (p6) br.sptk kvm_dispatch_virtualization_fault
+(p6)    br.sptk kvm_dispatch_virtualization_fault
-    ;;
+        ;;
-    adds r18=VMM_VPD_BASE_OFFSET,r21
+        adds r18=VMM_VPD_BASE_OFFSET,r21
-    ;;
+        ;;
-    ld8 r18=[r18]
+        ld8 r18=[r18]
-    ;;
+        ;;
-    adds r18=VMM_VPD_VIFS_OFFSET,r18
+        adds r18=VMM_VPD_VIFS_OFFSET,r18
-    ;;
+        ;;
-    ld8 r18=[r18]
+        ld8 r18=[r18]
-    ;;
+        ;;
-    tbit.z p6,p0=r18,63
+        tbit.z p6,p0=r18,63
-    (p6) br.sptk kvm_dispatch_virtualization_fault
+(p6)    br.sptk kvm_dispatch_virtualization_fault
-    ;;
+        ;;
-    //if vifs.v=1 desert current register frame
+//if vifs.v=1 desert current register frame
-    alloc r18=ar.pfs,0,0,0,0
+        alloc r18=ar.pfs,0,0,0,0
-    br.sptk kvm_dispatch_virtualization_fault
+        br.sptk kvm_dispatch_virtualization_fault
 END(kvm_virtualization_fault)
    .org kvm_ia64_ivt+0x6200
 //////////////////////////////////////////////////////////////
 // 0x6200 Entry 38 (size 16 bundles) Reserved
-    KVM_FAULT(38)
+        KVM_FAULT(38)
    .org kvm_ia64_ivt+0x6300
 /////////////////////////////////////////////////////////////////
 // 0x6300 Entry 39 (size 16 bundles) Reserved
-    KVM_FAULT(39)
+        KVM_FAULT(39)
    .org kvm_ia64_ivt+0x6400
 /////////////////////////////////////////////////////////////////
 // 0x6400 Entry 40 (size 16 bundles) Reserved
-    KVM_FAULT(40)
+        KVM_FAULT(40)
    .org kvm_ia64_ivt+0x6500
 //////////////////////////////////////////////////////////////////
 // 0x6500 Entry 41 (size 16 bundles) Reserved
-    KVM_FAULT(41)
+        KVM_FAULT(41)
    .org kvm_ia64_ivt+0x6600
 //////////////////////////////////////////////////////////////////
 // 0x6600 Entry 42 (size 16 bundles) Reserved
-    KVM_FAULT(42)
+        KVM_FAULT(42)
    .org kvm_ia64_ivt+0x6700
 //////////////////////////////////////////////////////////////////
 // 0x6700 Entry 43 (size 16 bundles) Reserved
-    KVM_FAULT(43)
+        KVM_FAULT(43)
    .org kvm_ia64_ivt+0x6800
 //////////////////////////////////////////////////////////////////
 // 0x6800 Entry 44 (size 16 bundles) Reserved
-    KVM_FAULT(44)
+        KVM_FAULT(44)
    .org kvm_ia64_ivt+0x6900
 ///////////////////////////////////////////////////////////////////
 // 0x6900 Entry 45 (size 16 bundles) IA-32 Exeception
 //(17,18,29,41,42,43,44,58,60,61,62,72,73,75,76,77)
 ENTRY(kvm_ia32_exception)
-    KVM_FAULT(45)
+        KVM_FAULT(45)
 END(kvm_ia32_exception)
    .org kvm_ia64_ivt+0x6a00
 ////////////////////////////////////////////////////////////////////
 // 0x6a00 Entry 46 (size 16 bundles) IA-32 Intercept  (30,31,59,70,71)
 ENTRY(kvm_ia32_intercept)
-    KVM_FAULT(47)
+        KVM_FAULT(47)
 END(kvm_ia32_intercept)
    .org kvm_ia64_ivt+0x6c00
 /////////////////////////////////////////////////////////////////////
 // 0x6c00 Entry 48 (size 16 bundles) Reserved
-    KVM_FAULT(48)
+        KVM_FAULT(48)
    .org kvm_ia64_ivt+0x6d00
 //////////////////////////////////////////////////////////////////////
 // 0x6d00 Entry 49 (size 16 bundles) Reserved
-    KVM_FAULT(49)
+        KVM_FAULT(49)
    .org kvm_ia64_ivt+0x6e00
 //////////////////////////////////////////////////////////////////////
 // 0x6e00 Entry 50 (size 16 bundles) Reserved
-    KVM_FAULT(50)
+        KVM_FAULT(50)
    .org kvm_ia64_ivt+0x6f00
 /////////////////////////////////////////////////////////////////////
 // 0x6f00 Entry 51 (size 16 bundles) Reserved
-    KVM_FAULT(52)
+        KVM_FAULT(52)
    .org kvm_ia64_ivt+0x7100
 ////////////////////////////////////////////////////////////////////
 // 0x7100 Entry 53 (size 16 bundles) Reserved
-    KVM_FAULT(53)
+        KVM_FAULT(53)
    .org kvm_ia64_ivt+0x7200
 /////////////////////////////////////////////////////////////////////
 // 0x7200 Entry 54 (size 16 bundles) Reserved
-    KVM_FAULT(54)
+        KVM_FAULT(54)
    .org kvm_ia64_ivt+0x7300
 ////////////////////////////////////////////////////////////////////
 // 0x7300 Entry 55 (size 16 bundles) Reserved
-    KVM_FAULT(55)
+        KVM_FAULT(55)
    .org kvm_ia64_ivt+0x7400
 ////////////////////////////////////////////////////////////////////
 // 0x7400 Entry 56 (size 16 bundles) Reserved
-    KVM_FAULT(56)
+        KVM_FAULT(56)
    .org kvm_ia64_ivt+0x7500
 /////////////////////////////////////////////////////////////////////
 // 0x7500 Entry 57 (size 16 bundles) Reserved
-    KVM_FAULT(57)
+        KVM_FAULT(57)
    .org kvm_ia64_ivt+0x7600
 /////////////////////////////////////////////////////////////////////
 // 0x7600 Entry 58 (size 16 bundles) Reserved
-    KVM_FAULT(58)
+        KVM_FAULT(58)
    .org kvm_ia64_ivt+0x7700
 ////////////////////////////////////////////////////////////////////
 // 0x7700 Entry 59 (size 16 bundles) Reserved
-    KVM_FAULT(59)
+        KVM_FAULT(59)
    .org kvm_ia64_ivt+0x7800
 ////////////////////////////////////////////////////////////////////
 // 0x7800 Entry 60 (size 16 bundles) Reserved
-    KVM_FAULT(60)
+        KVM_FAULT(60)
    .org kvm_ia64_ivt+0x7900
 /////////////////////////////////////////////////////////////////////
 // 0x7900 Entry 61 (size 16 bundles) Reserved
-    KVM_FAULT(61)
+        KVM_FAULT(61)
    .org kvm_ia64_ivt+0x7a00
 /////////////////////////////////////////////////////////////////////
 // 0x7a00 Entry 62 (size 16 bundles) Reserved
-    KVM_FAULT(62)
+        KVM_FAULT(62)
    .org kvm_ia64_ivt+0x7b00
 /////////////////////////////////////////////////////////////////////
 // 0x7b00 Entry 63 (size 16 bundles) Reserved
-    KVM_FAULT(63)
+        KVM_FAULT(63)
    .org kvm_ia64_ivt+0x7c00
 ////////////////////////////////////////////////////////////////////
 // 0x7c00 Entry 64 (size 16 bundles) Reserved
-    KVM_FAULT(64)
+        KVM_FAULT(64)
    .org kvm_ia64_ivt+0x7d00
 /////////////////////////////////////////////////////////////////////
 // 0x7d00 Entry 65 (size 16 bundles) Reserved
-    KVM_FAULT(65)
+        KVM_FAULT(65)
    .org kvm_ia64_ivt+0x7e00
 /////////////////////////////////////////////////////////////////////
 // 0x7e00 Entry 66 (size 16 bundles) Reserved
-    KVM_FAULT(66)
+        KVM_FAULT(66)
    .org kvm_ia64_ivt+0x7f00
 ////////////////////////////////////////////////////////////////////
 // 0x7f00 Entry 67 (size 16 bundles) Reserved
-    KVM_FAULT(67)
+        KVM_FAULT(67)
    .org kvm_ia64_ivt+0x8000
 // There is no particular reason for this code to be here, other than that
@@ -804,132 +808,128 @@ END(kvm_ia32_intercept)
 ENTRY(kvm_dtlb_miss_dispatch)
-    mov r19 = 2
+        mov r19 = 2
-    KVM_SAVE_MIN_WITH_COVER_R19
+        KVM_SAVE_MIN_WITH_COVER_R19
-    alloc r14=ar.pfs,0,0,3,0
+        alloc r14=ar.pfs,0,0,3,0
-    mov out0=cr.ifa
+        mov out0=cr.ifa
-    mov out1=r15
+        mov out1=r15
-    adds r3=8,r2                // set up second base pointer
+        adds r3=8,r2                // set up second base pointer
-    ;;
+        ;;
-    ssm psr.ic
+        ssm psr.ic
-    ;;
+        ;;
-    srlz.i                  // guarantee that interruption collection is on
+        srlz.i     // guarantee that interruption collection is on
-    ;;
+        ;;
-    //(p15) ssm psr.i               // restore psr.i
+        //(p15) ssm psr.i               // restore psr.i
-    addl r14=@gprel(ia64_leave_hypervisor_prepare),gp
+        addl r14=@gprel(ia64_leave_hypervisor_prepare),gp
-    ;;
+        ;;
-    KVM_SAVE_REST
+        KVM_SAVE_REST
-    KVM_SAVE_EXTRA
+        KVM_SAVE_EXTRA
-    mov rp=r14
+        mov rp=r14
-    ;;
+        ;;
-    adds out2=16,r12
+        adds out2=16,r12
-    br.call.sptk.many b6=kvm_page_fault
+        br.call.sptk.many b6=kvm_page_fault
 END(kvm_dtlb_miss_dispatch)
 ENTRY(kvm_itlb_miss_dispatch)
-    KVM_SAVE_MIN_WITH_COVER_R19
+        KVM_SAVE_MIN_WITH_COVER_R19
-    alloc r14=ar.pfs,0,0,3,0
+        alloc r14=ar.pfs,0,0,3,0
-    mov out0=cr.ifa
+        mov out0=cr.ifa
-    mov out1=r15
+        mov out1=r15
-    adds r3=8,r2                // set up second base pointer
+        adds r3=8,r2                // set up second base pointer
-    ;;
+        ;;
-    ssm psr.ic
+        ssm psr.ic
-    ;;
+        ;;
-    srlz.i                  // guarantee that interruption collection is on
+        srlz.i   // guarantee that interruption collection is on
-    ;;
+        ;;
-    //(p15) ssm psr.i               // restore psr.i
+        //(p15) ssm psr.i               // restore psr.i
-    addl r14=@gprel(ia64_leave_hypervisor),gp
+        addl r14=@gprel(ia64_leave_hypervisor),gp
-    ;;
+        ;;
-    KVM_SAVE_REST
+        KVM_SAVE_REST
-    mov rp=r14
+        mov rp=r14
-    ;;
+        ;;
-    adds out2=16,r12
+        adds out2=16,r12
-    br.call.sptk.many b6=kvm_page_fault
+        br.call.sptk.many b6=kvm_page_fault
 END(kvm_itlb_miss_dispatch)
 ENTRY(kvm_dispatch_reflection)
-    /*
+/*
-     * Input:
+ * Input:
-     *  psr.ic: off
+ *  psr.ic: off
-     *  r19:    intr type (offset into ivt, see ia64_int.h)
+ *  r19:    intr type (offset into ivt, see ia64_int.h)
-     *  r31:    contains saved predicates (pr)
+ *  r31:    contains saved predicates (pr)
-     */
+ */
-    KVM_SAVE_MIN_WITH_COVER_R19
+        KVM_SAVE_MIN_WITH_COVER_R19
-    alloc r14=ar.pfs,0,0,5,0
+        alloc r14=ar.pfs,0,0,5,0
-    mov out0=cr.ifa
+        mov out0=cr.ifa
-    mov out1=cr.isr
+        mov out1=cr.isr
-    mov out2=cr.iim
+        mov out2=cr.iim
-    mov out3=r15
+        mov out3=r15
-    adds r3=8,r2                // set up second base pointer
+        adds r3=8,r2                // set up second base pointer
-    ;;
+        ;;
-    ssm psr.ic
+        ssm psr.ic
-    ;;
+        ;;
-    srlz.i                  // guarantee that interruption collection is on
+        srlz.i   // guarantee that interruption collection is on
-    ;;
+        ;;
-    //(p15) ssm psr.i               // restore psr.i
+        //(p15) ssm psr.i               // restore psr.i
-    addl r14=@gprel(ia64_leave_hypervisor),gp
+        addl r14=@gprel(ia64_leave_hypervisor),gp
-    ;;
+        ;;
-    KVM_SAVE_REST
+        KVM_SAVE_REST
-    mov rp=r14
+        mov rp=r14
-    ;;
+        ;;
-    adds out4=16,r12
+        adds out4=16,r12
-    br.call.sptk.many b6=reflect_interruption
+        br.call.sptk.many b6=reflect_interruption
 END(kvm_dispatch_reflection)
 ENTRY(kvm_dispatch_virtualization_fault)
-    adds r16 = VMM_VCPU_CAUSE_OFFSET,r21
+        adds r16 = VMM_VCPU_CAUSE_OFFSET,r21
-    adds r17 = VMM_VCPU_OPCODE_OFFSET,r21
+        adds r17 = VMM_VCPU_OPCODE_OFFSET,r21
-    ;;
+        ;;
-    st8 [r16] = r24
+        st8 [r16] = r24
-    st8 [r17] = r25
+        st8 [r17] = r25
-    ;;
+        ;;
-    KVM_SAVE_MIN_WITH_COVER_R19
+        KVM_SAVE_MIN_WITH_COVER_R19
-    ;;
+        ;;
-    alloc r14=ar.pfs,0,0,2,0 // now it's safe (must be first in insn group!)
+        alloc r14=ar.pfs,0,0,2,0 // (must be first in insn group!)
-    mov out0=r13        //vcpu
+        mov out0=r13        //vcpu
-    adds r3=8,r2                // set up second base pointer
+        adds r3=8,r2                // set up second base pointer
-    ;;
+        ;;
-    ssm psr.ic
+        ssm psr.ic
-    ;;
+        ;;
-    srlz.i                  // guarantee that interruption collection is on
+        srlz.i    // guarantee that interruption collection is on
-    ;;
+        ;;
-    //(p15) ssm psr.i               // restore psr.i
+        //(p15) ssm psr.i               // restore psr.i
-    addl r14=@gprel(ia64_leave_hypervisor_prepare),gp
+        addl r14=@gprel(ia64_leave_hypervisor_prepare),gp
-    ;;
+        ;;
-    KVM_SAVE_REST
+        KVM_SAVE_REST
-    KVM_SAVE_EXTRA
+        KVM_SAVE_EXTRA
-    mov rp=r14
+        mov rp=r14
-    ;;
+        ;;
-    adds out1=16,sp         //regs
+        adds out1=16,sp         //regs
-    br.call.sptk.many b6=kvm_emulate
+        br.call.sptk.many b6=kvm_emulate
 END(kvm_dispatch_virtualization_fault)
 ENTRY(kvm_dispatch_interrupt)
-    KVM_SAVE_MIN_WITH_COVER_R19 // uses r31; defines r2 and r3
+        KVM_SAVE_MIN_WITH_COVER_R19     // uses r31; defines r2 and r3
-    ;;
+        ;;
-    alloc r14=ar.pfs,0,0,1,0 // must be first in an insn group
+        alloc r14=ar.pfs,0,0,1,0 // must be first in an insn group
-    //mov out0=cr.ivr           // pass cr.ivr as first arg
+        adds r3=8,r2            // set up second base pointer for SAVE_REST
-    adds r3=8,r2                // set up second base pointer for SAVE_REST
+        ;;
-    ;;
+        ssm psr.ic
-    ssm psr.ic
+        ;;
-    ;;
+        srlz.i
-    srlz.i
+        ;;
-    ;;
+        //(p15) ssm psr.i
-    //(p15) ssm psr.i
+        addl r14=@gprel(ia64_leave_hypervisor),gp
-    addl r14=@gprel(ia64_leave_hypervisor),gp
+        ;;
-    ;;
+        KVM_SAVE_REST
-    KVM_SAVE_REST
+        mov rp=r14
-    mov rp=r14
+        ;;
-    ;;
+        mov out0=r13            // pass pointer to pt_regs as second arg
-    mov out0=r13                // pass pointer to pt_regs as second arg
+        br.call.sptk.many b6=kvm_ia64_handle_irq
-    br.call.sptk.many b6=kvm_ia64_handle_irq
 END(kvm_dispatch_interrupt)
 GLOBAL_ENTRY(ia64_leave_nested)
        rsm psr.i
        ;;
@@ -1008,7 +1008,7 @@ GLOBAL_ENTRY(ia64_leave_nested)
        ;;
        ldf.fill f11=[r2]
 //      mov r18=r13
-//    mov r21=r13
+//      mov r21=r13
        adds r16=PT(CR_IPSR)+16,r12
        adds r17=PT(CR_IIP)+16,r12
        ;;
@@ -1058,138 +1058,135 @@ GLOBAL_ENTRY(ia64_leave_nested)
        rfi
 END(ia64_leave_nested)
 GLOBAL_ENTRY(ia64_leave_hypervisor_prepare)
-    /*
+/*
-     * work.need_resched etc. mustn't get changed
+ * work.need_resched etc. mustn't get changed
-     *by this CPU before it returns to
+ *by this CPU before it returns to
-    ;;
+ * user- or fsys-mode, hence we disable interrupts early on:
-     * user- or fsys-mode, hence we disable interrupts early on:
+ */
-     */
+        adds r2 = PT(R4)+16,r12
-    adds r2 = PT(R4)+16,r12
+        adds r3 = PT(R5)+16,r12
-    adds r3 = PT(R5)+16,r12
+        adds r8 = PT(EML_UNAT)+16,r12
-    adds r8 = PT(EML_UNAT)+16,r12
+        ;;
-    ;;
+        ld8 r8 = [r8]
-    ld8 r8 = [r8]
+        ;;
-    ;;
+        mov ar.unat=r8
-    mov ar.unat=r8
+        ;;
-    ;;
+        ld8.fill r4=[r2],16    //load r4
-    ld8.fill r4=[r2],16    //load r4
+        ld8.fill r5=[r3],16    //load r5
-    ld8.fill r5=[r3],16    //load r5
+        ;;
-    ;;
+        ld8.fill r6=[r2]    //load r6
-    ld8.fill r6=[r2]    //load r6
+        ld8.fill r7=[r3]    //load r7
-    ld8.fill r7=[r3]    //load r7
+        ;;
-    ;;
 END(ia64_leave_hypervisor_prepare)
 //fall through
 GLOBAL_ENTRY(ia64_leave_hypervisor)
-    rsm psr.i
+        rsm psr.i
-    ;;
+        ;;
-    br.call.sptk.many b0=leave_hypervisor_tail
+        br.call.sptk.many b0=leave_hypervisor_tail
-    ;;
+        ;;
-    adds r20=PT(PR)+16,r12
+        adds r20=PT(PR)+16,r12
-    adds r8=PT(EML_UNAT)+16,r12
+        adds r8=PT(EML_UNAT)+16,r12
-    ;;
+        ;;
-    ld8 r8=[r8]
+        ld8 r8=[r8]
-    ;;
+        ;;
-    mov ar.unat=r8
+        mov ar.unat=r8
-    ;;
+        ;;
-    lfetch [r20],PT(CR_IPSR)-PT(PR)
+        lfetch [r20],PT(CR_IPSR)-PT(PR)
-    adds r2 = PT(B6)+16,r12
+        adds r2 = PT(B6)+16,r12
-    adds r3 = PT(B7)+16,r12
+        adds r3 = PT(B7)+16,r12
-    ;;
+        ;;
-    lfetch [r20]
+        lfetch [r20]
-    ;;
+        ;;
-    ld8 r24=[r2],16        /* B6 */
+        ld8 r24=[r2],16        /* B6 */
-    ld8 r25=[r3],16        /* B7 */
+        ld8 r25=[r3],16        /* B7 */
-    ;;
+        ;;
-    ld8 r26=[r2],16        /* ar_csd */
+        ld8 r26=[r2],16        /* ar_csd */
-    ld8 r27=[r3],16        /* ar_ssd */
+        ld8 r27=[r3],16        /* ar_ssd */
-    mov b6 = r24
+        mov b6 = r24
-    ;;
+        ;;
-    ld8.fill r8=[r2],16
+        ld8.fill r8=[r2],16
-    ld8.fill r9=[r3],16
+        ld8.fill r9=[r3],16
-    mov b7 = r25
+        mov b7 = r25
-    ;;
+        ;;
-    mov ar.csd = r26
+        mov ar.csd = r26
-    mov ar.ssd = r27
+        mov ar.ssd = r27
-    ;;
+        ;;
-    ld8.fill r10=[r2],PT(R15)-PT(R10)
+        ld8.fill r10=[r2],PT(R15)-PT(R10)
-    ld8.fill r11=[r3],PT(R14)-PT(R11)
+        ld8.fill r11=[r3],PT(R14)-PT(R11)
-    ;;
+        ;;
-    ld8.fill r15=[r2],PT(R16)-PT(R15)
+        ld8.fill r15=[r2],PT(R16)-PT(R15)
-    ld8.fill r14=[r3],PT(R17)-PT(R14)
+        ld8.fill r14=[r3],PT(R17)-PT(R14)
-    ;;
+        ;;
-    ld8.fill r16=[r2],16
+        ld8.fill r16=[r2],16
-    ld8.fill r17=[r3],16
+        ld8.fill r17=[r3],16
-    ;;
+        ;;
-    ld8.fill r18=[r2],16
+        ld8.fill r18=[r2],16
-    ld8.fill r19=[r3],16
+        ld8.fill r19=[r3],16
-    ;;
+        ;;
-    ld8.fill r20=[r2],16
+        ld8.fill r20=[r2],16
-    ld8.fill r21=[r3],16
+        ld8.fill r21=[r3],16
-    ;;
+        ;;
-    ld8.fill r22=[r2],16
+        ld8.fill r22=[r2],16
-    ld8.fill r23=[r3],16
+        ld8.fill r23=[r3],16
-    ;;
+        ;;
-    ld8.fill r24=[r2],16
+        ld8.fill r24=[r2],16
-    ld8.fill r25=[r3],16
+        ld8.fill r25=[r3],16
-    ;;
+        ;;
-    ld8.fill r26=[r2],16
+        ld8.fill r26=[r2],16
-    ld8.fill r27=[r3],16
+        ld8.fill r27=[r3],16
-    ;;
+        ;;
-    ld8.fill r28=[r2],16
+        ld8.fill r28=[r2],16
-    ld8.fill r29=[r3],16
+        ld8.fill r29=[r3],16
-    ;;
+        ;;
-    ld8.fill r30=[r2],PT(F6)-PT(R30)
+        ld8.fill r30=[r2],PT(F6)-PT(R30)
-    ld8.fill r31=[r3],PT(F7)-PT(R31)
+        ld8.fill r31=[r3],PT(F7)-PT(R31)
-    ;;
+        ;;
-    rsm psr.i | psr.ic
+        rsm psr.i | psr.ic
-    // initiate turning off of interrupt and interruption collection
+        // initiate turning off of interrupt and interruption collection
-    invala          // invalidate ALAT
+        invala          // invalidate ALAT
-    ;;
+        ;;
-    srlz.i          // ensure interruption collection is off
+        srlz.i          // ensure interruption collection is off
-    ;;
+        ;;
-    bsw.0
+        bsw.0
-    ;;
+        ;;
-    adds r16 = PT(CR_IPSR)+16,r12
+        adds r16 = PT(CR_IPSR)+16,r12
-    adds r17 = PT(CR_IIP)+16,r12
+        adds r17 = PT(CR_IIP)+16,r12
-    mov r21=r13         // get current
+        mov r21=r13             // get current
-    ;;
+        ;;
-    ld8 r31=[r16],16    // load cr.ipsr
+        ld8 r31=[r16],16    // load cr.ipsr
-    ld8 r30=[r17],16    // load cr.iip
+        ld8 r30=[r17],16    // load cr.iip
-    ;;
+        ;;
-    ld8 r29=[r16],16    // load cr.ifs
+        ld8 r29=[r16],16    // load cr.ifs
-    ld8 r28=[r17],16    // load ar.unat
+        ld8 r28=[r17],16    // load ar.unat
-    ;;
+        ;;
-    ld8 r27=[r16],16    // load ar.pfs
+        ld8 r27=[r16],16    // load ar.pfs
-    ld8 r26=[r17],16    // load ar.rsc
+        ld8 r26=[r17],16    // load ar.rsc
-    ;;
+        ;;
-    ld8 r25=[r16],16    // load ar.rnat
+        ld8 r25=[r16],16    // load ar.rnat
-    ld8 r24=[r17],16    // load ar.bspstore
+        ld8 r24=[r17],16    // load ar.bspstore
-    ;;
+        ;;
-    ld8 r23=[r16],16    // load predicates
+        ld8 r23=[r16],16    // load predicates
-    ld8 r22=[r17],16    // load b0
+        ld8 r22=[r17],16    // load b0
-    ;;
+        ;;
-    ld8 r20=[r16],16    // load ar.rsc value for "loadrs"
+        ld8 r20=[r16],16    // load ar.rsc value for "loadrs"
-    ld8.fill r1=[r17],16    //load r1
+        ld8.fill r1=[r17],16    //load r1
-    ;;
+        ;;
-    ld8.fill r12=[r16],16    //load r12
+        ld8.fill r12=[r16],16    //load r12
-    ld8.fill r13=[r17],PT(R2)-PT(R13)    //load r13
+        ld8.fill r13=[r17],PT(R2)-PT(R13)    //load r13
-    ;;
+        ;;
-    ld8 r19=[r16],PT(R3)-PT(AR_FPSR)    //load ar_fpsr
+        ld8 r19=[r16],PT(R3)-PT(AR_FPSR)    //load ar_fpsr
-    ld8.fill r2=[r17],PT(AR_CCV)-PT(R2)    //load r2
+        ld8.fill r2=[r17],PT(AR_CCV)-PT(R2)    //load r2
-    ;;
+        ;;
-    ld8.fill r3=[r16]   //load r3
+        ld8.fill r3=[r16]       //load r3
-    ld8 r18=[r17]       //load ar_ccv
+        ld8 r18=[r17]   //load ar_ccv
-    ;;
+        ;;
-    mov ar.fpsr=r19
+        mov ar.fpsr=r19
-    mov ar.ccv=r18
+        mov ar.ccv=r18
-    shr.u r18=r20,16
+        shr.u r18=r20,16
-    ;;
+        ;;
 kvm_rbs_switch:
-    mov r19=96
+        mov r19=96
 kvm_dont_preserve_current_frame:
 /*
@@ -1201,76 +1198,76 @@ kvm_dont_preserve_current_frame:
 #   define pReturn      p7
 #   define Nregs        14
-    alloc loc0=ar.pfs,2,Nregs-2,2,0
+        alloc loc0=ar.pfs,2,Nregs-2,2,0
-    shr.u loc1=r18,9            // RNaTslots <= floor(dirtySize / (64*8))
+        shr.u loc1=r18,9        // RNaTslots <= floor(dirtySize / (64*8))
-    sub r19=r19,r18             // r19 = (physStackedSize + 8) - dirtySize
+        sub r19=r19,r18         // r19 = (physStackedSize + 8) - dirtySize
-    ;;
+        ;;
-    mov ar.rsc=r20              // load ar.rsc to be used for "loadrs"
+        mov ar.rsc=r20          // load ar.rsc to be used for "loadrs"
-    shladd in0=loc1,3,r19
+        shladd in0=loc1,3,r19
-    mov in1=0
+        mov in1=0
-    ;;
+        ;;
-    TEXT_ALIGN(32)
+        TEXT_ALIGN(32)
 kvm_rse_clear_invalid:
-    alloc loc0=ar.pfs,2,Nregs-2,2,0
+        alloc loc0=ar.pfs,2,Nregs-2,2,0
-    cmp.lt pRecurse,p0=Nregs*8,in0
+        cmp.lt pRecurse,p0=Nregs*8,in0
-    // if more than Nregs regs left to clear, (re)curse
+        // if more than Nregs regs left to clear, (re)curse
-    add out0=-Nregs*8,in0
+        add out0=-Nregs*8,in0
-    add out1=1,in1              // increment recursion count
+        add out1=1,in1          // increment recursion count
-    mov loc1=0
+        mov loc1=0
-    mov loc2=0
+        mov loc2=0
-    ;;
+        ;;
-    mov loc3=0
+        mov loc3=0
-    mov loc4=0
+        mov loc4=0
-    mov loc5=0
+        mov loc5=0
-    mov loc6=0
+        mov loc6=0
-    mov loc7=0
+        mov loc7=0
 (pRecurse) br.call.dptk.few b0=kvm_rse_clear_invalid
-    ;;
+        ;;
-    mov loc8=0
+        mov loc8=0
-    mov loc9=0
+        mov loc9=0
-    cmp.ne pReturn,p0=r0,in1
+        cmp.ne pReturn,p0=r0,in1
-    // if recursion count != 0, we need to do a br.ret
+        // if recursion count != 0, we need to do a br.ret
-    mov loc10=0
+        mov loc10=0
-    mov loc11=0
+        mov loc11=0
 (pReturn) br.ret.dptk.many b0
 #       undef pRecurse
 #       undef pReturn
 // loadrs has already been shifted
-    alloc r16=ar.pfs,0,0,0,0    // drop current register frame
+        alloc r16=ar.pfs,0,0,0,0    // drop current register frame
-    ;;
+        ;;
-    loadrs
+        loadrs
-    ;;
+        ;;
-    mov ar.bspstore=r24
+        mov ar.bspstore=r24
-    ;;
+        ;;
-    mov ar.unat=r28
+        mov ar.unat=r28
-    mov ar.rnat=r25
+        mov ar.rnat=r25
-    mov ar.rsc=r26
+        mov ar.rsc=r26
-    ;;
+        ;;
-    mov cr.ipsr=r31
+        mov cr.ipsr=r31
-    mov cr.iip=r30
+        mov cr.iip=r30
-    mov cr.ifs=r29
+        mov cr.ifs=r29
-    mov ar.pfs=r27
+        mov ar.pfs=r27
-    adds r18=VMM_VPD_BASE_OFFSET,r21
+        adds r18=VMM_VPD_BASE_OFFSET,r21
-    ;;
+        ;;
-    ld8 r18=[r18]   //vpd
+        ld8 r18=[r18]   //vpd
-    adds r17=VMM_VCPU_ISR_OFFSET,r21
+        adds r17=VMM_VCPU_ISR_OFFSET,r21
-    ;;
+        ;;
-    ld8 r17=[r17]
+        ld8 r17=[r17]
-    adds r19=VMM_VPD_VPSR_OFFSET,r18
+        adds r19=VMM_VPD_VPSR_OFFSET,r18
-    ;;
+        ;;
-    ld8 r19=[r19]        //vpsr
+        ld8 r19=[r19]        //vpsr
-    mov r25=r18
+        mov r25=r18
-    adds r16= VMM_VCPU_GP_OFFSET,r21
+        adds r16= VMM_VCPU_GP_OFFSET,r21
-    ;;
+        ;;
-    ld8 r16= [r16] // Put gp in r24
+        ld8 r16= [r16] // Put gp in r24
-    movl r24=@gprel(ia64_vmm_entry)  // calculate return address
+        movl r24=@gprel(ia64_vmm_entry)  // calculate return address
-    ;;
+        ;;
-    add  r24=r24,r16
+        add  r24=r24,r16
-    ;;
+        ;;
-    br.sptk.many  kvm_vps_sync_write       // call the service
+        br.sptk.many  kvm_vps_sync_write       // call the service
-    ;;
+        ;;
 END(ia64_leave_hypervisor)
 // fall through
 GLOBAL_ENTRY(ia64_vmm_entry)
@@ -1283,16 +1280,14 @@ GLOBAL_ENTRY(ia64_vmm_entry)
 *  r22:b0
 *  r23:predicate
 */
-    mov r24=r22
+        mov r24=r22
-    mov r25=r18
+        mov r25=r18
-    tbit.nz p1,p2 = r19,IA64_PSR_IC_BIT        // p1=vpsr.ic
+        tbit.nz p1,p2 = r19,IA64_PSR_IC_BIT        // p1=vpsr.ic
-    (p1) br.cond.sptk.few kvm_vps_resume_normal
+(p1)    br.cond.sptk.few kvm_vps_resume_normal
-    (p2) br.cond.sptk.many kvm_vps_resume_handler
+(p2)    br.cond.sptk.many kvm_vps_resume_handler
-    ;;
+        ;;
 END(ia64_vmm_entry)
 /*
 * extern u64 ia64_call_vsa(u64 proc, u64 arg1, u64 arg2,
 *                  u64 arg3, u64 arg4, u64 arg5,
@@ -1310,88 +1305,88 @@ psrsave =   loc2
 entry   =   loc3
 hostret =   r24
-    alloc   pfssave=ar.pfs,4,4,0,0
+        alloc   pfssave=ar.pfs,4,4,0,0
-    mov rpsave=rp
+        mov rpsave=rp
-    adds entry=VMM_VCPU_VSA_BASE_OFFSET, r13
+        adds entry=VMM_VCPU_VSA_BASE_OFFSET, r13
-    ;;
+        ;;
-    ld8 entry=[entry]
+        ld8 entry=[entry]
-1:  mov hostret=ip
+1:      mov hostret=ip
-    mov r25=in1         // copy arguments
+        mov r25=in1         // copy arguments
-    mov r26=in2
+        mov r26=in2
-    mov r27=in3
+        mov r27=in3
-    mov psrsave=psr
+        mov psrsave=psr
-    ;;
+        ;;
-    tbit.nz p6,p0=psrsave,14    // IA64_PSR_I
+        tbit.nz p6,p0=psrsave,14    // IA64_PSR_I
-    tbit.nz p7,p0=psrsave,13    // IA64_PSR_IC
+        tbit.nz p7,p0=psrsave,13    // IA64_PSR_IC
-    ;;
+        ;;
-    add hostret=2f-1b,hostret   // calculate return address
+        add hostret=2f-1b,hostret   // calculate return address
-    add entry=entry,in0
+        add entry=entry,in0
-    ;;
+        ;;
-    rsm psr.i | psr.ic
+        rsm psr.i | psr.ic
-    ;;
+        ;;
-    srlz.i
+        srlz.i
-    mov b6=entry
+        mov b6=entry
-    br.cond.sptk b6         // call the service
+        br.cond.sptk b6         // call the service
 2:
-    // Architectural sequence for enabling interrupts if necessary
+// Architectural sequence for enabling interrupts if necessary
 (p7)    ssm psr.ic
-    ;;
+        ;;
 (p7)    srlz.i
-    ;;
+        ;;
 //(p6)    ssm psr.i
-    ;;
+        ;;
-    mov rp=rpsave
+        mov rp=rpsave
-    mov ar.pfs=pfssave
+        mov ar.pfs=pfssave
-    mov r8=r31
+        mov r8=r31
-    ;;
+        ;;
-    srlz.d
+        srlz.d
-    br.ret.sptk rp
+        br.ret.sptk rp
 END(ia64_call_vsa)
 #define  INIT_BSPSTORE  ((4<<30)-(12<<20)-0x100)
 GLOBAL_ENTRY(vmm_reset_entry)
-    //set up ipsr, iip, vpd.vpsr, dcr
+        //set up ipsr, iip, vpd.vpsr, dcr
-    // For IPSR: it/dt/rt=1, i/ic=1, si=1, vm/bn=1
+        // For IPSR: it/dt/rt=1, i/ic=1, si=1, vm/bn=1
-    // For DCR: all bits 0
+        // For DCR: all bits 0
-    bsw.0
+        bsw.0
-    ;;
+        ;;
-    mov r21 =r13
+        mov r21 =r13
-    adds r14=-VMM_PT_REGS_SIZE, r12
+        adds r14=-VMM_PT_REGS_SIZE, r12
-    ;;
+        ;;
-    movl r6=0x501008826000      // IPSR dt/rt/it:1;i/ic:1, si:1, vm/bn:1
+        movl r6=0x501008826000      // IPSR dt/rt/it:1;i/ic:1, si:1, vm/bn:1
-    movl r10=0x8000000000000000
+        movl r10=0x8000000000000000
-    adds r16=PT(CR_IIP), r14
+        adds r16=PT(CR_IIP), r14
-    adds r20=PT(R1), r14
+        adds r20=PT(R1), r14
-    ;;
+        ;;
-    rsm psr.ic | psr.i
+        rsm psr.ic | psr.i
-    ;;
+        ;;
-    srlz.i
+        srlz.i
-    ;;
+        ;;
-    mov ar.rsc = 0
+        mov ar.rsc = 0
-    ;;
+        ;;
-    flushrs
+        flushrs
-    ;;
+        ;;
-    mov ar.bspstore = 0
+        mov ar.bspstore = 0
-    // clear BSPSTORE
+        // clear BSPSTORE
-    ;;
+        ;;
-    mov cr.ipsr=r6
+        mov cr.ipsr=r6
-    mov cr.ifs=r10
+        mov cr.ifs=r10
-    ld8 r4 = [r16] // Set init iip for first run.
+        ld8 r4 = [r16] // Set init iip for first run.
-    ld8 r1 = [r20]
+        ld8 r1 = [r20]
-    ;;
+        ;;
-    mov cr.iip=r4
+        mov cr.iip=r4
-    adds r16=VMM_VPD_BASE_OFFSET,r13
+        adds r16=VMM_VPD_BASE_OFFSET,r13
-    ;;
+        ;;
-    ld8 r18=[r16]
+        ld8 r18=[r16]
-    ;;
+        ;;
-    adds r19=VMM_VPD_VPSR_OFFSET,r18
+        adds r19=VMM_VPD_VPSR_OFFSET,r18
-    ;;
+        ;;
-    ld8 r19=[r19]
+        ld8 r19=[r19]
-    mov r17=r0
+        mov r17=r0
-    mov r22=r0
+        mov r22=r0
-    mov r23=r0
+        mov r23=r0
-    br.cond.sptk ia64_vmm_entry
+        br.cond.sptk ia64_vmm_entry
-    br.ret.sptk  b0
+        br.ret.sptk  b0
 END(vmm_reset_entry)
diff --git a/arch/ia64/kvm/vtlb.c b/arch/ia64/kvm/vtlb.c
index e22b93361e08..6b6307a3bd55 100644
--- a/arch/ia64/kvm/vtlb.c
+++ b/arch/ia64/kvm/vtlb.c
@@ -183,8 +183,8 @@ void mark_pages_dirty(struct kvm_vcpu *v, u64 pte, u64 ps)
        u64 i, dirty_pages = 1;
        u64 base_gfn = (pte&_PAGE_PPN_MASK) >> PAGE_SHIFT;
        spinlock_t *lock = __kvm_va(v->arch.dirty_log_lock_pa);
-        void *dirty_bitmap = (void *)v - (KVM_VCPU_OFS + v->vcpu_id * VCPU_SIZE)
+        void *dirty_bitmap = (void *)KVM_MEM_DIRTY_LOG_BASE;
-                                                + KVM_MEM_DIRTY_LOG_OFS;
        dirty_pages <<= ps <= PAGE_SHIFT ? 0 : ps - PAGE_SHIFT;
        vmm_spin_lock(lock);
diff --git a/arch/powerpc/include/asm/disassemble.h b/arch/powerpc/include/asm/disassemble.h
new file mode 100644
index 000000000000..9b198d1b3b2b
--- /dev/null
+++ b/arch/powerpc/include/asm/disassemble.h
@@ -0,0 +1,80 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ *
+ * Copyright IBM Corp. 2008
+ *
+ * Authors: Hollis Blanchard <hollisb@us.ibm.com>
+ */
+#ifndef __ASM_PPC_DISASSEMBLE_H__
+#define __ASM_PPC_DISASSEMBLE_H__
+#include <linux/types.h>
+static inline unsigned int get_op(u32 inst)
+{
+        return inst >> 26;
+}
+static inline unsigned int get_xop(u32 inst)
+{
+        return (inst >> 1) & 0x3ff;
+}
+static inline unsigned int get_sprn(u32 inst)
+{
+        return ((inst >> 16) & 0x1f) | ((inst >> 6) & 0x3e0);
+}
+static inline unsigned int get_dcrn(u32 inst)
+{
+        return ((inst >> 16) & 0x1f) | ((inst >> 6) & 0x3e0);
+}
+static inline unsigned int get_rt(u32 inst)
+{
+        return (inst >> 21) & 0x1f;
+}
+static inline unsigned int get_rs(u32 inst)
+{
+        return (inst >> 21) & 0x1f;
+}
+static inline unsigned int get_ra(u32 inst)
+{
+        return (inst >> 16) & 0x1f;
+}
+static inline unsigned int get_rb(u32 inst)
+{
+        return (inst >> 11) & 0x1f;
+}
+static inline unsigned int get_rc(u32 inst)
+{
+        return inst & 0x1;
+}
+static inline unsigned int get_ws(u32 inst)
+{
+        return (inst >> 11) & 0x1f;
+}
+static inline unsigned int get_d(u32 inst)
+{
+        return inst & 0xffff;
+}
+#endif /* __ASM_PPC_DISASSEMBLE_H__ */
diff --git a/arch/powerpc/include/asm/kvm_44x.h b/arch/powerpc/include/asm/kvm_44x.h
new file mode 100644
index 000000000000..f49031b632ca
--- /dev/null
+++ b/arch/powerpc/include/asm/kvm_44x.h
@@ -0,0 +1,61 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ *
+ * Copyright IBM Corp. 2008
+ *
+ * Authors: Hollis Blanchard <hollisb@us.ibm.com>
+ */
+#ifndef __ASM_44X_H__
+#define __ASM_44X_H__
+#include <linux/kvm_host.h>
+#define PPC44x_TLB_SIZE 64
+/* If the guest is expecting it, this can be as large as we like; we'd just
+ * need to find some way of advertising it. */
+#define KVM44x_GUEST_TLB_SIZE 64
+struct kvmppc_44x_shadow_ref {
+        struct page *page;
+        u16 gtlb_index;
+        u8 writeable;
+        u8 tid;
+};
+struct kvmppc_vcpu_44x {
+        /* Unmodified copy of the guest's TLB. */
+        struct kvmppc_44x_tlbe guest_tlb[KVM44x_GUEST_TLB_SIZE];
+        /* References to guest pages in the hardware TLB. */
+        struct kvmppc_44x_shadow_ref shadow_refs[PPC44x_TLB_SIZE];
+        /* State of the shadow TLB at guest context switch time. */
+        struct kvmppc_44x_tlbe shadow_tlb[PPC44x_TLB_SIZE];
+        u8 shadow_tlb_mod[PPC44x_TLB_SIZE];
+        struct kvm_vcpu vcpu;
+};
+static inline struct kvmppc_vcpu_44x *to_44x(struct kvm_vcpu *vcpu)
+{
+        return container_of(vcpu, struct kvmppc_vcpu_44x, vcpu);
+}
+void kvmppc_set_pid(struct kvm_vcpu *vcpu, u32 new_pid);
+void kvmppc_44x_tlb_put(struct kvm_vcpu *vcpu);
+void kvmppc_44x_tlb_load(struct kvm_vcpu *vcpu);
+#endif /* __ASM_44X_H__ */
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index 34b52b7180cd..c1e436fe7738 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -64,27 +64,58 @@ struct kvm_vcpu_stat {
        u32 halt_wakeup;
 };
-struct tlbe {
+struct kvmppc_44x_tlbe {
        u32 tid; /* Only the low 8 bits are used. */
        u32 word0;
        u32 word1;
        u32 word2;
 };
-struct kvm_arch {
+enum kvm_exit_types {
+        MMIO_EXITS,
+        DCR_EXITS,
+        SIGNAL_EXITS,
+        ITLB_REAL_MISS_EXITS,
+        ITLB_VIRT_MISS_EXITS,
+        DTLB_REAL_MISS_EXITS,
+        DTLB_VIRT_MISS_EXITS,
+        SYSCALL_EXITS,
+        ISI_EXITS,
+        DSI_EXITS,
+        EMULATED_INST_EXITS,
+        EMULATED_MTMSRWE_EXITS,
+        EMULATED_WRTEE_EXITS,
+        EMULATED_MTSPR_EXITS,
+        EMULATED_MFSPR_EXITS,
+        EMULATED_MTMSR_EXITS,
+        EMULATED_MFMSR_EXITS,
+        EMULATED_TLBSX_EXITS,
+        EMULATED_TLBWE_EXITS,
+        EMULATED_RFI_EXITS,
+        DEC_EXITS,
+        EXT_INTR_EXITS,
+        HALT_WAKEUP,
+        USR_PR_INST,
+        FP_UNAVAIL,
+        DEBUG_EXITS,
+        TIMEINGUEST,
+        __NUMBER_OF_KVM_EXIT_TYPES
 };
-struct kvm_vcpu_arch {
+/* allow access to big endian 32bit upper/lower parts and 64bit var */
-        /* Unmodified copy of the guest's TLB. */
+struct kvmppc_exit_timing {
-        struct tlbe guest_tlb[PPC44x_TLB_SIZE];
+        union {
-        /* TLB that's actually used when the guest is running. */
+                u64 tv64;
-        struct tlbe shadow_tlb[PPC44x_TLB_SIZE];
+                struct {
-        /* Pages which are referenced in the shadow TLB. */
+                        u32 tbu, tbl;
-        struct page *shadow_pages[PPC44x_TLB_SIZE];
+                } tv32;
+        };
+};
-        /* Track which TLB entries we've modified in the current exit. */
+struct kvm_arch {
-        u8 shadow_tlb_mod[PPC44x_TLB_SIZE];
+};
+struct kvm_vcpu_arch {
        u32 host_stack;
        u32 host_pid;
        u32 host_dbcr0;
@@ -94,32 +125,32 @@ struct kvm_vcpu_arch {
        u32 host_msr;
        u64 fpr[32];
-        u32 gpr[32];
+        ulong gpr[32];
-        u32 pc;
+        ulong pc;
        u32 cr;
-        u32 ctr;
+        ulong ctr;
-        u32 lr;
+        ulong lr;
-        u32 xer;
+        ulong xer;
-        u32 msr;
+        ulong msr;
        u32 mmucr;
-        u32 sprg0;
+        ulong sprg0;
-        u32 sprg1;
+        ulong sprg1;
-        u32 sprg2;
+        ulong sprg2;
-        u32 sprg3;
+        ulong sprg3;
-        u32 sprg4;
+        ulong sprg4;
-        u32 sprg5;
+        ulong sprg5;
-        u32 sprg6;
+        ulong sprg6;
-        u32 sprg7;
+        ulong sprg7;
-        u32 srr0;
+        ulong srr0;
-        u32 srr1;
+        ulong srr1;
-        u32 csrr0;
+        ulong csrr0;
-        u32 csrr1;
+        ulong csrr1;
-        u32 dsrr0;
+        ulong dsrr0;
-        u32 dsrr1;
+        ulong dsrr1;
-        u32 dear;
+        ulong dear;
-        u32 esr;
+        ulong esr;
        u32 dec;
        u32 decar;
        u32 tbl;
@@ -127,7 +158,7 @@ struct kvm_vcpu_arch {
        u32 tcr;
        u32 tsr;
        u32 ivor[16];
-        u32 ivpr;
+        ulong ivpr;
        u32 pir;
        u32 shadow_pid;
@@ -140,9 +171,22 @@ struct kvm_vcpu_arch {
        u32 dbcr0;
        u32 dbcr1;
+#ifdef CONFIG_KVM_EXIT_TIMING
+        struct kvmppc_exit_timing timing_exit;
+        struct kvmppc_exit_timing timing_last_enter;
+        u32 last_exit_type;
+        u32 timing_count_type[__NUMBER_OF_KVM_EXIT_TYPES];
+        u64 timing_sum_duration[__NUMBER_OF_KVM_EXIT_TYPES];
+        u64 timing_sum_quad_duration[__NUMBER_OF_KVM_EXIT_TYPES];
+        u64 timing_min_duration[__NUMBER_OF_KVM_EXIT_TYPES];
+        u64 timing_max_duration[__NUMBER_OF_KVM_EXIT_TYPES];
+        u64 timing_last_exit;
+        struct dentry *debugfs_exit_timing;
+#endif
        u32 last_inst;
-        u32 fault_dear;
+        ulong fault_dear;
-        u32 fault_esr;
+        ulong fault_esr;
        gpa_t paddr_accessed;
        u8 io_gpr; /* GPR used as IO source/target */
diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
index bb62ad876de3..36d2a50a8487 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -29,11 +29,6 @@
 #include <linux/kvm_types.h>
 #include <linux/kvm_host.h>
-struct kvm_tlb {
-        struct tlbe guest_tlb[PPC44x_TLB_SIZE];
-        struct tlbe shadow_tlb[PPC44x_TLB_SIZE];
-};
 enum emulation_result {
        EMULATE_DONE,         /* no further processing */
        EMULATE_DO_MMIO,      /* kvm_run filled with MMIO request */
@@ -41,9 +36,6 @@ enum emulation_result {
        EMULATE_FAIL,         /* can't emulate this instruction */
 };
-extern const unsigned char exception_priority[];
-extern const unsigned char priority_exception[];
 extern int __kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu);
 extern char kvmppc_handlers_start[];
 extern unsigned long kvmppc_handler_len;
@@ -58,51 +50,44 @@ extern int kvmppc_handle_store(struct kvm_run *run, struct kvm_vcpu *vcpu,
 extern int kvmppc_emulate_instruction(struct kvm_run *run,
                                      struct kvm_vcpu *vcpu);
 extern int kvmppc_emulate_mmio(struct kvm_run *run, struct kvm_vcpu *vcpu);
+extern void kvmppc_emulate_dec(struct kvm_vcpu *vcpu);
-extern void kvmppc_mmu_map(struct kvm_vcpu *vcpu, u64 gvaddr, gfn_t gfn,
+extern void kvmppc_mmu_map(struct kvm_vcpu *vcpu, u64 gvaddr, gpa_t gpaddr,
-                           u64 asid, u32 flags);
+                           u64 asid, u32 flags, u32 max_bytes,
-extern void kvmppc_mmu_invalidate(struct kvm_vcpu *vcpu, gva_t eaddr,
+                           unsigned int gtlb_idx);
-                                  gva_t eend, u32 asid);
 extern void kvmppc_mmu_priv_switch(struct kvm_vcpu *vcpu, int usermode);
 extern void kvmppc_mmu_switch_pid(struct kvm_vcpu *vcpu, u32 pid);
-/* XXX Book E specific */
+/* Core-specific hooks */
-extern void kvmppc_tlbe_set_modified(struct kvm_vcpu *vcpu, unsigned int i);
+extern struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm,
-extern void kvmppc_check_and_deliver_interrupts(struct kvm_vcpu *vcpu);
+                                                unsigned int id);
+extern void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu);
-static inline void kvmppc_queue_exception(struct kvm_vcpu *vcpu, int exception)
+extern int kvmppc_core_vcpu_setup(struct kvm_vcpu *vcpu);
-{
+extern int kvmppc_core_check_processor_compat(void);
-        unsigned int priority = exception_priority[exception];
+extern int kvmppc_core_vcpu_translate(struct kvm_vcpu *vcpu,
-        set_bit(priority, &vcpu->arch.pending_exceptions);
+                                      struct kvm_translation *tr);
-}
+extern void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu);
-static inline void kvmppc_clear_exception(struct kvm_vcpu *vcpu, int exception)
+extern void kvmppc_core_vcpu_put(struct kvm_vcpu *vcpu);
-{
-        unsigned int priority = exception_priority[exception];
+extern void kvmppc_core_load_guest_debugstate(struct kvm_vcpu *vcpu);
-        clear_bit(priority, &vcpu->arch.pending_exceptions);
+extern void kvmppc_core_load_host_debugstate(struct kvm_vcpu *vcpu);
-}
+extern void kvmppc_core_deliver_interrupts(struct kvm_vcpu *vcpu);
-/* Helper function for "full" MSR writes. No need to call this if only EE is
+extern int kvmppc_core_pending_dec(struct kvm_vcpu *vcpu);
- * changing. */
+extern void kvmppc_core_queue_program(struct kvm_vcpu *vcpu);
-static inline void kvmppc_set_msr(struct kvm_vcpu *vcpu, u32 new_msr)
+extern void kvmppc_core_queue_dec(struct kvm_vcpu *vcpu);
-{
+extern void kvmppc_core_queue_external(struct kvm_vcpu *vcpu,
-        if ((new_msr & MSR_PR) != (vcpu->arch.msr & MSR_PR))
+                                       struct kvm_interrupt *irq);
-                kvmppc_mmu_priv_switch(vcpu, new_msr & MSR_PR);
+extern int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,
-        vcpu->arch.msr = new_msr;
+                                  unsigned int op, int *advance);
+extern int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs);
-        if (vcpu->arch.msr & MSR_WE)
+extern int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt);
-                kvm_vcpu_block(vcpu);
-}
+extern int kvmppc_booke_init(void);
+extern void kvmppc_booke_exit(void);
-static inline void kvmppc_set_pid(struct kvm_vcpu *vcpu, u32 new_pid)
-{
-        if (vcpu->arch.pid != new_pid) {
-                vcpu->arch.pid = new_pid;
-                vcpu->arch.swap_pid = 1;
-        }
-}
 extern void kvmppc_core_destroy_mmu(struct kvm_vcpu *vcpu);
diff --git a/arch/powerpc/include/asm/mmu-44x.h b/arch/powerpc/include/asm/mmu-44x.h
index 8a97cfb08b7e..27cc6fdcd3b7 100644
--- a/arch/powerpc/include/asm/mmu-44x.h
+++ b/arch/powerpc/include/asm/mmu-44x.h
@@ -56,6 +56,7 @@
 #ifndef __ASSEMBLY__
 extern unsigned int tlb_44x_hwater;
+extern unsigned int tlb_44x_index;
 typedef struct {
        unsigned int    id;
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
index 661d07d2146b..9937fe44555f 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -23,9 +23,6 @@
 #include <linux/mm.h>
 #include <linux/suspend.h>
 #include <linux/hrtimer.h>
-#ifdef CONFIG_KVM
-#include <linux/kvm_host.h>
-#endif
 #ifdef CONFIG_PPC64
 #include <linux/time.h>
 #include <linux/hardirq.h>
@@ -51,6 +48,9 @@
 #ifdef CONFIG_PPC_ISERIES
 #include <asm/iseries/alpaca.h>
 #endif
+#ifdef CONFIG_KVM
+#include <asm/kvm_44x.h>
+#endif
 #if defined(CONFIG_BOOKE) || defined(CONFIG_40x)
 #include "head_booke.h"
@@ -357,12 +357,10 @@ int main(void)
        DEFINE(PTE_SIZE, sizeof(pte_t));
 #ifdef CONFIG_KVM
-        DEFINE(TLBE_BYTES, sizeof(struct tlbe));
+        DEFINE(TLBE_BYTES, sizeof(struct kvmppc_44x_tlbe));
        DEFINE(VCPU_HOST_STACK, offsetof(struct kvm_vcpu, arch.host_stack));
        DEFINE(VCPU_HOST_PID, offsetof(struct kvm_vcpu, arch.host_pid));
-        DEFINE(VCPU_SHADOW_TLB, offsetof(struct kvm_vcpu, arch.shadow_tlb));
-        DEFINE(VCPU_SHADOW_MOD, offsetof(struct kvm_vcpu, arch.shadow_tlb_mod));
        DEFINE(VCPU_GPRS, offsetof(struct kvm_vcpu, arch.gpr));
        DEFINE(VCPU_LR, offsetof(struct kvm_vcpu, arch.lr));
        DEFINE(VCPU_CR, offsetof(struct kvm_vcpu, arch.cr));
@@ -385,5 +383,16 @@ int main(void)
        DEFINE(PTE_T_LOG2, PTE_T_LOG2);
 #endif
+#ifdef CONFIG_KVM_EXIT_TIMING
+        DEFINE(VCPU_TIMING_EXIT_TBU, offsetof(struct kvm_vcpu,
+                                                arch.timing_exit.tv32.tbu));
+        DEFINE(VCPU_TIMING_EXIT_TBL, offsetof(struct kvm_vcpu,
+                                                arch.timing_exit.tv32.tbl));
+        DEFINE(VCPU_TIMING_LAST_ENTER_TBU, offsetof(struct kvm_vcpu,
+                                        arch.timing_last_enter.tv32.tbu));
+        DEFINE(VCPU_TIMING_LAST_ENTER_TBL, offsetof(struct kvm_vcpu,
+                                        arch.timing_last_enter.tv32.tbl));
+#endif
        return 0;
 }
diff --git a/arch/powerpc/kvm/44x.c b/arch/powerpc/kvm/44x.c
new file mode 100644
index 000000000000..a66bec57265a
--- /dev/null
+++ b/arch/powerpc/kvm/44x.c
@@ -0,0 +1,228 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ *
+ * Copyright IBM Corp. 2008
+ *
+ * Authors: Hollis Blanchard <hollisb@us.ibm.com>
+ */
+#include <linux/kvm_host.h>
+#include <linux/err.h>
+#include <asm/reg.h>
+#include <asm/cputable.h>
+#include <asm/tlbflush.h>
+#include <asm/kvm_44x.h>
+#include <asm/kvm_ppc.h>
+#include "44x_tlb.h"
+/* Note: clearing MSR[DE] just means that the debug interrupt will not be
+ * delivered *immediately*. Instead, it simply sets the appropriate DBSR bits.
+ * If those DBSR bits are still set when MSR[DE] is re-enabled, the interrupt
+ * will be delivered as an "imprecise debug event" (which is indicated by
+ * DBSR[IDE].
+ */
+static void kvm44x_disable_debug_interrupts(void)
+{
+        mtmsr(mfmsr() & ~MSR_DE);
+}
+void kvmppc_core_load_host_debugstate(struct kvm_vcpu *vcpu)
+{
+        kvm44x_disable_debug_interrupts();
+        mtspr(SPRN_IAC1, vcpu->arch.host_iac[0]);
+        mtspr(SPRN_IAC2, vcpu->arch.host_iac[1]);
+        mtspr(SPRN_IAC3, vcpu->arch.host_iac[2]);
+        mtspr(SPRN_IAC4, vcpu->arch.host_iac[3]);
+        mtspr(SPRN_DBCR1, vcpu->arch.host_dbcr1);
+        mtspr(SPRN_DBCR2, vcpu->arch.host_dbcr2);
+        mtspr(SPRN_DBCR0, vcpu->arch.host_dbcr0);
+        mtmsr(vcpu->arch.host_msr);
+}
+void kvmppc_core_load_guest_debugstate(struct kvm_vcpu *vcpu)
+{
+        struct kvm_guest_debug *dbg = &vcpu->guest_debug;
+        u32 dbcr0 = 0;
+        vcpu->arch.host_msr = mfmsr();
+        kvm44x_disable_debug_interrupts();
+        /* Save host debug register state. */
+        vcpu->arch.host_iac[0] = mfspr(SPRN_IAC1);
+        vcpu->arch.host_iac[1] = mfspr(SPRN_IAC2);
+        vcpu->arch.host_iac[2] = mfspr(SPRN_IAC3);
+        vcpu->arch.host_iac[3] = mfspr(SPRN_IAC4);
+        vcpu->arch.host_dbcr0 = mfspr(SPRN_DBCR0);
+        vcpu->arch.host_dbcr1 = mfspr(SPRN_DBCR1);
+        vcpu->arch.host_dbcr2 = mfspr(SPRN_DBCR2);
+        /* set registers up for guest */
+        if (dbg->bp[0]) {
+                mtspr(SPRN_IAC1, dbg->bp[0]);
+                dbcr0 |= DBCR0_IAC1 | DBCR0_IDM;
+        }
+        if (dbg->bp[1]) {
+                mtspr(SPRN_IAC2, dbg->bp[1]);
+                dbcr0 |= DBCR0_IAC2 | DBCR0_IDM;
+        }
+        if (dbg->bp[2]) {
+                mtspr(SPRN_IAC3, dbg->bp[2]);
+                dbcr0 |= DBCR0_IAC3 | DBCR0_IDM;
+        }
+        if (dbg->bp[3]) {
+                mtspr(SPRN_IAC4, dbg->bp[3]);
+                dbcr0 |= DBCR0_IAC4 | DBCR0_IDM;
+        }
+        mtspr(SPRN_DBCR0, dbcr0);
+        mtspr(SPRN_DBCR1, 0);
+        mtspr(SPRN_DBCR2, 0);
+}
+void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
+{
+        kvmppc_44x_tlb_load(vcpu);
+}
+void kvmppc_core_vcpu_put(struct kvm_vcpu *vcpu)
+{
+        kvmppc_44x_tlb_put(vcpu);
+}
+int kvmppc_core_check_processor_compat(void)
+{
+        int r;
+        if (strcmp(cur_cpu_spec->platform, "ppc440") == 0)
+                r = 0;
+        else
+                r = -ENOTSUPP;
+        return r;
+}
+int kvmppc_core_vcpu_setup(struct kvm_vcpu *vcpu)
+{
+        struct kvmppc_vcpu_44x *vcpu_44x = to_44x(vcpu);
+        struct kvmppc_44x_tlbe *tlbe = &vcpu_44x->guest_tlb[0];
+        int i;
+        tlbe->tid = 0;
+        tlbe->word0 = PPC44x_TLB_16M | PPC44x_TLB_VALID;
+        tlbe->word1 = 0;
+        tlbe->word2 = PPC44x_TLB_SX | PPC44x_TLB_SW | PPC44x_TLB_SR;
+        tlbe++;
+        tlbe->tid = 0;
+        tlbe->word0 = 0xef600000 | PPC44x_TLB_4K | PPC44x_TLB_VALID;
+        tlbe->word1 = 0xef600000;
+        tlbe->word2 = PPC44x_TLB_SX | PPC44x_TLB_SW | PPC44x_TLB_SR
+                      | PPC44x_TLB_I | PPC44x_TLB_G;
+        /* Since the guest can directly access the timebase, it must know the
+         * real timebase frequency. Accordingly, it must see the state of
+         * CCR1[TCS]. */
+        vcpu->arch.ccr1 = mfspr(SPRN_CCR1);
+        for (i = 0; i < ARRAY_SIZE(vcpu_44x->shadow_refs); i++)
+                vcpu_44x->shadow_refs[i].gtlb_index = -1;
+        return 0;
+}
+/* 'linear_address' is actually an encoding of AS|PID|EADDR . */
+int kvmppc_core_vcpu_translate(struct kvm_vcpu *vcpu,
+                               struct kvm_translation *tr)
+{
+        struct kvmppc_vcpu_44x *vcpu_44x = to_44x(vcpu);
+        struct kvmppc_44x_tlbe *gtlbe;
+        int index;
+        gva_t eaddr;
+        u8 pid;
+        u8 as;
+        eaddr = tr->linear_address;
+        pid = (tr->linear_address >> 32) & 0xff;
+        as = (tr->linear_address >> 40) & 0x1;
+        index = kvmppc_44x_tlb_index(vcpu, eaddr, pid, as);
+        if (index == -1) {
+                tr->valid = 0;
+                return 0;
+        }
+        gtlbe = &vcpu_44x->guest_tlb[index];
+        tr->physical_address = tlb_xlate(gtlbe, eaddr);
+        /* XXX what does "writeable" and "usermode" even mean? */
+        tr->valid = 1;
+        return 0;
+}
+struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id)
+{
+        struct kvmppc_vcpu_44x *vcpu_44x;
+        struct kvm_vcpu *vcpu;
+        int err;
+        vcpu_44x = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL);
+        if (!vcpu_44x) {
+                err = -ENOMEM;
+                goto out;
+        }
+        vcpu = &vcpu_44x->vcpu;
+        err = kvm_vcpu_init(vcpu, kvm, id);
+        if (err)
+                goto free_vcpu;
+        return vcpu;
+free_vcpu:
+        kmem_cache_free(kvm_vcpu_cache, vcpu_44x);
+out:
+        return ERR_PTR(err);
+}
+void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu)
+{
+        struct kvmppc_vcpu_44x *vcpu_44x = to_44x(vcpu);
+        kvm_vcpu_uninit(vcpu);
+        kmem_cache_free(kvm_vcpu_cache, vcpu_44x);
+}
+static int kvmppc_44x_init(void)
+{
+        int r;
+        r = kvmppc_booke_init();
+        if (r)
+                return r;
+        return kvm_init(NULL, sizeof(struct kvmppc_vcpu_44x), THIS_MODULE);
+}
+static void kvmppc_44x_exit(void)
+{
+        kvmppc_booke_exit();
+}
+module_init(kvmppc_44x_init);
+module_exit(kvmppc_44x_exit);
diff --git a/arch/powerpc/kvm/44x_emulate.c b/arch/powerpc/kvm/44x_emulate.c
new file mode 100644
index 000000000000..82489a743a6f
--- /dev/null
+++ b/arch/powerpc/kvm/44x_emulate.c
@@ -0,0 +1,371 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ *
+ * Copyright IBM Corp. 2008
+ *
+ * Authors: Hollis Blanchard <hollisb@us.ibm.com>
+ */
+#include <asm/kvm_ppc.h>
+#include <asm/dcr.h>
+#include <asm/dcr-regs.h>
+#include <asm/disassemble.h>
+#include <asm/kvm_44x.h>
+#include "timing.h"
+#include "booke.h"
+#include "44x_tlb.h"
+#define OP_RFI      19
+#define XOP_RFI     50
+#define XOP_MFMSR   83
+#define XOP_WRTEE   131
+#define XOP_MTMSR   146
+#define XOP_WRTEEI  163
+#define XOP_MFDCR   323
+#define XOP_MTDCR   451
+#define XOP_TLBSX   914
+#define XOP_ICCCI   966
+#define XOP_TLBWE   978
+static void kvmppc_emul_rfi(struct kvm_vcpu *vcpu)
+{
+        vcpu->arch.pc = vcpu->arch.srr0;
+        kvmppc_set_msr(vcpu, vcpu->arch.srr1);
+}
+int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,
+                           unsigned int inst, int *advance)
+{
+        int emulated = EMULATE_DONE;
+        int dcrn;
+        int ra;
+        int rb;
+        int rc;
+        int rs;
+        int rt;
+        int ws;
+        switch (get_op(inst)) {
+        case OP_RFI:
+                switch (get_xop(inst)) {
+                case XOP_RFI:
+                        kvmppc_emul_rfi(vcpu);
+                        kvmppc_set_exit_type(vcpu, EMULATED_RFI_EXITS);
+                        *advance = 0;
+                        break;
+                default:
+                        emulated = EMULATE_FAIL;
+                        break;
+                }
+                break;
+        case 31:
+                switch (get_xop(inst)) {
+                case XOP_MFMSR:
+                        rt = get_rt(inst);
+                        vcpu->arch.gpr[rt] = vcpu->arch.msr;
+                        kvmppc_set_exit_type(vcpu, EMULATED_MFMSR_EXITS);
+                        break;
+                case XOP_MTMSR:
+                        rs = get_rs(inst);
+                        kvmppc_set_exit_type(vcpu, EMULATED_MTMSR_EXITS);
+                        kvmppc_set_msr(vcpu, vcpu->arch.gpr[rs]);
+                        break;
+                case XOP_WRTEE:
+                        rs = get_rs(inst);
+                        vcpu->arch.msr = (vcpu->arch.msr & ~MSR_EE)
+                                                         | (vcpu->arch.gpr[rs] & MSR_EE);
+                        kvmppc_set_exit_type(vcpu, EMULATED_WRTEE_EXITS);
+                        break;
+                case XOP_WRTEEI:
+                        vcpu->arch.msr = (vcpu->arch.msr & ~MSR_EE)
+                                                         | (inst & MSR_EE);
+                        kvmppc_set_exit_type(vcpu, EMULATED_WRTEE_EXITS);
+                        break;
+                case XOP_MFDCR:
+                        dcrn = get_dcrn(inst);
+                        rt = get_rt(inst);
+                        /* The guest may access CPR0 registers to determine the timebase
+                         * frequency, and it must know the real host frequency because it
+                         * can directly access the timebase registers.
+                         *
+                         * It would be possible to emulate those accesses in userspace,
+                         * but userspace can really only figure out the end frequency.
+                         * We could decompose that into the factors that compute it, but
+                         * that's tricky math, and it's easier to just report the real
+                         * CPR0 values.
+                         */
+                        switch (dcrn) {
+                        case DCRN_CPR0_CONFIG_ADDR:
+                                vcpu->arch.gpr[rt] = vcpu->arch.cpr0_cfgaddr;
+                                break;
+                        case DCRN_CPR0_CONFIG_DATA:
+                                local_irq_disable();
+                                mtdcr(DCRN_CPR0_CONFIG_ADDR,
+                                          vcpu->arch.cpr0_cfgaddr);
+                                vcpu->arch.gpr[rt] = mfdcr(DCRN_CPR0_CONFIG_DATA);
+                                local_irq_enable();
+                                break;
+                        default:
+                                run->dcr.dcrn = dcrn;
+                                run->dcr.data =  0;
+                                run->dcr.is_write = 0;
+                                vcpu->arch.io_gpr = rt;
+                                vcpu->arch.dcr_needed = 1;
+                                kvmppc_account_exit(vcpu, DCR_EXITS);
+                                emulated = EMULATE_DO_DCR;
+                        }
+                        break;
+                case XOP_MTDCR:
+                        dcrn = get_dcrn(inst);
+                        rs = get_rs(inst);
+                        /* emulate some access in kernel */
+                        switch (dcrn) {
+                        case DCRN_CPR0_CONFIG_ADDR:
+                                vcpu->arch.cpr0_cfgaddr = vcpu->arch.gpr[rs];
+                                break;
+                        default:
+                                run->dcr.dcrn = dcrn;
+                                run->dcr.data = vcpu->arch.gpr[rs];
+                                run->dcr.is_write = 1;
+                                vcpu->arch.dcr_needed = 1;
+                                kvmppc_account_exit(vcpu, DCR_EXITS);
+                                emulated = EMULATE_DO_DCR;
+                        }
+                        break;
+                case XOP_TLBWE:
+                        ra = get_ra(inst);
+                        rs = get_rs(inst);
+                        ws = get_ws(inst);
+                        emulated = kvmppc_44x_emul_tlbwe(vcpu, ra, rs, ws);
+                        break;
+                case XOP_TLBSX:
+                        rt = get_rt(inst);
+                        ra = get_ra(inst);
+                        rb = get_rb(inst);
+                        rc = get_rc(inst);
+                        emulated = kvmppc_44x_emul_tlbsx(vcpu, rt, ra, rb, rc);
+                        break;
+                case XOP_ICCCI:
+                        break;
+                default:
+                        emulated = EMULATE_FAIL;
+                }
+                break;
+        default:
+                emulated = EMULATE_FAIL;
+        }
+        return emulated;
+}
+int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs)
+{
+        switch (sprn) {
+        case SPRN_MMUCR:
+                vcpu->arch.mmucr = vcpu->arch.gpr[rs]; break;
+        case SPRN_PID:
+                kvmppc_set_pid(vcpu, vcpu->arch.gpr[rs]); break;
+        case SPRN_CCR0:
+                vcpu->arch.ccr0 = vcpu->arch.gpr[rs]; break;
+        case SPRN_CCR1:
+                vcpu->arch.ccr1 = vcpu->arch.gpr[rs]; break;
+        case SPRN_DEAR:
+                vcpu->arch.dear = vcpu->arch.gpr[rs]; break;
+        case SPRN_ESR:
+                vcpu->arch.esr = vcpu->arch.gpr[rs]; break;
+        case SPRN_DBCR0:
+                vcpu->arch.dbcr0 = vcpu->arch.gpr[rs]; break;
+        case SPRN_DBCR1:
+                vcpu->arch.dbcr1 = vcpu->arch.gpr[rs]; break;
+        case SPRN_TSR:
+                vcpu->arch.tsr &= ~vcpu->arch.gpr[rs]; break;
+        case SPRN_TCR:
+                vcpu->arch.tcr = vcpu->arch.gpr[rs];
+                kvmppc_emulate_dec(vcpu);
+                break;
+        /* Note: SPRG4-7 are user-readable. These values are
+         * loaded into the real SPRGs when resuming the
+         * guest. */
+        case SPRN_SPRG4:
+                vcpu->arch.sprg4 = vcpu->arch.gpr[rs]; break;
+        case SPRN_SPRG5:
+                vcpu->arch.sprg5 = vcpu->arch.gpr[rs]; break;
+        case SPRN_SPRG6:
+                vcpu->arch.sprg6 = vcpu->arch.gpr[rs]; break;
+        case SPRN_SPRG7:
+                vcpu->arch.sprg7 = vcpu->arch.gpr[rs]; break;
+        case SPRN_IVPR:
+                vcpu->arch.ivpr = vcpu->arch.gpr[rs];
+                break;
+        case SPRN_IVOR0:
+                vcpu->arch.ivor[BOOKE_IRQPRIO_CRITICAL] = vcpu->arch.gpr[rs];
+                break;
+        case SPRN_IVOR1:
+                vcpu->arch.ivor[BOOKE_IRQPRIO_MACHINE_CHECK] = vcpu->arch.gpr[rs];
+                break;
+        case SPRN_IVOR2:
+                vcpu->arch.ivor[BOOKE_IRQPRIO_DATA_STORAGE] = vcpu->arch.gpr[rs];
+                break;
+        case SPRN_IVOR3:
+                vcpu->arch.ivor[BOOKE_IRQPRIO_INST_STORAGE] = vcpu->arch.gpr[rs];
+                break;
+        case SPRN_IVOR4:
+                vcpu->arch.ivor[BOOKE_IRQPRIO_EXTERNAL] = vcpu->arch.gpr[rs];
+                break;
+        case SPRN_IVOR5:
+                vcpu->arch.ivor[BOOKE_IRQPRIO_ALIGNMENT] = vcpu->arch.gpr[rs];
+                break;
+        case SPRN_IVOR6:
+                vcpu->arch.ivor[BOOKE_IRQPRIO_PROGRAM] = vcpu->arch.gpr[rs];
+                break;
+        case SPRN_IVOR7:
+                vcpu->arch.ivor[BOOKE_IRQPRIO_FP_UNAVAIL] = vcpu->arch.gpr[rs];
+                break;
+        case SPRN_IVOR8:
+                vcpu->arch.ivor[BOOKE_IRQPRIO_SYSCALL] = vcpu->arch.gpr[rs];
+                break;
+        case SPRN_IVOR9:
+                vcpu->arch.ivor[BOOKE_IRQPRIO_AP_UNAVAIL] = vcpu->arch.gpr[rs];
+                break;
+        case SPRN_IVOR10:
+                vcpu->arch.ivor[BOOKE_IRQPRIO_DECREMENTER] = vcpu->arch.gpr[rs];
+                break;
+        case SPRN_IVOR11:
+                vcpu->arch.ivor[BOOKE_IRQPRIO_FIT] = vcpu->arch.gpr[rs];
+                break;
+        case SPRN_IVOR12:
+                vcpu->arch.ivor[BOOKE_IRQPRIO_WATCHDOG] = vcpu->arch.gpr[rs];
+                break;
+        case SPRN_IVOR13:
+                vcpu->arch.ivor[BOOKE_IRQPRIO_DTLB_MISS] = vcpu->arch.gpr[rs];
+                break;
+        case SPRN_IVOR14:
+                vcpu->arch.ivor[BOOKE_IRQPRIO_ITLB_MISS] = vcpu->arch.gpr[rs];
+                break;
+        case SPRN_IVOR15:
+                vcpu->arch.ivor[BOOKE_IRQPRIO_DEBUG] = vcpu->arch.gpr[rs];
+                break;
+        default:
+                return EMULATE_FAIL;
+        }
+        kvmppc_set_exit_type(vcpu, EMULATED_MTSPR_EXITS);
+        return EMULATE_DONE;
+}
+int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt)
+{
+        switch (sprn) {
+        /* 440 */
+        case SPRN_MMUCR:
+                vcpu->arch.gpr[rt] = vcpu->arch.mmucr; break;
+        case SPRN_CCR0:
+                vcpu->arch.gpr[rt] = vcpu->arch.ccr0; break;
+        case SPRN_CCR1:
+                vcpu->arch.gpr[rt] = vcpu->arch.ccr1; break;
+        /* Book E */
+        case SPRN_PID:
+                vcpu->arch.gpr[rt] = vcpu->arch.pid; break;
+        case SPRN_IVPR:
+                vcpu->arch.gpr[rt] = vcpu->arch.ivpr; break;
+        case SPRN_DEAR:
+                vcpu->arch.gpr[rt] = vcpu->arch.dear; break;
+        case SPRN_ESR:
+                vcpu->arch.gpr[rt] = vcpu->arch.esr; break;
+        case SPRN_DBCR0:
+                vcpu->arch.gpr[rt] = vcpu->arch.dbcr0; break;
+        case SPRN_DBCR1:
+                vcpu->arch.gpr[rt] = vcpu->arch.dbcr1; break;
+        case SPRN_IVOR0:
+                vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_CRITICAL];
+                break;
+        case SPRN_IVOR1:
+                vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_MACHINE_CHECK];
+                break;
+        case SPRN_IVOR2:
+                vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_DATA_STORAGE];
+                break;
+        case SPRN_IVOR3:
+                vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_INST_STORAGE];
+                break;
+        case SPRN_IVOR4:
+                vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_EXTERNAL];
+                break;
+        case SPRN_IVOR5:
+                vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_ALIGNMENT];
+                break;
+        case SPRN_IVOR6:
+                vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_PROGRAM];
+                break;
+        case SPRN_IVOR7:
+                vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_FP_UNAVAIL];
+                break;
+        case SPRN_IVOR8:
+                vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_SYSCALL];
+                break;
+        case SPRN_IVOR9:
+                vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_AP_UNAVAIL];
+                break;
+        case SPRN_IVOR10:
+                vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_DECREMENTER];
+                break;
+        case SPRN_IVOR11:
+                vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_FIT];
+                break;
+        case SPRN_IVOR12:
+                vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_WATCHDOG];
+                break;
+        case SPRN_IVOR13:
+                vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_DTLB_MISS];
+                break;
+        case SPRN_IVOR14:
+                vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_ITLB_MISS];
+                break;
+        case SPRN_IVOR15:
+                vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_DEBUG];
+                break;
+        default:
+                return EMULATE_FAIL;
+        }
+        kvmppc_set_exit_type(vcpu, EMULATED_MFSPR_EXITS);
+        return EMULATE_DONE;
+}
diff --git a/arch/powerpc/kvm/44x_tlb.c b/arch/powerpc/kvm/44x_tlb.c
index ad72c6f9811f..9a34b8edb9e2 100644
--- a/arch/powerpc/kvm/44x_tlb.c
+++ b/arch/powerpc/kvm/44x_tlb.c
@@ -22,20 +22,103 @@
 #include <linux/kvm.h>
 #include <linux/kvm_host.h>
 #include <linux/highmem.h>
+#include <asm/tlbflush.h>
 #include <asm/mmu-44x.h>
 #include <asm/kvm_ppc.h>
+#include <asm/kvm_44x.h>
+#include "timing.h"
 #include "44x_tlb.h"
+#ifndef PPC44x_TLBE_SIZE
+#define PPC44x_TLBE_SIZE        PPC44x_TLB_4K
+#endif
+#define PAGE_SIZE_4K (1<<12)
+#define PAGE_MASK_4K (~(PAGE_SIZE_4K - 1))
+#define PPC44x_TLB_UATTR_MASK \
+        (PPC44x_TLB_U0|PPC44x_TLB_U1|PPC44x_TLB_U2|PPC44x_TLB_U3)
 #define PPC44x_TLB_USER_PERM_MASK (PPC44x_TLB_UX|PPC44x_TLB_UR|PPC44x_TLB_UW)
 #define PPC44x_TLB_SUPER_PERM_MASK (PPC44x_TLB_SX|PPC44x_TLB_SR|PPC44x_TLB_SW)
-static unsigned int kvmppc_tlb_44x_pos;
+#ifdef DEBUG
+void kvmppc_dump_tlbs(struct kvm_vcpu *vcpu)
+{
+        struct kvmppc_44x_tlbe *tlbe;
+        int i;
+        printk("vcpu %d TLB dump:\n", vcpu->vcpu_id);
+        printk("| %2s | %3s | %8s | %8s | %8s |\n",
+                        "nr", "tid", "word0", "word1", "word2");
+        for (i = 0; i < ARRAY_SIZE(vcpu_44x->guest_tlb); i++) {
+                tlbe = &vcpu_44x->guest_tlb[i];
+                if (tlbe->word0 & PPC44x_TLB_VALID)
+                        printk(" G%2d |  %02X | %08X | %08X | %08X |\n",
+                               i, tlbe->tid, tlbe->word0, tlbe->word1,
+                               tlbe->word2);
+        }
+}
+#endif
+static inline void kvmppc_44x_tlbie(unsigned int index)
+{
+        /* 0 <= index < 64, so the V bit is clear and we can use the index as
+         * word0. */
+        asm volatile(
+                "tlbwe %[index], %[index], 0\n"
+        :
+        : [index] "r"(index)
+        );
+}
+static inline void kvmppc_44x_tlbre(unsigned int index,
+                                    struct kvmppc_44x_tlbe *tlbe)
+{
+        asm volatile(
+                "tlbre %[word0], %[index], 0\n"
+                "mfspr %[tid], %[sprn_mmucr]\n"
+                "andi. %[tid], %[tid], 0xff\n"
+                "tlbre %[word1], %[index], 1\n"
+                "tlbre %[word2], %[index], 2\n"
+                : [word0] "=r"(tlbe->word0),
+                  [word1] "=r"(tlbe->word1),
+                  [word2] "=r"(tlbe->word2),
+                  [tid]   "=r"(tlbe->tid)
+                : [index] "r"(index),
+                  [sprn_mmucr] "i"(SPRN_MMUCR)
+                : "cc"
+        );
+}
+static inline void kvmppc_44x_tlbwe(unsigned int index,
+                                    struct kvmppc_44x_tlbe *stlbe)
+{
+        unsigned long tmp;
+        asm volatile(
+                "mfspr %[tmp], %[sprn_mmucr]\n"
+                "rlwimi %[tmp], %[tid], 0, 0xff\n"
+                "mtspr %[sprn_mmucr], %[tmp]\n"
+                "tlbwe %[word0], %[index], 0\n"
+                "tlbwe %[word1], %[index], 1\n"
+                "tlbwe %[word2], %[index], 2\n"
+                : [tmp]   "=&r"(tmp)
+                : [word0] "r"(stlbe->word0),
+                  [word1] "r"(stlbe->word1),
+                  [word2] "r"(stlbe->word2),
+                  [tid]   "r"(stlbe->tid),
+                  [index] "r"(index),
+                  [sprn_mmucr] "i"(SPRN_MMUCR)
+        );
+}
 static u32 kvmppc_44x_tlb_shadow_attrib(u32 attrib, int usermode)
 {
-        /* Mask off reserved bits. */
+        /* We only care about the guest's permission and user bits. */
-        attrib &= PPC44x_TLB_PERM_MASK|PPC44x_TLB_ATTR_MASK;
+        attrib &= PPC44x_TLB_PERM_MASK|PPC44x_TLB_UATTR_MASK;
        if (!usermode) {
                /* Guest is in supervisor mode, so we need to translate guest
@@ -47,18 +130,60 @@ static u32 kvmppc_44x_tlb_shadow_attrib(u32 attrib, int usermode)
        /* Make sure host can always access this memory. */
        attrib |= PPC44x_TLB_SX|PPC44x_TLB_SR|PPC44x_TLB_SW;
+        /* WIMGE = 0b00100 */
+        attrib |= PPC44x_TLB_M;
        return attrib;
 }
+/* Load shadow TLB back into hardware. */
+void kvmppc_44x_tlb_load(struct kvm_vcpu *vcpu)
+{
+        struct kvmppc_vcpu_44x *vcpu_44x = to_44x(vcpu);
+        int i;
+        for (i = 0; i <= tlb_44x_hwater; i++) {
+                struct kvmppc_44x_tlbe *stlbe = &vcpu_44x->shadow_tlb[i];
+                if (get_tlb_v(stlbe) && get_tlb_ts(stlbe))
+                        kvmppc_44x_tlbwe(i, stlbe);
+        }
+}
+static void kvmppc_44x_tlbe_set_modified(struct kvmppc_vcpu_44x *vcpu_44x,
+                                         unsigned int i)
+{
+        vcpu_44x->shadow_tlb_mod[i] = 1;
+}
+/* Save hardware TLB to the vcpu, and invalidate all guest mappings. */
+void kvmppc_44x_tlb_put(struct kvm_vcpu *vcpu)
+{
+        struct kvmppc_vcpu_44x *vcpu_44x = to_44x(vcpu);
+        int i;
+        for (i = 0; i <= tlb_44x_hwater; i++) {
+                struct kvmppc_44x_tlbe *stlbe = &vcpu_44x->shadow_tlb[i];
+                if (vcpu_44x->shadow_tlb_mod[i])
+                        kvmppc_44x_tlbre(i, stlbe);
+                if (get_tlb_v(stlbe) && get_tlb_ts(stlbe))
+                        kvmppc_44x_tlbie(i);
+        }
+}
 /* Search the guest TLB for a matching entry. */
 int kvmppc_44x_tlb_index(struct kvm_vcpu *vcpu, gva_t eaddr, unsigned int pid,
                         unsigned int as)
 {
+        struct kvmppc_vcpu_44x *vcpu_44x = to_44x(vcpu);
        int i;
        /* XXX Replace loop with fancy data structures. */
-        for (i = 0; i < PPC44x_TLB_SIZE; i++) {
+        for (i = 0; i < ARRAY_SIZE(vcpu_44x->guest_tlb); i++) {
-                struct tlbe *tlbe = &vcpu->arch.guest_tlb[i];
+                struct kvmppc_44x_tlbe *tlbe = &vcpu_44x->guest_tlb[i];
                unsigned int tid;
                if (eaddr < get_tlb_eaddr(tlbe))
@@ -83,78 +208,89 @@ int kvmppc_44x_tlb_index(struct kvm_vcpu *vcpu, gva_t eaddr, unsigned int pid,
        return -1;
 }
-struct tlbe *kvmppc_44x_itlb_search(struct kvm_vcpu *vcpu, gva_t eaddr)
+int kvmppc_44x_itlb_index(struct kvm_vcpu *vcpu, gva_t eaddr)
 {
        unsigned int as = !!(vcpu->arch.msr & MSR_IS);
-        unsigned int index;
-        index = kvmppc_44x_tlb_index(vcpu, eaddr, vcpu->arch.pid, as);
+        return kvmppc_44x_tlb_index(vcpu, eaddr, vcpu->arch.pid, as);
-        if (index == -1)
-                return NULL;
-        return &vcpu->arch.guest_tlb[index];
 }
-struct tlbe *kvmppc_44x_dtlb_search(struct kvm_vcpu *vcpu, gva_t eaddr)
+int kvmppc_44x_dtlb_index(struct kvm_vcpu *vcpu, gva_t eaddr)
 {
        unsigned int as = !!(vcpu->arch.msr & MSR_DS);
-        unsigned int index;
-        index = kvmppc_44x_tlb_index(vcpu, eaddr, vcpu->arch.pid, as);
+        return kvmppc_44x_tlb_index(vcpu, eaddr, vcpu->arch.pid, as);
-        if (index == -1)
-                return NULL;
-        return &vcpu->arch.guest_tlb[index];
 }
-static int kvmppc_44x_tlbe_is_writable(struct tlbe *tlbe)
+static void kvmppc_44x_shadow_release(struct kvmppc_vcpu_44x *vcpu_44x,
+                                      unsigned int stlb_index)
 {
-        return tlbe->word2 & (PPC44x_TLB_SW|PPC44x_TLB_UW);
+        struct kvmppc_44x_shadow_ref *ref = &vcpu_44x->shadow_refs[stlb_index];
-}
-static void kvmppc_44x_shadow_release(struct kvm_vcpu *vcpu,
+        if (!ref->page)
-                                      unsigned int index)
+                return;
-{
-        struct tlbe *stlbe = &vcpu->arch.shadow_tlb[index];
-        struct page *page = vcpu->arch.shadow_pages[index];
-        if (get_tlb_v(stlbe)) {
+        /* Discard from the TLB. */
-                if (kvmppc_44x_tlbe_is_writable(stlbe))
+        /* Note: we could actually invalidate a host mapping, if the host overwrote
-                        kvm_release_page_dirty(page);
+         * this TLB entry since we inserted a guest mapping. */
-                else
+        kvmppc_44x_tlbie(stlb_index);
-                        kvm_release_page_clean(page);
-        }
+        /* Now release the page. */
+        if (ref->writeable)
+                kvm_release_page_dirty(ref->page);
+        else
+                kvm_release_page_clean(ref->page);
+        ref->page = NULL;
+        /* XXX set tlb_44x_index to stlb_index? */
+        KVMTRACE_1D(STLB_INVAL, &vcpu_44x->vcpu, stlb_index, handler);
 }
 void kvmppc_core_destroy_mmu(struct kvm_vcpu *vcpu)
 {
+        struct kvmppc_vcpu_44x *vcpu_44x = to_44x(vcpu);
        int i;
        for (i = 0; i <= tlb_44x_hwater; i++)
-                kvmppc_44x_shadow_release(vcpu, i);
+                kvmppc_44x_shadow_release(vcpu_44x, i);
-}
-void kvmppc_tlbe_set_modified(struct kvm_vcpu *vcpu, unsigned int i)
-{
-    vcpu->arch.shadow_tlb_mod[i] = 1;
 }
-/* Caller must ensure that the specified guest TLB entry is safe to insert into
+/**
- * the shadow TLB. */
+ * kvmppc_mmu_map -- create a host mapping for guest memory
-void kvmppc_mmu_map(struct kvm_vcpu *vcpu, u64 gvaddr, gfn_t gfn, u64 asid,
+ *
-                    u32 flags)
+ * If the guest wanted a larger page than the host supports, only the first
+ * host page is mapped here and the rest are demand faulted.
+ *
+ * If the guest wanted a smaller page than the host page size, we map only the
+ * guest-size page (i.e. not a full host page mapping).
+ *
+ * Caller must ensure that the specified guest TLB entry is safe to insert into
+ * the shadow TLB.
+ */
+void kvmppc_mmu_map(struct kvm_vcpu *vcpu, u64 gvaddr, gpa_t gpaddr, u64 asid,
+                    u32 flags, u32 max_bytes, unsigned int gtlb_index)
 {
+        struct kvmppc_44x_tlbe stlbe;
+        struct kvmppc_vcpu_44x *vcpu_44x = to_44x(vcpu);
+        struct kvmppc_44x_shadow_ref *ref;
        struct page *new_page;
-        struct tlbe *stlbe;
        hpa_t hpaddr;
+        gfn_t gfn;
        unsigned int victim;
-        /* Future optimization: don't overwrite the TLB entry containing the
+        /* Select TLB entry to clobber. Indirectly guard against races with the TLB
-         * current PC (or stack?). */
+         * miss handler by disabling interrupts. */
-        victim = kvmppc_tlb_44x_pos++;
+        local_irq_disable();
-        if (kvmppc_tlb_44x_pos > tlb_44x_hwater)
+        victim = ++tlb_44x_index;
-                kvmppc_tlb_44x_pos = 0;
+        if (victim > tlb_44x_hwater)
-        stlbe = &vcpu->arch.shadow_tlb[victim];
+                victim = 0;
+        tlb_44x_index = victim;
+        local_irq_enable();
        /* Get reference to new page. */
+        gfn = gpaddr >> PAGE_SHIFT;
        new_page = gfn_to_page(vcpu->kvm, gfn);
        if (is_error_page(new_page)) {
                printk(KERN_ERR "Couldn't get guest page for gfn %lx!\n", gfn);
@@ -163,10 +299,8 @@ void kvmppc_mmu_map(struct kvm_vcpu *vcpu, u64 gvaddr, gfn_t gfn, u64 asid,
        }
        hpaddr = page_to_phys(new_page);
-        /* Drop reference to old page. */
+        /* Invalidate any previous shadow mappings. */
-        kvmppc_44x_shadow_release(vcpu, victim);
+        kvmppc_44x_shadow_release(vcpu_44x, victim);
-        vcpu->arch.shadow_pages[victim] = new_page;
        /* XXX Make sure (va, size) doesn't overlap any other
         * entries. 440x6 user manual says the result would be
@@ -174,78 +308,193 @@ void kvmppc_mmu_map(struct kvm_vcpu *vcpu, u64 gvaddr, gfn_t gfn, u64 asid,
        /* XXX what about AS? */
-        stlbe->tid = !(asid & 0xff);
        /* Force TS=1 for all guest mappings. */
-        /* For now we hardcode 4KB mappings, but it will be important to
+        stlbe.word0 = PPC44x_TLB_VALID | PPC44x_TLB_TS;
-         * use host large pages in the future. */
-        stlbe->word0 = (gvaddr & PAGE_MASK) | PPC44x_TLB_VALID | PPC44x_TLB_TS
+        if (max_bytes >= PAGE_SIZE) {
-                       | PPC44x_TLB_4K;
+                /* Guest mapping is larger than or equal to host page size. We can use
-        stlbe->word1 = (hpaddr & 0xfffffc00) | ((hpaddr >> 32) & 0xf);
+                 * a "native" host mapping. */
-        stlbe->word2 = kvmppc_44x_tlb_shadow_attrib(flags,
+                stlbe.word0 |= (gvaddr & PAGE_MASK) | PPC44x_TLBE_SIZE;
-                                                    vcpu->arch.msr & MSR_PR);
+        } else {
-        kvmppc_tlbe_set_modified(vcpu, victim);
+                /* Guest mapping is smaller than host page size. We must restrict the
+                 * size of the mapping to be at most the smaller of the two, but for
+                 * simplicity we fall back to a 4K mapping (this is probably what the
+                 * guest is using anyways). */
+                stlbe.word0 |= (gvaddr & PAGE_MASK_4K) | PPC44x_TLB_4K;
+                /* 'hpaddr' is a host page, which is larger than the mapping we're
+                 * inserting here. To compensate, we must add the in-page offset to the
+                 * sub-page. */
+                hpaddr |= gpaddr & (PAGE_MASK ^ PAGE_MASK_4K);
+        }
-        KVMTRACE_5D(STLB_WRITE, vcpu, victim,
+        stlbe.word1 = (hpaddr & 0xfffffc00) | ((hpaddr >> 32) & 0xf);
-                        stlbe->tid, stlbe->word0, stlbe->word1, stlbe->word2,
+        stlbe.word2 = kvmppc_44x_tlb_shadow_attrib(flags,
-                        handler);
+                                                    vcpu->arch.msr & MSR_PR);
+        stlbe.tid = !(asid & 0xff);
+        /* Keep track of the reference so we can properly release it later. */
+        ref = &vcpu_44x->shadow_refs[victim];
+        ref->page = new_page;
+        ref->gtlb_index = gtlb_index;
+        ref->writeable = !!(stlbe.word2 & PPC44x_TLB_UW);
+        ref->tid = stlbe.tid;
+        /* Insert shadow mapping into hardware TLB. */
+        kvmppc_44x_tlbe_set_modified(vcpu_44x, victim);
+        kvmppc_44x_tlbwe(victim, &stlbe);
+        KVMTRACE_5D(STLB_WRITE, vcpu, victim, stlbe.tid, stlbe.word0, stlbe.word1,
+                    stlbe.word2, handler);
 }
-void kvmppc_mmu_invalidate(struct kvm_vcpu *vcpu, gva_t eaddr,
+/* For a particular guest TLB entry, invalidate the corresponding host TLB
-                           gva_t eend, u32 asid)
+ * mappings and release the host pages. */
+static void kvmppc_44x_invalidate(struct kvm_vcpu *vcpu,
+                                  unsigned int gtlb_index)
 {
-        unsigned int pid = !(asid & 0xff);
+        struct kvmppc_vcpu_44x *vcpu_44x = to_44x(vcpu);
        int i;
-        /* XXX Replace loop with fancy data structures. */
+        for (i = 0; i < ARRAY_SIZE(vcpu_44x->shadow_refs); i++) {
-        for (i = 0; i <= tlb_44x_hwater; i++) {
+                struct kvmppc_44x_shadow_ref *ref = &vcpu_44x->shadow_refs[i];
-                struct tlbe *stlbe = &vcpu->arch.shadow_tlb[i];
+                if (ref->gtlb_index == gtlb_index)
-                unsigned int tid;
+                        kvmppc_44x_shadow_release(vcpu_44x, i);
+        }
+}
-                if (!get_tlb_v(stlbe))
+void kvmppc_mmu_priv_switch(struct kvm_vcpu *vcpu, int usermode)
-                        continue;
+{
+        vcpu->arch.shadow_pid = !usermode;
+}
-                if (eend < get_tlb_eaddr(stlbe))
+void kvmppc_set_pid(struct kvm_vcpu *vcpu, u32 new_pid)
-                        continue;
+{
+        struct kvmppc_vcpu_44x *vcpu_44x = to_44x(vcpu);
+        int i;
-                if (eaddr > get_tlb_end(stlbe))
+        if (unlikely(vcpu->arch.pid == new_pid))
-                        continue;
+                return;
-                tid = get_tlb_tid(stlbe);
+        vcpu->arch.pid = new_pid;
-                if (tid && (tid != pid))
-                        continue;
-                kvmppc_44x_shadow_release(vcpu, i);
+        /* Guest userspace runs with TID=0 mappings and PID=0, to make sure it
-                stlbe->word0 = 0;
+         * can't access guest kernel mappings (TID=1). When we switch to a new
-                kvmppc_tlbe_set_modified(vcpu, i);
+         * guest PID, which will also use host PID=0, we must discard the old guest
-                KVMTRACE_5D(STLB_INVAL, vcpu, i,
+         * userspace mappings. */
-                                stlbe->tid, stlbe->word0, stlbe->word1,
+        for (i = 0; i < ARRAY_SIZE(vcpu_44x->shadow_refs); i++) {
-                                stlbe->word2, handler);
+                struct kvmppc_44x_shadow_ref *ref = &vcpu_44x->shadow_refs[i];
+                if (ref->tid == 0)
+                        kvmppc_44x_shadow_release(vcpu_44x, i);
        }
 }
-/* Invalidate all mappings on the privilege switch after PID has been changed.
+static int tlbe_is_host_safe(const struct kvm_vcpu *vcpu,
- * The guest always runs with PID=1, so we must clear the entire TLB when
+                             const struct kvmppc_44x_tlbe *tlbe)
- * switching address spaces. */
-void kvmppc_mmu_priv_switch(struct kvm_vcpu *vcpu, int usermode)
 {
-        int i;
+        gpa_t gpa;
-        if (vcpu->arch.swap_pid) {
+        if (!get_tlb_v(tlbe))
-                /* XXX Replace loop with fancy data structures. */
+                return 0;
-                for (i = 0; i <= tlb_44x_hwater; i++) {
-                        struct tlbe *stlbe = &vcpu->arch.shadow_tlb[i];
+        /* Does it match current guest AS? */
+        /* XXX what about IS != DS? */
-                        /* Future optimization: clear only userspace mappings. */
+        if (get_tlb_ts(tlbe) != !!(vcpu->arch.msr & MSR_IS))
-                        kvmppc_44x_shadow_release(vcpu, i);
+                return 0;
-                        stlbe->word0 = 0;
-                        kvmppc_tlbe_set_modified(vcpu, i);
+        gpa = get_tlb_raddr(tlbe);
-                        KVMTRACE_5D(STLB_INVAL, vcpu, i,
+        if (!gfn_to_memslot(vcpu->kvm, gpa >> PAGE_SHIFT))
-                                    stlbe->tid, stlbe->word0, stlbe->word1,
+                /* Mapping is not for RAM. */
-                                    stlbe->word2, handler);
+                return 0;
-                }
-                vcpu->arch.swap_pid = 0;
+        return 1;
+}
+int kvmppc_44x_emul_tlbwe(struct kvm_vcpu *vcpu, u8 ra, u8 rs, u8 ws)
+{
+        struct kvmppc_vcpu_44x *vcpu_44x = to_44x(vcpu);
+        struct kvmppc_44x_tlbe *tlbe;
+        unsigned int gtlb_index;
+        gtlb_index = vcpu->arch.gpr[ra];
+        if (gtlb_index > KVM44x_GUEST_TLB_SIZE) {
+                printk("%s: index %d\n", __func__, gtlb_index);
+                kvmppc_dump_vcpu(vcpu);
+                return EMULATE_FAIL;
        }
-        vcpu->arch.shadow_pid = !usermode;
+        tlbe = &vcpu_44x->guest_tlb[gtlb_index];
+        /* Invalidate shadow mappings for the about-to-be-clobbered TLB entry. */
+        if (tlbe->word0 & PPC44x_TLB_VALID)
+                kvmppc_44x_invalidate(vcpu, gtlb_index);
+        switch (ws) {
+        case PPC44x_TLB_PAGEID:
+                tlbe->tid = get_mmucr_stid(vcpu);
+                tlbe->word0 = vcpu->arch.gpr[rs];
+                break;
+        case PPC44x_TLB_XLAT:
+                tlbe->word1 = vcpu->arch.gpr[rs];
+                break;
+        case PPC44x_TLB_ATTRIB:
+                tlbe->word2 = vcpu->arch.gpr[rs];
+                break;
+        default:
+                return EMULATE_FAIL;
+        }
+        if (tlbe_is_host_safe(vcpu, tlbe)) {
+                u64 asid;
+                gva_t eaddr;
+                gpa_t gpaddr;
+                u32 flags;
+                u32 bytes;
+                eaddr = get_tlb_eaddr(tlbe);
+                gpaddr = get_tlb_raddr(tlbe);
+                /* Use the advertised page size to mask effective and real addrs. */
+                bytes = get_tlb_bytes(tlbe);
+                eaddr &= ~(bytes - 1);
+                gpaddr &= ~(bytes - 1);
+                asid = (tlbe->word0 & PPC44x_TLB_TS) | tlbe->tid;
+                flags = tlbe->word2 & 0xffff;
+                kvmppc_mmu_map(vcpu, eaddr, gpaddr, asid, flags, bytes, gtlb_index);
+        }
+        KVMTRACE_5D(GTLB_WRITE, vcpu, gtlb_index, tlbe->tid, tlbe->word0,
+                    tlbe->word1, tlbe->word2, handler);
+        kvmppc_set_exit_type(vcpu, EMULATED_TLBWE_EXITS);
+        return EMULATE_DONE;
+}
+int kvmppc_44x_emul_tlbsx(struct kvm_vcpu *vcpu, u8 rt, u8 ra, u8 rb, u8 rc)
+{
+        u32 ea;
+        int gtlb_index;
+        unsigned int as = get_mmucr_sts(vcpu);
+        unsigned int pid = get_mmucr_stid(vcpu);
+        ea = vcpu->arch.gpr[rb];
+        if (ra)
+                ea += vcpu->arch.gpr[ra];
+        gtlb_index = kvmppc_44x_tlb_index(vcpu, ea, pid, as);
+        if (rc) {
+                if (gtlb_index < 0)
+                        vcpu->arch.cr &= ~0x20000000;
+                else
+                        vcpu->arch.cr |= 0x20000000;
+        }
+        vcpu->arch.gpr[rt] = gtlb_index;
+        kvmppc_set_exit_type(vcpu, EMULATED_TLBSX_EXITS);
+        return EMULATE_DONE;
 }
diff --git a/arch/powerpc/kvm/44x_tlb.h b/arch/powerpc/kvm/44x_tlb.h
index 2ccd46b6f6b7..772191f29e62 100644
--- a/arch/powerpc/kvm/44x_tlb.h
+++ b/arch/powerpc/kvm/44x_tlb.h
@@ -25,48 +25,52 @@
 extern int kvmppc_44x_tlb_index(struct kvm_vcpu *vcpu, gva_t eaddr,
                                unsigned int pid, unsigned int as);
-extern struct tlbe *kvmppc_44x_dtlb_search(struct kvm_vcpu *vcpu, gva_t eaddr);
+extern int kvmppc_44x_dtlb_index(struct kvm_vcpu *vcpu, gva_t eaddr);
-extern struct tlbe *kvmppc_44x_itlb_search(struct kvm_vcpu *vcpu, gva_t eaddr);
+extern int kvmppc_44x_itlb_index(struct kvm_vcpu *vcpu, gva_t eaddr);
+extern int kvmppc_44x_emul_tlbsx(struct kvm_vcpu *vcpu, u8 rt, u8 ra, u8 rb,
+                                 u8 rc);
+extern int kvmppc_44x_emul_tlbwe(struct kvm_vcpu *vcpu, u8 ra, u8 rs, u8 ws);
 /* TLB helper functions */
-static inline unsigned int get_tlb_size(const struct tlbe *tlbe)
+static inline unsigned int get_tlb_size(const struct kvmppc_44x_tlbe *tlbe)
 {
        return (tlbe->word0 >> 4) & 0xf;
 }
-static inline gva_t get_tlb_eaddr(const struct tlbe *tlbe)
+static inline gva_t get_tlb_eaddr(const struct kvmppc_44x_tlbe *tlbe)
 {
        return tlbe->word0 & 0xfffffc00;
 }
-static inline gva_t get_tlb_bytes(const struct tlbe *tlbe)
+static inline gva_t get_tlb_bytes(const struct kvmppc_44x_tlbe *tlbe)
 {
        unsigned int pgsize = get_tlb_size(tlbe);
        return 1 << 10 << (pgsize << 1);
 }
-static inline gva_t get_tlb_end(const struct tlbe *tlbe)
+static inline gva_t get_tlb_end(const struct kvmppc_44x_tlbe *tlbe)
 {
        return get_tlb_eaddr(tlbe) + get_tlb_bytes(tlbe) - 1;
 }
-static inline u64 get_tlb_raddr(const struct tlbe *tlbe)
+static inline u64 get_tlb_raddr(const struct kvmppc_44x_tlbe *tlbe)
 {
        u64 word1 = tlbe->word1;
        return ((word1 & 0xf) << 32) | (word1 & 0xfffffc00);
 }
-static inline unsigned int get_tlb_tid(const struct tlbe *tlbe)
+static inline unsigned int get_tlb_tid(const struct kvmppc_44x_tlbe *tlbe)
 {
        return tlbe->tid & 0xff;
 }
-static inline unsigned int get_tlb_ts(const struct tlbe *tlbe)
+static inline unsigned int get_tlb_ts(const struct kvmppc_44x_tlbe *tlbe)
 {
        return (tlbe->word0 >> 8) & 0x1;
 }
-static inline unsigned int get_tlb_v(const struct tlbe *tlbe)
+static inline unsigned int get_tlb_v(const struct kvmppc_44x_tlbe *tlbe)
 {
        return (tlbe->word0 >> 9) & 0x1;
 }
@@ -81,7 +85,7 @@ static inline unsigned int get_mmucr_sts(const struct kvm_vcpu *vcpu)
        return (vcpu->arch.mmucr >> 16) & 0x1;
 }
-static inline gpa_t tlb_xlate(struct tlbe *tlbe, gva_t eaddr)
+static inline gpa_t tlb_xlate(struct kvmppc_44x_tlbe *tlbe, gva_t eaddr)
 {
        unsigned int pgmask = get_tlb_bytes(tlbe) - 1;
diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig
index 53aaa66b25e5..6dbdc4817d80 100644
--- a/arch/powerpc/kvm/Kconfig
+++ b/arch/powerpc/kvm/Kconfig
@@ -15,27 +15,33 @@ menuconfig VIRTUALIZATION
 if VIRTUALIZATION
 config KVM
-        bool "Kernel-based Virtual Machine (KVM) support"
+        bool
-        depends on 44x && EXPERIMENTAL
        select PREEMPT_NOTIFIERS
        select ANON_INODES
-        # We can only run on Book E hosts so far
-        select KVM_BOOKE_HOST
+config KVM_440
+        bool "KVM support for PowerPC 440 processors"
+        depends on EXPERIMENTAL && 44x
+        select KVM
        ---help---
-          Support hosting virtualized guest machines. You will also
+          Support running unmodified 440 guest kernels in virtual machines on
-          need to select one or more of the processor modules below.
+          440 host processors.
          This module provides access to the hardware capabilities through
          a character device node named /dev/kvm.
          If unsure, say N.
-config KVM_BOOKE_HOST
+config KVM_EXIT_TIMING
-        bool "KVM host support for Book E PowerPC processors"
+        bool "Detailed exit timing"
-        depends on KVM && 44x
+        depends on KVM
        ---help---
-          Provides host support for KVM on Book E PowerPC processors. Currently
+          Calculate elapsed time for every exit/enter cycle. A per-vcpu
-          this works on 440 processors only.
+          report is available in debugfs kvm/vm#_vcpu#_timing.
+          The overhead is relatively small, however it is not recommended for
+          production environments.
+          If unsure, say N.
 config KVM_TRACE
        bool "KVM trace support"
diff --git a/arch/powerpc/kvm/Makefile b/arch/powerpc/kvm/Makefile
index 2a5d4397ac4b..df7ba59e6d53 100644
--- a/arch/powerpc/kvm/Makefile
+++ b/arch/powerpc/kvm/Makefile
@@ -8,10 +8,16 @@ common-objs-y = $(addprefix ../../../virt/kvm/, kvm_main.o coalesced_mmio.o)
 common-objs-$(CONFIG_KVM_TRACE)  += $(addprefix ../../../virt/kvm/, kvm_trace.o)
-kvm-objs := $(common-objs-y) powerpc.o emulate.o booke_guest.o
+kvm-objs := $(common-objs-y) powerpc.o emulate.o
+obj-$(CONFIG_KVM_EXIT_TIMING) += timing.o
 obj-$(CONFIG_KVM) += kvm.o
 AFLAGS_booke_interrupts.o := -I$(obj)
-kvm-booke-host-objs := booke_host.o booke_interrupts.o 44x_tlb.o
+kvm-440-objs := \
-obj-$(CONFIG_KVM_BOOKE_HOST) += kvm-booke-host.o
+        booke.o \
+        booke_interrupts.o \
+        44x.o \
+        44x_tlb.o \
+        44x_emulate.o
+obj-$(CONFIG_KVM_440) += kvm-440.o
diff --git a/arch/powerpc/kvm/booke_guest.c b/arch/powerpc/kvm/booke.c
index 7b2591e26bae..35485dd6927e 100644
--- a/arch/powerpc/kvm/booke_guest.c
+++ b/arch/powerpc/kvm/booke.c
@@ -24,21 +24,26 @@
 #include <linux/module.h>
 #include <linux/vmalloc.h>
 #include <linux/fs.h>
 #include <asm/cputable.h>
 #include <asm/uaccess.h>
 #include <asm/kvm_ppc.h>
+#include "timing.h"
+#include <asm/cacheflush.h>
+#include <asm/kvm_44x.h>
+#include "booke.h"
 #include "44x_tlb.h"
+unsigned long kvmppc_booke_handlers;
 #define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM
 #define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU
 struct kvm_stats_debugfs_item debugfs_entries[] = {
-        { "exits",      VCPU_STAT(sum_exits) },
        { "mmio",       VCPU_STAT(mmio_exits) },
        { "dcr",        VCPU_STAT(dcr_exits) },
        { "sig",        VCPU_STAT(signal_exits) },
-        { "light",      VCPU_STAT(light_exits) },
        { "itlb_r",     VCPU_STAT(itlb_real_miss_exits) },
        { "itlb_v",     VCPU_STAT(itlb_virt_miss_exits) },
        { "dtlb_r",     VCPU_STAT(dtlb_real_miss_exits) },
@@ -53,103 +58,19 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
        { NULL }
 };
-static const u32 interrupt_msr_mask[16] = {
-        [BOOKE_INTERRUPT_CRITICAL]      = MSR_ME,
-        [BOOKE_INTERRUPT_MACHINE_CHECK] = 0,
-        [BOOKE_INTERRUPT_DATA_STORAGE]  = MSR_CE|MSR_ME|MSR_DE,
-        [BOOKE_INTERRUPT_INST_STORAGE]  = MSR_CE|MSR_ME|MSR_DE,
-        [BOOKE_INTERRUPT_EXTERNAL]      = MSR_CE|MSR_ME|MSR_DE,
-        [BOOKE_INTERRUPT_ALIGNMENT]     = MSR_CE|MSR_ME|MSR_DE,
-        [BOOKE_INTERRUPT_PROGRAM]       = MSR_CE|MSR_ME|MSR_DE,
-        [BOOKE_INTERRUPT_FP_UNAVAIL]    = MSR_CE|MSR_ME|MSR_DE,
-        [BOOKE_INTERRUPT_SYSCALL]       = MSR_CE|MSR_ME|MSR_DE,
-        [BOOKE_INTERRUPT_AP_UNAVAIL]    = MSR_CE|MSR_ME|MSR_DE,
-        [BOOKE_INTERRUPT_DECREMENTER]   = MSR_CE|MSR_ME|MSR_DE,
-        [BOOKE_INTERRUPT_FIT]           = MSR_CE|MSR_ME|MSR_DE,
-        [BOOKE_INTERRUPT_WATCHDOG]      = MSR_ME,
-        [BOOKE_INTERRUPT_DTLB_MISS]     = MSR_CE|MSR_ME|MSR_DE,
-        [BOOKE_INTERRUPT_ITLB_MISS]     = MSR_CE|MSR_ME|MSR_DE,
-        [BOOKE_INTERRUPT_DEBUG]         = MSR_ME,
-};
-const unsigned char exception_priority[] = {
-        [BOOKE_INTERRUPT_DATA_STORAGE] = 0,
-        [BOOKE_INTERRUPT_INST_STORAGE] = 1,
-        [BOOKE_INTERRUPT_ALIGNMENT] = 2,
-        [BOOKE_INTERRUPT_PROGRAM] = 3,
-        [BOOKE_INTERRUPT_FP_UNAVAIL] = 4,
-        [BOOKE_INTERRUPT_SYSCALL] = 5,
-        [BOOKE_INTERRUPT_AP_UNAVAIL] = 6,
-        [BOOKE_INTERRUPT_DTLB_MISS] = 7,
-        [BOOKE_INTERRUPT_ITLB_MISS] = 8,
-        [BOOKE_INTERRUPT_MACHINE_CHECK] = 9,
-        [BOOKE_INTERRUPT_DEBUG] = 10,
-        [BOOKE_INTERRUPT_CRITICAL] = 11,
-        [BOOKE_INTERRUPT_WATCHDOG] = 12,
-        [BOOKE_INTERRUPT_EXTERNAL] = 13,
-        [BOOKE_INTERRUPT_FIT] = 14,
-        [BOOKE_INTERRUPT_DECREMENTER] = 15,
-};
-const unsigned char priority_exception[] = {
-        BOOKE_INTERRUPT_DATA_STORAGE,
-        BOOKE_INTERRUPT_INST_STORAGE,
-        BOOKE_INTERRUPT_ALIGNMENT,
-        BOOKE_INTERRUPT_PROGRAM,
-        BOOKE_INTERRUPT_FP_UNAVAIL,
-        BOOKE_INTERRUPT_SYSCALL,
-        BOOKE_INTERRUPT_AP_UNAVAIL,
-        BOOKE_INTERRUPT_DTLB_MISS,
-        BOOKE_INTERRUPT_ITLB_MISS,
-        BOOKE_INTERRUPT_MACHINE_CHECK,
-        BOOKE_INTERRUPT_DEBUG,
-        BOOKE_INTERRUPT_CRITICAL,
-        BOOKE_INTERRUPT_WATCHDOG,
-        BOOKE_INTERRUPT_EXTERNAL,
-        BOOKE_INTERRUPT_FIT,
-        BOOKE_INTERRUPT_DECREMENTER,
-};
-void kvmppc_dump_tlbs(struct kvm_vcpu *vcpu)
-{
-        struct tlbe *tlbe;
-        int i;
-        printk("vcpu %d TLB dump:\n", vcpu->vcpu_id);
-        printk("| %2s | %3s | %8s | %8s | %8s |\n",
-                        "nr", "tid", "word0", "word1", "word2");
-        for (i = 0; i < PPC44x_TLB_SIZE; i++) {
-                tlbe = &vcpu->arch.guest_tlb[i];
-                if (tlbe->word0 & PPC44x_TLB_VALID)
-                        printk(" G%2d |  %02X | %08X | %08X | %08X |\n",
-                               i, tlbe->tid, tlbe->word0, tlbe->word1,
-                               tlbe->word2);
-        }
-        for (i = 0; i < PPC44x_TLB_SIZE; i++) {
-                tlbe = &vcpu->arch.shadow_tlb[i];
-                if (tlbe->word0 & PPC44x_TLB_VALID)
-                        printk(" S%2d | %02X | %08X | %08X | %08X |\n",
-                               i, tlbe->tid, tlbe->word0, tlbe->word1,
-                               tlbe->word2);
-        }
-}
 /* TODO: use vcpu_printf() */
 void kvmppc_dump_vcpu(struct kvm_vcpu *vcpu)
 {
        int i;
-        printk("pc:   %08x msr:  %08x\n", vcpu->arch.pc, vcpu->arch.msr);
+        printk("pc:   %08lx msr:  %08lx\n", vcpu->arch.pc, vcpu->arch.msr);
-        printk("lr:   %08x ctr:  %08x\n", vcpu->arch.lr, vcpu->arch.ctr);
+        printk("lr:   %08lx ctr:  %08lx\n", vcpu->arch.lr, vcpu->arch.ctr);
-        printk("srr0: %08x srr1: %08x\n", vcpu->arch.srr0, vcpu->arch.srr1);
+        printk("srr0: %08lx srr1: %08lx\n", vcpu->arch.srr0, vcpu->arch.srr1);
        printk("exceptions: %08lx\n", vcpu->arch.pending_exceptions);
        for (i = 0; i < 32; i += 4) {
-                printk("gpr%02d: %08x %08x %08x %08x\n", i,
+                printk("gpr%02d: %08lx %08lx %08lx %08lx\n", i,
                       vcpu->arch.gpr[i],
                       vcpu->arch.gpr[i+1],
                       vcpu->arch.gpr[i+2],
@@ -157,69 +78,96 @@ void kvmppc_dump_vcpu(struct kvm_vcpu *vcpu)
        }
 }
-/* Check if we are ready to deliver the interrupt */
+static void kvmppc_booke_queue_irqprio(struct kvm_vcpu *vcpu,
-static int kvmppc_can_deliver_interrupt(struct kvm_vcpu *vcpu, int interrupt)
+                                       unsigned int priority)
 {
-        int r;
+        set_bit(priority, &vcpu->arch.pending_exceptions);
+}
-        switch (interrupt) {
+void kvmppc_core_queue_program(struct kvm_vcpu *vcpu)
-        case BOOKE_INTERRUPT_CRITICAL:
+{
-                r = vcpu->arch.msr & MSR_CE;
+        kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_PROGRAM);
-                break;
+}
-        case BOOKE_INTERRUPT_MACHINE_CHECK:
-                r = vcpu->arch.msr & MSR_ME;
+void kvmppc_core_queue_dec(struct kvm_vcpu *vcpu)
-                break;
+{
-        case BOOKE_INTERRUPT_EXTERNAL:
+        kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_DECREMENTER);
-                r = vcpu->arch.msr & MSR_EE;
+}
+int kvmppc_core_pending_dec(struct kvm_vcpu *vcpu)
+{
+        return test_bit(BOOKE_IRQPRIO_DECREMENTER, &vcpu->arch.pending_exceptions);
+}
+void kvmppc_core_queue_external(struct kvm_vcpu *vcpu,
+                                struct kvm_interrupt *irq)
+{
+        kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_EXTERNAL);
+}
+/* Deliver the interrupt of the corresponding priority, if possible. */
+static int kvmppc_booke_irqprio_deliver(struct kvm_vcpu *vcpu,
+                                        unsigned int priority)
+{
+        int allowed = 0;
+        ulong msr_mask;
+        switch (priority) {
+        case BOOKE_IRQPRIO_PROGRAM:
+        case BOOKE_IRQPRIO_DTLB_MISS:
+        case BOOKE_IRQPRIO_ITLB_MISS:
+        case BOOKE_IRQPRIO_SYSCALL:
+        case BOOKE_IRQPRIO_DATA_STORAGE:
+        case BOOKE_IRQPRIO_INST_STORAGE:
+        case BOOKE_IRQPRIO_FP_UNAVAIL:
+        case BOOKE_IRQPRIO_AP_UNAVAIL:
+        case BOOKE_IRQPRIO_ALIGNMENT:
+                allowed = 1;
+                msr_mask = MSR_CE|MSR_ME|MSR_DE;
                break;
-        case BOOKE_INTERRUPT_DECREMENTER:
+        case BOOKE_IRQPRIO_CRITICAL:
-                r = vcpu->arch.msr & MSR_EE;
+        case BOOKE_IRQPRIO_WATCHDOG:
+                allowed = vcpu->arch.msr & MSR_CE;
+                msr_mask = MSR_ME;
                break;
-        case BOOKE_INTERRUPT_FIT:
+        case BOOKE_IRQPRIO_MACHINE_CHECK:
-                r = vcpu->arch.msr & MSR_EE;
+                allowed = vcpu->arch.msr & MSR_ME;
+                msr_mask = 0;
                break;
-        case BOOKE_INTERRUPT_WATCHDOG:
+        case BOOKE_IRQPRIO_EXTERNAL:
-                r = vcpu->arch.msr & MSR_CE;
+        case BOOKE_IRQPRIO_DECREMENTER:
+        case BOOKE_IRQPRIO_FIT:
+                allowed = vcpu->arch.msr & MSR_EE;
+                msr_mask = MSR_CE|MSR_ME|MSR_DE;
                break;
-        case BOOKE_INTERRUPT_DEBUG:
+        case BOOKE_IRQPRIO_DEBUG:
-                r = vcpu->arch.msr & MSR_DE;
+                allowed = vcpu->arch.msr & MSR_DE;
+                msr_mask = MSR_ME;
                break;
-        default:
-                r = 1;
        }
-        return r;
+        if (allowed) {
-}
+                vcpu->arch.srr0 = vcpu->arch.pc;
+                vcpu->arch.srr1 = vcpu->arch.msr;
+                vcpu->arch.pc = vcpu->arch.ivpr | vcpu->arch.ivor[priority];
+                kvmppc_set_msr(vcpu, vcpu->arch.msr & msr_mask);
-static void kvmppc_deliver_interrupt(struct kvm_vcpu *vcpu, int interrupt)
+                clear_bit(priority, &vcpu->arch.pending_exceptions);
-{
-        switch (interrupt) {
-        case BOOKE_INTERRUPT_DECREMENTER:
-                vcpu->arch.tsr |= TSR_DIS;
-                break;
        }
-        vcpu->arch.srr0 = vcpu->arch.pc;
+        return allowed;
-        vcpu->arch.srr1 = vcpu->arch.msr;
-        vcpu->arch.pc = vcpu->arch.ivpr | vcpu->arch.ivor[interrupt];
-        kvmppc_set_msr(vcpu, vcpu->arch.msr & interrupt_msr_mask[interrupt]);
 }
 /* Check pending exceptions and deliver one, if possible. */
-void kvmppc_check_and_deliver_interrupts(struct kvm_vcpu *vcpu)
+void kvmppc_core_deliver_interrupts(struct kvm_vcpu *vcpu)
 {
        unsigned long *pending = &vcpu->arch.pending_exceptions;
-        unsigned int exception;
        unsigned int priority;
-        priority = find_first_bit(pending, BITS_PER_BYTE * sizeof(*pending));
+        priority = __ffs(*pending);
        while (priority <= BOOKE_MAX_INTERRUPT) {
-                exception = priority_exception[priority];
+                if (kvmppc_booke_irqprio_deliver(vcpu, priority))
-                if (kvmppc_can_deliver_interrupt(vcpu, exception)) {
-                        kvmppc_clear_exception(vcpu, exception);
-                        kvmppc_deliver_interrupt(vcpu, exception);
                        break;
-                }
                priority = find_next_bit(pending,
                                         BITS_PER_BYTE * sizeof(*pending),
@@ -238,6 +186,9 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
        enum emulation_result er;
        int r = RESUME_HOST;
+        /* update before a new last_exit_type is rewritten */
+        kvmppc_update_timing_stats(vcpu);
        local_irq_enable();
        run->exit_reason = KVM_EXIT_UNKNOWN;
@@ -251,21 +202,19 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
                break;
        case BOOKE_INTERRUPT_EXTERNAL:
+                kvmppc_account_exit(vcpu, EXT_INTR_EXITS);
+                if (need_resched())
+                        cond_resched();
+                r = RESUME_GUEST;
+                break;
        case BOOKE_INTERRUPT_DECREMENTER:
                /* Since we switched IVPR back to the host's value, the host
                 * handled this interrupt the moment we enabled interrupts.
                 * Now we just offer it a chance to reschedule the guest. */
+                kvmppc_account_exit(vcpu, DEC_EXITS);
-                /* XXX At this point the TLB still holds our shadow TLB, so if
-                 * we do reschedule the host will fault over it. Perhaps we
-                 * should politely restore the host's entries to minimize
-                 * misses before ceding control. */
                if (need_resched())
                        cond_resched();
-                if (exit_nr == BOOKE_INTERRUPT_DECREMENTER)
-                        vcpu->stat.dec_exits++;
-                else
-                        vcpu->stat.ext_intr_exits++;
                r = RESUME_GUEST;
                break;
@@ -274,17 +223,19 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
                        /* Program traps generated by user-level software must be handled
                         * by the guest kernel. */
                        vcpu->arch.esr = vcpu->arch.fault_esr;
-                        kvmppc_queue_exception(vcpu, BOOKE_INTERRUPT_PROGRAM);
+                        kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_PROGRAM);
                        r = RESUME_GUEST;
+                        kvmppc_account_exit(vcpu, USR_PR_INST);
                        break;
                }
                er = kvmppc_emulate_instruction(run, vcpu);
                switch (er) {
                case EMULATE_DONE:
+                        /* don't overwrite subtypes, just account kvm_stats */
+                        kvmppc_account_exit_stat(vcpu, EMULATED_INST_EXITS);
                        /* Future optimization: only reload non-volatiles if
                         * they were actually modified by emulation. */
-                        vcpu->stat.emulated_inst_exits++;
                        r = RESUME_GUEST_NV;
                        break;
                case EMULATE_DO_DCR:
@@ -293,7 +244,7 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
                        break;
                case EMULATE_FAIL:
                        /* XXX Deliver Program interrupt to guest. */
-                        printk(KERN_CRIT "%s: emulation at %x failed (%08x)\n",
+                        printk(KERN_CRIT "%s: emulation at %lx failed (%08x)\n",
                               __func__, vcpu->arch.pc, vcpu->arch.last_inst);
                        /* For debugging, encode the failing instruction and
                         * report it to userspace. */
@@ -307,48 +258,53 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
                break;
        case BOOKE_INTERRUPT_FP_UNAVAIL:
-                kvmppc_queue_exception(vcpu, exit_nr);
+                kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_FP_UNAVAIL);
+                kvmppc_account_exit(vcpu, FP_UNAVAIL);
                r = RESUME_GUEST;
                break;
        case BOOKE_INTERRUPT_DATA_STORAGE:
                vcpu->arch.dear = vcpu->arch.fault_dear;
                vcpu->arch.esr = vcpu->arch.fault_esr;
-                kvmppc_queue_exception(vcpu, exit_nr);
+                kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_DATA_STORAGE);
-                vcpu->stat.dsi_exits++;
+                kvmppc_account_exit(vcpu, DSI_EXITS);
                r = RESUME_GUEST;
                break;
        case BOOKE_INTERRUPT_INST_STORAGE:
                vcpu->arch.esr = vcpu->arch.fault_esr;
-                kvmppc_queue_exception(vcpu, exit_nr);
+                kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_INST_STORAGE);
-                vcpu->stat.isi_exits++;
+                kvmppc_account_exit(vcpu, ISI_EXITS);
                r = RESUME_GUEST;
                break;
        case BOOKE_INTERRUPT_SYSCALL:
-                kvmppc_queue_exception(vcpu, exit_nr);
+                kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_SYSCALL);
-                vcpu->stat.syscall_exits++;
+                kvmppc_account_exit(vcpu, SYSCALL_EXITS);
                r = RESUME_GUEST;
                break;
+        /* XXX move to a 440-specific file. */
        case BOOKE_INTERRUPT_DTLB_MISS: {
-                struct tlbe *gtlbe;
+                struct kvmppc_vcpu_44x *vcpu_44x = to_44x(vcpu);
+                struct kvmppc_44x_tlbe *gtlbe;
                unsigned long eaddr = vcpu->arch.fault_dear;
+                int gtlb_index;
                gfn_t gfn;
                /* Check the guest TLB. */
-                gtlbe = kvmppc_44x_dtlb_search(vcpu, eaddr);
+                gtlb_index = kvmppc_44x_dtlb_index(vcpu, eaddr);
-                if (!gtlbe) {
+                if (gtlb_index < 0) {
                        /* The guest didn't have a mapping for it. */
-                        kvmppc_queue_exception(vcpu, exit_nr);
+                        kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_DTLB_MISS);
                        vcpu->arch.dear = vcpu->arch.fault_dear;
                        vcpu->arch.esr = vcpu->arch.fault_esr;
-                        vcpu->stat.dtlb_real_miss_exits++;
+                        kvmppc_account_exit(vcpu, DTLB_REAL_MISS_EXITS);
                        r = RESUME_GUEST;
                        break;
                }
+                gtlbe = &vcpu_44x->guest_tlb[gtlb_index];
                vcpu->arch.paddr_accessed = tlb_xlate(gtlbe, eaddr);
                gfn = vcpu->arch.paddr_accessed >> PAGE_SHIFT;
@@ -359,38 +315,45 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
                         * b) the guest used a large mapping which we're faking
                         * Either way, we need to satisfy the fault without
                         * invoking the guest. */
-                        kvmppc_mmu_map(vcpu, eaddr, gfn, gtlbe->tid,
+                        kvmppc_mmu_map(vcpu, eaddr, vcpu->arch.paddr_accessed, gtlbe->tid,
-                                       gtlbe->word2);
+                                       gtlbe->word2, get_tlb_bytes(gtlbe), gtlb_index);
-                        vcpu->stat.dtlb_virt_miss_exits++;
+                        kvmppc_account_exit(vcpu, DTLB_VIRT_MISS_EXITS);
                        r = RESUME_GUEST;
                } else {
                        /* Guest has mapped and accessed a page which is not
                         * actually RAM. */
                        r = kvmppc_emulate_mmio(run, vcpu);
+                        kvmppc_account_exit(vcpu, MMIO_EXITS);
                }
                break;
        }
+        /* XXX move to a 440-specific file. */
        case BOOKE_INTERRUPT_ITLB_MISS: {
-                struct tlbe *gtlbe;
+                struct kvmppc_vcpu_44x *vcpu_44x = to_44x(vcpu);
+                struct kvmppc_44x_tlbe *gtlbe;
                unsigned long eaddr = vcpu->arch.pc;
+                gpa_t gpaddr;
                gfn_t gfn;
+                int gtlb_index;
                r = RESUME_GUEST;
                /* Check the guest TLB. */
-                gtlbe = kvmppc_44x_itlb_search(vcpu, eaddr);
+                gtlb_index = kvmppc_44x_itlb_index(vcpu, eaddr);
-                if (!gtlbe) {
+                if (gtlb_index < 0) {
                        /* The guest didn't have a mapping for it. */
-                        kvmppc_queue_exception(vcpu, exit_nr);
+                        kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_ITLB_MISS);
-                        vcpu->stat.itlb_real_miss_exits++;
+                        kvmppc_account_exit(vcpu, ITLB_REAL_MISS_EXITS);
                        break;
                }
-                vcpu->stat.itlb_virt_miss_exits++;
+                kvmppc_account_exit(vcpu, ITLB_VIRT_MISS_EXITS);
-                gfn = tlb_xlate(gtlbe, eaddr) >> PAGE_SHIFT;
+                gtlbe = &vcpu_44x->guest_tlb[gtlb_index];
+                gpaddr = tlb_xlate(gtlbe, eaddr);
+                gfn = gpaddr >> PAGE_SHIFT;
                if (kvm_is_visible_gfn(vcpu->kvm, gfn)) {
                        /* The guest TLB had a mapping, but the shadow TLB
@@ -399,12 +362,11 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
                         * b) the guest used a large mapping which we're faking
                         * Either way, we need to satisfy the fault without
                         * invoking the guest. */
-                        kvmppc_mmu_map(vcpu, eaddr, gfn, gtlbe->tid,
+                        kvmppc_mmu_map(vcpu, eaddr, gpaddr, gtlbe->tid,
-                                       gtlbe->word2);
+                                       gtlbe->word2, get_tlb_bytes(gtlbe), gtlb_index);
                } else {
                        /* Guest mapped and leaped at non-RAM! */
-                        kvmppc_queue_exception(vcpu,
+                        kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_MACHINE_CHECK);
-                                               BOOKE_INTERRUPT_MACHINE_CHECK);
                }
                break;
@@ -421,6 +383,7 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
                mtspr(SPRN_DBSR, dbsr);
                run->exit_reason = KVM_EXIT_DEBUG;
+                kvmppc_account_exit(vcpu, DEBUG_EXITS);
                r = RESUME_HOST;
                break;
        }
@@ -432,10 +395,8 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
        local_irq_disable();
-        kvmppc_check_and_deliver_interrupts(vcpu);
+        kvmppc_core_deliver_interrupts(vcpu);
-        /* Do some exit accounting. */
-        vcpu->stat.sum_exits++;
        if (!(r & RESUME_HOST)) {
                /* To avoid clobbering exit_reason, only check for signals if
                 * we aren't already exiting to userspace for some other
@@ -443,22 +404,7 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
                if (signal_pending(current)) {
                        run->exit_reason = KVM_EXIT_INTR;
                        r = (-EINTR << 2) | RESUME_HOST | (r & RESUME_FLAG_NV);
+                        kvmppc_account_exit(vcpu, SIGNAL_EXITS);
-                        vcpu->stat.signal_exits++;
-                } else {
-                        vcpu->stat.light_exits++;
-                }
-        } else {
-                switch (run->exit_reason) {
-                case KVM_EXIT_MMIO:
-                        vcpu->stat.mmio_exits++;
-                        break;
-                case KVM_EXIT_DCR:
-                        vcpu->stat.dcr_exits++;
-                        break;
-                case KVM_EXIT_INTR:
-                        vcpu->stat.signal_exits++;
-                        break;
                }
        }
@@ -468,20 +414,6 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
 /* Initial guest state: 16MB mapping 0 -> 0, PC = 0, MSR = 0, R1 = 16MB */
 int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
 {
-        struct tlbe *tlbe = &vcpu->arch.guest_tlb[0];
-        tlbe->tid = 0;
-        tlbe->word0 = PPC44x_TLB_16M | PPC44x_TLB_VALID;
-        tlbe->word1 = 0;
-        tlbe->word2 = PPC44x_TLB_SX | PPC44x_TLB_SW | PPC44x_TLB_SR;
-        tlbe++;
-        tlbe->tid = 0;
-        tlbe->word0 = 0xef600000 | PPC44x_TLB_4K | PPC44x_TLB_VALID;
-        tlbe->word1 = 0xef600000;
-        tlbe->word2 = PPC44x_TLB_SX | PPC44x_TLB_SW | PPC44x_TLB_SR
-                      | PPC44x_TLB_I | PPC44x_TLB_G;
        vcpu->arch.pc = 0;
        vcpu->arch.msr = 0;
        vcpu->arch.gpr[1] = (16<<20) - 8; /* -8 for the callee-save LR slot */
@@ -492,12 +424,9 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
         * before it's programmed its own IVPR. */
        vcpu->arch.ivpr = 0x55550000;
-        /* Since the guest can directly access the timebase, it must know the
+        kvmppc_init_timing_stats(vcpu);
-         * real timebase frequency. Accordingly, it must see the state of
-         * CCR1[TCS]. */
-        vcpu->arch.ccr1 = mfspr(SPRN_CCR1);
-        return 0;
+        return kvmppc_core_vcpu_setup(vcpu);
 }
 int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
@@ -536,7 +465,7 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
        vcpu->arch.ctr = regs->ctr;
        vcpu->arch.lr = regs->lr;
        vcpu->arch.xer = regs->xer;
-        vcpu->arch.msr = regs->msr;
+        kvmppc_set_msr(vcpu, regs->msr);
        vcpu->arch.srr0 = regs->srr0;
        vcpu->arch.srr1 = regs->srr1;
        vcpu->arch.sprg0 = regs->sprg0;
@@ -575,31 +504,62 @@ int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
        return -ENOTSUPP;
 }
-/* 'linear_address' is actually an encoding of AS|PID|EADDR . */
 int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
                                  struct kvm_translation *tr)
 {
-        struct tlbe *gtlbe;
+        return kvmppc_core_vcpu_translate(vcpu, tr);
-        int index;
+}
-        gva_t eaddr;
-        u8 pid;
-        u8 as;
-        eaddr = tr->linear_address;
-        pid = (tr->linear_address >> 32) & 0xff;
-        as = (tr->linear_address >> 40) & 0x1;
-        index = kvmppc_44x_tlb_index(vcpu, eaddr, pid, as);
-        if (index == -1) {
-                tr->valid = 0;
-                return 0;
-        }
-        gtlbe = &vcpu->arch.guest_tlb[index];
+int kvmppc_booke_init(void)
+{
+        unsigned long ivor[16];
+        unsigned long max_ivor = 0;
+        int i;
-        tr->physical_address = tlb_xlate(gtlbe, eaddr);
+        /* We install our own exception handlers by hijacking IVPR. IVPR must
-        /* XXX what does "writeable" and "usermode" even mean? */
+         * be 16-bit aligned, so we need a 64KB allocation. */
-        tr->valid = 1;
+        kvmppc_booke_handlers = __get_free_pages(GFP_KERNEL | __GFP_ZERO,
+                                                 VCPU_SIZE_ORDER);
+        if (!kvmppc_booke_handlers)
+                return -ENOMEM;
+        /* XXX make sure our handlers are smaller than Linux's */
+        /* Copy our interrupt handlers to match host IVORs. That way we don't
+         * have to swap the IVORs on every guest/host transition. */
+        ivor[0] = mfspr(SPRN_IVOR0);
+        ivor[1] = mfspr(SPRN_IVOR1);
+        ivor[2] = mfspr(SPRN_IVOR2);
+        ivor[3] = mfspr(SPRN_IVOR3);
+        ivor[4] = mfspr(SPRN_IVOR4);
+        ivor[5] = mfspr(SPRN_IVOR5);
+        ivor[6] = mfspr(SPRN_IVOR6);
+        ivor[7] = mfspr(SPRN_IVOR7);
+        ivor[8] = mfspr(SPRN_IVOR8);
+        ivor[9] = mfspr(SPRN_IVOR9);
+        ivor[10] = mfspr(SPRN_IVOR10);
+        ivor[11] = mfspr(SPRN_IVOR11);
+        ivor[12] = mfspr(SPRN_IVOR12);
+        ivor[13] = mfspr(SPRN_IVOR13);
+        ivor[14] = mfspr(SPRN_IVOR14);
+        ivor[15] = mfspr(SPRN_IVOR15);
+        for (i = 0; i < 16; i++) {
+                if (ivor[i] > max_ivor)
+                        max_ivor = ivor[i];
+                memcpy((void *)kvmppc_booke_handlers + ivor[i],
+                       kvmppc_handlers_start + i * kvmppc_handler_len,
+                       kvmppc_handler_len);
+        }
+        flush_icache_range(kvmppc_booke_handlers,
+                           kvmppc_booke_handlers + max_ivor + kvmppc_handler_len);
        return 0;
 }
+void __exit kvmppc_booke_exit(void)
+{
+        free_pages(kvmppc_booke_handlers, VCPU_SIZE_ORDER);
+        kvm_exit();
+}
diff --git a/arch/powerpc/kvm/booke.h b/arch/powerpc/kvm/booke.h
new file mode 100644
index 000000000000..cf7c94ca24bf
--- /dev/null
+++ b/arch/powerpc/kvm/booke.h
@@ -0,0 +1,60 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ *
+ * Copyright IBM Corp. 2008
+ *
+ * Authors: Hollis Blanchard <hollisb@us.ibm.com>
+ */
+#ifndef __KVM_BOOKE_H__
+#define __KVM_BOOKE_H__
+#include <linux/types.h>
+#include <linux/kvm_host.h>
+#include "timing.h"
+/* interrupt priortity ordering */
+#define BOOKE_IRQPRIO_DATA_STORAGE 0
+#define BOOKE_IRQPRIO_INST_STORAGE 1
+#define BOOKE_IRQPRIO_ALIGNMENT 2
+#define BOOKE_IRQPRIO_PROGRAM 3
+#define BOOKE_IRQPRIO_FP_UNAVAIL 4
+#define BOOKE_IRQPRIO_SYSCALL 5
+#define BOOKE_IRQPRIO_AP_UNAVAIL 6
+#define BOOKE_IRQPRIO_DTLB_MISS 7
+#define BOOKE_IRQPRIO_ITLB_MISS 8
+#define BOOKE_IRQPRIO_MACHINE_CHECK 9
+#define BOOKE_IRQPRIO_DEBUG 10
+#define BOOKE_IRQPRIO_CRITICAL 11
+#define BOOKE_IRQPRIO_WATCHDOG 12
+#define BOOKE_IRQPRIO_EXTERNAL 13
+#define BOOKE_IRQPRIO_FIT 14
+#define BOOKE_IRQPRIO_DECREMENTER 15
+/* Helper function for "full" MSR writes. No need to call this if only EE is
+ * changing. */
+static inline void kvmppc_set_msr(struct kvm_vcpu *vcpu, u32 new_msr)
+{
+        if ((new_msr & MSR_PR) != (vcpu->arch.msr & MSR_PR))
+                kvmppc_mmu_priv_switch(vcpu, new_msr & MSR_PR);
+        vcpu->arch.msr = new_msr;
+        if (vcpu->arch.msr & MSR_WE) {
+                kvm_vcpu_block(vcpu);
+                kvmppc_set_exit_type(vcpu, EMULATED_MTMSRWE_EXITS);
+        };
+}
+#endif /* __KVM_BOOKE_H__ */
diff --git a/arch/powerpc/kvm/booke_host.c b/arch/powerpc/kvm/booke_host.c
deleted file mode 100644
index b480341bc31e..000000000000
--- a/arch/powerpc/kvm/booke_host.c
+++ /dev/null
@@ -1,83 +0,0 @@
-/*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License, version 2, as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
- *
- * Copyright IBM Corp. 2008
- *
- * Authors: Hollis Blanchard <hollisb@us.ibm.com>
- */
-#include <linux/errno.h>
-#include <linux/kvm_host.h>
-#include <linux/module.h>
-#include <asm/cacheflush.h>
-#include <asm/kvm_ppc.h>
-unsigned long kvmppc_booke_handlers;
-static int kvmppc_booke_init(void)
-{
-        unsigned long ivor[16];
-        unsigned long max_ivor = 0;
-        int i;
-        /* We install our own exception handlers by hijacking IVPR. IVPR must
-         * be 16-bit aligned, so we need a 64KB allocation. */
-        kvmppc_booke_handlers = __get_free_pages(GFP_KERNEL | __GFP_ZERO,
-                                                 VCPU_SIZE_ORDER);
-        if (!kvmppc_booke_handlers)
-                return -ENOMEM;
-        /* XXX make sure our handlers are smaller than Linux's */
-        /* Copy our interrupt handlers to match host IVORs. That way we don't
-         * have to swap the IVORs on every guest/host transition. */
-        ivor[0] = mfspr(SPRN_IVOR0);
-        ivor[1] = mfspr(SPRN_IVOR1);
-        ivor[2] = mfspr(SPRN_IVOR2);
-        ivor[3] = mfspr(SPRN_IVOR3);
-        ivor[4] = mfspr(SPRN_IVOR4);
-        ivor[5] = mfspr(SPRN_IVOR5);
-        ivor[6] = mfspr(SPRN_IVOR6);
-        ivor[7] = mfspr(SPRN_IVOR7);
-        ivor[8] = mfspr(SPRN_IVOR8);
-        ivor[9] = mfspr(SPRN_IVOR9);
-        ivor[10] = mfspr(SPRN_IVOR10);
-        ivor[11] = mfspr(SPRN_IVOR11);
-        ivor[12] = mfspr(SPRN_IVOR12);
-        ivor[13] = mfspr(SPRN_IVOR13);
-        ivor[14] = mfspr(SPRN_IVOR14);
-        ivor[15] = mfspr(SPRN_IVOR15);
-        for (i = 0; i < 16; i++) {
-                if (ivor[i] > max_ivor)
-                        max_ivor = ivor[i];
-                memcpy((void *)kvmppc_booke_handlers + ivor[i],
-                       kvmppc_handlers_start + i * kvmppc_handler_len,
-                       kvmppc_handler_len);
-        }
-        flush_icache_range(kvmppc_booke_handlers,
-                           kvmppc_booke_handlers + max_ivor + kvmppc_handler_len);
-        return kvm_init(NULL, sizeof(struct kvm_vcpu), THIS_MODULE);
-}
-static void __exit kvmppc_booke_exit(void)
-{
-        free_pages(kvmppc_booke_handlers, VCPU_SIZE_ORDER);
-        kvm_exit();
-}
-module_init(kvmppc_booke_init)
-module_exit(kvmppc_booke_exit)
diff --git a/arch/powerpc/kvm/booke_interrupts.S b/arch/powerpc/kvm/booke_interrupts.S
index 95e165baf85f..084ebcd7dd83 100644
--- a/arch/powerpc/kvm/booke_interrupts.S
+++ b/arch/powerpc/kvm/booke_interrupts.S
@@ -107,6 +107,18 @@ _GLOBAL(kvmppc_resume_host)
        li      r6, 1
        slw     r6, r6, r5
+#ifdef CONFIG_KVM_EXIT_TIMING
+        /* save exit time */
+1:
+        mfspr   r7, SPRN_TBRU
+        mfspr   r8, SPRN_TBRL
+        mfspr   r9, SPRN_TBRU
+        cmpw    r9, r7
+        bne     1b
+        stw     r8, VCPU_TIMING_EXIT_TBL(r4)
+        stw     r9, VCPU_TIMING_EXIT_TBU(r4)
+#endif
        /* Save the faulting instruction and all GPRs for emulation. */
        andi.   r7, r6, NEED_INST_MASK
        beq     ..skip_inst_copy
@@ -335,54 +347,6 @@ lightweight_exit:
        lwz     r3, VCPU_SHADOW_PID(r4)
        mtspr   SPRN_PID, r3
-        /* Prevent all asynchronous TLB updates. */
-        mfmsr   r5
-        lis     r6, (MSR_EE|MSR_CE|MSR_ME|MSR_DE)@h
-        ori     r6, r6, (MSR_EE|MSR_CE|MSR_ME|MSR_DE)@l
-        andc    r6, r5, r6
-        mtmsr   r6
-        /* Load the guest mappings, leaving the host's "pinned" kernel mappings
-         * in place. */
-        mfspr   r10, SPRN_MMUCR                 /* Save host MMUCR. */
-        li      r5, PPC44x_TLB_SIZE
-        lis     r5, tlb_44x_hwater@ha
-        lwz     r5, tlb_44x_hwater@l(r5)
-        mtctr   r5
-        addi    r9, r4, VCPU_SHADOW_TLB
-        addi    r5, r4, VCPU_SHADOW_MOD
-        li      r3, 0
-1:
-        lbzx    r7, r3, r5
-        cmpwi   r7, 0
-        beq     3f
-        /* Load guest entry. */
-        mulli   r11, r3, TLBE_BYTES
-        add     r11, r11, r9
-        lwz     r7, 0(r11)
-        mtspr   SPRN_MMUCR, r7
-        lwz     r7, 4(r11)
-        tlbwe   r7, r3, PPC44x_TLB_PAGEID
-        lwz     r7, 8(r11)
-        tlbwe   r7, r3, PPC44x_TLB_XLAT
-        lwz     r7, 12(r11)
-        tlbwe   r7, r3, PPC44x_TLB_ATTRIB
-3:
-        addi    r3, r3, 1                       /* Increment index. */
-        bdnz    1b
-        mtspr   SPRN_MMUCR, r10                 /* Restore host MMUCR. */
-        /* Clear bitmap of modified TLB entries */
-        li      r5, PPC44x_TLB_SIZE>>2
-        mtctr   r5
-        addi    r5, r4, VCPU_SHADOW_MOD - 4
-        li      r6, 0
-1:
-        stwu    r6, 4(r5)
-        bdnz    1b
        iccci   0, 0 /* XXX hack */
        /* Load some guest volatiles. */
@@ -423,6 +387,18 @@ lightweight_exit:
        lwz     r3, VCPU_SPRG7(r4)
        mtspr   SPRN_SPRG7, r3
+#ifdef CONFIG_KVM_EXIT_TIMING
+        /* save enter time */
+1:
+        mfspr   r6, SPRN_TBRU
+        mfspr   r7, SPRN_TBRL
+        mfspr   r8, SPRN_TBRU
+        cmpw    r8, r6
+        bne     1b
+        stw     r7, VCPU_TIMING_LAST_ENTER_TBL(r4)
+        stw     r8, VCPU_TIMING_LAST_ENTER_TBU(r4)
+#endif
        /* Finish loading guest volatiles and jump to guest. */
        lwz     r3, VCPU_CTR(r4)
        mtctr   r3
diff --git a/arch/powerpc/kvm/emulate.c b/arch/powerpc/kvm/emulate.c
index 0fce4fbdc20d..d1d38daa93fb 100644
--- a/arch/powerpc/kvm/emulate.c
+++ b/arch/powerpc/kvm/emulate.c
@@ -23,161 +23,14 @@
 #include <linux/string.h>
 #include <linux/kvm_host.h>
-#include <asm/dcr.h>
+#include <asm/reg.h>
-#include <asm/dcr-regs.h>
 #include <asm/time.h>
 #include <asm/byteorder.h>
 #include <asm/kvm_ppc.h>
+#include <asm/disassemble.h>
+#include "timing.h"
-#include "44x_tlb.h"
+void kvmppc_emulate_dec(struct kvm_vcpu *vcpu)
-/* Instruction decoding */
-static inline unsigned int get_op(u32 inst)
-{
-        return inst >> 26;
-}
-static inline unsigned int get_xop(u32 inst)
-{
-        return (inst >> 1) & 0x3ff;
-}
-static inline unsigned int get_sprn(u32 inst)
-{
-        return ((inst >> 16) & 0x1f) | ((inst >> 6) & 0x3e0);
-}
-static inline unsigned int get_dcrn(u32 inst)
-{
-        return ((inst >> 16) & 0x1f) | ((inst >> 6) & 0x3e0);
-}
-static inline unsigned int get_rt(u32 inst)
-{
-        return (inst >> 21) & 0x1f;
-}
-static inline unsigned int get_rs(u32 inst)
-{
-        return (inst >> 21) & 0x1f;
-}
-static inline unsigned int get_ra(u32 inst)
-{
-        return (inst >> 16) & 0x1f;
-}
-static inline unsigned int get_rb(u32 inst)
-{
-        return (inst >> 11) & 0x1f;
-}
-static inline unsigned int get_rc(u32 inst)
-{
-        return inst & 0x1;
-}
-static inline unsigned int get_ws(u32 inst)
-{
-        return (inst >> 11) & 0x1f;
-}
-static inline unsigned int get_d(u32 inst)
-{
-        return inst & 0xffff;
-}
-static int tlbe_is_host_safe(const struct kvm_vcpu *vcpu,
-                             const struct tlbe *tlbe)
-{
-        gpa_t gpa;
-        if (!get_tlb_v(tlbe))
-                return 0;
-        /* Does it match current guest AS? */
-        /* XXX what about IS != DS? */
-        if (get_tlb_ts(tlbe) != !!(vcpu->arch.msr & MSR_IS))
-                return 0;
-        gpa = get_tlb_raddr(tlbe);
-        if (!gfn_to_memslot(vcpu->kvm, gpa >> PAGE_SHIFT))
-                /* Mapping is not for RAM. */
-                return 0;
-        return 1;
-}
-static int kvmppc_emul_tlbwe(struct kvm_vcpu *vcpu, u32 inst)
-{
-        u64 eaddr;
-        u64 raddr;
-        u64 asid;
-        u32 flags;
-        struct tlbe *tlbe;
-        unsigned int ra;
-        unsigned int rs;
-        unsigned int ws;
-        unsigned int index;
-        ra = get_ra(inst);
-        rs = get_rs(inst);
-        ws = get_ws(inst);
-        index = vcpu->arch.gpr[ra];
-        if (index > PPC44x_TLB_SIZE) {
-                printk("%s: index %d\n", __func__, index);
-                kvmppc_dump_vcpu(vcpu);
-                return EMULATE_FAIL;
-        }
-        tlbe = &vcpu->arch.guest_tlb[index];
-        /* Invalidate shadow mappings for the about-to-be-clobbered TLBE. */
-        if (tlbe->word0 & PPC44x_TLB_VALID) {
-                eaddr = get_tlb_eaddr(tlbe);
-                asid = (tlbe->word0 & PPC44x_TLB_TS) | tlbe->tid;
-                kvmppc_mmu_invalidate(vcpu, eaddr, get_tlb_end(tlbe), asid);
-        }
-        switch (ws) {
-        case PPC44x_TLB_PAGEID:
-                tlbe->tid = vcpu->arch.mmucr & 0xff;
-                tlbe->word0 = vcpu->arch.gpr[rs];
-                break;
-        case PPC44x_TLB_XLAT:
-                tlbe->word1 = vcpu->arch.gpr[rs];
-                break;
-        case PPC44x_TLB_ATTRIB:
-                tlbe->word2 = vcpu->arch.gpr[rs];
-                break;
-        default:
-                return EMULATE_FAIL;
-        }
-        if (tlbe_is_host_safe(vcpu, tlbe)) {
-                eaddr = get_tlb_eaddr(tlbe);
-                raddr = get_tlb_raddr(tlbe);
-                asid = (tlbe->word0 & PPC44x_TLB_TS) | tlbe->tid;
-                flags = tlbe->word2 & 0xffff;
-                /* Create a 4KB mapping on the host. If the guest wanted a
-                 * large page, only the first 4KB is mapped here and the rest
-                 * are mapped on the fly. */
-                kvmppc_mmu_map(vcpu, eaddr, raddr >> PAGE_SHIFT, asid, flags);
-        }
-        KVMTRACE_5D(GTLB_WRITE, vcpu, index,
-                        tlbe->tid, tlbe->word0, tlbe->word1, tlbe->word2,
-                        handler);
-        return EMULATE_DONE;
-}
-static void kvmppc_emulate_dec(struct kvm_vcpu *vcpu)
 {
        if (vcpu->arch.tcr & TCR_DIE) {
                /* The decrementer ticks at the same rate as the timebase, so
@@ -193,12 +46,6 @@ static void kvmppc_emulate_dec(struct kvm_vcpu *vcpu)
        }
 }
-static void kvmppc_emul_rfi(struct kvm_vcpu *vcpu)
-{
-        vcpu->arch.pc = vcpu->arch.srr0;
-        kvmppc_set_msr(vcpu, vcpu->arch.srr1);
-}
 /* XXX to do:
 * lhax
 * lhaux
@@ -213,40 +60,30 @@ static void kvmppc_emul_rfi(struct kvm_vcpu *vcpu)
 *
 * XXX is_bigendian should depend on MMU mapping or MSR[LE]
 */
+/* XXX Should probably auto-generate instruction decoding for a particular core
+ * from opcode tables in the future. */
 int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)
 {
        u32 inst = vcpu->arch.last_inst;
        u32 ea;
        int ra;
        int rb;
-        int rc;
        int rs;
        int rt;
        int sprn;
-        int dcrn;
        enum emulation_result emulated = EMULATE_DONE;
        int advance = 1;
+        /* this default type might be overwritten by subcategories */
+        kvmppc_set_exit_type(vcpu, EMULATED_INST_EXITS);
        switch (get_op(inst)) {
-        case 3:                                                 /* trap */
+        case 3:                                             /* trap */
-                printk("trap!\n");
+                vcpu->arch.esr |= ESR_PTR;
-                kvmppc_queue_exception(vcpu, BOOKE_INTERRUPT_PROGRAM);
+                kvmppc_core_queue_program(vcpu);
                advance = 0;
                break;
-        case 19:
-                switch (get_xop(inst)) {
-                case 50:                                        /* rfi */
-                        kvmppc_emul_rfi(vcpu);
-                        advance = 0;
-                        break;
-                default:
-                        emulated = EMULATE_FAIL;
-                        break;
-                }
-                break;
        case 31:
                switch (get_xop(inst)) {
@@ -255,27 +92,11 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)
                        emulated = kvmppc_handle_load(run, vcpu, rt, 4, 1);
                        break;
-                case 83:                                        /* mfmsr */
-                        rt = get_rt(inst);
-                        vcpu->arch.gpr[rt] = vcpu->arch.msr;
-                        break;
                case 87:                                        /* lbzx */
                        rt = get_rt(inst);
                        emulated = kvmppc_handle_load(run, vcpu, rt, 1, 1);
                        break;
-                case 131:                                       /* wrtee */
-                        rs = get_rs(inst);
-                        vcpu->arch.msr = (vcpu->arch.msr & ~MSR_EE)
-                                         | (vcpu->arch.gpr[rs] & MSR_EE);
-                        break;
-                case 146:                                       /* mtmsr */
-                        rs = get_rs(inst);
-                        kvmppc_set_msr(vcpu, vcpu->arch.gpr[rs]);
-                        break;
                case 151:                                       /* stwx */
                        rs = get_rs(inst);
                        emulated = kvmppc_handle_store(run, vcpu,
@@ -283,11 +104,6 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)
                                                       4, 1);
                        break;
-                case 163:                                       /* wrteei */
-                        vcpu->arch.msr = (vcpu->arch.msr & ~MSR_EE)
-                                         | (inst & MSR_EE);
-                        break;
                case 215:                                       /* stbx */
                        rs = get_rs(inst);
                        emulated = kvmppc_handle_store(run, vcpu,
@@ -328,42 +144,6 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)
                        vcpu->arch.gpr[ra] = ea;
                        break;
-                case 323:                                       /* mfdcr */
-                        dcrn = get_dcrn(inst);
-                        rt = get_rt(inst);
-                        /* The guest may access CPR0 registers to determine the timebase
-                         * frequency, and it must know the real host frequency because it
-                         * can directly access the timebase registers.
-                         *
-                         * It would be possible to emulate those accesses in userspace,
-                         * but userspace can really only figure out the end frequency.
-                         * We could decompose that into the factors that compute it, but
-                         * that's tricky math, and it's easier to just report the real
-                         * CPR0 values.
-                         */
-                        switch (dcrn) {
-                        case DCRN_CPR0_CONFIG_ADDR:
-                                vcpu->arch.gpr[rt] = vcpu->arch.cpr0_cfgaddr;
-                                break;
-                        case DCRN_CPR0_CONFIG_DATA:
-                                local_irq_disable();
-                                mtdcr(DCRN_CPR0_CONFIG_ADDR,
-                                      vcpu->arch.cpr0_cfgaddr);
-                                vcpu->arch.gpr[rt] = mfdcr(DCRN_CPR0_CONFIG_DATA);
-                                local_irq_enable();
-                                break;
-                        default:
-                                run->dcr.dcrn = dcrn;
-                                run->dcr.data =  0;
-                                run->dcr.is_write = 0;
-                                vcpu->arch.io_gpr = rt;
-                                vcpu->arch.dcr_needed = 1;
-                                emulated = EMULATE_DO_DCR;
-                        }
-                        break;
                case 339:                                       /* mfspr */
                        sprn = get_sprn(inst);
                        rt = get_rt(inst);
@@ -373,26 +153,8 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)
                                vcpu->arch.gpr[rt] = vcpu->arch.srr0; break;
                        case SPRN_SRR1:
                                vcpu->arch.gpr[rt] = vcpu->arch.srr1; break;
-                        case SPRN_MMUCR:
-                                vcpu->arch.gpr[rt] = vcpu->arch.mmucr; break;
-                        case SPRN_PID:
-                                vcpu->arch.gpr[rt] = vcpu->arch.pid; break;
-                        case SPRN_IVPR:
-                                vcpu->arch.gpr[rt] = vcpu->arch.ivpr; break;
-                        case SPRN_CCR0:
-                                vcpu->arch.gpr[rt] = vcpu->arch.ccr0; break;
-                        case SPRN_CCR1:
-                                vcpu->arch.gpr[rt] = vcpu->arch.ccr1; break;
                        case SPRN_PVR:
                                vcpu->arch.gpr[rt] = vcpu->arch.pvr; break;
-                        case SPRN_DEAR:
-                                vcpu->arch.gpr[rt] = vcpu->arch.dear; break;
-                        case SPRN_ESR:
-                                vcpu->arch.gpr[rt] = vcpu->arch.esr; break;
-                        case SPRN_DBCR0:
-                                vcpu->arch.gpr[rt] = vcpu->arch.dbcr0; break;
-                        case SPRN_DBCR1:
-                                vcpu->arch.gpr[rt] = vcpu->arch.dbcr1; break;
                        /* Note: mftb and TBRL/TBWL are user-accessible, so
                         * the guest can always access the real TB anyways.
@@ -413,42 +175,12 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)
                        /* Note: SPRG4-7 are user-readable, so we don't get
                         * a trap. */
-                        case SPRN_IVOR0:
-                                vcpu->arch.gpr[rt] = vcpu->arch.ivor[0]; break;
-                        case SPRN_IVOR1:
-                                vcpu->arch.gpr[rt] = vcpu->arch.ivor[1]; break;
-                        case SPRN_IVOR2:
-                                vcpu->arch.gpr[rt] = vcpu->arch.ivor[2]; break;
-                        case SPRN_IVOR3:
-                                vcpu->arch.gpr[rt] = vcpu->arch.ivor[3]; break;
-                        case SPRN_IVOR4:
-                                vcpu->arch.gpr[rt] = vcpu->arch.ivor[4]; break;
-                        case SPRN_IVOR5:
-                                vcpu->arch.gpr[rt] = vcpu->arch.ivor[5]; break;
-                        case SPRN_IVOR6:
-                                vcpu->arch.gpr[rt] = vcpu->arch.ivor[6]; break;
-                        case SPRN_IVOR7:
-                                vcpu->arch.gpr[rt] = vcpu->arch.ivor[7]; break;
-                        case SPRN_IVOR8:
-                                vcpu->arch.gpr[rt] = vcpu->arch.ivor[8]; break;
-                        case SPRN_IVOR9:
-                                vcpu->arch.gpr[rt] = vcpu->arch.ivor[9]; break;
-                        case SPRN_IVOR10:
-                                vcpu->arch.gpr[rt] = vcpu->arch.ivor[10]; break;
-                        case SPRN_IVOR11:
-                                vcpu->arch.gpr[rt] = vcpu->arch.ivor[11]; break;
-                        case SPRN_IVOR12:
-                                vcpu->arch.gpr[rt] = vcpu->arch.ivor[12]; break;
-                        case SPRN_IVOR13:
-                                vcpu->arch.gpr[rt] = vcpu->arch.ivor[13]; break;
-                        case SPRN_IVOR14:
-                                vcpu->arch.gpr[rt] = vcpu->arch.ivor[14]; break;
-                        case SPRN_IVOR15:
-                                vcpu->arch.gpr[rt] = vcpu->arch.ivor[15]; break;
                        default:
-                                printk("mfspr: unknown spr %x\n", sprn);
+                                emulated = kvmppc_core_emulate_mfspr(vcpu, sprn, rt);
-                                vcpu->arch.gpr[rt] = 0;
+                                if (emulated == EMULATE_FAIL) {
+                                        printk("mfspr: unknown spr %x\n", sprn);
+                                        vcpu->arch.gpr[rt] = 0;
+                                }
                                break;
                        }
                        break;
@@ -478,25 +210,6 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)
                        vcpu->arch.gpr[ra] = ea;
                        break;
-                case 451:                                       /* mtdcr */
-                        dcrn = get_dcrn(inst);
-                        rs = get_rs(inst);
-                        /* emulate some access in kernel */
-                        switch (dcrn) {
-                        case DCRN_CPR0_CONFIG_ADDR:
-                                vcpu->arch.cpr0_cfgaddr = vcpu->arch.gpr[rs];
-                                break;
-                        default:
-                                run->dcr.dcrn = dcrn;
-                                run->dcr.data = vcpu->arch.gpr[rs];
-                                run->dcr.is_write = 1;
-                                vcpu->arch.dcr_needed = 1;
-                                emulated = EMULATE_DO_DCR;
-                        }
-                        break;
                case 467:                                       /* mtspr */
                        sprn = get_sprn(inst);
                        rs = get_rs(inst);
@@ -505,22 +218,6 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)
                                vcpu->arch.srr0 = vcpu->arch.gpr[rs]; break;
                        case SPRN_SRR1:
                                vcpu->arch.srr1 = vcpu->arch.gpr[rs]; break;
-                        case SPRN_MMUCR:
-                                vcpu->arch.mmucr = vcpu->arch.gpr[rs]; break;
-                        case SPRN_PID:
-                                kvmppc_set_pid(vcpu, vcpu->arch.gpr[rs]); break;
-                        case SPRN_CCR0:
-                                vcpu->arch.ccr0 = vcpu->arch.gpr[rs]; break;
-                        case SPRN_CCR1:
-                                vcpu->arch.ccr1 = vcpu->arch.gpr[rs]; break;
-                        case SPRN_DEAR:
-                                vcpu->arch.dear = vcpu->arch.gpr[rs]; break;
-                        case SPRN_ESR:
-                                vcpu->arch.esr = vcpu->arch.gpr[rs]; break;
-                        case SPRN_DBCR0:
-                                vcpu->arch.dbcr0 = vcpu->arch.gpr[rs]; break;
-                        case SPRN_DBCR1:
-                                vcpu->arch.dbcr1 = vcpu->arch.gpr[rs]; break;
                        /* XXX We need to context-switch the timebase for
                         * watchdog and FIT. */
@@ -532,14 +229,6 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)
                                kvmppc_emulate_dec(vcpu);
                                break;
-                        case SPRN_TSR:
-                                vcpu->arch.tsr &= ~vcpu->arch.gpr[rs]; break;
-                        case SPRN_TCR:
-                                vcpu->arch.tcr = vcpu->arch.gpr[rs];
-                                kvmppc_emulate_dec(vcpu);
-                                break;
                        case SPRN_SPRG0:
                                vcpu->arch.sprg0 = vcpu->arch.gpr[rs]; break;
                        case SPRN_SPRG1:
@@ -549,56 +238,10 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)
                        case SPRN_SPRG3:
                                vcpu->arch.sprg3 = vcpu->arch.gpr[rs]; break;
-                        /* Note: SPRG4-7 are user-readable. These values are
-                         * loaded into the real SPRGs when resuming the
-                         * guest. */
-                        case SPRN_SPRG4:
-                                vcpu->arch.sprg4 = vcpu->arch.gpr[rs]; break;
-                        case SPRN_SPRG5:
-                                vcpu->arch.sprg5 = vcpu->arch.gpr[rs]; break;
-                        case SPRN_SPRG6:
-                                vcpu->arch.sprg6 = vcpu->arch.gpr[rs]; break;
-                        case SPRN_SPRG7:
-                                vcpu->arch.sprg7 = vcpu->arch.gpr[rs]; break;
-                        case SPRN_IVPR:
-                                vcpu->arch.ivpr = vcpu->arch.gpr[rs]; break;
-                        case SPRN_IVOR0:
-                                vcpu->arch.ivor[0] = vcpu->arch.gpr[rs]; break;
-                        case SPRN_IVOR1:
-                                vcpu->arch.ivor[1] = vcpu->arch.gpr[rs]; break;
-                        case SPRN_IVOR2:
-                                vcpu->arch.ivor[2] = vcpu->arch.gpr[rs]; break;
-                        case SPRN_IVOR3:
-                                vcpu->arch.ivor[3] = vcpu->arch.gpr[rs]; break;
-                        case SPRN_IVOR4:
-                                vcpu->arch.ivor[4] = vcpu->arch.gpr[rs]; break;
-                        case SPRN_IVOR5:
-                                vcpu->arch.ivor[5] = vcpu->arch.gpr[rs]; break;
-                        case SPRN_IVOR6:
-                                vcpu->arch.ivor[6] = vcpu->arch.gpr[rs]; break;
-                        case SPRN_IVOR7:
-                                vcpu->arch.ivor[7] = vcpu->arch.gpr[rs]; break;
-                        case SPRN_IVOR8:
-                                vcpu->arch.ivor[8] = vcpu->arch.gpr[rs]; break;
-                        case SPRN_IVOR9:
-                                vcpu->arch.ivor[9] = vcpu->arch.gpr[rs]; break;
-                        case SPRN_IVOR10:
-                                vcpu->arch.ivor[10] = vcpu->arch.gpr[rs]; break;
-                        case SPRN_IVOR11:
-                                vcpu->arch.ivor[11] = vcpu->arch.gpr[rs]; break;
-                        case SPRN_IVOR12:
-                                vcpu->arch.ivor[12] = vcpu->arch.gpr[rs]; break;
-                        case SPRN_IVOR13:
-                                vcpu->arch.ivor[13] = vcpu->arch.gpr[rs]; break;
-                        case SPRN_IVOR14:
-                                vcpu->arch.ivor[14] = vcpu->arch.gpr[rs]; break;
-                        case SPRN_IVOR15:
-                                vcpu->arch.ivor[15] = vcpu->arch.gpr[rs]; break;
                        default:
-                                printk("mtspr: unknown spr %x\n", sprn);
+                                emulated = kvmppc_core_emulate_mtspr(vcpu, sprn, rs);
-                                emulated = EMULATE_FAIL;
+                                if (emulated == EMULATE_FAIL)
+                                        printk("mtspr: unknown spr %x\n", sprn);
                                break;
                        }
                        break;
@@ -629,36 +272,6 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)
                                                       4, 0);
                        break;
-                case 978:                                       /* tlbwe */
-                        emulated = kvmppc_emul_tlbwe(vcpu, inst);
-                        break;
-                case 914:       {                               /* tlbsx */
-                        int index;
-                        unsigned int as = get_mmucr_sts(vcpu);
-                        unsigned int pid = get_mmucr_stid(vcpu);
-                        rt = get_rt(inst);
-                        ra = get_ra(inst);
-                        rb = get_rb(inst);
-                        rc = get_rc(inst);
-                        ea = vcpu->arch.gpr[rb];
-                        if (ra)
-                                ea += vcpu->arch.gpr[ra];
-                        index = kvmppc_44x_tlb_index(vcpu, ea, pid, as);
-                        if (rc) {
-                                if (index < 0)
-                                        vcpu->arch.cr &= ~0x20000000;
-                                else
-                                        vcpu->arch.cr |= 0x20000000;
-                        }
-                        vcpu->arch.gpr[rt] = index;
-                        }
-                        break;
                case 790:                                       /* lhbrx */
                        rt = get_rt(inst);
                        emulated = kvmppc_handle_load(run, vcpu, rt, 2, 0);
@@ -674,14 +287,9 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)
                                                       2, 0);
                        break;
-                case 966:                                       /* iccci */
-                        break;
                default:
-                        printk("unknown: op %d xop %d\n", get_op(inst),
+                        /* Attempt core-specific emulation below. */
-                                get_xop(inst));
                        emulated = EMULATE_FAIL;
-                        break;
                }
                break;
@@ -764,12 +372,19 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)
                break;
        default:
-                printk("unknown op %d\n", get_op(inst));
                emulated = EMULATE_FAIL;
-                break;
        }
-        KVMTRACE_3D(PPC_INSTR, vcpu, inst, vcpu->arch.pc, emulated, entryexit);
+        if (emulated == EMULATE_FAIL) {
+                emulated = kvmppc_core_emulate_op(run, vcpu, inst, &advance);
+                if (emulated == EMULATE_FAIL) {
+                        advance = 0;
+                        printk(KERN_ERR "Couldn't emulate instruction 0x%08x "
+                               "(op %d xop %d)\n", inst, get_op(inst), get_xop(inst));
+                }
+        }
+        KVMTRACE_3D(PPC_INSTR, vcpu, inst, (int)vcpu->arch.pc, emulated, entryexit);
        if (advance)
                vcpu->arch.pc += 4; /* Advance past emulated instruction. */
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 8bef0efcdfe1..2822c8ccfaaf 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -28,9 +28,9 @@
 #include <asm/uaccess.h>
 #include <asm/kvm_ppc.h>
 #include <asm/tlbflush.h>
+#include "timing.h"
 #include "../mm/mmu_decl.h"
 gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn)
 {
        return gfn;
@@ -99,14 +99,7 @@ void kvm_arch_hardware_unsetup(void)
 void kvm_arch_check_processor_compat(void *rtn)
 {
-        int r;
+        *(int *)rtn = kvmppc_core_check_processor_compat();
-        if (strcmp(cur_cpu_spec->platform, "ppc440") == 0)
-                r = 0;
-        else
-                r = -ENOTSUPP;
-        *(int *)rtn = r;
 }
 struct kvm *kvm_arch_create_vm(void)
@@ -144,9 +137,6 @@ int kvm_dev_ioctl_check_extension(long ext)
        int r;
        switch (ext) {
-        case KVM_CAP_USER_MEMORY:
-                r = 1;
-                break;
        case KVM_CAP_COALESCED_MMIO:
                r = KVM_COALESCED_MMIO_PAGE_OFFSET;
                break;
@@ -179,30 +169,15 @@ void kvm_arch_flush_shadow(struct kvm *kvm)
 struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id)
 {
        struct kvm_vcpu *vcpu;
-        int err;
+        vcpu = kvmppc_core_vcpu_create(kvm, id);
+        kvmppc_create_vcpu_debugfs(vcpu, id);
-        vcpu = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL);
-        if (!vcpu) {
-                err = -ENOMEM;
-                goto out;
-        }
-        err = kvm_vcpu_init(vcpu, kvm, id);
-        if (err)
-                goto free_vcpu;
        return vcpu;
-free_vcpu:
-        kmem_cache_free(kvm_vcpu_cache, vcpu);
-out:
-        return ERR_PTR(err);
 }
 void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
 {
-        kvm_vcpu_uninit(vcpu);
+        kvmppc_remove_vcpu_debugfs(vcpu);
-        kmem_cache_free(kvm_vcpu_cache, vcpu);
+        kvmppc_core_vcpu_free(vcpu);
 }
 void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
@@ -212,16 +187,14 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
 int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu)
 {
-        unsigned int priority = exception_priority[BOOKE_INTERRUPT_DECREMENTER];
+        return kvmppc_core_pending_dec(vcpu);
-        return test_bit(priority, &vcpu->arch.pending_exceptions);
 }
 static void kvmppc_decrementer_func(unsigned long data)
 {
        struct kvm_vcpu *vcpu = (struct kvm_vcpu *)data;
-        kvmppc_queue_exception(vcpu, BOOKE_INTERRUPT_DECREMENTER);
+        kvmppc_core_queue_dec(vcpu);
        if (waitqueue_active(&vcpu->wq)) {
                wake_up_interruptible(&vcpu->wq);
@@ -242,96 +215,25 @@ void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
        kvmppc_core_destroy_mmu(vcpu);
 }
-/* Note: clearing MSR[DE] just means that the debug interrupt will not be
- * delivered *immediately*. Instead, it simply sets the appropriate DBSR bits.
- * If those DBSR bits are still set when MSR[DE] is re-enabled, the interrupt
- * will be delivered as an "imprecise debug event" (which is indicated by
- * DBSR[IDE].
- */
-static void kvmppc_disable_debug_interrupts(void)
-{
-        mtmsr(mfmsr() & ~MSR_DE);
-}
-static void kvmppc_restore_host_debug_state(struct kvm_vcpu *vcpu)
-{
-        kvmppc_disable_debug_interrupts();
-        mtspr(SPRN_IAC1, vcpu->arch.host_iac[0]);
-        mtspr(SPRN_IAC2, vcpu->arch.host_iac[1]);
-        mtspr(SPRN_IAC3, vcpu->arch.host_iac[2]);
-        mtspr(SPRN_IAC4, vcpu->arch.host_iac[3]);
-        mtspr(SPRN_DBCR1, vcpu->arch.host_dbcr1);
-        mtspr(SPRN_DBCR2, vcpu->arch.host_dbcr2);
-        mtspr(SPRN_DBCR0, vcpu->arch.host_dbcr0);
-        mtmsr(vcpu->arch.host_msr);
-}
-static void kvmppc_load_guest_debug_registers(struct kvm_vcpu *vcpu)
-{
-        struct kvm_guest_debug *dbg = &vcpu->guest_debug;
-        u32 dbcr0 = 0;
-        vcpu->arch.host_msr = mfmsr();
-        kvmppc_disable_debug_interrupts();
-        /* Save host debug register state. */
-        vcpu->arch.host_iac[0] = mfspr(SPRN_IAC1);
-        vcpu->arch.host_iac[1] = mfspr(SPRN_IAC2);
-        vcpu->arch.host_iac[2] = mfspr(SPRN_IAC3);
-        vcpu->arch.host_iac[3] = mfspr(SPRN_IAC4);
-        vcpu->arch.host_dbcr0 = mfspr(SPRN_DBCR0);
-        vcpu->arch.host_dbcr1 = mfspr(SPRN_DBCR1);
-        vcpu->arch.host_dbcr2 = mfspr(SPRN_DBCR2);
-        /* set registers up for guest */
-        if (dbg->bp[0]) {
-                mtspr(SPRN_IAC1, dbg->bp[0]);
-                dbcr0 |= DBCR0_IAC1 | DBCR0_IDM;
-        }
-        if (dbg->bp[1]) {
-                mtspr(SPRN_IAC2, dbg->bp[1]);
-                dbcr0 |= DBCR0_IAC2 | DBCR0_IDM;
-        }
-        if (dbg->bp[2]) {
-                mtspr(SPRN_IAC3, dbg->bp[2]);
-                dbcr0 |= DBCR0_IAC3 | DBCR0_IDM;
-        }
-        if (dbg->bp[3]) {
-                mtspr(SPRN_IAC4, dbg->bp[3]);
-                dbcr0 |= DBCR0_IAC4 | DBCR0_IDM;
-        }
-        mtspr(SPRN_DBCR0, dbcr0);
-        mtspr(SPRN_DBCR1, 0);
-        mtspr(SPRN_DBCR2, 0);
-}
 void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 {
-        int i;
        if (vcpu->guest_debug.enabled)
-                kvmppc_load_guest_debug_registers(vcpu);
+                kvmppc_core_load_guest_debugstate(vcpu);
-        /* Mark every guest entry in the shadow TLB entry modified, so that they
+        kvmppc_core_vcpu_load(vcpu, cpu);
-         * will all be reloaded on the next vcpu run (instead of being
-         * demand-faulted). */
-        for (i = 0; i <= tlb_44x_hwater; i++)
-                kvmppc_tlbe_set_modified(vcpu, i);
 }
 void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
 {
        if (vcpu->guest_debug.enabled)
-                kvmppc_restore_host_debug_state(vcpu);
+                kvmppc_core_load_host_debugstate(vcpu);
        /* Don't leave guest TLB entries resident when being de-scheduled. */
        /* XXX It would be nice to differentiate between heavyweight exit and
         * sched_out here, since we could avoid the TLB flush for heavyweight
         * exits. */
        _tlbil_all();
+        kvmppc_core_vcpu_put(vcpu);
 }
 int kvm_arch_vcpu_ioctl_debug_guest(struct kvm_vcpu *vcpu,
@@ -355,14 +257,14 @@ int kvm_arch_vcpu_ioctl_debug_guest(struct kvm_vcpu *vcpu,
 static void kvmppc_complete_dcr_load(struct kvm_vcpu *vcpu,
                                     struct kvm_run *run)
 {
-        u32 *gpr = &vcpu->arch.gpr[vcpu->arch.io_gpr];
+        ulong *gpr = &vcpu->arch.gpr[vcpu->arch.io_gpr];
        *gpr = run->dcr.data;
 }
 static void kvmppc_complete_mmio_load(struct kvm_vcpu *vcpu,
                                      struct kvm_run *run)
 {
-        u32 *gpr = &vcpu->arch.gpr[vcpu->arch.io_gpr];
+        ulong *gpr = &vcpu->arch.gpr[vcpu->arch.io_gpr];
        if (run->mmio.len > sizeof(*gpr)) {
                printk(KERN_ERR "bad MMIO length: %d\n", run->mmio.len);
@@ -460,7 +362,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
                vcpu->arch.dcr_needed = 0;
        }
-        kvmppc_check_and_deliver_interrupts(vcpu);
+        kvmppc_core_deliver_interrupts(vcpu);
        local_irq_disable();
        kvm_guest_enter();
@@ -478,7 +380,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
 int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, struct kvm_interrupt *irq)
 {
-        kvmppc_queue_exception(vcpu, BOOKE_INTERRUPT_EXTERNAL);
+        kvmppc_core_queue_external(vcpu, irq);
        if (waitqueue_active(&vcpu->wq)) {
                wake_up_interruptible(&vcpu->wq);
diff --git a/arch/powerpc/kvm/timing.c b/arch/powerpc/kvm/timing.c
new file mode 100644
index 000000000000..47ee603f558e
--- /dev/null
+++ b/arch/powerpc/kvm/timing.c
@@ -0,0 +1,239 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ *
+ * Copyright IBM Corp. 2008
+ *
+ * Authors: Hollis Blanchard <hollisb@us.ibm.com>
+ *          Christian Ehrhardt <ehrhardt@linux.vnet.ibm.com>
+ */
+#include <linux/kvm_host.h>
+#include <linux/fs.h>
+#include <linux/seq_file.h>
+#include <linux/debugfs.h>
+#include <linux/uaccess.h>
+#include <asm/time.h>
+#include <asm-generic/div64.h>
+#include "timing.h"
+void kvmppc_init_timing_stats(struct kvm_vcpu *vcpu)
+{
+        int i;
+        /* pause guest execution to avoid concurrent updates */
+        local_irq_disable();
+        mutex_lock(&vcpu->mutex);
+        vcpu->arch.last_exit_type = 0xDEAD;
+        for (i = 0; i < __NUMBER_OF_KVM_EXIT_TYPES; i++) {
+                vcpu->arch.timing_count_type[i] = 0;
+                vcpu->arch.timing_max_duration[i] = 0;
+                vcpu->arch.timing_min_duration[i] = 0xFFFFFFFF;
+                vcpu->arch.timing_sum_duration[i] = 0;
+                vcpu->arch.timing_sum_quad_duration[i] = 0;
+        }
+        vcpu->arch.timing_last_exit = 0;
+        vcpu->arch.timing_exit.tv64 = 0;
+        vcpu->arch.timing_last_enter.tv64 = 0;
+        mutex_unlock(&vcpu->mutex);
+        local_irq_enable();
+}
+static void add_exit_timing(struct kvm_vcpu *vcpu, u64 duration, int type)
+{
+        u64 old;
+        do_div(duration, tb_ticks_per_usec);
+        if (unlikely(duration > 0xFFFFFFFF)) {
+                printk(KERN_ERR"%s - duration too big -> overflow"
+                        " duration %lld type %d exit #%d\n",
+                        __func__, duration, type,
+                        vcpu->arch.timing_count_type[type]);
+                return;
+        }
+        vcpu->arch.timing_count_type[type]++;
+        /* sum */
+        old = vcpu->arch.timing_sum_duration[type];
+        vcpu->arch.timing_sum_duration[type] += duration;
+        if (unlikely(old > vcpu->arch.timing_sum_duration[type])) {
+                printk(KERN_ERR"%s - wrap adding sum of durations"
+                        " old %lld new %lld type %d exit # of type %d\n",
+                        __func__, old, vcpu->arch.timing_sum_duration[type],
+                        type, vcpu->arch.timing_count_type[type]);
+        }
+        /* square sum */
+        old = vcpu->arch.timing_sum_quad_duration[type];
+        vcpu->arch.timing_sum_quad_duration[type] += (duration*duration);
+        if (unlikely(old > vcpu->arch.timing_sum_quad_duration[type])) {
+                printk(KERN_ERR"%s - wrap adding sum of squared durations"
+                        " old %lld new %lld type %d exit # of type %d\n",
+                        __func__, old,
+                        vcpu->arch.timing_sum_quad_duration[type],
+                        type, vcpu->arch.timing_count_type[type]);
+        }
+        /* set min/max */
+        if (unlikely(duration < vcpu->arch.timing_min_duration[type]))
+                vcpu->arch.timing_min_duration[type] = duration;
+        if (unlikely(duration > vcpu->arch.timing_max_duration[type]))
+                vcpu->arch.timing_max_duration[type] = duration;
+}
+void kvmppc_update_timing_stats(struct kvm_vcpu *vcpu)
+{
+        u64 exit = vcpu->arch.timing_last_exit;
+        u64 enter = vcpu->arch.timing_last_enter.tv64;
+        /* save exit time, used next exit when the reenter time is known */
+        vcpu->arch.timing_last_exit = vcpu->arch.timing_exit.tv64;
+        if (unlikely(vcpu->arch.last_exit_type == 0xDEAD || exit == 0))
+                return; /* skip incomplete cycle (e.g. after reset) */
+        /* update statistics for average and standard deviation */
+        add_exit_timing(vcpu, (enter - exit), vcpu->arch.last_exit_type);
+        /* enter -> timing_last_exit is time spent in guest - log this too */
+        add_exit_timing(vcpu, (vcpu->arch.timing_last_exit - enter),
+                        TIMEINGUEST);
+}
+static const char *kvm_exit_names[__NUMBER_OF_KVM_EXIT_TYPES] = {
+        [MMIO_EXITS] =              "MMIO",
+        [DCR_EXITS] =               "DCR",
+        [SIGNAL_EXITS] =            "SIGNAL",
+        [ITLB_REAL_MISS_EXITS] =    "ITLBREAL",
+        [ITLB_VIRT_MISS_EXITS] =    "ITLBVIRT",
+        [DTLB_REAL_MISS_EXITS] =    "DTLBREAL",
+        [DTLB_VIRT_MISS_EXITS] =    "DTLBVIRT",
+        [SYSCALL_EXITS] =           "SYSCALL",
+        [ISI_EXITS] =               "ISI",
+        [DSI_EXITS] =               "DSI",
+        [EMULATED_INST_EXITS] =     "EMULINST",
+        [EMULATED_MTMSRWE_EXITS] =  "EMUL_WAIT",
+        [EMULATED_WRTEE_EXITS] =    "EMUL_WRTEE",
+        [EMULATED_MTSPR_EXITS] =    "EMUL_MTSPR",
+        [EMULATED_MFSPR_EXITS] =    "EMUL_MFSPR",
+        [EMULATED_MTMSR_EXITS] =    "EMUL_MTMSR",
+        [EMULATED_MFMSR_EXITS] =    "EMUL_MFMSR",
+        [EMULATED_TLBSX_EXITS] =    "EMUL_TLBSX",
+        [EMULATED_TLBWE_EXITS] =    "EMUL_TLBWE",
+        [EMULATED_RFI_EXITS] =      "EMUL_RFI",
+        [DEC_EXITS] =               "DEC",
+        [EXT_INTR_EXITS] =          "EXTINT",
+        [HALT_WAKEUP] =             "HALT",
+        [USR_PR_INST] =             "USR_PR_INST",
+        [FP_UNAVAIL] =              "FP_UNAVAIL",
+        [DEBUG_EXITS] =             "DEBUG",
+        [TIMEINGUEST] =             "TIMEINGUEST"
+};
+static int kvmppc_exit_timing_show(struct seq_file *m, void *private)
+{
+        struct kvm_vcpu *vcpu = m->private;
+        int i;
+        seq_printf(m, "%s", "type       count   min     max     sum     sum_squared\n");
+        for (i = 0; i < __NUMBER_OF_KVM_EXIT_TYPES; i++) {
+                seq_printf(m, "%12s     %10d    %10lld  %10lld  %20lld  %20lld\n",
+                        kvm_exit_names[i],
+                        vcpu->arch.timing_count_type[i],
+                        vcpu->arch.timing_min_duration[i],
+                        vcpu->arch.timing_max_duration[i],
+                        vcpu->arch.timing_sum_duration[i],
+                        vcpu->arch.timing_sum_quad_duration[i]);
+        }
+        return 0;
+}
+/* Write 'c' to clear the timing statistics. */
+static ssize_t kvmppc_exit_timing_write(struct file *file,
+                                       const char __user *user_buf,
+                                       size_t count, loff_t *ppos)
+{
+        int err = -EINVAL;
+        char c;
+        if (count > 1) {
+                goto done;
+        }
+        if (get_user(c, user_buf)) {
+                err = -EFAULT;
+                goto done;
+        }
+        if (c == 'c') {
+                struct seq_file *seqf = (struct seq_file *)file->private_data;
+                struct kvm_vcpu *vcpu = seqf->private;
+                /* Write does not affect our buffers previously generated with
+                 * show. seq_file is locked here to prevent races of init with
+                 * a show call */
+                mutex_lock(&seqf->lock);
+                kvmppc_init_timing_stats(vcpu);
+                mutex_unlock(&seqf->lock);
+                err = count;
+        }
+done:
+        return err;
+}
+static int kvmppc_exit_timing_open(struct inode *inode, struct file *file)
+{
+        return single_open(file, kvmppc_exit_timing_show, inode->i_private);
+}
+static struct file_operations kvmppc_exit_timing_fops = {
+        .owner   = THIS_MODULE,
+        .open    = kvmppc_exit_timing_open,
+        .read    = seq_read,
+        .write   = kvmppc_exit_timing_write,
+        .llseek  = seq_lseek,
+        .release = single_release,
+};
+void kvmppc_create_vcpu_debugfs(struct kvm_vcpu *vcpu, unsigned int id)
+{
+        static char dbg_fname[50];
+        struct dentry *debugfs_file;
+        snprintf(dbg_fname, sizeof(dbg_fname), "vm%u_vcpu%u_timing",
+                 current->pid, id);
+        debugfs_file = debugfs_create_file(dbg_fname, 0666,
+                                        kvm_debugfs_dir, vcpu,
+                                        &kvmppc_exit_timing_fops);
+        if (!debugfs_file) {
+                printk(KERN_ERR"%s: error creating debugfs file %s\n",
+                        __func__, dbg_fname);
+                return;
+        }
+        vcpu->arch.debugfs_exit_timing = debugfs_file;
+}
+void kvmppc_remove_vcpu_debugfs(struct kvm_vcpu *vcpu)
+{
+        if (vcpu->arch.debugfs_exit_timing) {
+                debugfs_remove(vcpu->arch.debugfs_exit_timing);
+                vcpu->arch.debugfs_exit_timing = NULL;
+        }
+}
diff --git a/arch/powerpc/kvm/timing.h b/arch/powerpc/kvm/timing.h
new file mode 100644
index 000000000000..bb13b1f3cd5a
--- /dev/null
+++ b/arch/powerpc/kvm/timing.h
@@ -0,0 +1,102 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ *
+ * Copyright IBM Corp. 2008
+ *
+ * Authors: Christian Ehrhardt <ehrhardt@linux.vnet.ibm.com>
+ */
+#ifndef __POWERPC_KVM_EXITTIMING_H__
+#define __POWERPC_KVM_EXITTIMING_H__
+#include <linux/kvm_host.h>
+#include <asm/kvm_host.h>
+#ifdef CONFIG_KVM_EXIT_TIMING
+void kvmppc_init_timing_stats(struct kvm_vcpu *vcpu);
+void kvmppc_update_timing_stats(struct kvm_vcpu *vcpu);
+void kvmppc_create_vcpu_debugfs(struct kvm_vcpu *vcpu, unsigned int id);
+void kvmppc_remove_vcpu_debugfs(struct kvm_vcpu *vcpu);
+static inline void kvmppc_set_exit_type(struct kvm_vcpu *vcpu, int type)
+{
+        vcpu->arch.last_exit_type = type;
+}
+#else
+/* if exit timing is not configured there is no need to build the c file */
+static inline void kvmppc_init_timing_stats(struct kvm_vcpu *vcpu) {}
+static inline void kvmppc_update_timing_stats(struct kvm_vcpu *vcpu) {}
+static inline void kvmppc_create_vcpu_debugfs(struct kvm_vcpu *vcpu,
+                                                unsigned int id) {}
+static inline void kvmppc_remove_vcpu_debugfs(struct kvm_vcpu *vcpu) {}
+static inline void kvmppc_set_exit_type(struct kvm_vcpu *vcpu, int type) {}
+#endif /* CONFIG_KVM_EXIT_TIMING */
+/* account the exit in kvm_stats */
+static inline void kvmppc_account_exit_stat(struct kvm_vcpu *vcpu, int type)
+{
+        /* type has to be known at build time for optimization */
+        BUILD_BUG_ON(__builtin_constant_p(type));
+        switch (type) {
+        case EXT_INTR_EXITS:
+                vcpu->stat.ext_intr_exits++;
+                break;
+        case DEC_EXITS:
+                vcpu->stat.dec_exits++;
+                break;
+        case EMULATED_INST_EXITS:
+                vcpu->stat.emulated_inst_exits++;
+                break;
+        case DCR_EXITS:
+                vcpu->stat.dcr_exits++;
+                break;
+        case DSI_EXITS:
+                vcpu->stat.dsi_exits++;
+                break;
+        case ISI_EXITS:
+                vcpu->stat.isi_exits++;
+                break;
+        case SYSCALL_EXITS:
+                vcpu->stat.syscall_exits++;
+                break;
+        case DTLB_REAL_MISS_EXITS:
+                vcpu->stat.dtlb_real_miss_exits++;
+                break;
+        case DTLB_VIRT_MISS_EXITS:
+                vcpu->stat.dtlb_virt_miss_exits++;
+                break;
+        case MMIO_EXITS:
+                vcpu->stat.mmio_exits++;
+                break;
+        case ITLB_REAL_MISS_EXITS:
+                vcpu->stat.itlb_real_miss_exits++;
+                break;
+        case ITLB_VIRT_MISS_EXITS:
+                vcpu->stat.itlb_virt_miss_exits++;
+                break;
+        case SIGNAL_EXITS:
+                vcpu->stat.signal_exits++;
+                break;
+        }
+}
+/* wrapper to set exit time and account for it in kvm_stats */
+static inline void kvmppc_account_exit(struct kvm_vcpu *vcpu, int type)
+{
+        kvmppc_set_exit_type(vcpu, type);
+        kvmppc_account_exit_stat(vcpu, type);
+}
+#endif /* __POWERPC_KVM_EXITTIMING_H__ */
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 8b00eb2ddf57..be8497186b96 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -113,8 +113,6 @@ long kvm_arch_dev_ioctl(struct file *filp,
 int kvm_dev_ioctl_check_extension(long ext)
 {
        switch (ext) {
-        case KVM_CAP_USER_MEMORY:
-                return 1;
        default:
                return 0;
        }
@@ -185,8 +183,6 @@ struct kvm *kvm_arch_create_vm(void)
        debug_register_view(kvm->arch.dbf, &debug_sprintf_view);
        VM_EVENT(kvm, 3, "%s", "vm created");
-        try_module_get(THIS_MODULE);
        return kvm;
 out_nodbf:
        free_page((unsigned long)(kvm->arch.sca));
@@ -196,13 +192,33 @@ out_nokvm:
        return ERR_PTR(rc);
 }
+void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
+{
+        VCPU_EVENT(vcpu, 3, "%s", "free cpu");
+        free_page((unsigned long)(vcpu->arch.sie_block));
+        kvm_vcpu_uninit(vcpu);
+        kfree(vcpu);
+}
+static void kvm_free_vcpus(struct kvm *kvm)
+{
+        unsigned int i;
+        for (i = 0; i < KVM_MAX_VCPUS; ++i) {
+                if (kvm->vcpus[i]) {
+                        kvm_arch_vcpu_destroy(kvm->vcpus[i]);
+                        kvm->vcpus[i] = NULL;
+                }
+        }
+}
 void kvm_arch_destroy_vm(struct kvm *kvm)
 {
-        debug_unregister(kvm->arch.dbf);
+        kvm_free_vcpus(kvm);
        kvm_free_physmem(kvm);
        free_page((unsigned long)(kvm->arch.sca));
+        debug_unregister(kvm->arch.dbf);
        kfree(kvm);
-        module_put(THIS_MODULE);
 }
 /* Section: vcpu related */
@@ -213,8 +229,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
 void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
 {
-        /* kvm common code refers to this, but does'nt call it */
+        /* Nothing todo */
-        BUG();
 }
 void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
@@ -308,8 +323,6 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm,
        VM_EVENT(kvm, 3, "create cpu %d at %p, sie block at %p", id, vcpu,
                 vcpu->arch.sie_block);
-        try_module_get(THIS_MODULE);
        return vcpu;
 out_free_cpu:
        kfree(vcpu);
@@ -317,14 +330,6 @@ out_nomem:
        return ERR_PTR(rc);
 }
-void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
-{
-        VCPU_EVENT(vcpu, 3, "%s", "destroy cpu");
-        free_page((unsigned long)(vcpu->arch.sie_block));
-        kfree(vcpu);
-        module_put(THIS_MODULE);
-}
 int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
 {
        /* kvm common code refers to this, but never calls it */
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 8346be87cfa1..97215a458e5f 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -21,6 +21,7 @@
 #include <asm/pvclock-abi.h>
 #include <asm/desc.h>
+#include <asm/mtrr.h>
 #define KVM_MAX_VCPUS 16
 #define KVM_MEMORY_SLOTS 32
@@ -86,6 +87,7 @@
 #define KVM_MIN_FREE_MMU_PAGES 5
 #define KVM_REFILL_PAGES 25
 #define KVM_MAX_CPUID_ENTRIES 40
+#define KVM_NR_FIXED_MTRR_REGION 88
 #define KVM_NR_VAR_MTRR 8
 extern spinlock_t kvm_lock;
@@ -180,6 +182,8 @@ struct kvm_mmu_page {
        struct list_head link;
        struct hlist_node hash_link;
+        struct list_head oos_link;
        /*
         * The following two entries are used to key the shadow page in the
         * hash table.
@@ -190,13 +194,16 @@ struct kvm_mmu_page {
        u64 *spt;
        /* hold the gfn of each spte inside spt */
        gfn_t *gfns;
-        unsigned long slot_bitmap; /* One bit set per slot which has memory
+        /*
-                                    * in this shadow page.
+         * One bit set per slot which has memory
-                                    */
+         * in this shadow page.
+         */
+        DECLARE_BITMAP(slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS);
        int multimapped;         /* More than one parent_pte? */
        int root_count;          /* Currently serving as active root */
        bool unsync;
-        bool unsync_children;
+        bool global;
+        unsigned int unsync_children;
        union {
                u64 *parent_pte;               /* !multimapped */
                struct hlist_head parent_ptes; /* multimapped, kvm_pte_chain */
@@ -327,8 +334,10 @@ struct kvm_vcpu_arch {
        bool nmi_pending;
        bool nmi_injected;
+        bool nmi_window_open;
-        u64 mtrr[0x100];
+        struct mtrr_state_type mtrr_state;
+        u32 pat;
 };
 struct kvm_mem_alias {
@@ -350,11 +359,13 @@ struct kvm_arch{
         */
        struct list_head active_mmu_pages;
        struct list_head assigned_dev_head;
+        struct list_head oos_global_pages;
        struct dmar_domain *intel_iommu_domain;
        struct kvm_pic *vpic;
        struct kvm_ioapic *vioapic;
        struct kvm_pit *vpit;
        struct hlist_head irq_ack_notifier_list;
+        int vapics_in_nmi_mode;
        int round_robin_prev_vcpu;
        unsigned int tss_addr;
@@ -378,6 +389,7 @@ struct kvm_vm_stat {
        u32 mmu_recycled;
        u32 mmu_cache_miss;
        u32 mmu_unsync;
+        u32 mmu_unsync_global;
        u32 remote_tlb_flush;
        u32 lpages;
 };
@@ -397,6 +409,7 @@ struct kvm_vcpu_stat {
        u32 halt_exits;
        u32 halt_wakeup;
        u32 request_irq_exits;
+        u32 request_nmi_exits;
        u32 irq_exits;
        u32 host_state_reload;
        u32 efer_reload;
@@ -405,6 +418,7 @@ struct kvm_vcpu_stat {
        u32 insn_emulation_fail;
        u32 hypercalls;
        u32 irq_injections;
+        u32 nmi_injections;
 };
 struct descriptor_table {
@@ -477,6 +491,7 @@ struct kvm_x86_ops {
        int (*set_tss_addr)(struct kvm *kvm, unsigned int addr);
        int (*get_tdp_level)(void);
+        int (*get_mt_mask_shift)(void);
 };
 extern struct kvm_x86_ops *kvm_x86_ops;
@@ -490,7 +505,7 @@ int kvm_mmu_setup(struct kvm_vcpu *vcpu);
 void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte);
 void kvm_mmu_set_base_ptes(u64 base_pte);
 void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
-                u64 dirty_mask, u64 nx_mask, u64 x_mask);
+                u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 mt_mask);
 int kvm_mmu_reset_context(struct kvm_vcpu *vcpu);
 void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot);
@@ -587,12 +602,14 @@ unsigned long segment_base(u16 selector);
 void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu);
 void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
-                       const u8 *new, int bytes);
+                       const u8 *new, int bytes,
+                       bool guest_initiated);
 int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva);
 void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu);
 int kvm_mmu_load(struct kvm_vcpu *vcpu);
 void kvm_mmu_unload(struct kvm_vcpu *vcpu);
 void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu);
+void kvm_mmu_sync_global(struct kvm_vcpu *vcpu);
 int kvm_emulate_hypercall(struct kvm_vcpu *vcpu);
@@ -607,6 +624,8 @@ void kvm_disable_tdp(void);
 int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3);
 int complete_pio(struct kvm_vcpu *vcpu);
+struct kvm_memory_slot *gfn_to_memslot_unaliased(struct kvm *kvm, gfn_t gfn);
 static inline struct kvm_mmu_page *page_header(hpa_t shadow_page)
 {
        struct page *page = pfn_to_page(shadow_page >> PAGE_SHIFT);
@@ -702,18 +721,6 @@ static inline void kvm_inject_gp(struct kvm_vcpu *vcpu, u32 error_code)
        kvm_queue_exception_e(vcpu, GP_VECTOR, error_code);
 }
-#define ASM_VMX_VMCLEAR_RAX       ".byte 0x66, 0x0f, 0xc7, 0x30"
-#define ASM_VMX_VMLAUNCH          ".byte 0x0f, 0x01, 0xc2"
-#define ASM_VMX_VMRESUME          ".byte 0x0f, 0x01, 0xc3"
-#define ASM_VMX_VMPTRLD_RAX       ".byte 0x0f, 0xc7, 0x30"
-#define ASM_VMX_VMREAD_RDX_RAX    ".byte 0x0f, 0x78, 0xd0"
-#define ASM_VMX_VMWRITE_RAX_RDX   ".byte 0x0f, 0x79, 0xd0"
-#define ASM_VMX_VMWRITE_RSP_RDX   ".byte 0x0f, 0x79, 0xd4"
-#define ASM_VMX_VMXOFF            ".byte 0x0f, 0x01, 0xc4"
-#define ASM_VMX_VMXON_RAX         ".byte 0xf3, 0x0f, 0xc7, 0x30"
-#define ASM_VMX_INVEPT            ".byte 0x66, 0x0f, 0x38, 0x80, 0x08"
-#define ASM_VMX_INVVPID           ".byte 0x66, 0x0f, 0x38, 0x81, 0x08"
 #define MSR_IA32_TIME_STAMP_COUNTER             0x010
 #define TSS_IOPB_BASE_OFFSET 0x66
diff --git a/arch/x86/include/asm/kvm_x86_emulate.h b/arch/x86/include/asm/kvm_x86_emulate.h
index 25179a29f208..6a159732881a 100644
--- a/arch/x86/include/asm/kvm_x86_emulate.h
+++ b/arch/x86/include/asm/kvm_x86_emulate.h
@@ -123,6 +123,7 @@ struct decode_cache {
        u8 ad_bytes;
        u8 rex_prefix;
        struct operand src;
+        struct operand src2;
        struct operand dst;
        bool has_seg_override;
        u8 seg_override;
@@ -146,22 +147,18 @@ struct x86_emulate_ctxt {
        /* Register state before/after emulation. */
        struct kvm_vcpu *vcpu;
-        /* Linear faulting address (if emulating a page-faulting instruction) */
        unsigned long eflags;
        /* Emulated execution mode, represented by an X86EMUL_MODE value. */
        int mode;
        u32 cs_base;
        /* decode cache */
        struct decode_cache decode;
 };
 /* Repeat String Operation Prefix */
-#define REPE_PREFIX  1
+#define REPE_PREFIX     1
-#define REPNE_PREFIX    2
+#define REPNE_PREFIX    2
 /* Execution mode, passed to the emulator. */
 #define X86EMUL_MODE_REAL     0 /* Real mode.             */
@@ -170,7 +167,7 @@ struct x86_emulate_ctxt {
 #define X86EMUL_MODE_PROT64   8 /* 64-bit (long) mode.    */
 /* Host execution mode. */
-#if defined(__i386__)
+#if defined(CONFIG_X86_32)
 #define X86EMUL_MODE_HOST X86EMUL_MODE_PROT32
 #elif defined(CONFIG_X86_64)
 #define X86EMUL_MODE_HOST X86EMUL_MODE_PROT64
diff --git a/arch/x86/include/asm/mtrr.h b/arch/x86/include/asm/mtrr.h
index 7c1e4258b31e..cb988aab716d 100644
--- a/arch/x86/include/asm/mtrr.h
+++ b/arch/x86/include/asm/mtrr.h
@@ -57,6 +57,31 @@ struct mtrr_gentry {
 };
 #endif /* !__i386__ */
+struct mtrr_var_range {
+        u32 base_lo;
+        u32 base_hi;
+        u32 mask_lo;
+        u32 mask_hi;
+};
+/* In the Intel processor's MTRR interface, the MTRR type is always held in
+   an 8 bit field: */
+typedef u8 mtrr_type;
+#define MTRR_NUM_FIXED_RANGES 88
+#define MTRR_MAX_VAR_RANGES 256
+struct mtrr_state_type {
+        struct mtrr_var_range var_ranges[MTRR_MAX_VAR_RANGES];
+        mtrr_type fixed_ranges[MTRR_NUM_FIXED_RANGES];
+        unsigned char enabled;
+        unsigned char have_fixed;
+        mtrr_type def_type;
+};
+#define MTRRphysBase_MSR(reg) (0x200 + 2 * (reg))
+#define MTRRphysMask_MSR(reg) (0x200 + 2 * (reg) + 1)
 /*  These are the various ioctls  */
 #define MTRRIOC_ADD_ENTRY        _IOW(MTRR_IOCTL_BASE,  0, struct mtrr_sentry)
 #define MTRRIOC_SET_ENTRY        _IOW(MTRR_IOCTL_BASE,  1, struct mtrr_sentry)
diff --git a/arch/x86/kvm/svm.h b/arch/x86/include/asm/svm.h
index 1b8afa78e869..1b8afa78e869 100644
--- a/arch/x86/kvm/svm.h
+++ b/arch/x86/include/asm/svm.h
diff --git a/arch/x86/include/asm/virtext.h b/arch/x86/include/asm/virtext.h
new file mode 100644
index 000000000000..593636275238
--- /dev/null
+++ b/arch/x86/include/asm/virtext.h
@@ -0,0 +1,132 @@
+/* CPU virtualization extensions handling
+ *
+ * This should carry the code for handling CPU virtualization extensions
+ * that needs to live in the kernel core.
+ *
+ * Author: Eduardo Habkost <ehabkost@redhat.com>
+ *
+ * Copyright (C) 2008, Red Hat Inc.
+ *
+ * Contains code from KVM, Copyright (C) 2006 Qumranet, Inc.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ */
+#ifndef _ASM_X86_VIRTEX_H
+#define _ASM_X86_VIRTEX_H
+#include <asm/processor.h>
+#include <asm/system.h>
+#include <asm/vmx.h>
+#include <asm/svm.h>
+/*
+ * VMX functions:
+ */
+static inline int cpu_has_vmx(void)
+{
+        unsigned long ecx = cpuid_ecx(1);
+        return test_bit(5, &ecx); /* CPUID.1:ECX.VMX[bit 5] -> VT */
+}
+/** Disable VMX on the current CPU
+ *
+ * vmxoff causes a undefined-opcode exception if vmxon was not run
+ * on the CPU previously. Only call this function if you know VMX
+ * is enabled.
+ */
+static inline void cpu_vmxoff(void)
+{
+        asm volatile (ASM_VMX_VMXOFF : : : "cc");
+        write_cr4(read_cr4() & ~X86_CR4_VMXE);
+}
+static inline int cpu_vmx_enabled(void)
+{
+        return read_cr4() & X86_CR4_VMXE;
+}
+/** Disable VMX if it is enabled on the current CPU
+ *
+ * You shouldn't call this if cpu_has_vmx() returns 0.
+ */
+static inline void __cpu_emergency_vmxoff(void)
+{
+        if (cpu_vmx_enabled())
+                cpu_vmxoff();
+}
+/** Disable VMX if it is supported and enabled on the current CPU
+ */
+static inline void cpu_emergency_vmxoff(void)
+{
+        if (cpu_has_vmx())
+                __cpu_emergency_vmxoff();
+}
+/*
+ * SVM functions:
+ */
+/** Check if the CPU has SVM support
+ *
+ * You can use the 'msg' arg to get a message describing the problem,
+ * if the function returns zero. Simply pass NULL if you are not interested
+ * on the messages; gcc should take care of not generating code for
+ * the messages on this case.
+ */
+static inline int cpu_has_svm(const char **msg)
+{
+        uint32_t eax, ebx, ecx, edx;
+        if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) {
+                if (msg)
+                        *msg = "not amd";
+                return 0;
+        }
+        cpuid(0x80000000, &eax, &ebx, &ecx, &edx);
+        if (eax < SVM_CPUID_FUNC) {
+                if (msg)
+                        *msg = "can't execute cpuid_8000000a";
+                return 0;
+        }
+        cpuid(0x80000001, &eax, &ebx, &ecx, &edx);
+        if (!(ecx & (1 << SVM_CPUID_FEATURE_SHIFT))) {
+                if (msg)
+                        *msg = "svm not available";
+                return 0;
+        }
+        return 1;
+}
+/** Disable SVM on the current CPU
+ *
+ * You should call this only if cpu_has_svm() returned true.
+ */
+static inline void cpu_svm_disable(void)
+{
+        uint64_t efer;
+        wrmsrl(MSR_VM_HSAVE_PA, 0);
+        rdmsrl(MSR_EFER, efer);
+        wrmsrl(MSR_EFER, efer & ~MSR_EFER_SVME_MASK);
+}
+/** Makes sure SVM is disabled, if it is supported on the CPU
+ */
+static inline void cpu_emergency_svm_disable(void)
+{
+        if (cpu_has_svm(NULL))
+                cpu_svm_disable();
+}
+#endif /* _ASM_X86_VIRTEX_H */
diff --git a/arch/x86/kvm/vmx.h b/arch/x86/include/asm/vmx.h
index ec5edc339da6..d0238e6151d8 100644
--- a/arch/x86/kvm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -63,10 +63,13 @@
 #define VM_EXIT_HOST_ADDR_SPACE_SIZE            0x00000200
 #define VM_EXIT_ACK_INTR_ON_EXIT                0x00008000
+#define VM_EXIT_SAVE_IA32_PAT                   0x00040000
+#define VM_EXIT_LOAD_IA32_PAT                   0x00080000
 #define VM_ENTRY_IA32E_MODE                     0x00000200
 #define VM_ENTRY_SMM                            0x00000400
 #define VM_ENTRY_DEACT_DUAL_MONITOR             0x00000800
+#define VM_ENTRY_LOAD_IA32_PAT                  0x00004000
 /* VMCS Encodings */
 enum vmcs_field {
@@ -112,6 +115,8 @@ enum vmcs_field {
        VMCS_LINK_POINTER_HIGH          = 0x00002801,
        GUEST_IA32_DEBUGCTL             = 0x00002802,
        GUEST_IA32_DEBUGCTL_HIGH        = 0x00002803,
+        GUEST_IA32_PAT                  = 0x00002804,
+        GUEST_IA32_PAT_HIGH             = 0x00002805,
        GUEST_PDPTR0                    = 0x0000280a,
        GUEST_PDPTR0_HIGH               = 0x0000280b,
        GUEST_PDPTR1                    = 0x0000280c,
@@ -120,6 +125,8 @@ enum vmcs_field {
        GUEST_PDPTR2_HIGH               = 0x0000280f,
        GUEST_PDPTR3                    = 0x00002810,
        GUEST_PDPTR3_HIGH               = 0x00002811,
+        HOST_IA32_PAT                   = 0x00002c00,
+        HOST_IA32_PAT_HIGH              = 0x00002c01,
        PIN_BASED_VM_EXEC_CONTROL       = 0x00004000,
        CPU_BASED_VM_EXEC_CONTROL       = 0x00004002,
        EXCEPTION_BITMAP                = 0x00004004,
@@ -331,8 +338,9 @@ enum vmcs_field {
 #define AR_RESERVD_MASK 0xfffe0f00
-#define APIC_ACCESS_PAGE_PRIVATE_MEMSLOT        9
+#define TSS_PRIVATE_MEMSLOT                     (KVM_MEMORY_SLOTS + 0)
-#define IDENTITY_PAGETABLE_PRIVATE_MEMSLOT      10
+#define APIC_ACCESS_PAGE_PRIVATE_MEMSLOT        (KVM_MEMORY_SLOTS + 1)
+#define IDENTITY_PAGETABLE_PRIVATE_MEMSLOT      (KVM_MEMORY_SLOTS + 2)
 #define VMX_NR_VPIDS                            (1 << 16)
 #define VMX_VPID_EXTENT_SINGLE_CONTEXT          1
@@ -356,4 +364,19 @@ enum vmcs_field {
 #define VMX_EPT_IDENTITY_PAGETABLE_ADDR         0xfffbc000ul
+#define ASM_VMX_VMCLEAR_RAX       ".byte 0x66, 0x0f, 0xc7, 0x30"
+#define ASM_VMX_VMLAUNCH          ".byte 0x0f, 0x01, 0xc2"
+#define ASM_VMX_VMRESUME          ".byte 0x0f, 0x01, 0xc3"
+#define ASM_VMX_VMPTRLD_RAX       ".byte 0x0f, 0xc7, 0x30"
+#define ASM_VMX_VMREAD_RDX_RAX    ".byte 0x0f, 0x78, 0xd0"
+#define ASM_VMX_VMWRITE_RAX_RDX   ".byte 0x0f, 0x79, 0xd0"
+#define ASM_VMX_VMWRITE_RSP_RDX   ".byte 0x0f, 0x79, 0xd4"
+#define ASM_VMX_VMXOFF            ".byte 0x0f, 0x01, 0xc4"
+#define ASM_VMX_VMXON_RAX         ".byte 0xf3, 0x0f, 0xc7, 0x30"
+#define ASM_VMX_INVEPT            ".byte 0x66, 0x0f, 0x38, 0x80, 0x08"
+#define ASM_VMX_INVVPID           ".byte 0x66, 0x0f, 0x38, 0x81, 0x08"
 #endif
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index 4e8d77f01eeb..b59ddcc88cd8 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -14,14 +14,6 @@
 #include <asm/pat.h>
 #include "mtrr.h"
-struct mtrr_state {
-        struct mtrr_var_range var_ranges[MAX_VAR_RANGES];
-        mtrr_type fixed_ranges[NUM_FIXED_RANGES];
-        unsigned char enabled;
-        unsigned char have_fixed;
-        mtrr_type def_type;
-};
 struct fixed_range_block {
        int base_msr; /* start address of an MTRR block */
        int ranges;   /* number of MTRRs in this block  */
@@ -35,10 +27,12 @@ static struct fixed_range_block fixed_range_blocks[] = {
 };
 static unsigned long smp_changes_mask;
-static struct mtrr_state mtrr_state = {};
 static int mtrr_state_set;
 u64 mtrr_tom2;
+struct mtrr_state_type mtrr_state = {};
+EXPORT_SYMBOL_GPL(mtrr_state);
 #undef MODULE_PARAM_PREFIX
 #define MODULE_PARAM_PREFIX "mtrr."
diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c
index 1159e269e596..d6ec7ec30274 100644
--- a/arch/x86/kernel/cpu/mtrr/main.c
+++ b/arch/x86/kernel/cpu/mtrr/main.c
@@ -49,7 +49,7 @@
 u32 num_var_ranges = 0;
-unsigned int mtrr_usage_table[MAX_VAR_RANGES];
+unsigned int mtrr_usage_table[MTRR_MAX_VAR_RANGES];
 static DEFINE_MUTEX(mtrr_mutex);
 u64 size_or_mask, size_and_mask;
@@ -574,7 +574,7 @@ struct mtrr_value {
        unsigned long   lsize;
 };
-static struct mtrr_value mtrr_state[MAX_VAR_RANGES];
+static struct mtrr_value mtrr_state[MTRR_MAX_VAR_RANGES];
 static int mtrr_save(struct sys_device * sysdev, pm_message_t state)
 {
diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.h b/arch/x86/kernel/cpu/mtrr/mtrr.h
index 2dc4ec656b23..ffd60409cc6d 100644
--- a/arch/x86/kernel/cpu/mtrr/mtrr.h
+++ b/arch/x86/kernel/cpu/mtrr/mtrr.h
@@ -8,11 +8,6 @@
 #define MTRRcap_MSR     0x0fe
 #define MTRRdefType_MSR 0x2ff
-#define MTRRphysBase_MSR(reg) (0x200 + 2 * (reg))
-#define MTRRphysMask_MSR(reg) (0x200 + 2 * (reg) + 1)
-#define NUM_FIXED_RANGES 88
-#define MAX_VAR_RANGES 256
 #define MTRRfix64K_00000_MSR 0x250
 #define MTRRfix16K_80000_MSR 0x258
 #define MTRRfix16K_A0000_MSR 0x259
@@ -29,11 +24,7 @@
 #define MTRR_CHANGE_MASK_VARIABLE  0x02
 #define MTRR_CHANGE_MASK_DEFTYPE   0x04
-/* In the Intel processor's MTRR interface, the MTRR type is always held in
+extern unsigned int mtrr_usage_table[MTRR_MAX_VAR_RANGES];
-   an 8 bit field: */
-typedef u8 mtrr_type;
-extern unsigned int mtrr_usage_table[MAX_VAR_RANGES];
 struct mtrr_ops {
        u32     vendor;
@@ -70,13 +61,6 @@ struct set_mtrr_context {
        u32 ccr3;
 };
-struct mtrr_var_range {
-        u32 base_lo;
-        u32 base_hi;
-        u32 mask_lo;
-        u32 mask_hi;
-};
 void set_mtrr_done(struct set_mtrr_context *ctxt);
 void set_mtrr_cache_disable(struct set_mtrr_context *ctxt);
 void set_mtrr_prepare_save(struct set_mtrr_context *ctxt);
diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index d84a852e4cd7..c689d19e35ab 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -26,6 +26,7 @@
 #include <linux/kdebug.h>
 #include <asm/smp.h>
 #include <asm/reboot.h>
+#include <asm/virtext.h>
 #include <mach_ipi.h>
@@ -49,6 +50,15 @@ static void kdump_nmi_callback(int cpu, struct die_args *args)
 #endif
        crash_save_cpu(regs, cpu);
+        /* Disable VMX or SVM if needed.
+         *
+         * We need to disable virtualization on all CPUs.
+         * Having VMX or SVM enabled on any CPU may break rebooting
+         * after the kdump kernel has finished its task.
+         */
+        cpu_emergency_vmxoff();
+        cpu_emergency_svm_disable();
        disable_local_APIC();
 }
@@ -80,6 +90,14 @@ void native_machine_crash_shutdown(struct pt_regs *regs)
        local_irq_disable();
        kdump_nmi_shootdown_cpus();
+        /* Booting kdump kernel with VMX or SVM enabled won't work,
+         * because (among other limitations) we can't disable paging
+         * with the virt flags.
+         */
+        cpu_emergency_vmxoff();
+        cpu_emergency_svm_disable();
        lapic_shutdown();
 #if defined(CONFIG_X86_IO_APIC)
        disable_IO_APIC();
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index e169ae9b6a62..652fce6d2cce 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -89,17 +89,17 @@ static cycle_t kvm_clock_read(void)
 */
 static unsigned long kvm_get_tsc_khz(void)
 {
-        return preset_lpj;
+        struct pvclock_vcpu_time_info *src;
+        src = &per_cpu(hv_clock, 0);
+        return pvclock_tsc_khz(src);
 }
 static void kvm_get_preset_lpj(void)
 {
-        struct pvclock_vcpu_time_info *src;
        unsigned long khz;
        u64 lpj;
-        src = &per_cpu(hv_clock, 0);
+        khz = kvm_get_tsc_khz();
-        khz = pvclock_tsc_khz(src);
        lpj = ((u64)khz * 1000);
        do_div(lpj, HZ);
@@ -194,5 +194,7 @@ void __init kvmclock_init(void)
 #endif
                kvm_get_preset_lpj();
                clocksource_register(&kvm_clock);
+                pv_info.paravirt_enabled = 1;
+                pv_info.name = "KVM";
        }
 }
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index 61f718df6eec..72e0e4e712d6 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -12,6 +12,7 @@
 #include <asm/proto.h>
 #include <asm/reboot_fixups.h>
 #include <asm/reboot.h>
+#include <asm/virtext.h>
 #ifdef CONFIG_X86_32
 # include <linux/dmi.h>
@@ -39,6 +40,12 @@ int reboot_force;
 static int reboot_cpu = -1;
 #endif
+/* This is set if we need to go through the 'emergency' path.
+ * When machine_emergency_restart() is called, we may be on
+ * an inconsistent state and won't be able to do a clean cleanup
+ */
+static int reboot_emergency;
 /* This is set by the PCI code if either type 1 or type 2 PCI is detected */
 bool port_cf9_safe = false;
@@ -368,6 +375,48 @@ static inline void kb_wait(void)
        }
 }
+static void vmxoff_nmi(int cpu, struct die_args *args)
+{
+        cpu_emergency_vmxoff();
+}
+/* Use NMIs as IPIs to tell all CPUs to disable virtualization
+ */
+static void emergency_vmx_disable_all(void)
+{
+        /* Just make sure we won't change CPUs while doing this */
+        local_irq_disable();
+        /* We need to disable VMX on all CPUs before rebooting, otherwise
+         * we risk hanging up the machine, because the CPU ignore INIT
+         * signals when VMX is enabled.
+         *
+         * We can't take any locks and we may be on an inconsistent
+         * state, so we use NMIs as IPIs to tell the other CPUs to disable
+         * VMX and halt.
+         *
+         * For safety, we will avoid running the nmi_shootdown_cpus()
+         * stuff unnecessarily, but we don't have a way to check
+         * if other CPUs have VMX enabled. So we will call it only if the
+         * CPU we are running on has VMX enabled.
+         *
+         * We will miss cases where VMX is not enabled on all CPUs. This
+         * shouldn't do much harm because KVM always enable VMX on all
+         * CPUs anyway. But we can miss it on the small window where KVM
+         * is still enabling VMX.
+         */
+        if (cpu_has_vmx() && cpu_vmx_enabled()) {
+                /* Disable VMX on this CPU.
+                 */
+                cpu_vmxoff();
+                /* Halt and disable VMX on the other CPUs */
+                nmi_shootdown_cpus(vmxoff_nmi);
+        }
+}
 void __attribute__((weak)) mach_reboot_fixups(void)
 {
 }
@@ -376,6 +425,9 @@ static void native_machine_emergency_restart(void)
 {
        int i;
+        if (reboot_emergency)
+                emergency_vmx_disable_all();
        /* Tell the BIOS if we want cold or warm reboot */
        *((unsigned short *)__va(0x472)) = reboot_mode;
@@ -482,13 +534,19 @@ void native_machine_shutdown(void)
 #endif
 }
+static void __machine_emergency_restart(int emergency)
+{
+        reboot_emergency = emergency;
+        machine_ops.emergency_restart();
+}
 static void native_machine_restart(char *__unused)
 {
        printk("machine restart\n");
        if (!reboot_force)
                machine_shutdown();
-        machine_emergency_restart();
+        __machine_emergency_restart(0);
 }
 static void native_machine_halt(void)
@@ -532,7 +590,7 @@ void machine_shutdown(void)
 void machine_emergency_restart(void)
 {
-        machine_ops.emergency_restart();
+        __machine_emergency_restart(1);
 }
 void machine_restart(char *cmd)
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index 59ebd37ad79e..e665d1c623ca 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -603,10 +603,29 @@ void kvm_free_pit(struct kvm *kvm)
 static void __inject_pit_timer_intr(struct kvm *kvm)
 {
+        struct kvm_vcpu *vcpu;
+        int i;
        mutex_lock(&kvm->lock);
        kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 1);
        kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 0);
        mutex_unlock(&kvm->lock);
+        /*
+         * Provides NMI watchdog support via Virtual Wire mode.
+         * The route is: PIT -> PIC -> LVT0 in NMI mode.
+         *
+         * Note: Our Virtual Wire implementation is simplified, only
+         * propagating PIT interrupts to all VCPUs when they have set
+         * LVT0 to NMI delivery. Other PIC interrupts are just sent to
+         * VCPU0, and only if its LVT0 is in EXTINT mode.
+         */
+        if (kvm->arch.vapics_in_nmi_mode > 0)
+                for (i = 0; i < KVM_MAX_VCPUS; ++i) {
+                        vcpu = kvm->vcpus[i];
+                        if (vcpu)
+                                kvm_apic_nmi_wd_deliver(vcpu);
+                }
 }
 void kvm_inject_pit_timer_irqs(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index 17e41e165f1a..179dcb0103fd 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -26,10 +26,40 @@
 *   Port from Qemu.
 */
 #include <linux/mm.h>
+#include <linux/bitops.h>
 #include "irq.h"
 #include <linux/kvm_host.h>
+static void pic_lock(struct kvm_pic *s)
+{
+        spin_lock(&s->lock);
+}
+static void pic_unlock(struct kvm_pic *s)
+{
+        struct kvm *kvm = s->kvm;
+        unsigned acks = s->pending_acks;
+        bool wakeup = s->wakeup_needed;
+        struct kvm_vcpu *vcpu;
+        s->pending_acks = 0;
+        s->wakeup_needed = false;
+        spin_unlock(&s->lock);
+        while (acks) {
+                kvm_notify_acked_irq(kvm, __ffs(acks));
+                acks &= acks - 1;
+        }
+        if (wakeup) {
+                vcpu = s->kvm->vcpus[0];
+                if (vcpu)
+                        kvm_vcpu_kick(vcpu);
+        }
+}
 static void pic_clear_isr(struct kvm_kpic_state *s, int irq)
 {
        s->isr &= ~(1 << irq);
@@ -136,17 +166,21 @@ static void pic_update_irq(struct kvm_pic *s)
 void kvm_pic_update_irq(struct kvm_pic *s)
 {
+        pic_lock(s);
        pic_update_irq(s);
+        pic_unlock(s);
 }
 void kvm_pic_set_irq(void *opaque, int irq, int level)
 {
        struct kvm_pic *s = opaque;
+        pic_lock(s);
        if (irq >= 0 && irq < PIC_NUM_PINS) {
                pic_set_irq1(&s->pics[irq >> 3], irq & 7, level);
                pic_update_irq(s);
        }
+        pic_unlock(s);
 }
 /*
@@ -172,6 +206,7 @@ int kvm_pic_read_irq(struct kvm *kvm)
        int irq, irq2, intno;
        struct kvm_pic *s = pic_irqchip(kvm);
+        pic_lock(s);
        irq = pic_get_irq(&s->pics[0]);
        if (irq >= 0) {
                pic_intack(&s->pics[0], irq);
@@ -196,6 +231,7 @@ int kvm_pic_read_irq(struct kvm *kvm)
                intno = s->pics[0].irq_base + irq;
        }
        pic_update_irq(s);
+        pic_unlock(s);
        kvm_notify_acked_irq(kvm, irq);
        return intno;
@@ -203,7 +239,7 @@ int kvm_pic_read_irq(struct kvm *kvm)
 void kvm_pic_reset(struct kvm_kpic_state *s)
 {
-        int irq, irqbase;
+        int irq, irqbase, n;
        struct kvm *kvm = s->pics_state->irq_request_opaque;
        struct kvm_vcpu *vcpu0 = kvm->vcpus[0];
@@ -214,8 +250,10 @@ void kvm_pic_reset(struct kvm_kpic_state *s)
        for (irq = 0; irq < PIC_NUM_PINS/2; irq++) {
                if (vcpu0 && kvm_apic_accept_pic_intr(vcpu0))
-                        if (s->irr & (1 << irq) || s->isr & (1 << irq))
+                        if (s->irr & (1 << irq) || s->isr & (1 << irq)) {
-                                kvm_notify_acked_irq(kvm, irq+irqbase);
+                                n = irq + irqbase;
+                                s->pics_state->pending_acks |= 1 << n;
+                        }
        }
        s->last_irr = 0;
        s->irr = 0;
@@ -406,6 +444,7 @@ static void picdev_write(struct kvm_io_device *this,
                        printk(KERN_ERR "PIC: non byte write\n");
                return;
        }
+        pic_lock(s);
        switch (addr) {
        case 0x20:
        case 0x21:
@@ -418,6 +457,7 @@ static void picdev_write(struct kvm_io_device *this,
                elcr_ioport_write(&s->pics[addr & 1], addr, data);
                break;
        }
+        pic_unlock(s);
 }
 static void picdev_read(struct kvm_io_device *this,
@@ -431,6 +471,7 @@ static void picdev_read(struct kvm_io_device *this,
                        printk(KERN_ERR "PIC: non byte read\n");
                return;
        }
+        pic_lock(s);
        switch (addr) {
        case 0x20:
        case 0x21:
@@ -444,6 +485,7 @@ static void picdev_read(struct kvm_io_device *this,
                break;
        }
        *(unsigned char *)val = data;
+        pic_unlock(s);
 }
 /*
@@ -459,7 +501,7 @@ static void pic_irq_request(void *opaque, int level)
        s->output = level;
        if (vcpu && level && (s->pics[0].isr_ack & (1 << irq))) {
                s->pics[0].isr_ack &= ~(1 << irq);
-                kvm_vcpu_kick(vcpu);
+                s->wakeup_needed = true;
        }
 }
@@ -469,6 +511,8 @@ struct kvm_pic *kvm_create_pic(struct kvm *kvm)
        s = kzalloc(sizeof(struct kvm_pic), GFP_KERNEL);
        if (!s)
                return NULL;
+        spin_lock_init(&s->lock);
+        s->kvm = kvm;
        s->pics[0].elcr_mask = 0xf8;
        s->pics[1].elcr_mask = 0xde;
        s->irq_request = pic_irq_request;
diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h
index f17c8f5bbf31..2bf32a03ceec 100644
--- a/arch/x86/kvm/irq.h
+++ b/arch/x86/kvm/irq.h
@@ -25,6 +25,7 @@
 #include <linux/mm_types.h>
 #include <linux/hrtimer.h>
 #include <linux/kvm_host.h>
+#include <linux/spinlock.h>
 #include "iodev.h"
 #include "ioapic.h"
@@ -59,6 +60,10 @@ struct kvm_kpic_state {
 };
 struct kvm_pic {
+        spinlock_t lock;
+        bool wakeup_needed;
+        unsigned pending_acks;
+        struct kvm *kvm;
        struct kvm_kpic_state pics[2]; /* 0 is master pic, 1 is slave pic */
        irq_request_func *irq_request;
        void *irq_request_opaque;
@@ -87,6 +92,7 @@ void kvm_pic_reset(struct kvm_kpic_state *s);
 void kvm_timer_intr_post(struct kvm_vcpu *vcpu, int vec);
 void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu);
 void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu);
+void kvm_apic_nmi_wd_deliver(struct kvm_vcpu *vcpu);
 void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu);
 void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu);
 void __kvm_migrate_timers(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/kvm_svm.h b/arch/x86/kvm/kvm_svm.h
index 65ef0fc2c036..8e5ee99551f6 100644
--- a/arch/x86/kvm/kvm_svm.h
+++ b/arch/x86/kvm/kvm_svm.h
@@ -7,7 +7,7 @@
 #include <linux/kvm_host.h>
 #include <asm/msr.h>
-#include "svm.h"
+#include <asm/svm.h>
 static const u32 host_save_user_msrs[] = {
 #ifdef CONFIG_X86_64
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 0fc3cab48943..afac68c0815c 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -130,6 +130,11 @@ static inline int apic_lvtt_period(struct kvm_lapic *apic)
        return apic_get_reg(apic, APIC_LVTT) & APIC_LVT_TIMER_PERIODIC;
 }
+static inline int apic_lvt_nmi_mode(u32 lvt_val)
+{
+        return (lvt_val & (APIC_MODE_MASK | APIC_LVT_MASKED)) == APIC_DM_NMI;
+}
 static unsigned int apic_lvt_mask[APIC_LVT_NUM] = {
        LVT_MASK | APIC_LVT_TIMER_PERIODIC,     /* LVTT */
        LVT_MASK | APIC_MODE_MASK,      /* LVTTHMR */
@@ -354,6 +359,7 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
        case APIC_DM_NMI:
                kvm_inject_nmi(vcpu);
+                kvm_vcpu_kick(vcpu);
                break;
        case APIC_DM_INIT:
@@ -380,6 +386,14 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
                }
                break;
+        case APIC_DM_EXTINT:
+                /*
+                 * Should only be called by kvm_apic_local_deliver() with LVT0,
+                 * before NMI watchdog was enabled. Already handled by
+                 * kvm_apic_accept_pic_intr().
+                 */
+                break;
        default:
                printk(KERN_ERR "TODO: unsupported delivery mode %x\n",
                       delivery_mode);
@@ -663,6 +677,20 @@ static void start_apic_timer(struct kvm_lapic *apic)
                                        apic->timer.period)));
 }
+static void apic_manage_nmi_watchdog(struct kvm_lapic *apic, u32 lvt0_val)
+{
+        int nmi_wd_enabled = apic_lvt_nmi_mode(apic_get_reg(apic, APIC_LVT0));
+        if (apic_lvt_nmi_mode(lvt0_val)) {
+                if (!nmi_wd_enabled) {
+                        apic_debug("Receive NMI setting on APIC_LVT0 "
+                                   "for cpu %d\n", apic->vcpu->vcpu_id);
+                        apic->vcpu->kvm->arch.vapics_in_nmi_mode++;
+                }
+        } else if (nmi_wd_enabled)
+                apic->vcpu->kvm->arch.vapics_in_nmi_mode--;
+}
 static void apic_mmio_write(struct kvm_io_device *this,
                            gpa_t address, int len, const void *data)
 {
@@ -743,10 +771,11 @@ static void apic_mmio_write(struct kvm_io_device *this,
                apic_set_reg(apic, APIC_ICR2, val & 0xff000000);
                break;
+        case APIC_LVT0:
+                apic_manage_nmi_watchdog(apic, val);
        case APIC_LVTT:
        case APIC_LVTTHMR:
        case APIC_LVTPC:
-        case APIC_LVT0:
        case APIC_LVT1:
        case APIC_LVTERR:
                /* TODO: Check vector */
@@ -961,12 +990,26 @@ int apic_has_pending_timer(struct kvm_vcpu *vcpu)
        return 0;
 }
-static int __inject_apic_timer_irq(struct kvm_lapic *apic)
+static int kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type)
+{
+        u32 reg = apic_get_reg(apic, lvt_type);
+        int vector, mode, trig_mode;
+        if (apic_hw_enabled(apic) && !(reg & APIC_LVT_MASKED)) {
+                vector = reg & APIC_VECTOR_MASK;
+                mode = reg & APIC_MODE_MASK;
+                trig_mode = reg & APIC_LVT_LEVEL_TRIGGER;
+                return __apic_accept_irq(apic, mode, vector, 1, trig_mode);
+        }
+        return 0;
+}
+void kvm_apic_nmi_wd_deliver(struct kvm_vcpu *vcpu)
 {
-        int vector;
+        struct kvm_lapic *apic = vcpu->arch.apic;
-        vector = apic_lvt_vector(apic, APIC_LVTT);
+        if (apic)
-        return __apic_accept_irq(apic, APIC_DM_FIXED, vector, 1, 0);
+                kvm_apic_local_deliver(apic, APIC_LVT0);
 }
 static enum hrtimer_restart apic_timer_fn(struct hrtimer *data)
@@ -1061,9 +1104,8 @@ void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu)
 {
        struct kvm_lapic *apic = vcpu->arch.apic;
-        if (apic && apic_lvt_enabled(apic, APIC_LVTT) &&
+        if (apic && atomic_read(&apic->timer.pending) > 0) {
-                atomic_read(&apic->timer.pending) > 0) {
+                if (kvm_apic_local_deliver(apic, APIC_LVTT))
-                if (__inject_apic_timer_irq(apic))
                        atomic_dec(&apic->timer.pending);
        }
 }
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 410ddbc1aa2e..83f11c7474a1 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -17,7 +17,6 @@
 *
 */
-#include "vmx.h"
 #include "mmu.h"
 #include <linux/kvm_host.h>
@@ -33,6 +32,7 @@
 #include <asm/page.h>
 #include <asm/cmpxchg.h>
 #include <asm/io.h>
+#include <asm/vmx.h>
 /*
 * When setting this variable to true it enables Two-Dimensional-Paging
@@ -168,6 +168,7 @@ static u64 __read_mostly shadow_x_mask;	/* mutual exclusive with nx_mask */
 static u64 __read_mostly shadow_user_mask;
 static u64 __read_mostly shadow_accessed_mask;
 static u64 __read_mostly shadow_dirty_mask;
+static u64 __read_mostly shadow_mt_mask;
 void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte)
 {
@@ -183,13 +184,14 @@ void kvm_mmu_set_base_ptes(u64 base_pte)
 EXPORT_SYMBOL_GPL(kvm_mmu_set_base_ptes);
 void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
-                u64 dirty_mask, u64 nx_mask, u64 x_mask)
+                u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 mt_mask)
 {
        shadow_user_mask = user_mask;
        shadow_accessed_mask = accessed_mask;
        shadow_dirty_mask = dirty_mask;
        shadow_nx_mask = nx_mask;
        shadow_x_mask = x_mask;
+        shadow_mt_mask = mt_mask;
 }
 EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes);
@@ -384,7 +386,9 @@ static void account_shadowed(struct kvm *kvm, gfn_t gfn)
 {
        int *write_count;
-        write_count = slot_largepage_idx(gfn, gfn_to_memslot(kvm, gfn));
+        gfn = unalias_gfn(kvm, gfn);
+        write_count = slot_largepage_idx(gfn,
+                                         gfn_to_memslot_unaliased(kvm, gfn));
        *write_count += 1;
 }
@@ -392,16 +396,20 @@ static void unaccount_shadowed(struct kvm *kvm, gfn_t gfn)
 {
        int *write_count;
-        write_count = slot_largepage_idx(gfn, gfn_to_memslot(kvm, gfn));
+        gfn = unalias_gfn(kvm, gfn);
+        write_count = slot_largepage_idx(gfn,
+                                         gfn_to_memslot_unaliased(kvm, gfn));
        *write_count -= 1;
        WARN_ON(*write_count < 0);
 }
 static int has_wrprotected_page(struct kvm *kvm, gfn_t gfn)
 {
-        struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
+        struct kvm_memory_slot *slot;
        int *largepage_idx;
+        gfn = unalias_gfn(kvm, gfn);
+        slot = gfn_to_memslot_unaliased(kvm, gfn);
        if (slot) {
                largepage_idx = slot_largepage_idx(gfn, slot);
                return *largepage_idx;
@@ -613,7 +621,7 @@ static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte)
        return NULL;
 }
-static void rmap_write_protect(struct kvm *kvm, u64 gfn)
+static int rmap_write_protect(struct kvm *kvm, u64 gfn)
 {
        unsigned long *rmapp;
        u64 *spte;
@@ -659,8 +667,7 @@ static void rmap_write_protect(struct kvm *kvm, u64 gfn)
                spte = rmap_next(kvm, rmapp, spte);
        }
-        if (write_protected)
+        return write_protected;
-                kvm_flush_remote_tlbs(kvm);
 }
 static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp)
@@ -786,9 +793,11 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
        sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE);
        set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
        list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages);
+        INIT_LIST_HEAD(&sp->oos_link);
        ASSERT(is_empty_shadow_page(sp->spt));
-        sp->slot_bitmap = 0;
+        bitmap_zero(sp->slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS);
        sp->multimapped = 0;
+        sp->global = 1;
        sp->parent_pte = parent_pte;
        --vcpu->kvm->arch.n_free_mmu_pages;
        return sp;
@@ -900,8 +909,9 @@ static void kvm_mmu_update_unsync_bitmap(u64 *spte)
        struct kvm_mmu_page *sp = page_header(__pa(spte));
        index = spte - sp->spt;
-        __set_bit(index, sp->unsync_child_bitmap);
+        if (!__test_and_set_bit(index, sp->unsync_child_bitmap))
-        sp->unsync_children = 1;
+                sp->unsync_children++;
+        WARN_ON(!sp->unsync_children);
 }
 static void kvm_mmu_update_parents_unsync(struct kvm_mmu_page *sp)
@@ -928,7 +938,6 @@ static void kvm_mmu_update_parents_unsync(struct kvm_mmu_page *sp)
 static int unsync_walk_fn(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
 {
-        sp->unsync_children = 1;
        kvm_mmu_update_parents_unsync(sp);
        return 1;
 }
@@ -959,38 +968,66 @@ static void nonpaging_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
 {
 }
+#define KVM_PAGE_ARRAY_NR 16
+struct kvm_mmu_pages {
+        struct mmu_page_and_offset {
+                struct kvm_mmu_page *sp;
+                unsigned int idx;
+        } page[KVM_PAGE_ARRAY_NR];
+        unsigned int nr;
+};
 #define for_each_unsync_children(bitmap, idx)           \
        for (idx = find_first_bit(bitmap, 512);         \
             idx < 512;                                 \
             idx = find_next_bit(bitmap, 512, idx+1))
-static int mmu_unsync_walk(struct kvm_mmu_page *sp,
+int mmu_pages_add(struct kvm_mmu_pages *pvec, struct kvm_mmu_page *sp,
-                           struct kvm_unsync_walk *walker)
+                   int idx)
 {
-        int i, ret;
+        int i;
-        if (!sp->unsync_children)
+        if (sp->unsync)
-                return 0;
+                for (i=0; i < pvec->nr; i++)
+                        if (pvec->page[i].sp == sp)
+                                return 0;
+        pvec->page[pvec->nr].sp = sp;
+        pvec->page[pvec->nr].idx = idx;
+        pvec->nr++;
+        return (pvec->nr == KVM_PAGE_ARRAY_NR);
+}
+static int __mmu_unsync_walk(struct kvm_mmu_page *sp,
+                           struct kvm_mmu_pages *pvec)
+{
+        int i, ret, nr_unsync_leaf = 0;
        for_each_unsync_children(sp->unsync_child_bitmap, i) {
                u64 ent = sp->spt[i];
-                if (is_shadow_present_pte(ent)) {
+                if (is_shadow_present_pte(ent) && !is_large_pte(ent)) {
                        struct kvm_mmu_page *child;
                        child = page_header(ent & PT64_BASE_ADDR_MASK);
                        if (child->unsync_children) {
-                                ret = mmu_unsync_walk(child, walker);
+                                if (mmu_pages_add(pvec, child, i))
-                                if (ret)
+                                        return -ENOSPC;
+                                ret = __mmu_unsync_walk(child, pvec);
+                                if (!ret)
+                                        __clear_bit(i, sp->unsync_child_bitmap);
+                                else if (ret > 0)
+                                        nr_unsync_leaf += ret;
+                                else
                                        return ret;
-                                __clear_bit(i, sp->unsync_child_bitmap);
                        }
                        if (child->unsync) {
-                                ret = walker->entry(child, walker);
+                                nr_unsync_leaf++;
-                                __clear_bit(i, sp->unsync_child_bitmap);
+                                if (mmu_pages_add(pvec, child, i))
-                                if (ret)
+                                        return -ENOSPC;
-                                        return ret;
                        }
                }
        }
@@ -998,7 +1035,17 @@ static int mmu_unsync_walk(struct kvm_mmu_page *sp,
        if (find_first_bit(sp->unsync_child_bitmap, 512) == 512)
                sp->unsync_children = 0;
-        return 0;
+        return nr_unsync_leaf;
+}
+static int mmu_unsync_walk(struct kvm_mmu_page *sp,
+                           struct kvm_mmu_pages *pvec)
+{
+        if (!sp->unsync_children)
+                return 0;
+        mmu_pages_add(pvec, sp, 0);
+        return __mmu_unsync_walk(sp, pvec);
 }
 static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn)
@@ -1021,10 +1068,18 @@ static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn)
        return NULL;
 }
+static void kvm_unlink_unsync_global(struct kvm *kvm, struct kvm_mmu_page *sp)
+{
+        list_del(&sp->oos_link);
+        --kvm->stat.mmu_unsync_global;
+}
 static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp)
 {
        WARN_ON(!sp->unsync);
        sp->unsync = 0;
+        if (sp->global)
+                kvm_unlink_unsync_global(kvm, sp);
        --kvm->stat.mmu_unsync;
 }
@@ -1037,7 +1092,8 @@ static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
                return 1;
        }
-        rmap_write_protect(vcpu->kvm, sp->gfn);
+        if (rmap_write_protect(vcpu->kvm, sp->gfn))
+                kvm_flush_remote_tlbs(vcpu->kvm);
        kvm_unlink_unsync_page(vcpu->kvm, sp);
        if (vcpu->arch.mmu.sync_page(vcpu, sp)) {
                kvm_mmu_zap_page(vcpu->kvm, sp);
@@ -1048,30 +1104,89 @@ static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
        return 0;
 }
-struct sync_walker {
+struct mmu_page_path {
-        struct kvm_vcpu *vcpu;
+        struct kvm_mmu_page *parent[PT64_ROOT_LEVEL-1];
-        struct kvm_unsync_walk walker;
+        unsigned int idx[PT64_ROOT_LEVEL-1];
 };
-static int mmu_sync_fn(struct kvm_mmu_page *sp, struct kvm_unsync_walk *walk)
+#define for_each_sp(pvec, sp, parents, i)                       \
+                for (i = mmu_pages_next(&pvec, &parents, -1),   \
+                        sp = pvec.page[i].sp;                   \
+                        i < pvec.nr && ({ sp = pvec.page[i].sp; 1;});   \
+                        i = mmu_pages_next(&pvec, &parents, i))
+int mmu_pages_next(struct kvm_mmu_pages *pvec, struct mmu_page_path *parents,
+                   int i)
 {
-        struct sync_walker *sync_walk = container_of(walk, struct sync_walker,
+        int n;
-                                                     walker);
-        struct kvm_vcpu *vcpu = sync_walk->vcpu;
-        kvm_sync_page(vcpu, sp);
+        for (n = i+1; n < pvec->nr; n++) {
-        return (need_resched() || spin_needbreak(&vcpu->kvm->mmu_lock));
+                struct kvm_mmu_page *sp = pvec->page[n].sp;
+                if (sp->role.level == PT_PAGE_TABLE_LEVEL) {
+                        parents->idx[0] = pvec->page[n].idx;
+                        return n;
+                }
+                parents->parent[sp->role.level-2] = sp;
+                parents->idx[sp->role.level-1] = pvec->page[n].idx;
+        }
+        return n;
 }
-static void mmu_sync_children(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
+void mmu_pages_clear_parents(struct mmu_page_path *parents)
 {
-        struct sync_walker walker = {
+        struct kvm_mmu_page *sp;
-                .walker = { .entry = mmu_sync_fn, },
+        unsigned int level = 0;
-                .vcpu = vcpu,
-        };
+        do {
+                unsigned int idx = parents->idx[level];
+                sp = parents->parent[level];
+                if (!sp)
+                        return;
+                --sp->unsync_children;
+                WARN_ON((int)sp->unsync_children < 0);
+                __clear_bit(idx, sp->unsync_child_bitmap);
+                level++;
+        } while (level < PT64_ROOT_LEVEL-1 && !sp->unsync_children);
+}
+static void kvm_mmu_pages_init(struct kvm_mmu_page *parent,
+                               struct mmu_page_path *parents,
+                               struct kvm_mmu_pages *pvec)
+{
+        parents->parent[parent->role.level-1] = NULL;
+        pvec->nr = 0;
+}
+static void mmu_sync_children(struct kvm_vcpu *vcpu,
+                              struct kvm_mmu_page *parent)
+{
+        int i;
+        struct kvm_mmu_page *sp;
+        struct mmu_page_path parents;
+        struct kvm_mmu_pages pages;
+        kvm_mmu_pages_init(parent, &parents, &pages);
+        while (mmu_unsync_walk(parent, &pages)) {
+                int protected = 0;
-        while (mmu_unsync_walk(sp, &walker.walker))
+                for_each_sp(pages, sp, parents, i)
+                        protected |= rmap_write_protect(vcpu->kvm, sp->gfn);
+                if (protected)
+                        kvm_flush_remote_tlbs(vcpu->kvm);
+                for_each_sp(pages, sp, parents, i) {
+                        kvm_sync_page(vcpu, sp);
+                        mmu_pages_clear_parents(&parents);
+                }
                cond_resched_lock(&vcpu->kvm->mmu_lock);
+                kvm_mmu_pages_init(parent, &parents, &pages);
+        }
 }
 static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
@@ -1129,7 +1244,8 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
        sp->role = role;
        hlist_add_head(&sp->hash_link, bucket);
        if (!metaphysical) {
-                rmap_write_protect(vcpu->kvm, gfn);
+                if (rmap_write_protect(vcpu->kvm, gfn))
+                        kvm_flush_remote_tlbs(vcpu->kvm);
                account_shadowed(vcpu->kvm, gfn);
        }
        if (shadow_trap_nonpresent_pte != shadow_notrap_nonpresent_pte)
@@ -1153,6 +1269,8 @@ static int walk_shadow(struct kvm_shadow_walk *walker,
        if (level == PT32E_ROOT_LEVEL) {
                shadow_addr = vcpu->arch.mmu.pae_root[(addr >> 30) & 3];
                shadow_addr &= PT64_BASE_ADDR_MASK;
+                if (!shadow_addr)
+                        return 1;
                --level;
        }
@@ -1237,33 +1355,29 @@ static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp)
        }
 }
-struct zap_walker {
+static int mmu_zap_unsync_children(struct kvm *kvm,
-        struct kvm_unsync_walk walker;
+                                   struct kvm_mmu_page *parent)
-        struct kvm *kvm;
-        int zapped;
-};
-static int mmu_zap_fn(struct kvm_mmu_page *sp, struct kvm_unsync_walk *walk)
 {
-        struct zap_walker *zap_walk = container_of(walk, struct zap_walker,
+        int i, zapped = 0;
-                                                     walker);
+        struct mmu_page_path parents;
-        kvm_mmu_zap_page(zap_walk->kvm, sp);
+        struct kvm_mmu_pages pages;
-        zap_walk->zapped = 1;
-        return 0;
-}
-static int mmu_zap_unsync_children(struct kvm *kvm, struct kvm_mmu_page *sp)
+        if (parent->role.level == PT_PAGE_TABLE_LEVEL)
-{
-        struct zap_walker walker = {
-                .walker = { .entry = mmu_zap_fn, },
-                .kvm = kvm,
-                .zapped = 0,
-        };
-        if (sp->role.level == PT_PAGE_TABLE_LEVEL)
                return 0;
-        mmu_unsync_walk(sp, &walker.walker);
-        return walker.zapped;
+        kvm_mmu_pages_init(parent, &parents, &pages);
+        while (mmu_unsync_walk(parent, &pages)) {
+                struct kvm_mmu_page *sp;
+                for_each_sp(pages, sp, parents, i) {
+                        kvm_mmu_zap_page(kvm, sp);
+                        mmu_pages_clear_parents(&parents);
+                }
+                zapped += pages.nr;
+                kvm_mmu_pages_init(parent, &parents, &pages);
+        }
+        return zapped;
 }
 static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp)
@@ -1362,7 +1476,7 @@ static void page_header_update_slot(struct kvm *kvm, void *pte, gfn_t gfn)
        int slot = memslot_id(kvm, gfn_to_memslot(kvm, gfn));
        struct kvm_mmu_page *sp = page_header(__pa(pte));
-        __set_bit(slot, &sp->slot_bitmap);
+        __set_bit(slot, sp->slot_bitmap);
 }
 static void mmu_convert_notrap(struct kvm_mmu_page *sp)
@@ -1393,6 +1507,110 @@ struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva)
        return page;
 }
+/*
+ * The function is based on mtrr_type_lookup() in
+ * arch/x86/kernel/cpu/mtrr/generic.c
+ */
+static int get_mtrr_type(struct mtrr_state_type *mtrr_state,
+                         u64 start, u64 end)
+{
+        int i;
+        u64 base, mask;
+        u8 prev_match, curr_match;
+        int num_var_ranges = KVM_NR_VAR_MTRR;
+        if (!mtrr_state->enabled)
+                return 0xFF;
+        /* Make end inclusive end, instead of exclusive */
+        end--;
+        /* Look in fixed ranges. Just return the type as per start */
+        if (mtrr_state->have_fixed && (start < 0x100000)) {
+                int idx;
+                if (start < 0x80000) {
+                        idx = 0;
+                        idx += (start >> 16);
+                        return mtrr_state->fixed_ranges[idx];
+                } else if (start < 0xC0000) {
+                        idx = 1 * 8;
+                        idx += ((start - 0x80000) >> 14);
+                        return mtrr_state->fixed_ranges[idx];
+                } else if (start < 0x1000000) {
+                        idx = 3 * 8;
+                        idx += ((start - 0xC0000) >> 12);
+                        return mtrr_state->fixed_ranges[idx];
+                }
+        }
+        /*
+         * Look in variable ranges
+         * Look of multiple ranges matching this address and pick type
+         * as per MTRR precedence
+         */
+        if (!(mtrr_state->enabled & 2))
+                return mtrr_state->def_type;
+        prev_match = 0xFF;
+        for (i = 0; i < num_var_ranges; ++i) {
+                unsigned short start_state, end_state;
+                if (!(mtrr_state->var_ranges[i].mask_lo & (1 << 11)))
+                        continue;
+                base = (((u64)mtrr_state->var_ranges[i].base_hi) << 32) +
+                       (mtrr_state->var_ranges[i].base_lo & PAGE_MASK);
+                mask = (((u64)mtrr_state->var_ranges[i].mask_hi) << 32) +
+                       (mtrr_state->var_ranges[i].mask_lo & PAGE_MASK);
+                start_state = ((start & mask) == (base & mask));
+                end_state = ((end & mask) == (base & mask));
+                if (start_state != end_state)
+                        return 0xFE;
+                if ((start & mask) != (base & mask))
+                        continue;
+                curr_match = mtrr_state->var_ranges[i].base_lo & 0xff;
+                if (prev_match == 0xFF) {
+                        prev_match = curr_match;
+                        continue;
+                }
+                if (prev_match == MTRR_TYPE_UNCACHABLE ||
+                    curr_match == MTRR_TYPE_UNCACHABLE)
+                        return MTRR_TYPE_UNCACHABLE;
+                if ((prev_match == MTRR_TYPE_WRBACK &&
+                     curr_match == MTRR_TYPE_WRTHROUGH) ||
+                    (prev_match == MTRR_TYPE_WRTHROUGH &&
+                     curr_match == MTRR_TYPE_WRBACK)) {
+                        prev_match = MTRR_TYPE_WRTHROUGH;
+                        curr_match = MTRR_TYPE_WRTHROUGH;
+                }
+                if (prev_match != curr_match)
+                        return MTRR_TYPE_UNCACHABLE;
+        }
+        if (prev_match != 0xFF)
+                return prev_match;
+        return mtrr_state->def_type;
+}
+static u8 get_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn)
+{
+        u8 mtrr;
+        mtrr = get_mtrr_type(&vcpu->arch.mtrr_state, gfn << PAGE_SHIFT,
+                             (gfn << PAGE_SHIFT) + PAGE_SIZE);
+        if (mtrr == 0xfe || mtrr == 0xff)
+                mtrr = MTRR_TYPE_WRBACK;
+        return mtrr;
+}
 static int kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
 {
        unsigned index;
@@ -1409,9 +1627,15 @@ static int kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
                if (s->role.word != sp->role.word)
                        return 1;
        }
-        kvm_mmu_mark_parents_unsync(vcpu, sp);
        ++vcpu->kvm->stat.mmu_unsync;
        sp->unsync = 1;
+        if (sp->global) {
+                list_add(&sp->oos_link, &vcpu->kvm->arch.oos_global_pages);
+                ++vcpu->kvm->stat.mmu_unsync_global;
+        } else
+                kvm_mmu_mark_parents_unsync(vcpu, sp);
        mmu_convert_notrap(sp);
        return 0;
 }
@@ -1437,11 +1661,24 @@ static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
 static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
                    unsigned pte_access, int user_fault,
                    int write_fault, int dirty, int largepage,
-                    gfn_t gfn, pfn_t pfn, bool speculative,
+                    int global, gfn_t gfn, pfn_t pfn, bool speculative,
                    bool can_unsync)
 {
        u64 spte;
        int ret = 0;
+        u64 mt_mask = shadow_mt_mask;
+        struct kvm_mmu_page *sp = page_header(__pa(shadow_pte));
+        if (!(vcpu->arch.cr4 & X86_CR4_PGE))
+                global = 0;
+        if (!global && sp->global) {
+                sp->global = 0;
+                if (sp->unsync) {
+                        kvm_unlink_unsync_global(vcpu->kvm, sp);
+                        kvm_mmu_mark_parents_unsync(vcpu, sp);
+                }
+        }
        /*
         * We don't set the accessed bit, since we sometimes want to see
         * whether the guest actually used the pte (in order to detect
@@ -1460,6 +1697,11 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
                spte |= shadow_user_mask;
        if (largepage)
                spte |= PT_PAGE_SIZE_MASK;
+        if (mt_mask) {
+                mt_mask = get_memory_type(vcpu, gfn) <<
+                          kvm_x86_ops->get_mt_mask_shift();
+                spte |= mt_mask;
+        }
        spte |= (u64)pfn << PAGE_SHIFT;
@@ -1474,6 +1716,15 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
                spte |= PT_WRITABLE_MASK;
+                /*
+                 * Optimization: for pte sync, if spte was writable the hash
+                 * lookup is unnecessary (and expensive). Write protection
+                 * is responsibility of mmu_get_page / kvm_sync_page.
+                 * Same reasoning can be applied to dirty page accounting.
+                 */
+                if (!can_unsync && is_writeble_pte(*shadow_pte))
+                        goto set_pte;
                if (mmu_need_write_protect(vcpu, gfn, can_unsync)) {
                        pgprintk("%s: found shadow page for %lx, marking ro\n",
                                 __func__, gfn);
@@ -1495,8 +1746,8 @@ set_pte:
 static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
                         unsigned pt_access, unsigned pte_access,
                         int user_fault, int write_fault, int dirty,
-                         int *ptwrite, int largepage, gfn_t gfn,
+                         int *ptwrite, int largepage, int global,
-                         pfn_t pfn, bool speculative)
+                         gfn_t gfn, pfn_t pfn, bool speculative)
 {
        int was_rmapped = 0;
        int was_writeble = is_writeble_pte(*shadow_pte);
@@ -1529,7 +1780,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
                }
        }
        if (set_spte(vcpu, shadow_pte, pte_access, user_fault, write_fault,
-                      dirty, largepage, gfn, pfn, speculative, true)) {
+                      dirty, largepage, global, gfn, pfn, speculative, true)) {
                if (write_fault)
                        *ptwrite = 1;
                kvm_x86_ops->tlb_flush(vcpu);
@@ -1586,7 +1837,7 @@ static int direct_map_entry(struct kvm_shadow_walk *_walk,
            || (walk->largepage && level == PT_DIRECTORY_LEVEL)) {
                mmu_set_spte(vcpu, sptep, ACC_ALL, ACC_ALL,
                             0, walk->write, 1, &walk->pt_write,
-                             walk->largepage, gfn, walk->pfn, false);
+                             walk->largepage, 0, gfn, walk->pfn, false);
                ++vcpu->stat.pf_fixed;
                return 1;
        }
@@ -1773,6 +2024,15 @@ static void mmu_sync_roots(struct kvm_vcpu *vcpu)
        }
 }
+static void mmu_sync_global(struct kvm_vcpu *vcpu)
+{
+        struct kvm *kvm = vcpu->kvm;
+        struct kvm_mmu_page *sp, *n;
+        list_for_each_entry_safe(sp, n, &kvm->arch.oos_global_pages, oos_link)
+                kvm_sync_page(vcpu, sp);
+}
 void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
 {
        spin_lock(&vcpu->kvm->mmu_lock);
@@ -1780,6 +2040,13 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
        spin_unlock(&vcpu->kvm->mmu_lock);
 }
+void kvm_mmu_sync_global(struct kvm_vcpu *vcpu)
+{
+        spin_lock(&vcpu->kvm->mmu_lock);
+        mmu_sync_global(vcpu);
+        spin_unlock(&vcpu->kvm->mmu_lock);
+}
 static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr)
 {
        return vaddr;
@@ -2178,7 +2445,8 @@ static void kvm_mmu_access_page(struct kvm_vcpu *vcpu, gfn_t gfn)
 }
 void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
-                       const u8 *new, int bytes)
+                       const u8 *new, int bytes,
+                       bool guest_initiated)
 {
        gfn_t gfn = gpa >> PAGE_SHIFT;
        struct kvm_mmu_page *sp;
@@ -2204,15 +2472,17 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
        kvm_mmu_free_some_pages(vcpu);
        ++vcpu->kvm->stat.mmu_pte_write;
        kvm_mmu_audit(vcpu, "pre pte write");
-        if (gfn == vcpu->arch.last_pt_write_gfn
+        if (guest_initiated) {
-            && !last_updated_pte_accessed(vcpu)) {
+                if (gfn == vcpu->arch.last_pt_write_gfn
-                ++vcpu->arch.last_pt_write_count;
+                    && !last_updated_pte_accessed(vcpu)) {
-                if (vcpu->arch.last_pt_write_count >= 3)
+                        ++vcpu->arch.last_pt_write_count;
-                        flooded = 1;
+                        if (vcpu->arch.last_pt_write_count >= 3)
-        } else {
+                                flooded = 1;
-                vcpu->arch.last_pt_write_gfn = gfn;
+                } else {
-                vcpu->arch.last_pt_write_count = 1;
+                        vcpu->arch.last_pt_write_gfn = gfn;
-                vcpu->arch.last_pte_updated = NULL;
+                        vcpu->arch.last_pt_write_count = 1;
+                        vcpu->arch.last_pte_updated = NULL;
+                }
        }
        index = kvm_page_table_hashfn(gfn);
        bucket = &vcpu->kvm->arch.mmu_page_hash[index];
@@ -2352,9 +2622,7 @@ EXPORT_SYMBOL_GPL(kvm_mmu_page_fault);
 void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
 {
-        spin_lock(&vcpu->kvm->mmu_lock);
        vcpu->arch.mmu.invlpg(vcpu, gva);
-        spin_unlock(&vcpu->kvm->mmu_lock);
        kvm_mmu_flush_tlb(vcpu);
        ++vcpu->stat.invlpg;
 }
@@ -2451,7 +2719,7 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
                int i;
                u64 *pt;
-                if (!test_bit(slot, &sp->slot_bitmap))
+                if (!test_bit(slot, sp->slot_bitmap))
                        continue;
                pt = sp->spt;
@@ -2860,8 +3128,8 @@ static void audit_write_protection(struct kvm_vcpu *vcpu)
                if (sp->role.metaphysical)
                        continue;
-                slot = gfn_to_memslot(vcpu->kvm, sp->gfn);
                gfn = unalias_gfn(vcpu->kvm, sp->gfn);
+                slot = gfn_to_memslot_unaliased(vcpu->kvm, sp->gfn);
                rmapp = &slot->rmap[gfn - slot->base_gfn];
                if (*rmapp)
                        printk(KERN_ERR "%s: (%s) shadow page has writable"
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 84eee43bbe74..9fd78b6e17ad 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -82,6 +82,7 @@ struct shadow_walker {
        int *ptwrite;
        pfn_t pfn;
        u64 *sptep;
+        gpa_t pte_gpa;
 };
 static gfn_t gpte_to_gfn(pt_element_t gpte)
@@ -222,7 +223,7 @@ walk:
                if (ret)
                        goto walk;
                pte |= PT_DIRTY_MASK;
-                kvm_mmu_pte_write(vcpu, pte_gpa, (u8 *)&pte, sizeof(pte));
+                kvm_mmu_pte_write(vcpu, pte_gpa, (u8 *)&pte, sizeof(pte), 0);
                walker->ptes[walker->level - 1] = pte;
        }
@@ -274,7 +275,8 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page,
                return;
        kvm_get_pfn(pfn);
        mmu_set_spte(vcpu, spte, page->role.access, pte_access, 0, 0,
-                     gpte & PT_DIRTY_MASK, NULL, largepage, gpte_to_gfn(gpte),
+                     gpte & PT_DIRTY_MASK, NULL, largepage,
+                     gpte & PT_GLOBAL_MASK, gpte_to_gfn(gpte),
                     pfn, true);
 }
@@ -301,8 +303,9 @@ static int FNAME(shadow_walk_entry)(struct kvm_shadow_walk *_sw,
                mmu_set_spte(vcpu, sptep, access, gw->pte_access & access,
                             sw->user_fault, sw->write_fault,
                             gw->ptes[gw->level-1] & PT_DIRTY_MASK,
-                             sw->ptwrite, sw->largepage, gw->gfn, sw->pfn,
+                             sw->ptwrite, sw->largepage,
-                             false);
+                             gw->ptes[gw->level-1] & PT_GLOBAL_MASK,
+                             gw->gfn, sw->pfn, false);
                sw->sptep = sptep;
                return 1;
        }
@@ -466,10 +469,22 @@ static int FNAME(shadow_invlpg_entry)(struct kvm_shadow_walk *_sw,
                                      struct kvm_vcpu *vcpu, u64 addr,
                                      u64 *sptep, int level)
 {
+        struct shadow_walker *sw =
+                container_of(_sw, struct shadow_walker, walker);
-        if (level == PT_PAGE_TABLE_LEVEL) {
+        /* FIXME: properly handle invlpg on large guest pages */
-                if (is_shadow_present_pte(*sptep))
+        if (level == PT_PAGE_TABLE_LEVEL ||
+            ((level == PT_DIRECTORY_LEVEL) && is_large_pte(*sptep))) {
+                struct kvm_mmu_page *sp = page_header(__pa(sptep));
+                sw->pte_gpa = (sp->gfn << PAGE_SHIFT);
+                sw->pte_gpa += (sptep - sp->spt) * sizeof(pt_element_t);
+                if (is_shadow_present_pte(*sptep)) {
                        rmap_remove(vcpu->kvm, sptep);
+                        if (is_large_pte(*sptep))
+                                --vcpu->kvm->stat.lpages;
+                }
                set_shadow_pte(sptep, shadow_trap_nonpresent_pte);
                return 1;
        }
@@ -480,11 +495,26 @@ static int FNAME(shadow_invlpg_entry)(struct kvm_shadow_walk *_sw,
 static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
 {
+        pt_element_t gpte;
        struct shadow_walker walker = {
                .walker = { .entry = FNAME(shadow_invlpg_entry), },
+                .pte_gpa = -1,
        };
+        spin_lock(&vcpu->kvm->mmu_lock);
        walk_shadow(&walker.walker, vcpu, gva);
+        spin_unlock(&vcpu->kvm->mmu_lock);
+        if (walker.pte_gpa == -1)
+                return;
+        if (kvm_read_guest_atomic(vcpu->kvm, walker.pte_gpa, &gpte,
+                                  sizeof(pt_element_t)))
+                return;
+        if (is_present_pte(gpte) && (gpte & PT_ACCESSED_MASK)) {
+                if (mmu_topup_memory_caches(vcpu))
+                        return;
+                kvm_mmu_pte_write(vcpu, walker.pte_gpa, (const u8 *)&gpte,
+                                  sizeof(pt_element_t), 0);
+        }
 }
 static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr)
@@ -580,7 +610,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
                nr_present++;
                pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
                set_spte(vcpu, &sp->spt[i], pte_access, 0, 0,
-                         is_dirty_pte(gpte), 0, gfn,
+                         is_dirty_pte(gpte), 0, gpte & PT_GLOBAL_MASK, gfn,
                         spte_to_pfn(sp->spt[i]), true, false);
        }
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 9c4ce657d963..1452851ae258 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -28,6 +28,8 @@
 #include <asm/desc.h>
+#include <asm/virtext.h>
 #define __ex(x) __kvm_handle_fault_on_reboot(x)
 MODULE_AUTHOR("Qumranet");
@@ -245,34 +247,19 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
 static int has_svm(void)
 {
-        uint32_t eax, ebx, ecx, edx;
+        const char *msg;
-        if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) {
-                printk(KERN_INFO "has_svm: not amd\n");
-                return 0;
-        }
-        cpuid(0x80000000, &eax, &ebx, &ecx, &edx);
+        if (!cpu_has_svm(&msg)) {
-        if (eax < SVM_CPUID_FUNC) {
+                printk(KERN_INFO "has_svn: %s\n", msg);
-                printk(KERN_INFO "has_svm: can't execute cpuid_8000000a\n");
                return 0;
        }
-        cpuid(0x80000001, &eax, &ebx, &ecx, &edx);
-        if (!(ecx & (1 << SVM_CPUID_FEATURE_SHIFT))) {
-                printk(KERN_DEBUG "has_svm: svm not available\n");
-                return 0;
-        }
        return 1;
 }
 static void svm_hardware_disable(void *garbage)
 {
-        uint64_t efer;
+        cpu_svm_disable();
-        wrmsrl(MSR_VM_HSAVE_PA, 0);
-        rdmsrl(MSR_EFER, efer);
-        wrmsrl(MSR_EFER, efer & ~MSR_EFER_SVME_MASK);
 }
 static void svm_hardware_enable(void *garbage)
@@ -772,6 +759,22 @@ static void svm_get_segment(struct kvm_vcpu *vcpu,
        var->l = (s->attrib >> SVM_SELECTOR_L_SHIFT) & 1;
        var->db = (s->attrib >> SVM_SELECTOR_DB_SHIFT) & 1;
        var->g = (s->attrib >> SVM_SELECTOR_G_SHIFT) & 1;
+        /*
+         * SVM always stores 0 for the 'G' bit in the CS selector in
+         * the VMCB on a VMEXIT. This hurts cross-vendor migration:
+         * Intel's VMENTRY has a check on the 'G' bit.
+         */
+        if (seg == VCPU_SREG_CS)
+                var->g = s->limit > 0xfffff;
+        /*
+         * Work around a bug where the busy flag in the tr selector
+         * isn't exposed
+         */
+        if (seg == VCPU_SREG_TR)
+                var->type |= 0x2;
        var->unusable = !var->present;
 }
@@ -1099,6 +1102,7 @@ static int io_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
        rep = (io_info & SVM_IOIO_REP_MASK) != 0;
        down = (svm->vmcb->save.rflags & X86_EFLAGS_DF) != 0;
+        skip_emulated_instruction(&svm->vcpu);
        return kvm_emulate_pio(&svm->vcpu, kvm_run, in, size, port);
 }
@@ -1912,6 +1916,11 @@ static int get_npt_level(void)
 #endif
 }
+static int svm_get_mt_mask_shift(void)
+{
+        return 0;
+}
 static struct kvm_x86_ops svm_x86_ops = {
        .cpu_has_kvm_support = has_svm,
        .disabled_by_bios = is_disabled,
@@ -1967,6 +1976,7 @@ static struct kvm_x86_ops svm_x86_ops = {
        .set_tss_addr = svm_set_tss_addr,
        .get_tdp_level = get_npt_level,
+        .get_mt_mask_shift = svm_get_mt_mask_shift,
 };
 static int __init svm_init(void)
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index a4018b01e1f9..6259d7467648 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -16,7 +16,6 @@
 */
 #include "irq.h"
-#include "vmx.h"
 #include "mmu.h"
 #include <linux/kvm_host.h>
@@ -31,6 +30,8 @@
 #include <asm/io.h>
 #include <asm/desc.h>
+#include <asm/vmx.h>
+#include <asm/virtext.h>
 #define __ex(x) __kvm_handle_fault_on_reboot(x)
@@ -90,6 +91,11 @@ struct vcpu_vmx {
        } rmode;
        int vpid;
        bool emulation_required;
+        /* Support for vnmi-less CPUs */
+        int soft_vnmi_blocked;
+        ktime_t entry_time;
+        s64 vnmi_blocked_time;
 };
 static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
@@ -122,7 +128,7 @@ static struct vmcs_config {
        u32 vmentry_ctrl;
 } vmcs_config;
-struct vmx_capability {
+static struct vmx_capability {
        u32 ept;
        u32 vpid;
 } vmx_capability;
@@ -957,6 +963,13 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
                pr_unimpl(vcpu, "unimplemented perfctr wrmsr: 0x%x data 0x%llx\n", msr_index, data);
                break;
+        case MSR_IA32_CR_PAT:
+                if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
+                        vmcs_write64(GUEST_IA32_PAT, data);
+                        vcpu->arch.pat = data;
+                        break;
+                }
+                /* Otherwise falls through to kvm_set_msr_common */
        default:
                vmx_load_host_state(vmx);
                msr = find_msr_entry(vmx, msr_index);
@@ -1032,8 +1045,7 @@ static int vmx_get_irq(struct kvm_vcpu *vcpu)
 static __init int cpu_has_kvm_support(void)
 {
-        unsigned long ecx = cpuid_ecx(1);
+        return cpu_has_vmx();
-        return test_bit(5, &ecx); /* CPUID.1:ECX.VMX[bit 5] -> VT */
 }
 static __init int vmx_disabled_by_bios(void)
@@ -1079,13 +1091,22 @@ static void vmclear_local_vcpus(void)
                __vcpu_clear(vmx);
 }
-static void hardware_disable(void *garbage)
+/* Just like cpu_vmxoff(), but with the __kvm_handle_fault_on_reboot()
+ * tricks.
+ */
+static void kvm_cpu_vmxoff(void)
 {
-        vmclear_local_vcpus();
        asm volatile (__ex(ASM_VMX_VMXOFF) : : : "cc");
        write_cr4(read_cr4() & ~X86_CR4_VMXE);
 }
+static void hardware_disable(void *garbage)
+{
+        vmclear_local_vcpus();
+        kvm_cpu_vmxoff();
+}
 static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt,
                                      u32 msr, u32 *result)
 {
@@ -1176,12 +1197,13 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
 #ifdef CONFIG_X86_64
        min |= VM_EXIT_HOST_ADDR_SPACE_SIZE;
 #endif
-        opt = 0;
+        opt = VM_EXIT_SAVE_IA32_PAT | VM_EXIT_LOAD_IA32_PAT;
        if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_EXIT_CTLS,
                                &_vmexit_control) < 0)
                return -EIO;
-        min = opt = 0;
+        min = 0;
+        opt = VM_ENTRY_LOAD_IA32_PAT;
        if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_ENTRY_CTLS,
                                &_vmentry_control) < 0)
                return -EIO;
@@ -2087,8 +2109,9 @@ static void vmx_disable_intercept_for_msr(struct page *msr_bitmap, u32 msr)
 */
 static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
 {
-        u32 host_sysenter_cs;
+        u32 host_sysenter_cs, msr_low, msr_high;
        u32 junk;
+        u64 host_pat;
        unsigned long a;
        struct descriptor_table dt;
        int i;
@@ -2176,6 +2199,20 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
        rdmsrl(MSR_IA32_SYSENTER_EIP, a);
        vmcs_writel(HOST_IA32_SYSENTER_EIP, a);   /* 22.2.3 */
+        if (vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_PAT) {
+                rdmsr(MSR_IA32_CR_PAT, msr_low, msr_high);
+                host_pat = msr_low | ((u64) msr_high << 32);
+                vmcs_write64(HOST_IA32_PAT, host_pat);
+        }
+        if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
+                rdmsr(MSR_IA32_CR_PAT, msr_low, msr_high);
+                host_pat = msr_low | ((u64) msr_high << 32);
+                /* Write the default value follow host pat */
+                vmcs_write64(GUEST_IA32_PAT, host_pat);
+                /* Keep arch.pat sync with GUEST_IA32_PAT */
+                vmx->vcpu.arch.pat = host_pat;
+        }
        for (i = 0; i < NR_VMX_MSR; ++i) {
                u32 index = vmx_msr_index[i];
                u32 data_low, data_high;
@@ -2230,6 +2267,8 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
        vmx->vcpu.arch.rmode.active = 0;
+        vmx->soft_vnmi_blocked = 0;
        vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val();
        kvm_set_cr8(&vmx->vcpu, 0);
        msr = 0xfee00000 | MSR_IA32_APICBASE_ENABLE;
@@ -2335,6 +2374,29 @@ out:
        return ret;
 }
+static void enable_irq_window(struct kvm_vcpu *vcpu)
+{
+        u32 cpu_based_vm_exec_control;
+        cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
+        cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_INTR_PENDING;
+        vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
+}
+static void enable_nmi_window(struct kvm_vcpu *vcpu)
+{
+        u32 cpu_based_vm_exec_control;
+        if (!cpu_has_virtual_nmis()) {
+                enable_irq_window(vcpu);
+                return;
+        }
+        cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
+        cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_NMI_PENDING;
+        vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
+}
 static void vmx_inject_irq(struct kvm_vcpu *vcpu, int irq)
 {
        struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -2358,10 +2420,54 @@ static void vmx_inject_irq(struct kvm_vcpu *vcpu, int irq)
 static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
 {
+        struct vcpu_vmx *vmx = to_vmx(vcpu);
+        if (!cpu_has_virtual_nmis()) {
+                /*
+                 * Tracking the NMI-blocked state in software is built upon
+                 * finding the next open IRQ window. This, in turn, depends on
+                 * well-behaving guests: They have to keep IRQs disabled at
+                 * least as long as the NMI handler runs. Otherwise we may
+                 * cause NMI nesting, maybe breaking the guest. But as this is
+                 * highly unlikely, we can live with the residual risk.
+                 */
+                vmx->soft_vnmi_blocked = 1;
+                vmx->vnmi_blocked_time = 0;
+        }
+        ++vcpu->stat.nmi_injections;
+        if (vcpu->arch.rmode.active) {
+                vmx->rmode.irq.pending = true;
+                vmx->rmode.irq.vector = NMI_VECTOR;
+                vmx->rmode.irq.rip = kvm_rip_read(vcpu);
+                vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
+                             NMI_VECTOR | INTR_TYPE_SOFT_INTR |
+                             INTR_INFO_VALID_MASK);
+                vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1);
+                kvm_rip_write(vcpu, vmx->rmode.irq.rip - 1);
+                return;
+        }
        vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
                        INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR);
 }
+static void vmx_update_window_states(struct kvm_vcpu *vcpu)
+{
+        u32 guest_intr = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
+        vcpu->arch.nmi_window_open =
+                !(guest_intr & (GUEST_INTR_STATE_STI |
+                                GUEST_INTR_STATE_MOV_SS |
+                                GUEST_INTR_STATE_NMI));
+        if (!cpu_has_virtual_nmis() && to_vmx(vcpu)->soft_vnmi_blocked)
+                vcpu->arch.nmi_window_open = 0;
+        vcpu->arch.interrupt_window_open =
+                ((vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) &&
+                 !(guest_intr & (GUEST_INTR_STATE_STI |
+                                 GUEST_INTR_STATE_MOV_SS)));
+}
 static void kvm_do_inject_irq(struct kvm_vcpu *vcpu)
 {
        int word_index = __ffs(vcpu->arch.irq_summary);
@@ -2374,40 +2480,49 @@ static void kvm_do_inject_irq(struct kvm_vcpu *vcpu)
        kvm_queue_interrupt(vcpu, irq);
 }
 static void do_interrupt_requests(struct kvm_vcpu *vcpu,
                                       struct kvm_run *kvm_run)
 {
-        u32 cpu_based_vm_exec_control;
+        vmx_update_window_states(vcpu);
-        vcpu->arch.interrupt_window_open =
-                ((vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) &&
-                 (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0);
-        if (vcpu->arch.interrupt_window_open &&
+        if (vcpu->arch.nmi_pending && !vcpu->arch.nmi_injected) {
-            vcpu->arch.irq_summary && !vcpu->arch.interrupt.pending)
+                if (vcpu->arch.interrupt.pending) {
-                kvm_do_inject_irq(vcpu);
+                        enable_nmi_window(vcpu);
+                } else if (vcpu->arch.nmi_window_open) {
+                        vcpu->arch.nmi_pending = false;
+                        vcpu->arch.nmi_injected = true;
+                } else {
+                        enable_nmi_window(vcpu);
+                        return;
+                }
+        }
+        if (vcpu->arch.nmi_injected) {
+                vmx_inject_nmi(vcpu);
+                if (vcpu->arch.nmi_pending)
+                        enable_nmi_window(vcpu);
+                else if (vcpu->arch.irq_summary
+                         || kvm_run->request_interrupt_window)
+                        enable_irq_window(vcpu);
+                return;
+        }
-        if (vcpu->arch.interrupt_window_open && vcpu->arch.interrupt.pending)
+        if (vcpu->arch.interrupt_window_open) {
-                vmx_inject_irq(vcpu, vcpu->arch.interrupt.nr);
+                if (vcpu->arch.irq_summary && !vcpu->arch.interrupt.pending)
+                        kvm_do_inject_irq(vcpu);
-        cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
+                if (vcpu->arch.interrupt.pending)
+                        vmx_inject_irq(vcpu, vcpu->arch.interrupt.nr);
+        }
        if (!vcpu->arch.interrupt_window_open &&
            (vcpu->arch.irq_summary || kvm_run->request_interrupt_window))
-                /*
+                enable_irq_window(vcpu);
-                 * Interrupts blocked.  Wait for unblock.
-                 */
-                cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_INTR_PENDING;
-        else
-                cpu_based_vm_exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING;
-        vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
 }
 static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr)
 {
        int ret;
        struct kvm_userspace_memory_region tss_mem = {
-                .slot = 8,
+                .slot = TSS_PRIVATE_MEMSLOT,
                .guest_phys_addr = addr,
                .memory_size = PAGE_SIZE * 3,
                .flags = 0,
@@ -2492,7 +2607,7 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
                set_bit(irq / BITS_PER_LONG, &vcpu->arch.irq_summary);
        }
-        if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == 0x200) /* nmi */
+        if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR)
                return 1;  /* already handled by vmx_vcpu_run() */
        if (is_no_device(intr_info)) {
@@ -2581,6 +2696,7 @@ static int handle_io(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
        rep = (exit_qualification & 32) != 0;
        port = exit_qualification >> 16;
+        skip_emulated_instruction(vcpu);
        return kvm_emulate_pio(vcpu, kvm_run, in, size, port);
 }
@@ -2767,6 +2883,7 @@ static int handle_interrupt_window(struct kvm_vcpu *vcpu,
        vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
        KVMTRACE_0D(PEND_INTR, vcpu, handler);
+        ++vcpu->stat.irq_window_exits;
        /*
         * If the user space waits to inject interrupts, exit as soon as
@@ -2775,7 +2892,6 @@ static int handle_interrupt_window(struct kvm_vcpu *vcpu,
        if (kvm_run->request_interrupt_window &&
            !vcpu->arch.irq_summary) {
                kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
-                ++vcpu->stat.irq_window_exits;
                return 0;
        }
        return 1;
@@ -2832,6 +2948,7 @@ static int handle_apic_access(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 static int handle_task_switch(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 {
+        struct vcpu_vmx *vmx = to_vmx(vcpu);
        unsigned long exit_qualification;
        u16 tss_selector;
        int reason;
@@ -2839,6 +2956,15 @@ static int handle_task_switch(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
        exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
        reason = (u32)exit_qualification >> 30;
+        if (reason == TASK_SWITCH_GATE && vmx->vcpu.arch.nmi_injected &&
+            (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK) &&
+            (vmx->idt_vectoring_info & VECTORING_INFO_TYPE_MASK)
+            == INTR_TYPE_NMI_INTR) {
+                vcpu->arch.nmi_injected = false;
+                if (cpu_has_virtual_nmis())
+                        vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
+                                      GUEST_INTR_STATE_NMI);
+        }
        tss_selector = exit_qualification;
        return kvm_task_switch(vcpu, tss_selector, reason);
@@ -2927,16 +3053,12 @@ static void handle_invalid_guest_state(struct kvm_vcpu *vcpu,
        while (!guest_state_valid(vcpu)) {
                err = emulate_instruction(vcpu, kvm_run, 0, 0, 0);
-                switch (err) {
+                if (err == EMULATE_DO_MMIO)
-                        case EMULATE_DONE:
+                        break;
-                                break;
-                        case EMULATE_DO_MMIO:
+                if (err != EMULATE_DONE) {
-                                kvm_report_emulation_failure(vcpu, "mmio");
+                        kvm_report_emulation_failure(vcpu, "emulation failure");
-                                /* TODO: Handle MMIO */
+                        return;
-                                return;
-                        default:
-                                kvm_report_emulation_failure(vcpu, "emulation failure");
-                                return;
                }
                if (signal_pending(current))
@@ -2948,8 +3070,10 @@ static void handle_invalid_guest_state(struct kvm_vcpu *vcpu,
        local_irq_disable();
        preempt_disable();
-        /* Guest state should be valid now, no more emulation should be needed */
+        /* Guest state should be valid now except if we need to
-        vmx->emulation_required = 0;
+         * emulate an MMIO */
+        if (guest_state_valid(vcpu))
+                vmx->emulation_required = 0;
 }
 /*
@@ -2996,6 +3120,11 @@ static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
        KVMTRACE_3D(VMEXIT, vcpu, exit_reason, (u32)kvm_rip_read(vcpu),
                    (u32)((u64)kvm_rip_read(vcpu) >> 32), entryexit);
+        /* If we need to emulate an MMIO from handle_invalid_guest_state
+         * we just return 0 */
+        if (vmx->emulation_required && emulate_invalid_guest_state)
+                return 0;
        /* Access CR3 don't cause VMExit in paging mode, so we need
         * to sync with guest real CR3. */
        if (vm_need_ept() && is_paging(vcpu)) {
@@ -3012,9 +3141,32 @@ static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
        if ((vectoring_info & VECTORING_INFO_VALID_MASK) &&
                        (exit_reason != EXIT_REASON_EXCEPTION_NMI &&
-                        exit_reason != EXIT_REASON_EPT_VIOLATION))
+                        exit_reason != EXIT_REASON_EPT_VIOLATION &&
-                printk(KERN_WARNING "%s: unexpected, valid vectoring info and "
+                        exit_reason != EXIT_REASON_TASK_SWITCH))
-                       "exit reason is 0x%x\n", __func__, exit_reason);
+                printk(KERN_WARNING "%s: unexpected, valid vectoring info "
+                       "(0x%x) and exit reason is 0x%x\n",
+                       __func__, vectoring_info, exit_reason);
+        if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked)) {
+                if (vcpu->arch.interrupt_window_open) {
+                        vmx->soft_vnmi_blocked = 0;
+                        vcpu->arch.nmi_window_open = 1;
+                } else if (vmx->vnmi_blocked_time > 1000000000LL &&
+                           vcpu->arch.nmi_pending) {
+                        /*
+                         * This CPU don't support us in finding the end of an
+                         * NMI-blocked window if the guest runs with IRQs
+                         * disabled. So we pull the trigger after 1 s of
+                         * futile waiting, but inform the user about this.
+                         */
+                        printk(KERN_WARNING "%s: Breaking out of NMI-blocked "
+                               "state on VCPU %d after 1 s timeout\n",
+                               __func__, vcpu->vcpu_id);
+                        vmx->soft_vnmi_blocked = 0;
+                        vmx->vcpu.arch.nmi_window_open = 1;
+                }
+        }
        if (exit_reason < kvm_vmx_max_exit_handlers
            && kvm_vmx_exit_handlers[exit_reason])
                return kvm_vmx_exit_handlers[exit_reason](vcpu, kvm_run);
@@ -3042,51 +3194,6 @@ static void update_tpr_threshold(struct kvm_vcpu *vcpu)
        vmcs_write32(TPR_THRESHOLD, (max_irr > tpr) ? tpr >> 4 : max_irr >> 4);
 }
-static void enable_irq_window(struct kvm_vcpu *vcpu)
-{
-        u32 cpu_based_vm_exec_control;
-        cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
-        cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_INTR_PENDING;
-        vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
-}
-static void enable_nmi_window(struct kvm_vcpu *vcpu)
-{
-        u32 cpu_based_vm_exec_control;
-        if (!cpu_has_virtual_nmis())
-                return;
-        cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
-        cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_NMI_PENDING;
-        vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
-}
-static int vmx_nmi_enabled(struct kvm_vcpu *vcpu)
-{
-        u32 guest_intr = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
-        return !(guest_intr & (GUEST_INTR_STATE_NMI |
-                               GUEST_INTR_STATE_MOV_SS |
-                               GUEST_INTR_STATE_STI));
-}
-static int vmx_irq_enabled(struct kvm_vcpu *vcpu)
-{
-        u32 guest_intr = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
-        return (!(guest_intr & (GUEST_INTR_STATE_MOV_SS |
-                               GUEST_INTR_STATE_STI)) &&
-                (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF));
-}
-static void enable_intr_window(struct kvm_vcpu *vcpu)
-{
-        if (vcpu->arch.nmi_pending)
-                enable_nmi_window(vcpu);
-        else if (kvm_cpu_has_interrupt(vcpu))
-                enable_irq_window(vcpu);
-}
 static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
 {
        u32 exit_intr_info;
@@ -3109,7 +3216,9 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
                if (unblock_nmi && vector != DF_VECTOR)
                        vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
                                      GUEST_INTR_STATE_NMI);
-        }
+        } else if (unlikely(vmx->soft_vnmi_blocked))
+                vmx->vnmi_blocked_time +=
+                        ktime_to_ns(ktime_sub(ktime_get(), vmx->entry_time));
        idt_vectoring_info = vmx->idt_vectoring_info;
        idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK;
@@ -3147,26 +3256,29 @@ static void vmx_intr_assist(struct kvm_vcpu *vcpu)
 {
        update_tpr_threshold(vcpu);
-        if (cpu_has_virtual_nmis()) {
+        vmx_update_window_states(vcpu);
-                if (vcpu->arch.nmi_pending && !vcpu->arch.nmi_injected) {
-                        if (vcpu->arch.interrupt.pending) {
+        if (vcpu->arch.nmi_pending && !vcpu->arch.nmi_injected) {
-                                enable_nmi_window(vcpu);
+                if (vcpu->arch.interrupt.pending) {
-                        } else if (vmx_nmi_enabled(vcpu)) {
+                        enable_nmi_window(vcpu);
-                                vcpu->arch.nmi_pending = false;
+                } else if (vcpu->arch.nmi_window_open) {
-                                vcpu->arch.nmi_injected = true;
+                        vcpu->arch.nmi_pending = false;
-                        } else {
+                        vcpu->arch.nmi_injected = true;
-                                enable_intr_window(vcpu);
+                } else {
-                                return;
+                        enable_nmi_window(vcpu);
-                        }
-                }
-                if (vcpu->arch.nmi_injected) {
-                        vmx_inject_nmi(vcpu);
-                        enable_intr_window(vcpu);
                        return;
                }
        }
+        if (vcpu->arch.nmi_injected) {
+                vmx_inject_nmi(vcpu);
+                if (vcpu->arch.nmi_pending)
+                        enable_nmi_window(vcpu);
+                else if (kvm_cpu_has_interrupt(vcpu))
+                        enable_irq_window(vcpu);
+                return;
+        }
        if (!vcpu->arch.interrupt.pending && kvm_cpu_has_interrupt(vcpu)) {
-                if (vmx_irq_enabled(vcpu))
+                if (vcpu->arch.interrupt_window_open)
                        kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu));
                else
                        enable_irq_window(vcpu);
@@ -3174,6 +3286,8 @@ static void vmx_intr_assist(struct kvm_vcpu *vcpu)
        if (vcpu->arch.interrupt.pending) {
                vmx_inject_irq(vcpu, vcpu->arch.interrupt.nr);
                kvm_timer_intr_post(vcpu, vcpu->arch.interrupt.nr);
+                if (kvm_cpu_has_interrupt(vcpu))
+                        enable_irq_window(vcpu);
        }
 }
@@ -3213,6 +3327,10 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
        struct vcpu_vmx *vmx = to_vmx(vcpu);
        u32 intr_info;
+        /* Record the guest's net vcpu time for enforced NMI injections. */
+        if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked))
+                vmx->entry_time = ktime_get();
        /* Handle invalid guest state instead of entering VMX */
        if (vmx->emulation_required && emulate_invalid_guest_state) {
                handle_invalid_guest_state(vcpu, kvm_run);
@@ -3327,9 +3445,7 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
        if (vmx->rmode.irq.pending)
                fixup_rmode_irq(vmx);
-        vcpu->arch.interrupt_window_open =
+        vmx_update_window_states(vcpu);
-                (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
-                 (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS)) == 0;
        asm("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS));
        vmx->launched = 1;
@@ -3337,7 +3453,7 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
        intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
        /* We need to handle NMIs before interrupts are enabled */
-        if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == 0x200 &&
+        if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR &&
            (intr_info & INTR_INFO_VALID_MASK)) {
                KVMTRACE_0D(NMI, vcpu, handler);
                asm("int $2");
@@ -3455,6 +3571,11 @@ static int get_ept_level(void)
        return VMX_EPT_DEFAULT_GAW + 1;
 }
+static int vmx_get_mt_mask_shift(void)
+{
+        return VMX_EPT_MT_EPTE_SHIFT;
+}
 static struct kvm_x86_ops vmx_x86_ops = {
        .cpu_has_kvm_support = cpu_has_kvm_support,
        .disabled_by_bios = vmx_disabled_by_bios,
@@ -3510,6 +3631,7 @@ static struct kvm_x86_ops vmx_x86_ops = {
        .set_tss_addr = vmx_set_tss_addr,
        .get_tdp_level = get_ept_level,
+        .get_mt_mask_shift = vmx_get_mt_mask_shift,
 };
 static int __init vmx_init(void)
@@ -3566,10 +3688,10 @@ static int __init vmx_init(void)
                bypass_guest_pf = 0;
                kvm_mmu_set_base_ptes(VMX_EPT_READABLE_MASK |
                        VMX_EPT_WRITABLE_MASK |
-                        VMX_EPT_DEFAULT_MT << VMX_EPT_MT_EPTE_SHIFT |
                        VMX_EPT_IGMT_BIT);
                kvm_mmu_set_mask_ptes(0ull, 0ull, 0ull, 0ull,
-                                VMX_EPT_EXECUTABLE_MASK);
+                                VMX_EPT_EXECUTABLE_MASK,
+                                VMX_EPT_DEFAULT_MT << VMX_EPT_MT_EPTE_SHIFT);
                kvm_enable_tdp();
        } else
                kvm_disable_tdp();
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index f1f8ff2f1fa2..0e6aa8141dcd 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -39,6 +39,7 @@
 #include <asm/uaccess.h>
 #include <asm/msr.h>
 #include <asm/desc.h>
+#include <asm/mtrr.h>
 #define MAX_IO_MSRS 256
 #define CR0_RESERVED_BITS                                               \
@@ -86,6 +87,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
        { "halt_wakeup", VCPU_STAT(halt_wakeup) },
        { "hypercalls", VCPU_STAT(hypercalls) },
        { "request_irq", VCPU_STAT(request_irq_exits) },
+        { "request_nmi", VCPU_STAT(request_nmi_exits) },
        { "irq_exits", VCPU_STAT(irq_exits) },
        { "host_state_reload", VCPU_STAT(host_state_reload) },
        { "efer_reload", VCPU_STAT(efer_reload) },
@@ -93,6 +95,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
        { "insn_emulation", VCPU_STAT(insn_emulation) },
        { "insn_emulation_fail", VCPU_STAT(insn_emulation_fail) },
        { "irq_injections", VCPU_STAT(irq_injections) },
+        { "nmi_injections", VCPU_STAT(nmi_injections) },
        { "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) },
        { "mmu_pte_write", VM_STAT(mmu_pte_write) },
        { "mmu_pte_updated", VM_STAT(mmu_pte_updated) },
@@ -101,6 +104,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
        { "mmu_recycled", VM_STAT(mmu_recycled) },
        { "mmu_cache_miss", VM_STAT(mmu_cache_miss) },
        { "mmu_unsync", VM_STAT(mmu_unsync) },
+        { "mmu_unsync_global", VM_STAT(mmu_unsync_global) },
        { "remote_tlb_flush", VM_STAT(remote_tlb_flush) },
        { "largepages", VM_STAT(lpages) },
        { NULL }
@@ -312,6 +316,7 @@ void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
        kvm_x86_ops->set_cr0(vcpu, cr0);
        vcpu->arch.cr0 = cr0;
+        kvm_mmu_sync_global(vcpu);
        kvm_mmu_reset_context(vcpu);
        return;
 }
@@ -355,6 +360,7 @@ void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
        }
        kvm_x86_ops->set_cr4(vcpu, cr4);
        vcpu->arch.cr4 = cr4;
+        kvm_mmu_sync_global(vcpu);
        kvm_mmu_reset_context(vcpu);
 }
 EXPORT_SYMBOL_GPL(kvm_set_cr4);
@@ -449,7 +455,7 @@ static u32 msrs_to_save[] = {
        MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
 #endif
        MSR_IA32_TIME_STAMP_COUNTER, MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
-        MSR_IA32_PERF_STATUS,
+        MSR_IA32_PERF_STATUS, MSR_IA32_CR_PAT
 };
 static unsigned num_msrs_to_save;
@@ -648,10 +654,38 @@ static bool msr_mtrr_valid(unsigned msr)
 static int set_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 data)
 {
+        u64 *p = (u64 *)&vcpu->arch.mtrr_state.fixed_ranges;
        if (!msr_mtrr_valid(msr))
                return 1;
-        vcpu->arch.mtrr[msr - 0x200] = data;
+        if (msr == MSR_MTRRdefType) {
+                vcpu->arch.mtrr_state.def_type = data;
+                vcpu->arch.mtrr_state.enabled = (data & 0xc00) >> 10;
+        } else if (msr == MSR_MTRRfix64K_00000)
+                p[0] = data;
+        else if (msr == MSR_MTRRfix16K_80000 || msr == MSR_MTRRfix16K_A0000)
+                p[1 + msr - MSR_MTRRfix16K_80000] = data;
+        else if (msr >= MSR_MTRRfix4K_C0000 && msr <= MSR_MTRRfix4K_F8000)
+                p[3 + msr - MSR_MTRRfix4K_C0000] = data;
+        else if (msr == MSR_IA32_CR_PAT)
+                vcpu->arch.pat = data;
+        else {  /* Variable MTRRs */
+                int idx, is_mtrr_mask;
+                u64 *pt;
+                idx = (msr - 0x200) / 2;
+                is_mtrr_mask = msr - 0x200 - 2 * idx;
+                if (!is_mtrr_mask)
+                        pt =
+                          (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].base_lo;
+                else
+                        pt =
+                          (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].mask_lo;
+                *pt = data;
+        }
+        kvm_mmu_reset_context(vcpu);
        return 0;
 }
@@ -747,10 +781,37 @@ int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
 static int get_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
 {
+        u64 *p = (u64 *)&vcpu->arch.mtrr_state.fixed_ranges;
        if (!msr_mtrr_valid(msr))
                return 1;
-        *pdata = vcpu->arch.mtrr[msr - 0x200];
+        if (msr == MSR_MTRRdefType)
+                *pdata = vcpu->arch.mtrr_state.def_type +
+                         (vcpu->arch.mtrr_state.enabled << 10);
+        else if (msr == MSR_MTRRfix64K_00000)
+                *pdata = p[0];
+        else if (msr == MSR_MTRRfix16K_80000 || msr == MSR_MTRRfix16K_A0000)
+                *pdata = p[1 + msr - MSR_MTRRfix16K_80000];
+        else if (msr >= MSR_MTRRfix4K_C0000 && msr <= MSR_MTRRfix4K_F8000)
+                *pdata = p[3 + msr - MSR_MTRRfix4K_C0000];
+        else if (msr == MSR_IA32_CR_PAT)
+                *pdata = vcpu->arch.pat;
+        else {  /* Variable MTRRs */
+                int idx, is_mtrr_mask;
+                u64 *pt;
+                idx = (msr - 0x200) / 2;
+                is_mtrr_mask = msr - 0x200 - 2 * idx;
+                if (!is_mtrr_mask)
+                        pt =
+                          (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].base_lo;
+                else
+                        pt =
+                          (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].mask_lo;
+                *pdata = *pt;
+        }
        return 0;
 }
@@ -903,7 +964,6 @@ int kvm_dev_ioctl_check_extension(long ext)
        case KVM_CAP_IRQCHIP:
        case KVM_CAP_HLT:
        case KVM_CAP_MMU_SHADOW_CACHE_CONTROL:
-        case KVM_CAP_USER_MEMORY:
        case KVM_CAP_SET_TSS_ADDR:
        case KVM_CAP_EXT_CPUID:
        case KVM_CAP_CLOCKSOURCE:
@@ -1188,6 +1248,7 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
                int t, times = entry->eax & 0xff;
                entry->flags |= KVM_CPUID_FLAG_STATEFUL_FUNC;
+                entry->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT;
                for (t = 1; t < times && *nent < maxnent; ++t) {
                        do_cpuid_1_ent(&entry[t], function, 0);
                        entry[t].flags |= KVM_CPUID_FLAG_STATEFUL_FUNC;
@@ -1218,7 +1279,7 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
                entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
                /* read more entries until level_type is zero */
                for (i = 1; *nent < maxnent; ++i) {
-                        level_type = entry[i - 1].ecx & 0xff;
+                        level_type = entry[i - 1].ecx & 0xff00;
                        if (!level_type)
                                break;
                        do_cpuid_1_ent(&entry[i], function, i);
@@ -1318,6 +1379,15 @@ static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
        return 0;
 }
+static int kvm_vcpu_ioctl_nmi(struct kvm_vcpu *vcpu)
+{
+        vcpu_load(vcpu);
+        kvm_inject_nmi(vcpu);
+        vcpu_put(vcpu);
+        return 0;
+}
 static int vcpu_ioctl_tpr_access_reporting(struct kvm_vcpu *vcpu,
                                           struct kvm_tpr_access_ctl *tac)
 {
@@ -1377,6 +1447,13 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
                r = 0;
                break;
        }
+        case KVM_NMI: {
+                r = kvm_vcpu_ioctl_nmi(vcpu);
+                if (r)
+                        goto out;
+                r = 0;
+                break;
+        }
        case KVM_SET_CPUID: {
                struct kvm_cpuid __user *cpuid_arg = argp;
                struct kvm_cpuid cpuid;
@@ -1968,7 +2045,7 @@ int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
        ret = kvm_write_guest(vcpu->kvm, gpa, val, bytes);
        if (ret < 0)
                return 0;
-        kvm_mmu_pte_write(vcpu, gpa, val, bytes);
+        kvm_mmu_pte_write(vcpu, gpa, val, bytes, 1);
        return 1;
 }
@@ -2404,8 +2481,6 @@ int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
        val = kvm_register_read(vcpu, VCPU_REGS_RAX);
        memcpy(vcpu->arch.pio_data, &val, 4);
-        kvm_x86_ops->skip_emulated_instruction(vcpu);
        pio_dev = vcpu_find_pio_dev(vcpu, port, size, !in);
        if (pio_dev) {
                kernel_pio(pio_dev, vcpu, vcpu->arch.pio_data);
@@ -2541,7 +2616,7 @@ int kvm_arch_init(void *opaque)
        kvm_mmu_set_nonpresent_ptes(0ull, 0ull);
        kvm_mmu_set_base_ptes(PT_PRESENT_MASK);
        kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK,
-                        PT_DIRTY_MASK, PT64_NX_MASK, 0);
+                        PT_DIRTY_MASK, PT64_NX_MASK, 0, 0);
        return 0;
 out:
@@ -2729,7 +2804,7 @@ static int move_to_next_stateful_cpuid_entry(struct kvm_vcpu *vcpu, int i)
        e->flags &= ~KVM_CPUID_FLAG_STATE_READ_NEXT;
        /* when no next entry is found, the current entry[i] is reselected */
-        for (j = i + 1; j == i; j = (j + 1) % nent) {
+        for (j = i + 1; ; j = (j + 1) % nent) {
                struct kvm_cpuid_entry2 *ej = &vcpu->arch.cpuid_entries[j];
                if (ej->function == e->function) {
                        ej->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT;
@@ -2973,7 +3048,7 @@ static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
                pr_debug("vcpu %d received sipi with vector # %x\n",
                         vcpu->vcpu_id, vcpu->arch.sipi_vector);
                kvm_lapic_reset(vcpu);
-                r = kvm_x86_ops->vcpu_reset(vcpu);
+                r = kvm_arch_vcpu_reset(vcpu);
                if (r)
                        return r;
                vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
@@ -3275,9 +3350,9 @@ static void seg_desct_to_kvm_desct(struct desc_struct *seg_desc, u16 selector,
        kvm_desct->padding = 0;
 }
-static void get_segment_descritptor_dtable(struct kvm_vcpu *vcpu,
+static void get_segment_descriptor_dtable(struct kvm_vcpu *vcpu,
-                                           u16 selector,
+                                          u16 selector,
-                                           struct descriptor_table *dtable)
+                                          struct descriptor_table *dtable)
 {
        if (selector & 1 << 2) {
                struct kvm_segment kvm_seg;
@@ -3302,7 +3377,7 @@ static int load_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
        struct descriptor_table dtable;
        u16 index = selector >> 3;
-        get_segment_descritptor_dtable(vcpu, selector, &dtable);
+        get_segment_descriptor_dtable(vcpu, selector, &dtable);
        if (dtable.limit < index * 8 + 7) {
                kvm_queue_exception_e(vcpu, GP_VECTOR, selector & 0xfffc);
@@ -3321,7 +3396,7 @@ static int save_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
        struct descriptor_table dtable;
        u16 index = selector >> 3;
-        get_segment_descritptor_dtable(vcpu, selector, &dtable);
+        get_segment_descriptor_dtable(vcpu, selector, &dtable);
        if (dtable.limit < index * 8 + 7)
                return 1;
@@ -3900,6 +3975,7 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
        /* We do fxsave: this must be aligned. */
        BUG_ON((unsigned long)&vcpu->arch.host_fx_image & 0xF);
+        vcpu->arch.mtrr_state.have_fixed = 1;
        vcpu_load(vcpu);
        r = kvm_arch_vcpu_reset(vcpu);
        if (r == 0)
@@ -3925,6 +4001,9 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
 int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu)
 {
+        vcpu->arch.nmi_pending = false;
+        vcpu->arch.nmi_injected = false;
        return kvm_x86_ops->vcpu_reset(vcpu);
 }
@@ -4012,6 +4091,7 @@ struct  kvm *kvm_arch_create_vm(void)
                return ERR_PTR(-ENOMEM);
        INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
+        INIT_LIST_HEAD(&kvm->arch.oos_global_pages);
        INIT_LIST_HEAD(&kvm->arch.assigned_dev_head);
        /* Reserve bit 0 of irq_sources_bitmap for userspace irq source */
@@ -4048,8 +4128,8 @@ static void kvm_free_vcpus(struct kvm *kvm)
 void kvm_arch_destroy_vm(struct kvm *kvm)
 {
-        kvm_iommu_unmap_guest(kvm);
        kvm_free_all_assigned_devices(kvm);
+        kvm_iommu_unmap_guest(kvm);
        kvm_free_pit(kvm);
        kfree(kvm->arch.vpic);
        kfree(kvm->arch.vioapic);
@@ -4127,7 +4207,8 @@ void kvm_arch_flush_shadow(struct kvm *kvm)
 int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
 {
        return vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE
-               || vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED;
+               || vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED
+               || vcpu->arch.nmi_pending;
 }
 static void vcpu_kick_intr(void *info)
diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c
index ea051173b0da..d174db7a3370 100644
--- a/arch/x86/kvm/x86_emulate.c
+++ b/arch/x86/kvm/x86_emulate.c
@@ -58,6 +58,7 @@
 #define SrcMem32    (4<<4)      /* Memory operand (32-bit). */
 #define SrcImm      (5<<4)      /* Immediate operand. */
 #define SrcImmByte  (6<<4)      /* 8-bit sign-extended immediate operand. */
+#define SrcOne      (7<<4)      /* Implied '1' */
 #define SrcMask     (7<<4)
 /* Generic ModRM decode. */
 #define ModRM       (1<<7)
@@ -70,17 +71,23 @@
 #define Group       (1<<14)     /* Bits 3:5 of modrm byte extend opcode */
 #define GroupDual   (1<<15)     /* Alternate decoding of mod == 3 */
 #define GroupMask   0xff        /* Group number stored in bits 0:7 */
+/* Source 2 operand type */
+#define Src2None    (0<<29)
+#define Src2CL      (1<<29)
+#define Src2ImmByte (2<<29)
+#define Src2One     (3<<29)
+#define Src2Mask    (7<<29)
 enum {
        Group1_80, Group1_81, Group1_82, Group1_83,
        Group1A, Group3_Byte, Group3, Group4, Group5, Group7,
 };
-static u16 opcode_table[256] = {
+static u32 opcode_table[256] = {
        /* 0x00 - 0x07 */
        ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
        ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
-        0, 0, 0, 0,
+        ByteOp | DstAcc | SrcImm, DstAcc | SrcImm, 0, 0,
        /* 0x08 - 0x0F */
        ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
        ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
@@ -195,7 +202,7 @@ static u16 opcode_table[256] = {
        ImplicitOps, ImplicitOps, Group | Group4, Group | Group5,
 };
-static u16 twobyte_table[256] = {
+static u32 twobyte_table[256] = {
        /* 0x00 - 0x0F */
        0, Group | GroupDual | Group7, 0, 0, 0, 0, ImplicitOps, 0,
        ImplicitOps, ImplicitOps, 0, 0, 0, ImplicitOps | ModRM, 0, 0,
@@ -230,9 +237,14 @@ static u16 twobyte_table[256] = {
        /* 0x90 - 0x9F */
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        /* 0xA0 - 0xA7 */
-        0, 0, 0, DstMem | SrcReg | ModRM | BitOp, 0, 0, 0, 0,
+        0, 0, 0, DstMem | SrcReg | ModRM | BitOp,
+        DstMem | SrcReg | Src2ImmByte | ModRM,
+        DstMem | SrcReg | Src2CL | ModRM, 0, 0,
        /* 0xA8 - 0xAF */
-        0, 0, 0, DstMem | SrcReg | ModRM | BitOp, 0, 0, ModRM, 0,
+        0, 0, 0, DstMem | SrcReg | ModRM | BitOp,
+        DstMem | SrcReg | Src2ImmByte | ModRM,
+        DstMem | SrcReg | Src2CL | ModRM,
+        ModRM, 0,
        /* 0xB0 - 0xB7 */
        ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 0,
            DstMem | SrcReg | ModRM | BitOp,
@@ -253,7 +265,7 @@ static u16 twobyte_table[256] = {
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
 };
-static u16 group_table[] = {
+static u32 group_table[] = {
        [Group1_80*8] =
        ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM,
        ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM,
@@ -297,9 +309,9 @@ static u16 group_table[] = {
        SrcMem16 | ModRM | Mov, SrcMem | ModRM | ByteOp,
 };
-static u16 group2_table[] = {
+static u32 group2_table[] = {
        [Group7*8] =
-        SrcNone | ModRM, 0, 0, 0,
+        SrcNone | ModRM, 0, 0, SrcNone | ModRM,
        SrcNone | ModRM | DstMem | Mov, 0,
        SrcMem16 | ModRM | Mov, 0,
 };
@@ -359,49 +371,48 @@ static u16 group2_table[] = {
        "andl %"_msk",%"_LO32 _tmp"; "          \
        "orl  %"_LO32 _tmp",%"_sav"; "
+#ifdef CONFIG_X86_64
+#define ON64(x) x
+#else
+#define ON64(x)
+#endif
+#define ____emulate_2op(_op, _src, _dst, _eflags, _x, _y, _suffix)      \
+        do {                                                            \
+                __asm__ __volatile__ (                                  \
+                        _PRE_EFLAGS("0", "4", "2")                      \
+                        _op _suffix " %"_x"3,%1; "                      \
+                        _POST_EFLAGS("0", "4", "2")                     \
+                        : "=m" (_eflags), "=m" ((_dst).val),            \
+                          "=&r" (_tmp)                                  \
+                        : _y ((_src).val), "i" (EFLAGS_MASK));          \
+        } while (0)
 /* Raw emulation: instruction has two explicit operands. */
 #define __emulate_2op_nobyte(_op,_src,_dst,_eflags,_wx,_wy,_lx,_ly,_qx,_qy) \
-        do {                                                                \
+        do {                                                            \
-                unsigned long _tmp;                                         \
+                unsigned long _tmp;                                     \
-                                                                            \
+                                                                        \
-                switch ((_dst).bytes) {                                     \
+                switch ((_dst).bytes) {                                 \
-                case 2:                                                     \
+                case 2:                                                 \
-                        __asm__ __volatile__ (                              \
+                        ____emulate_2op(_op,_src,_dst,_eflags,_wx,_wy,"w"); \
-                                _PRE_EFLAGS("0", "4", "2")                  \
+                        break;                                          \
-                                _op"w %"_wx"3,%1; "                         \
+                case 4:                                                 \
-                                _POST_EFLAGS("0", "4", "2")                 \
+                        ____emulate_2op(_op,_src,_dst,_eflags,_lx,_ly,"l"); \
-                                : "=m" (_eflags), "=m" ((_dst).val),        \
+                        break;                                          \
-                                  "=&r" (_tmp)                              \
+                case 8:                                                 \
-                                : _wy ((_src).val), "i" (EFLAGS_MASK));     \
+                        ON64(____emulate_2op(_op,_src,_dst,_eflags,_qx,_qy,"q")); \
-                        break;                                              \
+                        break;                                          \
-                case 4:                                                     \
+                }                                                       \
-                        __asm__ __volatile__ (                              \
-                                _PRE_EFLAGS("0", "4", "2")                  \
-                                _op"l %"_lx"3,%1; "                         \
-                                _POST_EFLAGS("0", "4", "2")                 \
-                                : "=m" (_eflags), "=m" ((_dst).val),        \
-                                  "=&r" (_tmp)                              \
-                                : _ly ((_src).val), "i" (EFLAGS_MASK));     \
-                        break;                                              \
-                case 8:                                                     \
-                        __emulate_2op_8byte(_op, _src, _dst,                \
-                                            _eflags, _qx, _qy);             \
-                        break;                                              \
-                }                                                           \
        } while (0)
 #define __emulate_2op(_op,_src,_dst,_eflags,_bx,_by,_wx,_wy,_lx,_ly,_qx,_qy) \
        do {                                                                 \
-                unsigned long __tmp;                                         \
+                unsigned long _tmp;                                          \
                switch ((_dst).bytes) {                                      \
                case 1:                                                      \
-                        __asm__ __volatile__ (                               \
+                        ____emulate_2op(_op,_src,_dst,_eflags,_bx,_by,"b");  \
-                                _PRE_EFLAGS("0", "4", "2")                   \
-                                _op"b %"_bx"3,%1; "                          \
-                                _POST_EFLAGS("0", "4", "2")                  \
-                                : "=m" (_eflags), "=m" ((_dst).val),         \
-                                  "=&r" (__tmp)                              \
-                                : _by ((_src).val), "i" (EFLAGS_MASK));      \
                        break;                                               \
                default:                                                     \
                        __emulate_2op_nobyte(_op, _src, _dst, _eflags,       \
@@ -425,71 +436,68 @@ static u16 group2_table[] = {
        __emulate_2op_nobyte(_op, _src, _dst, _eflags,                  \
                             "w", "r", _LO32, "r", "", "r")
-/* Instruction has only one explicit operand (no source operand). */
+/* Instruction has three operands and one operand is stored in ECX register */
-#define emulate_1op(_op, _dst, _eflags)                                    \
+#define __emulate_2op_cl(_op, _cl, _src, _dst, _eflags, _suffix, _type)         \
-        do {                                                            \
+        do {                                                                    \
-                unsigned long _tmp;                                     \
+                unsigned long _tmp;                                             \
-                                                                        \
+                _type _clv  = (_cl).val;                                        \
-                switch ((_dst).bytes) {                                 \
+                _type _srcv = (_src).val;                                       \
-                case 1:                                                 \
+                _type _dstv = (_dst).val;                                       \
-                        __asm__ __volatile__ (                          \
+                                                                                \
-                                _PRE_EFLAGS("0", "3", "2")              \
+                __asm__ __volatile__ (                                          \
-                                _op"b %1; "                             \
+                        _PRE_EFLAGS("0", "5", "2")                              \
-                                _POST_EFLAGS("0", "3", "2")             \
+                        _op _suffix " %4,%1 \n"                                 \
-                                : "=m" (_eflags), "=m" ((_dst).val),    \
+                        _POST_EFLAGS("0", "5", "2")                             \
-                                  "=&r" (_tmp)                          \
+                        : "=m" (_eflags), "+r" (_dstv), "=&r" (_tmp)            \
-                                : "i" (EFLAGS_MASK));                   \
+                        : "c" (_clv) , "r" (_srcv), "i" (EFLAGS_MASK)           \
-                        break;                                          \
+                        );                                                      \
-                case 2:                                                 \
+                                                                                \
-                        __asm__ __volatile__ (                          \
+                (_cl).val  = (unsigned long) _clv;                              \
-                                _PRE_EFLAGS("0", "3", "2")              \
+                (_src).val = (unsigned long) _srcv;                             \
-                                _op"w %1; "                             \
+                (_dst).val = (unsigned long) _dstv;                             \
-                                _POST_EFLAGS("0", "3", "2")             \
-                                : "=m" (_eflags), "=m" ((_dst).val),    \
-                                  "=&r" (_tmp)                          \
-                                : "i" (EFLAGS_MASK));                   \
-                        break;                                          \
-                case 4:                                                 \
-                        __asm__ __volatile__ (                          \
-                                _PRE_EFLAGS("0", "3", "2")              \
-                                _op"l %1; "                             \
-                                _POST_EFLAGS("0", "3", "2")             \
-                                : "=m" (_eflags), "=m" ((_dst).val),    \
-                                  "=&r" (_tmp)                          \
-                                : "i" (EFLAGS_MASK));                   \
-                        break;                                          \
-                case 8:                                                 \
-                        __emulate_1op_8byte(_op, _dst, _eflags);        \
-                        break;                                          \
-                }                                                       \
        } while (0)
-/* Emulate an instruction with quadword operands (x86/64 only). */
+#define emulate_2op_cl(_op, _cl, _src, _dst, _eflags)                           \
-#if defined(CONFIG_X86_64)
+        do {                                                                    \
-#define __emulate_2op_8byte(_op, _src, _dst, _eflags, _qx, _qy)           \
+                switch ((_dst).bytes) {                                         \
-        do {                                                              \
+                case 2:                                                         \
-                __asm__ __volatile__ (                                    \
+                        __emulate_2op_cl(_op, _cl, _src, _dst, _eflags,         \
-                        _PRE_EFLAGS("0", "4", "2")                        \
+                                                "w", unsigned short);           \
-                        _op"q %"_qx"3,%1; "                               \
+                        break;                                                  \
-                        _POST_EFLAGS("0", "4", "2")                       \
+                case 4:                                                         \
-                        : "=m" (_eflags), "=m" ((_dst).val), "=&r" (_tmp) \
+                        __emulate_2op_cl(_op, _cl, _src, _dst, _eflags,         \
-                        : _qy ((_src).val), "i" (EFLAGS_MASK));         \
+                                                "l", unsigned int);             \
+                        break;                                                  \
+                case 8:                                                         \
+                        ON64(__emulate_2op_cl(_op, _cl, _src, _dst, _eflags,    \
+                                                "q", unsigned long));           \
+                        break;                                                  \
+                }                                                               \
        } while (0)
-#define __emulate_1op_8byte(_op, _dst, _eflags)                           \
+#define __emulate_1op(_op, _dst, _eflags, _suffix)                      \
-        do {                                                              \
+        do {                                                            \
-                __asm__ __volatile__ (                                    \
+                unsigned long _tmp;                                     \
-                        _PRE_EFLAGS("0", "3", "2")                        \
+                                                                        \
-                        _op"q %1; "                                       \
+                __asm__ __volatile__ (                                  \
-                        _POST_EFLAGS("0", "3", "2")                       \
+                        _PRE_EFLAGS("0", "3", "2")                      \
-                        : "=m" (_eflags), "=m" ((_dst).val), "=&r" (_tmp) \
+                        _op _suffix " %1; "                             \
-                        : "i" (EFLAGS_MASK));                             \
+                        _POST_EFLAGS("0", "3", "2")                     \
+                        : "=m" (_eflags), "+m" ((_dst).val),            \
+                          "=&r" (_tmp)                                  \
+                        : "i" (EFLAGS_MASK));                           \
        } while (0)
-#elif defined(__i386__)
+/* Instruction has only one explicit operand (no source operand). */
-#define __emulate_2op_8byte(_op, _src, _dst, _eflags, _qx, _qy)
+#define emulate_1op(_op, _dst, _eflags)                                    \
-#define __emulate_1op_8byte(_op, _dst, _eflags)
+        do {                                                            \
-#endif                          /* __i386__ */
+                switch ((_dst).bytes) {                                 \
+                case 1: __emulate_1op(_op, _dst, _eflags, "b"); break;  \
+                case 2: __emulate_1op(_op, _dst, _eflags, "w"); break;  \
+                case 4: __emulate_1op(_op, _dst, _eflags, "l"); break;  \
+                case 8: ON64(__emulate_1op(_op, _dst, _eflags, "q")); break; \
+                }                                                       \
+        } while (0)
 /* Fetch next part of the instruction being emulated. */
 #define insn_fetch(_type, _size, _eip)                                  \
@@ -1041,6 +1049,33 @@ done_prefixes:
                c->src.bytes = 1;
                c->src.val = insn_fetch(s8, 1, c->eip);
                break;
+        case SrcOne:
+                c->src.bytes = 1;
+                c->src.val = 1;
+                break;
+        }
+        /*
+         * Decode and fetch the second source operand: register, memory
+         * or immediate.
+         */
+        switch (c->d & Src2Mask) {
+        case Src2None:
+                break;
+        case Src2CL:
+                c->src2.bytes = 1;
+                c->src2.val = c->regs[VCPU_REGS_RCX] & 0x8;
+                break;
+        case Src2ImmByte:
+                c->src2.type = OP_IMM;
+                c->src2.ptr = (unsigned long *)c->eip;
+                c->src2.bytes = 1;
+                c->src2.val = insn_fetch(u8, 1, c->eip);
+                break;
+        case Src2One:
+                c->src2.bytes = 1;
+                c->src2.val = 1;
+                break;
        }
        /* Decode and fetch the destination operand: register or memory. */
@@ -1100,20 +1135,33 @@ static inline void emulate_push(struct x86_emulate_ctxt *ctxt)
                                               c->regs[VCPU_REGS_RSP]);
 }
-static inline int emulate_grp1a(struct x86_emulate_ctxt *ctxt,
+static int emulate_pop(struct x86_emulate_ctxt *ctxt,
-                                struct x86_emulate_ops *ops)
+                       struct x86_emulate_ops *ops)
 {
        struct decode_cache *c = &ctxt->decode;
        int rc;
-        rc = ops->read_std(register_address(c, ss_base(ctxt),
+        rc = ops->read_emulated(register_address(c, ss_base(ctxt),
-                                            c->regs[VCPU_REGS_RSP]),
+                                                 c->regs[VCPU_REGS_RSP]),
-                           &c->dst.val, c->dst.bytes, ctxt->vcpu);
+                                &c->src.val, c->src.bytes, ctxt->vcpu);
        if (rc != 0)
                return rc;
-        register_address_increment(c, &c->regs[VCPU_REGS_RSP], c->dst.bytes);
+        register_address_increment(c, &c->regs[VCPU_REGS_RSP], c->src.bytes);
+        return rc;
+}
+static inline int emulate_grp1a(struct x86_emulate_ctxt *ctxt,
+                                struct x86_emulate_ops *ops)
+{
+        struct decode_cache *c = &ctxt->decode;
+        int rc;
+        c->src.bytes = c->dst.bytes;
+        rc = emulate_pop(ctxt, ops);
+        if (rc != 0)
+                return rc;
+        c->dst.val = c->src.val;
        return 0;
 }
@@ -1415,24 +1463,15 @@ special_insn:
                emulate_1op("dec", c->dst, ctxt->eflags);
                break;
        case 0x50 ... 0x57:  /* push reg */
-                c->dst.type  = OP_MEM;
+                emulate_push(ctxt);
-                c->dst.bytes = c->op_bytes;
-                c->dst.val = c->src.val;
-                register_address_increment(c, &c->regs[VCPU_REGS_RSP],
-                                           -c->op_bytes);
-                c->dst.ptr = (void *) register_address(
-                        c, ss_base(ctxt), c->regs[VCPU_REGS_RSP]);
                break;
        case 0x58 ... 0x5f: /* pop reg */
        pop_instruction:
-                if ((rc = ops->read_std(register_address(c, ss_base(ctxt),
+                c->src.bytes = c->op_bytes;
-                        c->regs[VCPU_REGS_RSP]), c->dst.ptr,
+                rc = emulate_pop(ctxt, ops);
-                        c->op_bytes, ctxt->vcpu)) != 0)
+                if (rc != 0)
                        goto done;
+                c->dst.val = c->src.val;
-                register_address_increment(c, &c->regs[VCPU_REGS_RSP],
-                                           c->op_bytes);
-                c->dst.type = OP_NONE;  /* Disable writeback. */
                break;
        case 0x63:              /* movsxd */
                if (ctxt->mode != X86EMUL_MODE_PROT64)
@@ -1591,7 +1630,9 @@ special_insn:
                emulate_push(ctxt);
                break;
        case 0x9d: /* popf */
+                c->dst.type = OP_REG;
                c->dst.ptr = (unsigned long *) &ctxt->eflags;
+                c->dst.bytes = c->op_bytes;
                goto pop_instruction;
        case 0xa0 ... 0xa1:     /* mov */
                c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX];
@@ -1689,7 +1730,9 @@ special_insn:
                emulate_grp2(ctxt);
                break;
        case 0xc3: /* ret */
+                c->dst.type = OP_REG;
                c->dst.ptr = &c->eip;
+                c->dst.bytes = c->op_bytes;
                goto pop_instruction;
        case 0xc6 ... 0xc7:     /* mov (sole member of Grp11) */
        mov:
@@ -1778,7 +1821,7 @@ special_insn:
                        c->eip = saved_eip;
                        goto cannot_emulate;
                }
-                return 0;
+                break;
        case 0xf4:              /* hlt */
                ctxt->vcpu->arch.halt_request = 1;
                break;
@@ -1999,12 +2042,20 @@ twobyte_insn:
                c->src.val &= (c->dst.bytes << 3) - 1;
                emulate_2op_SrcV_nobyte("bt", c->src, c->dst, ctxt->eflags);
                break;
+        case 0xa4: /* shld imm8, r, r/m */
+        case 0xa5: /* shld cl, r, r/m */
+                emulate_2op_cl("shld", c->src2, c->src, c->dst, ctxt->eflags);
+                break;
        case 0xab:
              bts:              /* bts */
                /* only subword offset */
                c->src.val &= (c->dst.bytes << 3) - 1;
                emulate_2op_SrcV_nobyte("bts", c->src, c->dst, ctxt->eflags);
                break;
+        case 0xac: /* shrd imm8, r, r/m */
+        case 0xad: /* shrd cl, r, r/m */
+                emulate_2op_cl("shrd", c->src2, c->src, c->dst, ctxt->eflags);
+                break;
        case 0xae:              /* clflush */
                break;
        case 0xb0 ... 0xb1:     /* cmpxchg */
diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c
index c16d9be1b017..3bbdb9d02376 100644
--- a/fs/anon_inodes.c
+++ b/fs/anon_inodes.c
@@ -79,9 +79,12 @@ int anon_inode_getfd(const char *name, const struct file_operations *fops,
        if (IS_ERR(anon_inode_inode))
                return -ENODEV;
+        if (fops->owner && !try_module_get(fops->owner))
+                return -ENOENT;
        error = get_unused_fd_flags(flags);
        if (error < 0)
-                return error;
+                goto err_module;
        fd = error;
        /*
@@ -128,6 +131,8 @@ err_dput:
        dput(dentry);
 err_put_unused_fd:
        put_unused_fd(fd);
+err_module:
+        module_put(fops->owner);
        return error;
 }
 EXPORT_SYMBOL_GPL(anon_inode_getfd);
diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index f18b86fa8655..35525ac63337 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -83,6 +83,7 @@ struct kvm_irqchip {
 #define KVM_EXIT_S390_SIEIC       13
 #define KVM_EXIT_S390_RESET       14
 #define KVM_EXIT_DCR              15
+#define KVM_EXIT_NMI              16
 /* for KVM_RUN, returned by mmap(vcpu_fd, offset=0) */
 struct kvm_run {
@@ -387,6 +388,14 @@ struct kvm_trace_rec {
 #define KVM_CAP_DEVICE_ASSIGNMENT 17
 #endif
 #define KVM_CAP_IOMMU 18
+#if defined(CONFIG_X86)
+#define KVM_CAP_DEVICE_MSI 20
+#endif
+/* Bug in KVM_SET_USER_MEMORY_REGION fixed: */
+#define KVM_CAP_DESTROY_MEMORY_REGION_WORKS 21
+#if defined(CONFIG_X86)
+#define KVM_CAP_USER_NMI 22
+#endif
 /*
 * ioctls for VM fds
@@ -458,6 +467,8 @@ struct kvm_trace_rec {
 #define KVM_S390_INITIAL_RESET    _IO(KVMIO,  0x97)
 #define KVM_GET_MP_STATE          _IOR(KVMIO,  0x98, struct kvm_mp_state)
 #define KVM_SET_MP_STATE          _IOW(KVMIO,  0x99, struct kvm_mp_state)
+/* Available with KVM_CAP_NMI */
+#define KVM_NMI                   _IO(KVMIO,  0x9a)
 #define KVM_TRC_INJ_VIRQ         (KVM_TRC_HANDLER + 0x02)
 #define KVM_TRC_REDELIVER_EVT    (KVM_TRC_HANDLER + 0x03)
@@ -500,10 +511,17 @@ struct kvm_assigned_irq {
        __u32 guest_irq;
        __u32 flags;
        union {
+                struct {
+                        __u32 addr_lo;
+                        __u32 addr_hi;
+                        __u32 data;
+                } guest_msi;
                __u32 reserved[12];
        };
 };
 #define KVM_DEV_ASSIGN_ENABLE_IOMMU     (1 << 0)
+#define KVM_DEV_IRQ_ASSIGN_ENABLE_MSI   (1 << 0)
 #endif
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index bb92be2153bc..eafabd5c66b2 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -16,6 +16,7 @@
 #include <linux/mm.h>
 #include <linux/preempt.h>
 #include <linux/marker.h>
+#include <linux/msi.h>
 #include <asm/signal.h>
 #include <linux/kvm.h>
@@ -306,8 +307,14 @@ struct kvm_assigned_dev_kernel {
        int host_busnr;
        int host_devfn;
        int host_irq;
+        bool host_irq_disabled;
        int guest_irq;
-        int irq_requested;
+        struct msi_msg guest_msi;
+#define KVM_ASSIGNED_DEV_GUEST_INTX     (1 << 0)
+#define KVM_ASSIGNED_DEV_GUEST_MSI      (1 << 1)
+#define KVM_ASSIGNED_DEV_HOST_INTX      (1 << 8)
+#define KVM_ASSIGNED_DEV_HOST_MSI       (1 << 9)
+        unsigned long irq_requested_type;
        int irq_source_id;
        struct pci_dev *dev;
        struct kvm *kvm;
@@ -316,8 +323,7 @@ void kvm_set_irq(struct kvm *kvm, int irq_source_id, int irq, int level);
 void kvm_notify_acked_irq(struct kvm *kvm, unsigned gsi);
 void kvm_register_irq_ack_notifier(struct kvm *kvm,
                                   struct kvm_irq_ack_notifier *kian);
-void kvm_unregister_irq_ack_notifier(struct kvm *kvm,
+void kvm_unregister_irq_ack_notifier(struct kvm_irq_ack_notifier *kian);
-                                     struct kvm_irq_ack_notifier *kian);
 int kvm_request_irq_source_id(struct kvm *kvm);
 void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id);
diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c
index 53772bb46320..23b81cf242af 100644
--- a/virt/kvm/ioapic.c
+++ b/virt/kvm/ioapic.c
@@ -150,10 +150,11 @@ static int ioapic_inj_irq(struct kvm_ioapic *ioapic,
 static void ioapic_inj_nmi(struct kvm_vcpu *vcpu)
 {
        kvm_inject_nmi(vcpu);
+        kvm_vcpu_kick(vcpu);
 }
-static u32 ioapic_get_delivery_bitmask(struct kvm_ioapic *ioapic, u8 dest,
+u32 kvm_ioapic_get_delivery_bitmask(struct kvm_ioapic *ioapic, u8 dest,
-                                       u8 dest_mode)
+                                    u8 dest_mode)
 {
        u32 mask = 0;
        int i;
@@ -207,7 +208,8 @@ static int ioapic_deliver(struct kvm_ioapic *ioapic, int irq)
                     "vector=%x trig_mode=%x\n",
                     dest, dest_mode, delivery_mode, vector, trig_mode);
-        deliver_bitmask = ioapic_get_delivery_bitmask(ioapic, dest, dest_mode);
+        deliver_bitmask = kvm_ioapic_get_delivery_bitmask(ioapic, dest,
+                                                          dest_mode);
        if (!deliver_bitmask) {
                ioapic_debug("no target on destination\n");
                return 0;
diff --git a/virt/kvm/ioapic.h b/virt/kvm/ioapic.h
index cd7ae7691c9d..49c9581d2586 100644
--- a/virt/kvm/ioapic.h
+++ b/virt/kvm/ioapic.h
@@ -85,5 +85,7 @@ void kvm_ioapic_update_eoi(struct kvm *kvm, int vector, int trigger_mode);
 int kvm_ioapic_init(struct kvm *kvm);
 void kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level);
 void kvm_ioapic_reset(struct kvm_ioapic *ioapic);
+u32 kvm_ioapic_get_delivery_bitmask(struct kvm_ioapic *ioapic, u8 dest,
+                                u8 dest_mode);
 #endif
diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c
index 55ad76ee2d09..aa5d1e5c497e 100644
--- a/virt/kvm/irq_comm.c
+++ b/virt/kvm/irq_comm.c
@@ -61,10 +61,9 @@ void kvm_register_irq_ack_notifier(struct kvm *kvm,
        hlist_add_head(&kian->link, &kvm->arch.irq_ack_notifier_list);
 }
-void kvm_unregister_irq_ack_notifier(struct kvm *kvm,
+void kvm_unregister_irq_ack_notifier(struct kvm_irq_ack_notifier *kian)
-                                     struct kvm_irq_ack_notifier *kian)
 {
-        hlist_del(&kian->link);
+        hlist_del_init(&kian->link);
 }
 /* The caller must hold kvm->lock mutex */
@@ -73,11 +72,15 @@ int kvm_request_irq_source_id(struct kvm *kvm)
        unsigned long *bitmap = &kvm->arch.irq_sources_bitmap;
        int irq_source_id = find_first_zero_bit(bitmap,
                                sizeof(kvm->arch.irq_sources_bitmap));
        if (irq_source_id >= sizeof(kvm->arch.irq_sources_bitmap)) {
                printk(KERN_WARNING "kvm: exhaust allocatable IRQ sources!\n");
-                irq_source_id = -EFAULT;
+                return -EFAULT;
-        } else
+        }
-                set_bit(irq_source_id, bitmap);
+        ASSERT(irq_source_id != KVM_USERSPACE_IRQ_SOURCE_ID);
+        set_bit(irq_source_id, bitmap);
        return irq_source_id;
 }
@@ -85,7 +88,9 @@ void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id)
 {
        int i;
-        if (irq_source_id <= 0 ||
+        ASSERT(irq_source_id != KVM_USERSPACE_IRQ_SOURCE_ID);
+        if (irq_source_id < 0 ||
            irq_source_id >= sizeof(kvm->arch.irq_sources_bitmap)) {
                printk(KERN_ERR "kvm: IRQ source ID out of range!\n");
                return;
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index a87f45edfae8..fc6127cbea1f 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -47,6 +47,10 @@
 #include <asm/uaccess.h>
 #include <asm/pgtable.h>
+#ifdef CONFIG_X86
+#include <asm/msidef.h>
+#endif
 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
 #include "coalesced_mmio.h"
 #endif
@@ -60,10 +64,13 @@
 MODULE_AUTHOR("Qumranet");
 MODULE_LICENSE("GPL");
+static int msi2intx = 1;
+module_param(msi2intx, bool, 0);
 DEFINE_SPINLOCK(kvm_lock);
 LIST_HEAD(vm_list);
-static cpumask_t cpus_hardware_enabled;
+static cpumask_var_t cpus_hardware_enabled;
 struct kmem_cache *kvm_vcpu_cache;
 EXPORT_SYMBOL_GPL(kvm_vcpu_cache);
@@ -75,9 +82,60 @@ struct dentry *kvm_debugfs_dir;
 static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl,
                           unsigned long arg);
-bool kvm_rebooting;
+static bool kvm_rebooting;
 #ifdef KVM_CAP_DEVICE_ASSIGNMENT
+#ifdef CONFIG_X86
+static void assigned_device_msi_dispatch(struct kvm_assigned_dev_kernel *dev)
+{
+        int vcpu_id;
+        struct kvm_vcpu *vcpu;
+        struct kvm_ioapic *ioapic = ioapic_irqchip(dev->kvm);
+        int dest_id = (dev->guest_msi.address_lo & MSI_ADDR_DEST_ID_MASK)
+                        >> MSI_ADDR_DEST_ID_SHIFT;
+        int vector = (dev->guest_msi.data & MSI_DATA_VECTOR_MASK)
+                        >> MSI_DATA_VECTOR_SHIFT;
+        int dest_mode = test_bit(MSI_ADDR_DEST_MODE_SHIFT,
+                                (unsigned long *)&dev->guest_msi.address_lo);
+        int trig_mode = test_bit(MSI_DATA_TRIGGER_SHIFT,
+                                (unsigned long *)&dev->guest_msi.data);
+        int delivery_mode = test_bit(MSI_DATA_DELIVERY_MODE_SHIFT,
+                                (unsigned long *)&dev->guest_msi.data);
+        u32 deliver_bitmask;
+        BUG_ON(!ioapic);
+        deliver_bitmask = kvm_ioapic_get_delivery_bitmask(ioapic,
+                                dest_id, dest_mode);
+        /* IOAPIC delivery mode value is the same as MSI here */
+        switch (delivery_mode) {
+        case IOAPIC_LOWEST_PRIORITY:
+                vcpu = kvm_get_lowest_prio_vcpu(ioapic->kvm, vector,
+                                deliver_bitmask);
+                if (vcpu != NULL)
+                        kvm_apic_set_irq(vcpu, vector, trig_mode);
+                else
+                        printk(KERN_INFO "kvm: null lowest priority vcpu!\n");
+                break;
+        case IOAPIC_FIXED:
+                for (vcpu_id = 0; deliver_bitmask != 0; vcpu_id++) {
+                        if (!(deliver_bitmask & (1 << vcpu_id)))
+                                continue;
+                        deliver_bitmask &= ~(1 << vcpu_id);
+                        vcpu = ioapic->kvm->vcpus[vcpu_id];
+                        if (vcpu)
+                                kvm_apic_set_irq(vcpu, vector, trig_mode);
+                }
+                break;
+        default:
+                printk(KERN_INFO "kvm: unsupported MSI delivery mode\n");
+        }
+}
+#else
+static void assigned_device_msi_dispatch(struct kvm_assigned_dev_kernel *dev) {}
+#endif
 static struct kvm_assigned_dev_kernel *kvm_find_assigned_dev(struct list_head *head,
                                                      int assigned_dev_id)
 {
@@ -104,9 +162,16 @@ static void kvm_assigned_dev_interrupt_work_handler(struct work_struct *work)
         * finer-grained lock, update this
         */
        mutex_lock(&assigned_dev->kvm->lock);
-        kvm_set_irq(assigned_dev->kvm,
+        if (assigned_dev->irq_requested_type & KVM_ASSIGNED_DEV_GUEST_INTX)
-                    assigned_dev->irq_source_id,
+                kvm_set_irq(assigned_dev->kvm,
-                    assigned_dev->guest_irq, 1);
+                            assigned_dev->irq_source_id,
+                            assigned_dev->guest_irq, 1);
+        else if (assigned_dev->irq_requested_type &
+                                KVM_ASSIGNED_DEV_GUEST_MSI) {
+                assigned_device_msi_dispatch(assigned_dev);
+                enable_irq(assigned_dev->host_irq);
+                assigned_dev->host_irq_disabled = false;
+        }
        mutex_unlock(&assigned_dev->kvm->lock);
        kvm_put_kvm(assigned_dev->kvm);
 }
@@ -117,8 +182,12 @@ static irqreturn_t kvm_assigned_dev_intr(int irq, void *dev_id)
                (struct kvm_assigned_dev_kernel *) dev_id;
        kvm_get_kvm(assigned_dev->kvm);
        schedule_work(&assigned_dev->interrupt_work);
        disable_irq_nosync(irq);
+        assigned_dev->host_irq_disabled = true;
        return IRQ_HANDLED;
 }
@@ -132,19 +201,32 @@ static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier *kian)
        dev = container_of(kian, struct kvm_assigned_dev_kernel,
                           ack_notifier);
        kvm_set_irq(dev->kvm, dev->irq_source_id, dev->guest_irq, 0);
-        enable_irq(dev->host_irq);
+        /* The guest irq may be shared so this ack may be
+         * from another device.
+         */
+        if (dev->host_irq_disabled) {
+                enable_irq(dev->host_irq);
+                dev->host_irq_disabled = false;
+        }
 }
-static void kvm_free_assigned_device(struct kvm *kvm,
+static void kvm_free_assigned_irq(struct kvm *kvm,
-                                     struct kvm_assigned_dev_kernel
+                                  struct kvm_assigned_dev_kernel *assigned_dev)
-                                     *assigned_dev)
 {
-        if (irqchip_in_kernel(kvm) && assigned_dev->irq_requested)
+        if (!irqchip_in_kernel(kvm))
-                free_irq(assigned_dev->host_irq, (void *)assigned_dev);
+                return;
+        kvm_unregister_irq_ack_notifier(&assigned_dev->ack_notifier);
-        kvm_unregister_irq_ack_notifier(kvm, &assigned_dev->ack_notifier);
+        if (assigned_dev->irq_source_id != -1)
-        kvm_free_irq_source_id(kvm, assigned_dev->irq_source_id);
+                kvm_free_irq_source_id(kvm, assigned_dev->irq_source_id);
+        assigned_dev->irq_source_id = -1;
+        if (!assigned_dev->irq_requested_type)
+                return;
        if (cancel_work_sync(&assigned_dev->interrupt_work))
                /* We had pending work. That means we will have to take
@@ -152,6 +234,23 @@ static void kvm_free_assigned_device(struct kvm *kvm,
                 */
                kvm_put_kvm(kvm);
+        free_irq(assigned_dev->host_irq, (void *)assigned_dev);
+        if (assigned_dev->irq_requested_type & KVM_ASSIGNED_DEV_HOST_MSI)
+                pci_disable_msi(assigned_dev->dev);
+        assigned_dev->irq_requested_type = 0;
+}
+static void kvm_free_assigned_device(struct kvm *kvm,
+                                     struct kvm_assigned_dev_kernel
+                                     *assigned_dev)
+{
+        kvm_free_assigned_irq(kvm, assigned_dev);
+        pci_reset_function(assigned_dev->dev);
        pci_release_regions(assigned_dev->dev);
        pci_disable_device(assigned_dev->dev);
        pci_dev_put(assigned_dev->dev);
@@ -174,6 +273,95 @@ void kvm_free_all_assigned_devices(struct kvm *kvm)
        }
 }
+static int assigned_device_update_intx(struct kvm *kvm,
+                        struct kvm_assigned_dev_kernel *adev,
+                        struct kvm_assigned_irq *airq)
+{
+        adev->guest_irq = airq->guest_irq;
+        adev->ack_notifier.gsi = airq->guest_irq;
+        if (adev->irq_requested_type & KVM_ASSIGNED_DEV_HOST_INTX)
+                return 0;
+        if (irqchip_in_kernel(kvm)) {
+                if (!msi2intx &&
+                    adev->irq_requested_type & KVM_ASSIGNED_DEV_HOST_MSI) {
+                        free_irq(adev->host_irq, (void *)kvm);
+                        pci_disable_msi(adev->dev);
+                }
+                if (!capable(CAP_SYS_RAWIO))
+                        return -EPERM;
+                if (airq->host_irq)
+                        adev->host_irq = airq->host_irq;
+                else
+                        adev->host_irq = adev->dev->irq;
+                /* Even though this is PCI, we don't want to use shared
+                 * interrupts. Sharing host devices with guest-assigned devices
+                 * on the same interrupt line is not a happy situation: there
+                 * are going to be long delays in accepting, acking, etc.
+                 */
+                if (request_irq(adev->host_irq, kvm_assigned_dev_intr,
+                                0, "kvm_assigned_intx_device", (void *)adev))
+                        return -EIO;
+        }
+        adev->irq_requested_type = KVM_ASSIGNED_DEV_GUEST_INTX |
+                                   KVM_ASSIGNED_DEV_HOST_INTX;
+        return 0;
+}
+#ifdef CONFIG_X86
+static int assigned_device_update_msi(struct kvm *kvm,
+                        struct kvm_assigned_dev_kernel *adev,
+                        struct kvm_assigned_irq *airq)
+{
+        int r;
+        if (airq->flags & KVM_DEV_IRQ_ASSIGN_ENABLE_MSI) {
+                /* x86 don't care upper address of guest msi message addr */
+                adev->irq_requested_type |= KVM_ASSIGNED_DEV_GUEST_MSI;
+                adev->irq_requested_type &= ~KVM_ASSIGNED_DEV_GUEST_INTX;
+                adev->guest_msi.address_lo = airq->guest_msi.addr_lo;
+                adev->guest_msi.data = airq->guest_msi.data;
+                adev->ack_notifier.gsi = -1;
+        } else if (msi2intx) {
+                adev->irq_requested_type |= KVM_ASSIGNED_DEV_GUEST_INTX;
+                adev->irq_requested_type &= ~KVM_ASSIGNED_DEV_GUEST_MSI;
+                adev->guest_irq = airq->guest_irq;
+                adev->ack_notifier.gsi = airq->guest_irq;
+        }
+        if (adev->irq_requested_type & KVM_ASSIGNED_DEV_HOST_MSI)
+                return 0;
+        if (irqchip_in_kernel(kvm)) {
+                if (!msi2intx) {
+                        if (adev->irq_requested_type &
+                                        KVM_ASSIGNED_DEV_HOST_INTX)
+                                free_irq(adev->host_irq, (void *)adev);
+                        r = pci_enable_msi(adev->dev);
+                        if (r)
+                                return r;
+                }
+                adev->host_irq = adev->dev->irq;
+                if (request_irq(adev->host_irq, kvm_assigned_dev_intr, 0,
+                                "kvm_assigned_msi_device", (void *)adev))
+                        return -EIO;
+        }
+        if (!msi2intx)
+                adev->irq_requested_type = KVM_ASSIGNED_DEV_GUEST_MSI;
+        adev->irq_requested_type |= KVM_ASSIGNED_DEV_HOST_MSI;
+        return 0;
+}
+#endif
 static int kvm_vm_ioctl_assign_irq(struct kvm *kvm,
                                   struct kvm_assigned_irq
                                   *assigned_irq)
@@ -190,49 +378,68 @@ static int kvm_vm_ioctl_assign_irq(struct kvm *kvm,
                return -EINVAL;
        }
-        if (match->irq_requested) {
+        if (!match->irq_requested_type) {
-                match->guest_irq = assigned_irq->guest_irq;
+                INIT_WORK(&match->interrupt_work,
-                match->ack_notifier.gsi = assigned_irq->guest_irq;
+                                kvm_assigned_dev_interrupt_work_handler);
-                mutex_unlock(&kvm->lock);
+                if (irqchip_in_kernel(kvm)) {
-                return 0;
+                        /* Register ack nofitier */
-        }
+                        match->ack_notifier.gsi = -1;
+                        match->ack_notifier.irq_acked =
+                                        kvm_assigned_dev_ack_irq;
+                        kvm_register_irq_ack_notifier(kvm,
+                                        &match->ack_notifier);
+                        /* Request IRQ source ID */
+                        r = kvm_request_irq_source_id(kvm);
+                        if (r < 0)
+                                goto out_release;
+                        else
+                                match->irq_source_id = r;
-        INIT_WORK(&match->interrupt_work,
+#ifdef CONFIG_X86
-                  kvm_assigned_dev_interrupt_work_handler);
+                        /* Determine host device irq type, we can know the
+                         * result from dev->msi_enabled */
+                        if (msi2intx)
+                                pci_enable_msi(match->dev);
+#endif
+                }
+        }
-        if (irqchip_in_kernel(kvm)) {
+        if ((!msi2intx &&
-                if (!capable(CAP_SYS_RAWIO)) {
+             (assigned_irq->flags & KVM_DEV_IRQ_ASSIGN_ENABLE_MSI)) ||
-                        r = -EPERM;
+            (msi2intx && match->dev->msi_enabled)) {
+#ifdef CONFIG_X86
+                r = assigned_device_update_msi(kvm, match, assigned_irq);
+                if (r) {
+                        printk(KERN_WARNING "kvm: failed to enable "
+                                        "MSI device!\n");
                        goto out_release;
                }
+#else
-                if (assigned_irq->host_irq)
+                r = -ENOTTY;
-                        match->host_irq = assigned_irq->host_irq;
+#endif
-                else
+        } else if (assigned_irq->host_irq == 0 && match->dev->irq == 0) {
-                        match->host_irq = match->dev->irq;
+                /* Host device IRQ 0 means don't support INTx */
-                match->guest_irq = assigned_irq->guest_irq;
+                if (!msi2intx) {
-                match->ack_notifier.gsi = assigned_irq->guest_irq;
+                        printk(KERN_WARNING
-                match->ack_notifier.irq_acked = kvm_assigned_dev_ack_irq;
+                               "kvm: wait device to enable MSI!\n");
-                kvm_register_irq_ack_notifier(kvm, &match->ack_notifier);
+                        r = 0;
-                r = kvm_request_irq_source_id(kvm);
+                } else {
-                if (r < 0)
+                        printk(KERN_WARNING
+                               "kvm: failed to enable MSI device!\n");
+                        r = -ENOTTY;
                        goto out_release;
-                else
+                }
-                        match->irq_source_id = r;
+        } else {
+                /* Non-sharing INTx mode */
-                /* Even though this is PCI, we don't want to use shared
+                r = assigned_device_update_intx(kvm, match, assigned_irq);
-                 * interrupts. Sharing host devices with guest-assigned devices
+                if (r) {
-                 * on the same interrupt line is not a happy situation: there
+                        printk(KERN_WARNING "kvm: failed to enable "
-                 * are going to be long delays in accepting, acking, etc.
+                                        "INTx device!\n");
-                 */
-                if (request_irq(match->host_irq, kvm_assigned_dev_intr, 0,
-                                "kvm_assigned_device", (void *)match)) {
-                        r = -EIO;
                        goto out_release;
                }
        }
-        match->irq_requested = true;
        mutex_unlock(&kvm->lock);
        return r;
 out_release:
@@ -283,11 +490,14 @@ static int kvm_vm_ioctl_assign_device(struct kvm *kvm,
                       __func__);
                goto out_disable;
        }
+        pci_reset_function(dev);
        match->assigned_dev_id = assigned_dev->assigned_dev_id;
        match->host_busnr = assigned_dev->busnr;
        match->host_devfn = assigned_dev->devfn;
        match->dev = dev;
+        match->irq_source_id = -1;
        match->kvm = kvm;
        list_add(&match->list, &kvm->arch.assigned_dev_head);
@@ -355,57 +565,48 @@ static void ack_flush(void *_completed)
 {
 }
-void kvm_flush_remote_tlbs(struct kvm *kvm)
+static bool make_all_cpus_request(struct kvm *kvm, unsigned int req)
 {
        int i, cpu, me;
-        cpumask_t cpus;
+        cpumask_var_t cpus;
+        bool called = true;
        struct kvm_vcpu *vcpu;
+        if (alloc_cpumask_var(&cpus, GFP_ATOMIC))
+                cpumask_clear(cpus);
        me = get_cpu();
-        cpus_clear(cpus);
        for (i = 0; i < KVM_MAX_VCPUS; ++i) {
                vcpu = kvm->vcpus[i];
                if (!vcpu)
                        continue;
-                if (test_and_set_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests))
+                if (test_and_set_bit(req, &vcpu->requests))
                        continue;
                cpu = vcpu->cpu;
-                if (cpu != -1 && cpu != me)
+                if (cpus != NULL && cpu != -1 && cpu != me)
-                        cpu_set(cpu, cpus);
+                        cpumask_set_cpu(cpu, cpus);
        }
-        if (cpus_empty(cpus))
+        if (unlikely(cpus == NULL))
-                goto out;
+                smp_call_function_many(cpu_online_mask, ack_flush, NULL, 1);
-        ++kvm->stat.remote_tlb_flush;
+        else if (!cpumask_empty(cpus))
-        smp_call_function_mask(cpus, ack_flush, NULL, 1);
+                smp_call_function_many(cpus, ack_flush, NULL, 1);
-out:
+        else
+                called = false;
        put_cpu();
+        free_cpumask_var(cpus);
+        return called;
 }
-void kvm_reload_remote_mmus(struct kvm *kvm)
+void kvm_flush_remote_tlbs(struct kvm *kvm)
 {
-        int i, cpu, me;
+        if (make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH))
-        cpumask_t cpus;
+                ++kvm->stat.remote_tlb_flush;
-        struct kvm_vcpu *vcpu;
-        me = get_cpu();
-        cpus_clear(cpus);
-        for (i = 0; i < KVM_MAX_VCPUS; ++i) {
-                vcpu = kvm->vcpus[i];
-                if (!vcpu)
-                        continue;
-                if (test_and_set_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests))
-                        continue;
-                cpu = vcpu->cpu;
-                if (cpu != -1 && cpu != me)
-                        cpu_set(cpu, cpus);
-        }
-        if (cpus_empty(cpus))
-                goto out;
-        smp_call_function_mask(cpus, ack_flush, NULL, 1);
-out:
-        put_cpu();
 }
+void kvm_reload_remote_mmus(struct kvm *kvm)
+{
+        make_all_cpus_request(kvm, KVM_REQ_MMU_RELOAD);
+}
 int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
 {
@@ -710,6 +911,8 @@ int __kvm_set_memory_region(struct kvm *kvm,
                goto out;
        if (mem->guest_phys_addr & (PAGE_SIZE - 1))
                goto out;
+        if (user_alloc && (mem->userspace_addr & (PAGE_SIZE - 1)))
+                goto out;
        if (mem->slot >= KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS)
                goto out;
        if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr)
@@ -821,7 +1024,10 @@ int __kvm_set_memory_region(struct kvm *kvm,
                goto out_free;
        }
-        kvm_free_physmem_slot(&old, &new);
+        kvm_free_physmem_slot(&old, npages ? &new : NULL);
+        /* Slot deletion case: we have to update the current slot */
+        if (!npages)
+                *memslot = old;
 #ifdef CONFIG_DMAR
        /* map the pages in iommu page table */
        r = kvm_iommu_map_pages(kvm, base_gfn, npages);
@@ -918,7 +1124,7 @@ int kvm_is_error_hva(unsigned long addr)
 }
 EXPORT_SYMBOL_GPL(kvm_is_error_hva);
-static struct kvm_memory_slot *__gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
+struct kvm_memory_slot *gfn_to_memslot_unaliased(struct kvm *kvm, gfn_t gfn)
 {
        int i;
@@ -931,11 +1137,12 @@ static struct kvm_memory_slot *__gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
        }
        return NULL;
 }
+EXPORT_SYMBOL_GPL(gfn_to_memslot_unaliased);
 struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
 {
        gfn = unalias_gfn(kvm, gfn);
-        return __gfn_to_memslot(kvm, gfn);
+        return gfn_to_memslot_unaliased(kvm, gfn);
 }
 int kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn)
@@ -959,7 +1166,7 @@ unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn)
        struct kvm_memory_slot *slot;
        gfn = unalias_gfn(kvm, gfn);
-        slot = __gfn_to_memslot(kvm, gfn);
+        slot = gfn_to_memslot_unaliased(kvm, gfn);
        if (!slot)
                return bad_hva();
        return (slot->userspace_addr + (gfn - slot->base_gfn) * PAGE_SIZE);
@@ -1210,7 +1417,7 @@ void mark_page_dirty(struct kvm *kvm, gfn_t gfn)
        struct kvm_memory_slot *memslot;
        gfn = unalias_gfn(kvm, gfn);
-        memslot = __gfn_to_memslot(kvm, gfn);
+        memslot = gfn_to_memslot_unaliased(kvm, gfn);
        if (memslot && memslot->dirty_bitmap) {
                unsigned long rel_gfn = gfn - memslot->base_gfn;
@@ -1295,7 +1502,7 @@ static int kvm_vcpu_release(struct inode *inode, struct file *filp)
        return 0;
 }
-static const struct file_operations kvm_vcpu_fops = {
+static struct file_operations kvm_vcpu_fops = {
        .release        = kvm_vcpu_release,
        .unlocked_ioctl = kvm_vcpu_ioctl,
        .compat_ioctl   = kvm_vcpu_ioctl,
@@ -1689,7 +1896,7 @@ static int kvm_vm_mmap(struct file *file, struct vm_area_struct *vma)
        return 0;
 }
-static const struct file_operations kvm_vm_fops = {
+static struct file_operations kvm_vm_fops = {
        .release        = kvm_vm_release,
        .unlocked_ioctl = kvm_vm_ioctl,
        .compat_ioctl   = kvm_vm_ioctl,
@@ -1711,6 +1918,18 @@ static int kvm_dev_ioctl_create_vm(void)
        return fd;
 }
+static long kvm_dev_ioctl_check_extension_generic(long arg)
+{
+        switch (arg) {
+        case KVM_CAP_USER_MEMORY:
+        case KVM_CAP_DESTROY_MEMORY_REGION_WORKS:
+                return 1;
+        default:
+                break;
+        }
+        return kvm_dev_ioctl_check_extension(arg);
+}
 static long kvm_dev_ioctl(struct file *filp,
                          unsigned int ioctl, unsigned long arg)
 {
@@ -1730,7 +1949,7 @@ static long kvm_dev_ioctl(struct file *filp,
                r = kvm_dev_ioctl_create_vm();
                break;
        case KVM_CHECK_EXTENSION:
-                r = kvm_dev_ioctl_check_extension(arg);
+                r = kvm_dev_ioctl_check_extension_generic(arg);
                break;
        case KVM_GET_VCPU_MMAP_SIZE:
                r = -EINVAL;
@@ -1771,9 +1990,9 @@ static void hardware_enable(void *junk)
 {
        int cpu = raw_smp_processor_id();
-        if (cpu_isset(cpu, cpus_hardware_enabled))
+        if (cpumask_test_cpu(cpu, cpus_hardware_enabled))
                return;
-        cpu_set(cpu, cpus_hardware_enabled);
+        cpumask_set_cpu(cpu, cpus_hardware_enabled);
        kvm_arch_hardware_enable(NULL);
 }
@@ -1781,9 +2000,9 @@ static void hardware_disable(void *junk)
 {
        int cpu = raw_smp_processor_id();
-        if (!cpu_isset(cpu, cpus_hardware_enabled))
+        if (!cpumask_test_cpu(cpu, cpus_hardware_enabled))
                return;
-        cpu_clear(cpu, cpus_hardware_enabled);
+        cpumask_clear_cpu(cpu, cpus_hardware_enabled);
        kvm_arch_hardware_disable(NULL);
 }
@@ -2017,9 +2236,14 @@ int kvm_init(void *opaque, unsigned int vcpu_size,
        bad_pfn = page_to_pfn(bad_page);
+        if (!alloc_cpumask_var(&cpus_hardware_enabled, GFP_KERNEL)) {
+                r = -ENOMEM;
+                goto out_free_0;
+        }
        r = kvm_arch_hardware_setup();
        if (r < 0)
-                goto out_free_0;
+                goto out_free_0a;
        for_each_online_cpu(cpu) {
                smp_call_function_single(cpu,
@@ -2053,6 +2277,8 @@ int kvm_init(void *opaque, unsigned int vcpu_size,
        }
        kvm_chardev_ops.owner = module;
+        kvm_vm_fops.owner = module;
+        kvm_vcpu_fops.owner = module;
        r = misc_register(&kvm_dev);
        if (r) {
@@ -2062,6 +2288,9 @@ int kvm_init(void *opaque, unsigned int vcpu_size,
        kvm_preempt_ops.sched_in = kvm_sched_in;
        kvm_preempt_ops.sched_out = kvm_sched_out;
+#ifndef CONFIG_X86
+        msi2intx = 0;
+#endif
        return 0;
@@ -2078,6 +2307,8 @@ out_free_2:
        on_each_cpu(hardware_disable, NULL, 1);
 out_free_1:
        kvm_arch_hardware_unsetup();
+out_free_0a:
+        free_cpumask_var(cpus_hardware_enabled);
 out_free_0:
        __free_page(bad_page);
 out:
@@ -2101,6 +2332,7 @@ void kvm_exit(void)
        kvm_arch_hardware_unsetup();
        kvm_arch_exit();
        kvm_exit_debug();
+        free_cpumask_var(cpus_hardware_enabled);
        __free_page(bad_page);
 }
 EXPORT_SYMBOL_GPL(kvm_exit);
diff --git a/virt/kvm/kvm_trace.c b/virt/kvm/kvm_trace.c
index 41dcc845f78c..f59874446440 100644
--- a/virt/kvm/kvm_trace.c
+++ b/virt/kvm/kvm_trace.c
@@ -252,6 +252,7 @@ void kvm_trace_cleanup(void)
                        struct kvm_trace_probe *p = &kvm_trace_probes[i];
                        marker_probe_unregister(p->name, p->probe_func, p);
                }
+                marker_synchronize_unregister();
                relay_close(kt->rchan);
                debugfs_remove(kt->lost_file);
author	Linus Torvalds <torvalds@linux-foundation.org>	2009-01-02 14:41:11 -0500
committer	Linus Torvalds <torvalds@linux-foundation.org>	2009-01-02 14:41:11 -0500
commit	597b0d21626da4e6f09f132442caf0cc2b0eb47c (patch)
tree	13c0074bb20f7b05a471e78d4ff52c665a10266a
parent	2640c9a90fa596871e142f42052608864335f102 (diff)
parent	87917239204d67a316cb89751750f86c9ed3640b (diff)