73 files changed, 2413 insertions, 1403 deletions
diff --git a/Documentation/virtual/kvm/cpuid.txt b/Documentation/virtual/kvm/cpuid.txt
index 83afe65d4966..22ff659bc0fb 100644
--- a/Documentation/virtual/kvm/cpuid.txt
+++ b/Documentation/virtual/kvm/cpuid.txt
@@ -43,6 +43,10 @@ KVM_FEATURE_CLOCKSOURCE2           ||     3 || kvmclock available at msrs
 KVM_FEATURE_ASYNC_PF               ||     4 || async pf can be enabled by
                                   ||       || writing to msr 0x4b564d02
 ------------------------------------------------------------------------------
+KVM_FEATURE_PV_UNHALT              ||     7 || guest checks this feature bit
+                                   ||       || before enabling paravirtualized
+                                   ||       || spinlock support.
+------------------------------------------------------------------------------
 KVM_FEATURE_CLOCKSOURCE_STABLE_BIT ||    24 || host will warn if no guest-side
                                   ||       || per-cpu warps are expected in
                                   ||       || kvmclock.
diff --git a/Documentation/virtual/kvm/hypercalls.txt b/Documentation/virtual/kvm/hypercalls.txt
index ea113b5d87a4..022198e389d7 100644
--- a/Documentation/virtual/kvm/hypercalls.txt
+++ b/Documentation/virtual/kvm/hypercalls.txt
@@ -64,3 +64,17 @@ Purpose: To enable communication between the hypervisor and guest there is a
 shared page that contains parts of supervisor visible register state.
 The guest can map this shared page to access its supervisor register through
 memory using this hypercall.
+5. KVM_HC_KICK_CPU
+------------------------
+Architecture: x86
+Status: active
+Purpose: Hypercall used to wakeup a vcpu from HLT state
+Usage example : A vcpu of a paravirtualized guest that is busywaiting in guest
+kernel mode for an event to occur (ex: a spinlock to become available) can
+execute HLT instruction once it has busy-waited for more than a threshold
+time-interval. Execution of HLT instruction would cause the hypervisor to put
+the vcpu to sleep until occurence of an appropriate event. Another vcpu of the
+same guest can wakeup the sleeping vcpu by issuing KVM_HC_KICK_CPU hypercall,
+specifying APIC ID (a1) of the vcpu to be woken up. An additional argument (a0)
+is used in the hypercall for future use.
diff --git a/arch/arm/configs/keystone_defconfig b/arch/arm/configs/keystone_defconfig
index 62e968cac9dc..1f36b823905f 100644
--- a/arch/arm/configs/keystone_defconfig
+++ b/arch/arm/configs/keystone_defconfig
@@ -104,6 +104,7 @@ CONFIG_IP_SCTP=y
 CONFIG_VLAN_8021Q=y
 CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug"
 CONFIG_CMA=y
+CONFIG_DMA_CMA=y
 CONFIG_MTD=y
 CONFIG_MTD_CMDLINE_PARTS=y
 CONFIG_MTD_BLOCK=y
diff --git a/arch/arm/configs/omap2plus_defconfig b/arch/arm/configs/omap2plus_defconfig
index 5339e6a4d639..5465f564fdf3 100644
--- a/arch/arm/configs/omap2plus_defconfig
+++ b/arch/arm/configs/omap2plus_defconfig
@@ -78,6 +78,7 @@ CONFIG_MAC80211_RC_PID=y
 CONFIG_MAC80211_RC_DEFAULT_PID=y
 CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug"
 CONFIG_CMA=y
+CONFIG_DMA_CMA=y
 CONFIG_CONNECTOR=y
 CONFIG_DEVTMPFS=y
 CONFIG_DEVTMPFS_MOUNT=y
diff --git a/arch/arm/configs/tegra_defconfig b/arch/arm/configs/tegra_defconfig
index 1effb43dab80..92d0a149aeb5 100644
--- a/arch/arm/configs/tegra_defconfig
+++ b/arch/arm/configs/tegra_defconfig
@@ -79,6 +79,7 @@ CONFIG_DEVTMPFS=y
 CONFIG_DEVTMPFS_MOUNT=y
 # CONFIG_FIRMWARE_IN_KERNEL is not set
 CONFIG_CMA=y
+CONFIG_DMA_CMA=y
 CONFIG_MTD=y
 CONFIG_MTD_M25P80=y
 CONFIG_PROC_DEVICETREE=y
diff --git a/arch/arm/include/asm/dma-contiguous.h b/arch/arm/include/asm/dma-contiguous.h
index 3ed37b4d93da..e072bb2ba1b1 100644
--- a/arch/arm/include/asm/dma-contiguous.h
+++ b/arch/arm/include/asm/dma-contiguous.h
@@ -2,7 +2,7 @@
 #define ASMARM_DMA_CONTIGUOUS_H
 #ifdef __KERNEL__
-#ifdef CONFIG_CMA
+#ifdef CONFIG_DMA_CMA
 #include <linux/types.h>
 #include <asm-generic/dma-contiguous.h>
diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h
index 472ac7091003..9b28c41f4ba9 100644
--- a/arch/arm/include/asm/kvm_mmu.h
+++ b/arch/arm/include/asm/kvm_mmu.h
@@ -64,7 +64,7 @@ void kvm_clear_hyp_idmap(void);
 static inline void kvm_set_pte(pte_t *pte, pte_t new_pte)
 {
-        pte_val(*pte) = new_pte;
+        *pte = new_pte;
        /*
         * flush_pmd_entry just takes a void pointer and cleans the necessary
         * cache entries, so we can reuse the function for ptes.
diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
index 741f66a2edbd..9c697db2787e 100644
--- a/arch/arm/kvm/arm.c
+++ b/arch/arm/kvm/arm.c
@@ -219,6 +219,10 @@ long kvm_arch_dev_ioctl(struct file *filp,
        return -EINVAL;
 }
+void kvm_arch_memslots_updated(struct kvm *kvm)
+{
+}
 int kvm_arch_prepare_memory_region(struct kvm *kvm,
                                   struct kvm_memory_slot *memslot,
                                   struct kvm_userspace_memory_region *mem,
diff --git a/arch/arm/kvm/interrupts.S b/arch/arm/kvm/interrupts.S
index 16cd4ba5d7fd..85dd84b10687 100644
--- a/arch/arm/kvm/interrupts.S
+++ b/arch/arm/kvm/interrupts.S
@@ -492,10 +492,10 @@ __kvm_hyp_code_end:
        .section ".rodata"
 und_die_str:
-        .ascii  "unexpected undefined exception in Hyp mode at: %#08x"
+        .ascii  "unexpected undefined exception in Hyp mode at: %#08x\n"
 pabt_die_str:
-        .ascii  "unexpected prefetch abort in Hyp mode at: %#08x"
+        .ascii  "unexpected prefetch abort in Hyp mode at: %#08x\n"
 dabt_die_str:
-        .ascii  "unexpected data abort in Hyp mode at: %#08x"
+        .ascii  "unexpected data abort in Hyp mode at: %#08x\n"
 svc_die_str:
-        .ascii  "unexpected HVC/SVC trap in Hyp mode at: %#08x"
+        .ascii  "unexpected HVC/SVC trap in Hyp mode at: %#08x\n"
diff --git a/arch/arm/kvm/reset.c b/arch/arm/kvm/reset.c
index b7840e7aa452..71e08baee209 100644
--- a/arch/arm/kvm/reset.c
+++ b/arch/arm/kvm/reset.c
@@ -40,7 +40,7 @@ static struct kvm_regs a15_regs_reset = {
 };
 static const struct kvm_irq_level a15_vtimer_irq = {
-        .irq = 27,
+        { .irq = 27 },
        .level = 1,
 };
diff --git a/arch/arm/kvm/trace.h b/arch/arm/kvm/trace.h
index a8e73ed5ad5b..b1d640f78623 100644
--- a/arch/arm/kvm/trace.h
+++ b/arch/arm/kvm/trace.h
@@ -59,10 +59,9 @@ TRACE_EVENT(kvm_guest_fault,
                __entry->ipa                    = ipa;
        ),
-        TP_printk("guest fault at PC %#08lx (hxfar %#08lx, "
+        TP_printk("ipa %#llx, hsr %#08lx, hxfar %#08lx, pc %#08lx",
-                  "ipa %#16llx, hsr %#08lx",
+                  __entry->ipa, __entry->hsr,
-                  __entry->vcpu_pc, __entry->hxfar,
+                  __entry->hxfar, __entry->vcpu_pc)
-                  __entry->ipa, __entry->hsr)
 );
 TRACE_EVENT(kvm_irq_line,
diff --git a/arch/arm/mm/dma-mapping.c b/arch/arm/mm/dma-mapping.c
index 7f9b1798c6cf..dbddc07a3bbd 100644
--- a/arch/arm/mm/dma-mapping.c
+++ b/arch/arm/mm/dma-mapping.c
@@ -358,7 +358,7 @@ static int __init atomic_pool_init(void)
        if (!pages)
                goto no_pages;
-        if (IS_ENABLED(CONFIG_CMA))
+        if (IS_ENABLED(CONFIG_DMA_CMA))
                ptr = __alloc_from_contiguous(NULL, pool->size, prot, &page,
                                              atomic_pool_init);
        else
@@ -670,7 +670,7 @@ static void *__dma_alloc(struct device *dev, size_t size, dma_addr_t *handle,
                addr = __alloc_simple_buffer(dev, size, gfp, &page);
        else if (!(gfp & __GFP_WAIT))
                addr = __alloc_from_pool(size, &page);
-        else if (!IS_ENABLED(CONFIG_CMA))
+        else if (!IS_ENABLED(CONFIG_DMA_CMA))
                addr = __alloc_remap_buffer(dev, size, gfp, prot, &page, caller);
        else
                addr = __alloc_from_contiguous(dev, size, prot, &page, caller);
@@ -759,7 +759,7 @@ static void __arm_dma_free(struct device *dev, size_t size, void *cpu_addr,
                __dma_free_buffer(page, size);
        } else if (__free_from_pool(cpu_addr, size)) {
                return;
-        } else if (!IS_ENABLED(CONFIG_CMA)) {
+        } else if (!IS_ENABLED(CONFIG_DMA_CMA)) {
                __dma_free_remap(cpu_addr, size);
                __dma_free_buffer(page, size);
        } else {
diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c
index 5b2dc0d10c8f..bdfd8789b376 100644
--- a/arch/ia64/kvm/kvm-ia64.c
+++ b/arch/ia64/kvm/kvm-ia64.c
@@ -1560,6 +1560,10 @@ int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages)
        return 0;
 }
+void kvm_arch_memslots_updated(struct kvm *kvm)
+{
+}
 int kvm_arch_prepare_memory_region(struct kvm *kvm,
                struct kvm_memory_slot *memslot,
                struct kvm_userspace_memory_region *mem,
diff --git a/arch/mips/kvm/kvm_locore.S b/arch/mips/kvm/kvm_locore.S
index dca2aa665993..bbace092ad0a 100644
--- a/arch/mips/kvm/kvm_locore.S
+++ b/arch/mips/kvm/kvm_locore.S
@@ -1,13 +1,13 @@
 /*
-* This file is subject to the terms and conditions of the GNU General Public
+ * This file is subject to the terms and conditions of the GNU General Public
-* License.  See the file "COPYING" in the main directory of this archive
+ * License.  See the file "COPYING" in the main directory of this archive
-* for more details.
+ * for more details.
-*
+ *
-* Main entry point for the guest, exception handling.
+ * Main entry point for the guest, exception handling.
-*
+ *
-* Copyright (C) 2012  MIPS Technologies, Inc.  All rights reserved.
+ * Copyright (C) 2012  MIPS Technologies, Inc.  All rights reserved.
-* Authors: Sanjay Lal <sanjayl@kymasys.com>
+ * Authors: Sanjay Lal <sanjayl@kymasys.com>
-*/
+ */
 #include <asm/asm.h>
 #include <asm/asmmacro.h>
@@ -55,195 +55,193 @@
 * a0: run
 * a1: vcpu
 */
+        .set    noreorder
+        .set    noat
 FEXPORT(__kvm_mips_vcpu_run)
-    .set    push
+        /* k0/k1 not being used in host kernel context */
-    .set    noreorder
+        INT_ADDIU k1, sp, -PT_SIZE
-    .set    noat
+        LONG_S  $0, PT_R0(k1)
+        LONG_S  $1, PT_R1(k1)
-    /* k0/k1 not being used in host kernel context */
+        LONG_S  $2, PT_R2(k1)
-        addiu           k1,sp, -PT_SIZE
+        LONG_S  $3, PT_R3(k1)
-    LONG_S          $0, PT_R0(k1)
-    LONG_S      $1, PT_R1(k1)
+        LONG_S  $4, PT_R4(k1)
-    LONG_S      $2, PT_R2(k1)
+        LONG_S  $5, PT_R5(k1)
-    LONG_S      $3, PT_R3(k1)
+        LONG_S  $6, PT_R6(k1)
+        LONG_S  $7, PT_R7(k1)
-    LONG_S      $4, PT_R4(k1)
-    LONG_S      $5, PT_R5(k1)
+        LONG_S  $8,  PT_R8(k1)
-    LONG_S      $6, PT_R6(k1)
+        LONG_S  $9,  PT_R9(k1)
-    LONG_S      $7, PT_R7(k1)
+        LONG_S  $10, PT_R10(k1)
+        LONG_S  $11, PT_R11(k1)
-    LONG_S      $8,  PT_R8(k1)
+        LONG_S  $12, PT_R12(k1)
-    LONG_S      $9,  PT_R9(k1)
+        LONG_S  $13, PT_R13(k1)
-    LONG_S      $10, PT_R10(k1)
+        LONG_S  $14, PT_R14(k1)
-    LONG_S      $11, PT_R11(k1)
+        LONG_S  $15, PT_R15(k1)
-    LONG_S      $12, PT_R12(k1)
+        LONG_S  $16, PT_R16(k1)
-    LONG_S      $13, PT_R13(k1)
+        LONG_S  $17, PT_R17(k1)
-    LONG_S      $14, PT_R14(k1)
-    LONG_S      $15, PT_R15(k1)
+        LONG_S  $18, PT_R18(k1)
-    LONG_S      $16, PT_R16(k1)
+        LONG_S  $19, PT_R19(k1)
-    LONG_S      $17, PT_R17(k1)
+        LONG_S  $20, PT_R20(k1)
+        LONG_S  $21, PT_R21(k1)
-    LONG_S      $18, PT_R18(k1)
+        LONG_S  $22, PT_R22(k1)
-    LONG_S      $19, PT_R19(k1)
+        LONG_S  $23, PT_R23(k1)
-    LONG_S      $20, PT_R20(k1)
+        LONG_S  $24, PT_R24(k1)
-    LONG_S      $21, PT_R21(k1)
+        LONG_S  $25, PT_R25(k1)
-    LONG_S      $22, PT_R22(k1)
-    LONG_S      $23, PT_R23(k1)
-    LONG_S      $24, PT_R24(k1)
-    LONG_S      $25, PT_R25(k1)
        /* XXXKYMA k0/k1 not saved, not being used if we got here through an ioctl() */
-    LONG_S      $28, PT_R28(k1)
+        LONG_S  $28, PT_R28(k1)
-    LONG_S      $29, PT_R29(k1)
+        LONG_S  $29, PT_R29(k1)
-    LONG_S      $30, PT_R30(k1)
+        LONG_S  $30, PT_R30(k1)
-    LONG_S      $31, PT_R31(k1)
+        LONG_S  $31, PT_R31(k1)
-    /* Save hi/lo */
+        /* Save hi/lo */
-        mflo            v0
+        mflo    v0
-        LONG_S          v0, PT_LO(k1)
+        LONG_S  v0, PT_LO(k1)
-        mfhi            v1
+        mfhi    v1
-        LONG_S          v1, PT_HI(k1)
+        LONG_S  v1, PT_HI(k1)
        /* Save host status */
-        mfc0            v0, CP0_STATUS
+        mfc0    v0, CP0_STATUS
-        LONG_S          v0, PT_STATUS(k1)
+        LONG_S  v0, PT_STATUS(k1)
        /* Save host ASID, shove it into the BVADDR location */
-        mfc0            v1,CP0_ENTRYHI
+        mfc0    v1, CP0_ENTRYHI
-        andi            v1, 0xff
+        andi    v1, 0xff
-        LONG_S          v1, PT_HOST_ASID(k1)
+        LONG_S  v1, PT_HOST_ASID(k1)
-    /* Save DDATA_LO, will be used to store pointer to vcpu */
+        /* Save DDATA_LO, will be used to store pointer to vcpu */
-    mfc0        v1, CP0_DDATA_LO
+        mfc0    v1, CP0_DDATA_LO
-    LONG_S      v1, PT_HOST_USERLOCAL(k1)
+        LONG_S  v1, PT_HOST_USERLOCAL(k1)
-    /* DDATA_LO has pointer to vcpu */
+        /* DDATA_LO has pointer to vcpu */
-    mtc0        a1,CP0_DDATA_LO
+        mtc0    a1, CP0_DDATA_LO
-    /* Offset into vcpu->arch */
+        /* Offset into vcpu->arch */
-        addiu           k1, a1, VCPU_HOST_ARCH
+        INT_ADDIU k1, a1, VCPU_HOST_ARCH
-    /* Save the host stack to VCPU, used for exception processing when we exit from the Guest */
+        /*
-    LONG_S      sp, VCPU_HOST_STACK(k1)
+         * Save the host stack to VCPU, used for exception processing
+         * when we exit from the Guest
+         */
+        LONG_S  sp, VCPU_HOST_STACK(k1)
-    /* Save the kernel gp as well */
+        /* Save the kernel gp as well */
-    LONG_S      gp, VCPU_HOST_GP(k1)
+        LONG_S  gp, VCPU_HOST_GP(k1)
        /* Setup status register for running the guest in UM, interrupts are disabled */
-        li                      k0,(ST0_EXL | KSU_USER| ST0_BEV)
+        li      k0, (ST0_EXL | KSU_USER | ST0_BEV)
-        mtc0            k0,CP0_STATUS
+        mtc0    k0, CP0_STATUS
-    ehb
+        ehb
-    /* load up the new EBASE */
+        /* load up the new EBASE */
-    LONG_L      k0, VCPU_GUEST_EBASE(k1)
+        LONG_L  k0, VCPU_GUEST_EBASE(k1)
-    mtc0        k0,CP0_EBASE
+        mtc0    k0, CP0_EBASE
-    /* Now that the new EBASE has been loaded, unset BEV, set interrupt mask as it was
+        /*
-     * but make sure that timer interrupts are enabled
+         * Now that the new EBASE has been loaded, unset BEV, set
-     */
+         * interrupt mask as it was but make sure that timer interrupts
-    li          k0,(ST0_EXL | KSU_USER | ST0_IE)
+         * are enabled
-    andi        v0, v0, ST0_IM
+         */
-    or          k0, k0, v0
+        li      k0, (ST0_EXL | KSU_USER | ST0_IE)
-    mtc0        k0,CP0_STATUS
+        andi    v0, v0, ST0_IM
-    ehb
+        or      k0, k0, v0
+        mtc0    k0, CP0_STATUS
+        ehb
        /* Set Guest EPC */
-        LONG_L          t0, VCPU_PC(k1)
+        LONG_L  t0, VCPU_PC(k1)
-        mtc0            t0, CP0_EPC
+        mtc0    t0, CP0_EPC
 FEXPORT(__kvm_mips_load_asid)
-    /* Set the ASID for the Guest Kernel */
+        /* Set the ASID for the Guest Kernel */
-    sll         t0, t0, 1                       /* with kseg0 @ 0x40000000, kernel */
+        INT_SLL t0, t0, 1       /* with kseg0 @ 0x40000000, kernel */
-                                                /* addresses shift to 0x80000000 */
+                                /* addresses shift to 0x80000000 */
-    bltz        t0, 1f                          /* If kernel */
+        bltz    t0, 1f          /* If kernel */
-        addiu       t1, k1, VCPU_GUEST_KERNEL_ASID  /* (BD)  */
+         INT_ADDIU t1, k1, VCPU_GUEST_KERNEL_ASID  /* (BD)  */
-    addiu       t1, k1, VCPU_GUEST_USER_ASID    /* else user */
+        INT_ADDIU t1, k1, VCPU_GUEST_USER_ASID    /* else user */
 1:
-    /* t1: contains the base of the ASID array, need to get the cpu id  */
+             /* t1: contains the base of the ASID array, need to get the cpu id  */
-    LONG_L      t2, TI_CPU($28)             /* smp_processor_id */
+        LONG_L  t2, TI_CPU($28)             /* smp_processor_id */
-    sll         t2, t2, 2                   /* x4 */
+        INT_SLL t2, t2, 2                   /* x4 */
-    addu        t3, t1, t2
+        REG_ADDU t3, t1, t2
-    LONG_L      k0, (t3)
+        LONG_L  k0, (t3)
-    andi        k0, k0, 0xff
+        andi    k0, k0, 0xff
-        mtc0            k0,CP0_ENTRYHI
+        mtc0    k0, CP0_ENTRYHI
-    ehb
+        ehb
-    /* Disable RDHWR access */
+        /* Disable RDHWR access */
-    mtc0    zero,  CP0_HWRENA
+        mtc0    zero, CP0_HWRENA
-    /* Now load up the Guest Context from VCPU */
+        /* Now load up the Guest Context from VCPU */
-    LONG_L      $1, VCPU_R1(k1)
+        LONG_L  $1, VCPU_R1(k1)
-    LONG_L      $2, VCPU_R2(k1)
+        LONG_L  $2, VCPU_R2(k1)
-    LONG_L      $3, VCPU_R3(k1)
+        LONG_L  $3, VCPU_R3(k1)
-    LONG_L      $4, VCPU_R4(k1)
+        LONG_L  $4, VCPU_R4(k1)
-    LONG_L      $5, VCPU_R5(k1)
+        LONG_L  $5, VCPU_R5(k1)
-    LONG_L      $6, VCPU_R6(k1)
+        LONG_L  $6, VCPU_R6(k1)
-    LONG_L      $7, VCPU_R7(k1)
+        LONG_L  $7, VCPU_R7(k1)
-    LONG_L      $8,  VCPU_R8(k1)
+        LONG_L  $8, VCPU_R8(k1)
-    LONG_L      $9,  VCPU_R9(k1)
+        LONG_L  $9, VCPU_R9(k1)
-    LONG_L      $10, VCPU_R10(k1)
+        LONG_L  $10, VCPU_R10(k1)
-    LONG_L      $11, VCPU_R11(k1)
+        LONG_L  $11, VCPU_R11(k1)
-    LONG_L      $12, VCPU_R12(k1)
+        LONG_L  $12, VCPU_R12(k1)
-    LONG_L      $13, VCPU_R13(k1)
+        LONG_L  $13, VCPU_R13(k1)
-    LONG_L      $14, VCPU_R14(k1)
+        LONG_L  $14, VCPU_R14(k1)
-    LONG_L      $15, VCPU_R15(k1)
+        LONG_L  $15, VCPU_R15(k1)
-    LONG_L      $16, VCPU_R16(k1)
+        LONG_L  $16, VCPU_R16(k1)
-    LONG_L      $17, VCPU_R17(k1)
+        LONG_L  $17, VCPU_R17(k1)
-    LONG_L      $18, VCPU_R18(k1)
+        LONG_L  $18, VCPU_R18(k1)
-    LONG_L      $19, VCPU_R19(k1)
+        LONG_L  $19, VCPU_R19(k1)
-    LONG_L      $20, VCPU_R20(k1)
+        LONG_L  $20, VCPU_R20(k1)
-    LONG_L      $21, VCPU_R21(k1)
+        LONG_L  $21, VCPU_R21(k1)
-    LONG_L      $22, VCPU_R22(k1)
+        LONG_L  $22, VCPU_R22(k1)
-    LONG_L      $23, VCPU_R23(k1)
+        LONG_L  $23, VCPU_R23(k1)
-    LONG_L      $24, VCPU_R24(k1)
+        LONG_L  $24, VCPU_R24(k1)
-    LONG_L      $25, VCPU_R25(k1)
+        LONG_L  $25, VCPU_R25(k1)
-    /* k0/k1 loaded up later */
+        /* k0/k1 loaded up later */
-    LONG_L      $28, VCPU_R28(k1)
+        LONG_L  $28, VCPU_R28(k1)
-    LONG_L      $29, VCPU_R29(k1)
+        LONG_L  $29, VCPU_R29(k1)
-    LONG_L      $30, VCPU_R30(k1)
+        LONG_L  $30, VCPU_R30(k1)
-    LONG_L      $31, VCPU_R31(k1)
+        LONG_L  $31, VCPU_R31(k1)
-    /* Restore hi/lo */
+        /* Restore hi/lo */
-        LONG_L          k0, VCPU_LO(k1)
+        LONG_L  k0, VCPU_LO(k1)
-        mtlo            k0
+        mtlo    k0
-        LONG_L          k0, VCPU_HI(k1)
+        LONG_L  k0, VCPU_HI(k1)
-        mthi            k0
+        mthi    k0
 FEXPORT(__kvm_mips_load_k0k1)
        /* Restore the guest's k0/k1 registers */
-    LONG_L      k0, VCPU_R26(k1)
+        LONG_L  k0, VCPU_R26(k1)
-    LONG_L      k1, VCPU_R27(k1)
+        LONG_L  k1, VCPU_R27(k1)
-    /* Jump to guest */
+        /* Jump to guest */
        eret
-        .set    pop
 VECTOR(MIPSX(exception), unknown)
 /*
 * Find out what mode we came from and jump to the proper handler.
 */
-    .set    push
+        mtc0    k0, CP0_ERROREPC        #01: Save guest k0
-        .set    noat
+        ehb                             #02:
-    .set    noreorder
-    mtc0    k0, CP0_ERROREPC    #01: Save guest k0
+        mfc0    k0, CP0_EBASE           #02: Get EBASE
-    ehb                         #02:
+        INT_SRL k0, k0, 10              #03: Get rid of CPUNum
+        INT_SLL k0, k0, 10              #04
-    mfc0    k0, CP0_EBASE       #02: Get EBASE
+        LONG_S  k1, 0x3000(k0)          #05: Save k1 @ offset 0x3000
-    srl     k0, k0, 10          #03: Get rid of CPUNum
+        INT_ADDIU k0, k0, 0x2000                #06: Exception handler is installed @ offset 0x2000
-    sll     k0, k0, 10          #04
+        j       k0                      #07: jump to the function
-    LONG_S  k1, 0x3000(k0)      #05: Save k1 @ offset 0x3000
+         nop                            #08: branch delay slot
-    addiu   k0, k0, 0x2000      #06: Exception handler is installed @ offset 0x2000
-        j       k0                                      #07: jump to the function
-        nop                                             #08: branch delay slot
-        .set    push
 VECTOR_END(MIPSX(exceptionEnd))
 .end MIPSX(exception)
@@ -253,329 +251,327 @@ VECTOR_END(MIPSX(exceptionEnd))
 *
 */
 NESTED (MIPSX(GuestException), CALLFRAME_SIZ, ra)
-    .set    push
+        /* Get the VCPU pointer from DDTATA_LO */
-    .set    noat
+        mfc0    k1, CP0_DDATA_LO
-    .set    noreorder
+        INT_ADDIU k1, k1, VCPU_HOST_ARCH
-    /* Get the VCPU pointer from DDTATA_LO */
+        /* Start saving Guest context to VCPU */
-    mfc0        k1, CP0_DDATA_LO
+        LONG_S  $0, VCPU_R0(k1)
-        addiu           k1, k1, VCPU_HOST_ARCH
+        LONG_S  $1, VCPU_R1(k1)
+        LONG_S  $2, VCPU_R2(k1)
-    /* Start saving Guest context to VCPU */
+        LONG_S  $3, VCPU_R3(k1)
-    LONG_S  $0, VCPU_R0(k1)
+        LONG_S  $4, VCPU_R4(k1)
-    LONG_S  $1, VCPU_R1(k1)
+        LONG_S  $5, VCPU_R5(k1)
-    LONG_S  $2, VCPU_R2(k1)
+        LONG_S  $6, VCPU_R6(k1)
-    LONG_S  $3, VCPU_R3(k1)
+        LONG_S  $7, VCPU_R7(k1)
-    LONG_S  $4, VCPU_R4(k1)
+        LONG_S  $8, VCPU_R8(k1)
-    LONG_S  $5, VCPU_R5(k1)
+        LONG_S  $9, VCPU_R9(k1)
-    LONG_S  $6, VCPU_R6(k1)
+        LONG_S  $10, VCPU_R10(k1)
-    LONG_S  $7, VCPU_R7(k1)
+        LONG_S  $11, VCPU_R11(k1)
-    LONG_S  $8, VCPU_R8(k1)
+        LONG_S  $12, VCPU_R12(k1)
-    LONG_S  $9, VCPU_R9(k1)
+        LONG_S  $13, VCPU_R13(k1)
-    LONG_S  $10, VCPU_R10(k1)
+        LONG_S  $14, VCPU_R14(k1)
-    LONG_S  $11, VCPU_R11(k1)
+        LONG_S  $15, VCPU_R15(k1)
-    LONG_S  $12, VCPU_R12(k1)
+        LONG_S  $16, VCPU_R16(k1)
-    LONG_S  $13, VCPU_R13(k1)
+        LONG_S  $17, VCPU_R17(k1)
-    LONG_S  $14, VCPU_R14(k1)
+        LONG_S  $18, VCPU_R18(k1)
-    LONG_S  $15, VCPU_R15(k1)
+        LONG_S  $19, VCPU_R19(k1)
-    LONG_S  $16, VCPU_R16(k1)
+        LONG_S  $20, VCPU_R20(k1)
-    LONG_S  $17,VCPU_R17(k1)
+        LONG_S  $21, VCPU_R21(k1)
-    LONG_S  $18, VCPU_R18(k1)
+        LONG_S  $22, VCPU_R22(k1)
-    LONG_S  $19, VCPU_R19(k1)
+        LONG_S  $23, VCPU_R23(k1)
-    LONG_S  $20, VCPU_R20(k1)
+        LONG_S  $24, VCPU_R24(k1)
-    LONG_S  $21, VCPU_R21(k1)
+        LONG_S  $25, VCPU_R25(k1)
-    LONG_S  $22, VCPU_R22(k1)
-    LONG_S  $23, VCPU_R23(k1)
+        /* Guest k0/k1 saved later */
-    LONG_S  $24, VCPU_R24(k1)
-    LONG_S  $25, VCPU_R25(k1)
+        LONG_S  $28, VCPU_R28(k1)
+        LONG_S  $29, VCPU_R29(k1)
-    /* Guest k0/k1 saved later */
+        LONG_S  $30, VCPU_R30(k1)
+        LONG_S  $31, VCPU_R31(k1)
-    LONG_S  $28, VCPU_R28(k1)
-    LONG_S  $29, VCPU_R29(k1)
+        /* We need to save hi/lo and restore them on
-    LONG_S  $30, VCPU_R30(k1)
+         * the way out
-    LONG_S  $31, VCPU_R31(k1)
+         */
+        mfhi    t0
-    /* We need to save hi/lo and restore them on
+        LONG_S  t0, VCPU_HI(k1)
-     * the way out
-     */
+        mflo    t0
-    mfhi    t0
+        LONG_S  t0, VCPU_LO(k1)
-    LONG_S  t0, VCPU_HI(k1)
+        /* Finally save guest k0/k1 to VCPU */
-    mflo    t0
+        mfc0    t0, CP0_ERROREPC
-    LONG_S  t0, VCPU_LO(k1)
+        LONG_S  t0, VCPU_R26(k1)
-    /* Finally save guest k0/k1 to VCPU */
+        /* Get GUEST k1 and save it in VCPU */
-    mfc0    t0, CP0_ERROREPC
+        PTR_LI  t1, ~0x2ff
-    LONG_S  t0, VCPU_R26(k1)
+        mfc0    t0, CP0_EBASE
+        and     t0, t0, t1
-    /* Get GUEST k1 and save it in VCPU */
+        LONG_L  t0, 0x3000(t0)
-    la      t1, ~0x2ff
+        LONG_S  t0, VCPU_R27(k1)
-    mfc0    t0, CP0_EBASE
-    and     t0, t0, t1
+        /* Now that context has been saved, we can use other registers */
-    LONG_L  t0, 0x3000(t0)
-    LONG_S  t0, VCPU_R27(k1)
+        /* Restore vcpu */
+        mfc0    a1, CP0_DDATA_LO
-    /* Now that context has been saved, we can use other registers */
+        move    s1, a1
-    /* Restore vcpu */
+        /* Restore run (vcpu->run) */
-    mfc0        a1, CP0_DDATA_LO
+        LONG_L  a0, VCPU_RUN(a1)
-    move        s1, a1
+        /* Save pointer to run in s0, will be saved by the compiler */
+        move    s0, a0
-   /* Restore run (vcpu->run) */
-    LONG_L      a0, VCPU_RUN(a1)
+        /* Save Host level EPC, BadVaddr and Cause to VCPU, useful to
-    /* Save pointer to run in s0, will be saved by the compiler */
+         * process the exception */
-    move        s0, a0
+        mfc0    k0,CP0_EPC
+        LONG_S  k0, VCPU_PC(k1)
-    /* Save Host level EPC, BadVaddr and Cause to VCPU, useful to process the exception */
+        mfc0    k0, CP0_BADVADDR
-    mfc0    k0,CP0_EPC
+        LONG_S  k0, VCPU_HOST_CP0_BADVADDR(k1)
-    LONG_S  k0, VCPU_PC(k1)
+        mfc0    k0, CP0_CAUSE
-    mfc0    k0, CP0_BADVADDR
+        LONG_S  k0, VCPU_HOST_CP0_CAUSE(k1)
-    LONG_S  k0, VCPU_HOST_CP0_BADVADDR(k1)
+        mfc0    k0, CP0_ENTRYHI
-    mfc0    k0, CP0_CAUSE
+        LONG_S  k0, VCPU_HOST_ENTRYHI(k1)
-    LONG_S  k0, VCPU_HOST_CP0_CAUSE(k1)
+        /* Now restore the host state just enough to run the handlers */
-    mfc0    k0, CP0_ENTRYHI
-    LONG_S  k0, VCPU_HOST_ENTRYHI(k1)
+        /* Swtich EBASE to the one used by Linux */
+        /* load up the host EBASE */
-    /* Now restore the host state just enough to run the handlers */
+        mfc0    v0, CP0_STATUS
-    /* Swtich EBASE to the one used by Linux */
+        .set    at
-    /* load up the host EBASE */
+        or      k0, v0, ST0_BEV
-    mfc0        v0, CP0_STATUS
+        .set    noat
-    .set at
+        mtc0    k0, CP0_STATUS
-        or          k0, v0, ST0_BEV
+        ehb
-    .set noat
+        LONG_L  k0, VCPU_HOST_EBASE(k1)
-    mtc0        k0, CP0_STATUS
+        mtc0    k0,CP0_EBASE
-    ehb
-    LONG_L      k0, VCPU_HOST_EBASE(k1)
-    mtc0        k0,CP0_EBASE
-    /* Now that the new EBASE has been loaded, unset BEV and KSU_USER */
-    .set at
-        and         v0, v0, ~(ST0_EXL | KSU_USER | ST0_IE)
-    or          v0, v0, ST0_CU0
-    .set noat
-    mtc0        v0, CP0_STATUS
-    ehb
-    /* Load up host GP */
-    LONG_L  gp, VCPU_HOST_GP(k1)
-    /* Need a stack before we can jump to "C" */
-    LONG_L  sp, VCPU_HOST_STACK(k1)
-    /* Saved host state */
-    addiu   sp,sp, -PT_SIZE
-    /* XXXKYMA do we need to load the host ASID, maybe not because the
+        /* Now that the new EBASE has been loaded, unset BEV and KSU_USER */
-     * kernel entries are marked GLOBAL, need to verify
+        .set    at
-     */
+        and     v0, v0, ~(ST0_EXL | KSU_USER | ST0_IE)
+        or      v0, v0, ST0_CU0
+        .set    noat
+        mtc0    v0, CP0_STATUS
+        ehb
+        /* Load up host GP */
+        LONG_L  gp, VCPU_HOST_GP(k1)
+        /* Need a stack before we can jump to "C" */
+        LONG_L  sp, VCPU_HOST_STACK(k1)
+        /* Saved host state */
+        INT_ADDIU sp, sp, -PT_SIZE
-    /* Restore host DDATA_LO */
+        /* XXXKYMA do we need to load the host ASID, maybe not because the
-    LONG_L      k0, PT_HOST_USERLOCAL(sp)
+         * kernel entries are marked GLOBAL, need to verify
-    mtc0        k0, CP0_DDATA_LO
+         */
-    /* Restore RDHWR access */
+        /* Restore host DDATA_LO */
-    la      k0, 0x2000000F
+        LONG_L  k0, PT_HOST_USERLOCAL(sp)
-    mtc0    k0,  CP0_HWRENA
+        mtc0    k0, CP0_DDATA_LO
-    /* Jump to handler */
+        /* Restore RDHWR access */
+        PTR_LI  k0, 0x2000000F
+        mtc0    k0, CP0_HWRENA
+        /* Jump to handler */
 FEXPORT(__kvm_mips_jump_to_handler)
-    /* XXXKYMA: not sure if this is safe, how large is the stack?? */
+        /* XXXKYMA: not sure if this is safe, how large is the stack??
-    /* Now jump to the kvm_mips_handle_exit() to see if we can deal with this in the kernel */
+         * Now jump to the kvm_mips_handle_exit() to see if we can deal
-    la          t9,kvm_mips_handle_exit
+         * with this in the kernel */
-    jalr.hb     t9
+        PTR_LA  t9, kvm_mips_handle_exit
-    addiu       sp,sp, -CALLFRAME_SIZ           /* BD Slot */
+        jalr.hb t9
+         INT_ADDIU sp, sp, -CALLFRAME_SIZ           /* BD Slot */
-    /* Return from handler Make sure interrupts are disabled */
-    di
+        /* Return from handler Make sure interrupts are disabled */
-    ehb
+        di
+        ehb
-    /* XXXKYMA: k0/k1 could have been blown away if we processed an exception
-     * while we were handling the exception from the guest, reload k1
+        /* XXXKYMA: k0/k1 could have been blown away if we processed
-     */
+         * an exception while we were handling the exception from the
-    move        k1, s1
+         * guest, reload k1
-        addiu           k1, k1, VCPU_HOST_ARCH
+         */
-    /* Check return value, should tell us if we are returning to the host (handle I/O etc)
+        move    k1, s1
-     * or resuming the guest
+        INT_ADDIU k1, k1, VCPU_HOST_ARCH
-     */
-    andi        t0, v0, RESUME_HOST
+        /* Check return value, should tell us if we are returning to the
-    bnez        t0, __kvm_mips_return_to_host
+         * host (handle I/O etc)or resuming the guest
-    nop
+         */
+        andi    t0, v0, RESUME_HOST
+        bnez    t0, __kvm_mips_return_to_host
+         nop
 __kvm_mips_return_to_guest:
-    /* Put the saved pointer to vcpu (s1) back into the DDATA_LO Register */
+        /* Put the saved pointer to vcpu (s1) back into the DDATA_LO Register */
-    mtc0        s1, CP0_DDATA_LO
+        mtc0    s1, CP0_DDATA_LO
-    /* Load up the Guest EBASE to minimize the window where BEV is set */
-    LONG_L      t0, VCPU_GUEST_EBASE(k1)
-    /* Switch EBASE back to the one used by KVM */
-    mfc0        v1, CP0_STATUS
-    .set at
-        or          k0, v1, ST0_BEV
-    .set noat
-    mtc0        k0, CP0_STATUS
-    ehb
-    mtc0        t0,CP0_EBASE
-    /* Setup status register for running guest in UM */
-    .set at
-    or     v1, v1, (ST0_EXL | KSU_USER | ST0_IE)
-    and     v1, v1, ~ST0_CU0
-    .set noat
-    mtc0    v1, CP0_STATUS
-    ehb
+        /* Load up the Guest EBASE to minimize the window where BEV is set */
+        LONG_L  t0, VCPU_GUEST_EBASE(k1)
+        /* Switch EBASE back to the one used by KVM */
+        mfc0    v1, CP0_STATUS
+        .set    at
+        or      k0, v1, ST0_BEV
+        .set    noat
+        mtc0    k0, CP0_STATUS
+        ehb
+        mtc0    t0, CP0_EBASE
+        /* Setup status register for running guest in UM */
+        .set    at
+        or      v1, v1, (ST0_EXL | KSU_USER | ST0_IE)
+        and     v1, v1, ~ST0_CU0
+        .set    noat
+        mtc0    v1, CP0_STATUS
+        ehb
        /* Set Guest EPC */
-        LONG_L          t0, VCPU_PC(k1)
+        LONG_L  t0, VCPU_PC(k1)
-        mtc0            t0, CP0_EPC
+        mtc0    t0, CP0_EPC
-    /* Set the ASID for the Guest Kernel */
+        /* Set the ASID for the Guest Kernel */
-    sll         t0, t0, 1                       /* with kseg0 @ 0x40000000, kernel */
+        INT_SLL t0, t0, 1       /* with kseg0 @ 0x40000000, kernel */
-                                                /* addresses shift to 0x80000000 */
+                                /* addresses shift to 0x80000000 */
-    bltz        t0, 1f                          /* If kernel */
+        bltz    t0, 1f          /* If kernel */
-        addiu       t1, k1, VCPU_GUEST_KERNEL_ASID  /* (BD)  */
+         INT_ADDIU t1, k1, VCPU_GUEST_KERNEL_ASID  /* (BD)  */
-    addiu       t1, k1, VCPU_GUEST_USER_ASID    /* else user */
+        INT_ADDIU t1, k1, VCPU_GUEST_USER_ASID    /* else user */
 1:
-    /* t1: contains the base of the ASID array, need to get the cpu id  */
+        /* t1: contains the base of the ASID array, need to get the cpu id  */
-    LONG_L      t2, TI_CPU($28)             /* smp_processor_id */
+        LONG_L  t2, TI_CPU($28)         /* smp_processor_id */
-    sll         t2, t2, 2                   /* x4 */
+        INT_SLL t2, t2, 2               /* x4 */
-    addu        t3, t1, t2
+        REG_ADDU t3, t1, t2
-    LONG_L      k0, (t3)
+        LONG_L  k0, (t3)
-    andi        k0, k0, 0xff
+        andi    k0, k0, 0xff
-        mtc0            k0,CP0_ENTRYHI
+        mtc0    k0,CP0_ENTRYHI
-    ehb
+        ehb
-    /* Disable RDHWR access */
+        /* Disable RDHWR access */
-    mtc0    zero,  CP0_HWRENA
+        mtc0    zero,  CP0_HWRENA
-    /* load the guest context from VCPU and return */
+        /* load the guest context from VCPU and return */
-    LONG_L  $0, VCPU_R0(k1)
+        LONG_L  $0, VCPU_R0(k1)
-    LONG_L  $1, VCPU_R1(k1)
+        LONG_L  $1, VCPU_R1(k1)
-    LONG_L  $2, VCPU_R2(k1)
+        LONG_L  $2, VCPU_R2(k1)
-    LONG_L  $3, VCPU_R3(k1)
+        LONG_L  $3, VCPU_R3(k1)
-    LONG_L  $4, VCPU_R4(k1)
+        LONG_L  $4, VCPU_R4(k1)
-    LONG_L  $5, VCPU_R5(k1)
+        LONG_L  $5, VCPU_R5(k1)
-    LONG_L  $6, VCPU_R6(k1)
+        LONG_L  $6, VCPU_R6(k1)
-    LONG_L  $7, VCPU_R7(k1)
+        LONG_L  $7, VCPU_R7(k1)
-    LONG_L  $8, VCPU_R8(k1)
+        LONG_L  $8, VCPU_R8(k1)
-    LONG_L  $9, VCPU_R9(k1)
+        LONG_L  $9, VCPU_R9(k1)
-    LONG_L  $10, VCPU_R10(k1)
+        LONG_L  $10, VCPU_R10(k1)
-    LONG_L  $11, VCPU_R11(k1)
+        LONG_L  $11, VCPU_R11(k1)
-    LONG_L  $12, VCPU_R12(k1)
+        LONG_L  $12, VCPU_R12(k1)
-    LONG_L  $13, VCPU_R13(k1)
+        LONG_L  $13, VCPU_R13(k1)
-    LONG_L  $14, VCPU_R14(k1)
+        LONG_L  $14, VCPU_R14(k1)
-    LONG_L  $15, VCPU_R15(k1)
+        LONG_L  $15, VCPU_R15(k1)
-    LONG_L  $16, VCPU_R16(k1)
+        LONG_L  $16, VCPU_R16(k1)
-    LONG_L  $17, VCPU_R17(k1)
+        LONG_L  $17, VCPU_R17(k1)
-    LONG_L  $18, VCPU_R18(k1)
+        LONG_L  $18, VCPU_R18(k1)
-    LONG_L  $19, VCPU_R19(k1)
+        LONG_L  $19, VCPU_R19(k1)
-    LONG_L  $20, VCPU_R20(k1)
+        LONG_L  $20, VCPU_R20(k1)
-    LONG_L  $21, VCPU_R21(k1)
+        LONG_L  $21, VCPU_R21(k1)
-    LONG_L  $22, VCPU_R22(k1)
+        LONG_L  $22, VCPU_R22(k1)
-    LONG_L  $23, VCPU_R23(k1)
+        LONG_L  $23, VCPU_R23(k1)
-    LONG_L  $24, VCPU_R24(k1)
+        LONG_L  $24, VCPU_R24(k1)
-    LONG_L  $25, VCPU_R25(k1)
+        LONG_L  $25, VCPU_R25(k1)
-    /* $/k1 loaded later */
+        /* $/k1 loaded later */
-    LONG_L  $28, VCPU_R28(k1)
+        LONG_L  $28, VCPU_R28(k1)
-    LONG_L  $29, VCPU_R29(k1)
+        LONG_L  $29, VCPU_R29(k1)
-    LONG_L  $30, VCPU_R30(k1)
+        LONG_L  $30, VCPU_R30(k1)
-    LONG_L  $31, VCPU_R31(k1)
+        LONG_L  $31, VCPU_R31(k1)
 FEXPORT(__kvm_mips_skip_guest_restore)
-    LONG_L  k0, VCPU_HI(k1)
+        LONG_L  k0, VCPU_HI(k1)
-    mthi    k0
+        mthi    k0
-    LONG_L  k0, VCPU_LO(k1)
+        LONG_L  k0, VCPU_LO(k1)
-    mtlo    k0
+        mtlo    k0
-    LONG_L  k0, VCPU_R26(k1)
+        LONG_L  k0, VCPU_R26(k1)
-    LONG_L  k1, VCPU_R27(k1)
+        LONG_L  k1, VCPU_R27(k1)
-    eret
+        eret
 __kvm_mips_return_to_host:
-    /* EBASE is already pointing to Linux */
+        /* EBASE is already pointing to Linux */
-    LONG_L  k1, VCPU_HOST_STACK(k1)
+        LONG_L  k1, VCPU_HOST_STACK(k1)
-        addiu   k1,k1, -PT_SIZE
+        INT_ADDIU k1,k1, -PT_SIZE
-    /* Restore host DDATA_LO */
+        /* Restore host DDATA_LO */
-    LONG_L      k0, PT_HOST_USERLOCAL(k1)
+        LONG_L  k0, PT_HOST_USERLOCAL(k1)
-    mtc0        k0, CP0_DDATA_LO
+        mtc0    k0, CP0_DDATA_LO
-    /* Restore host ASID */
+        /* Restore host ASID */
-    LONG_L      k0, PT_HOST_ASID(sp)
+        LONG_L  k0, PT_HOST_ASID(sp)
-    andi        k0, 0xff
+        andi    k0, 0xff
-    mtc0        k0,CP0_ENTRYHI
+        mtc0    k0,CP0_ENTRYHI
-    ehb
+        ehb
-    /* Load context saved on the host stack */
+        /* Load context saved on the host stack */
-    LONG_L  $0, PT_R0(k1)
+        LONG_L  $0, PT_R0(k1)
-    LONG_L  $1, PT_R1(k1)
+        LONG_L  $1, PT_R1(k1)
-    /* r2/v0 is the return code, shift it down by 2 (arithmetic) to recover the err code  */
+        /* r2/v0 is the return code, shift it down by 2 (arithmetic)
-    sra     k0, v0, 2
+         * to recover the err code  */
-    move    $2, k0
+        INT_SRA k0, v0, 2
+        move    $2, k0
-    LONG_L  $3, PT_R3(k1)
-    LONG_L  $4, PT_R4(k1)
+        LONG_L  $3, PT_R3(k1)
-    LONG_L  $5, PT_R5(k1)
+        LONG_L  $4, PT_R4(k1)
-    LONG_L  $6, PT_R6(k1)
+        LONG_L  $5, PT_R5(k1)
-    LONG_L  $7, PT_R7(k1)
+        LONG_L  $6, PT_R6(k1)
-    LONG_L  $8, PT_R8(k1)
+        LONG_L  $7, PT_R7(k1)
-    LONG_L  $9, PT_R9(k1)
+        LONG_L  $8, PT_R8(k1)
-    LONG_L  $10, PT_R10(k1)
+        LONG_L  $9, PT_R9(k1)
-    LONG_L  $11, PT_R11(k1)
+        LONG_L  $10, PT_R10(k1)
-    LONG_L  $12, PT_R12(k1)
+        LONG_L  $11, PT_R11(k1)
-    LONG_L  $13, PT_R13(k1)
+        LONG_L  $12, PT_R12(k1)
-    LONG_L  $14, PT_R14(k1)
+        LONG_L  $13, PT_R13(k1)
-    LONG_L  $15, PT_R15(k1)
+        LONG_L  $14, PT_R14(k1)
-    LONG_L  $16, PT_R16(k1)
+        LONG_L  $15, PT_R15(k1)
-    LONG_L  $17, PT_R17(k1)
+        LONG_L  $16, PT_R16(k1)
-    LONG_L  $18, PT_R18(k1)
+        LONG_L  $17, PT_R17(k1)
-    LONG_L  $19, PT_R19(k1)
+        LONG_L  $18, PT_R18(k1)
-    LONG_L  $20, PT_R20(k1)
+        LONG_L  $19, PT_R19(k1)
-    LONG_L  $21, PT_R21(k1)
+        LONG_L  $20, PT_R20(k1)
-    LONG_L  $22, PT_R22(k1)
+        LONG_L  $21, PT_R21(k1)
-    LONG_L  $23, PT_R23(k1)
+        LONG_L  $22, PT_R22(k1)
-    LONG_L  $24, PT_R24(k1)
+        LONG_L  $23, PT_R23(k1)
-    LONG_L  $25, PT_R25(k1)
+        LONG_L  $24, PT_R24(k1)
+        LONG_L  $25, PT_R25(k1)
-    /* Host k0/k1 were not saved */
+        /* Host k0/k1 were not saved */
-    LONG_L  $28, PT_R28(k1)
-    LONG_L  $29, PT_R29(k1)
+        LONG_L  $28, PT_R28(k1)
-    LONG_L  $30, PT_R30(k1)
+        LONG_L  $29, PT_R29(k1)
+        LONG_L  $30, PT_R30(k1)
-    LONG_L  k0, PT_HI(k1)
-    mthi    k0
+        LONG_L  k0, PT_HI(k1)
+        mthi    k0
-    LONG_L  k0, PT_LO(k1)
-    mtlo    k0
+        LONG_L  k0, PT_LO(k1)
+        mtlo    k0
-    /* Restore RDHWR access */
-    la      k0, 0x2000000F
+        /* Restore RDHWR access */
-    mtc0    k0,  CP0_HWRENA
+        PTR_LI  k0, 0x2000000F
+        mtc0    k0,  CP0_HWRENA
-    /* Restore RA, which is the address we will return to */
-    LONG_L  ra, PT_R31(k1)
+        /* Restore RA, which is the address we will return to */
-    j       ra
+        LONG_L  ra, PT_R31(k1)
-    nop
+        j       ra
+         nop
-    .set    pop
 VECTOR_END(MIPSX(GuestExceptionEnd))
 .end MIPSX(GuestException)
@@ -627,24 +623,23 @@ MIPSX(exceptions):
 #define HW_SYNCI_Step       $1
 LEAF(MIPSX(SyncICache))
-    .set    push
+        .set    push
        .set    mips32r2
-    beq     a1, zero, 20f
+        beq     a1, zero, 20f
-    nop
+         nop
-    addu    a1, a0, a1
+        REG_ADDU a1, a0, a1
-    rdhwr   v0, HW_SYNCI_Step
+        rdhwr   v0, HW_SYNCI_Step
-    beq     v0, zero, 20f
+        beq     v0, zero, 20f
-    nop
+         nop
 10:
-    synci   0(a0)
+        synci   0(a0)
-    addu    a0, a0, v0
+        REG_ADDU a0, a0, v0
-    sltu    v1, a0, a1
+        sltu    v1, a0, a1
-    bne     v1, zero, 10b
+        bne     v1, zero, 10b
-    nop
+         nop
-    sync
+        sync
 20:
-    jr.hb   ra
+        jr.hb   ra
-    nop
+         nop
-    .set pop
+        .set    pop
 END(MIPSX(SyncICache))
diff --git a/arch/mips/kvm/kvm_mips.c b/arch/mips/kvm/kvm_mips.c
index dd203e59e6fd..a7b044536de4 100644
--- a/arch/mips/kvm/kvm_mips.c
+++ b/arch/mips/kvm/kvm_mips.c
@@ -208,6 +208,10 @@ int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages)
        return 0;
 }
+void kvm_arch_memslots_updated(struct kvm *kvm)
+{
+}
 int kvm_arch_prepare_memory_region(struct kvm *kvm,
                                struct kvm_memory_slot *memslot,
                                struct kvm_userspace_memory_region *mem,
diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h
index 08891d07aeb6..fa19e2f1a874 100644
--- a/arch/powerpc/include/asm/kvm_book3s.h
+++ b/arch/powerpc/include/asm/kvm_book3s.h
@@ -334,6 +334,27 @@ static inline u32 kvmppc_get_last_inst(struct kvm_vcpu *vcpu)
        return r;
 }
+/*
+ * Like kvmppc_get_last_inst(), but for fetching a sc instruction.
+ * Because the sc instruction sets SRR0 to point to the following
+ * instruction, we have to fetch from pc - 4.
+ */
+static inline u32 kvmppc_get_last_sc(struct kvm_vcpu *vcpu)
+{
+        ulong pc = kvmppc_get_pc(vcpu) - 4;
+        struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu);
+        u32 r;
+        /* Load the instruction manually if it failed to do so in the
+         * exit path */
+        if (svcpu->last_inst == KVM_INST_FETCH_FAILED)
+                kvmppc_ld(vcpu, &pc, sizeof(u32), &svcpu->last_inst, false);
+        r = svcpu->last_inst;
+        svcpu_put(svcpu);
+        return r;
+}
 static inline ulong kvmppc_get_fault_dar(struct kvm_vcpu *vcpu)
 {
        struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu);
@@ -446,6 +467,23 @@ static inline u32 kvmppc_get_last_inst(struct kvm_vcpu *vcpu)
        return vcpu->arch.last_inst;
 }
+/*
+ * Like kvmppc_get_last_inst(), but for fetching a sc instruction.
+ * Because the sc instruction sets SRR0 to point to the following
+ * instruction, we have to fetch from pc - 4.
+ */
+static inline u32 kvmppc_get_last_sc(struct kvm_vcpu *vcpu)
+{
+        ulong pc = kvmppc_get_pc(vcpu) - 4;
+        /* Load the instruction manually if it failed to do so in the
+         * exit path */
+        if (vcpu->arch.last_inst == KVM_INST_FETCH_FAILED)
+                kvmppc_ld(vcpu, &pc, sizeof(u32), &vcpu->arch.last_inst, false);
+        return vcpu->arch.last_inst;
+}
 static inline ulong kvmppc_get_fault_dar(struct kvm_vcpu *vcpu)
 {
        return vcpu->arch.fault_dar;
diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h
index a1ecb14e4442..86d638a3b359 100644
--- a/arch/powerpc/include/asm/kvm_book3s_64.h
+++ b/arch/powerpc/include/asm/kvm_book3s_64.h
@@ -37,7 +37,7 @@ static inline void svcpu_put(struct kvmppc_book3s_shadow_vcpu *svcpu)
 #ifdef CONFIG_KVM_BOOK3S_64_HV
 #define KVM_DEFAULT_HPT_ORDER   24      /* 16MB HPT by default */
-extern int kvm_hpt_order;               /* order of preallocated HPTs */
+extern unsigned long kvm_rma_pages;
 #endif
 #define VRMA_VSID       0x1ffffffUL     /* 1TB VSID reserved for VRMA */
@@ -100,7 +100,7 @@ static inline unsigned long compute_tlbie_rb(unsigned long v, unsigned long r,
                        /* (masks depend on page size) */
                        rb |= 0x1000;           /* page encoding in LP field */
                        rb |= (va_low & 0x7f) << 16; /* 7b of VA in AVA/LP field */
-                        rb |= (va_low & 0xfe);  /* AVAL field (P7 doesn't seem to care) */
+                        rb |= ((va_low << 4) & 0xf0);   /* AVAL field (P7 doesn't seem to care) */
                }
        } else {
                /* 4kB page */
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index af326cde7cb6..33283532e9d8 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -183,13 +183,9 @@ struct kvmppc_spapr_tce_table {
        struct page *pages[0];
 };
-struct kvmppc_linear_info {
+struct kvm_rma_info {
-        void            *base_virt;
+        atomic_t use_count;
-        unsigned long    base_pfn;
+        unsigned long base_pfn;
-        unsigned long    npages;
-        struct list_head list;
-        atomic_t         use_count;
-        int              type;
 };
 /* XICS components, defined in book3s_xics.c */
@@ -246,7 +242,7 @@ struct kvm_arch {
        int tlbie_lock;
        unsigned long lpcr;
        unsigned long rmor;
-        struct kvmppc_linear_info *rma;
+        struct kvm_rma_info *rma;
        unsigned long vrma_slb_v;
        int rma_setup_done;
        int using_mmu_notifiers;
@@ -259,7 +255,7 @@ struct kvm_arch {
        spinlock_t slot_phys_lock;
        cpumask_t need_tlb_flush;
        struct kvmppc_vcore *vcores[KVM_MAX_VCORES];
-        struct kvmppc_linear_info *hpt_li;
+        int hpt_cma_alloc;
 #endif /* CONFIG_KVM_BOOK3S_64_HV */
 #ifdef CONFIG_PPC_BOOK3S_64
        struct list_head spapr_tce_tables;
diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
index a5287fe03d77..b15554a26c20 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -137,10 +137,10 @@ extern long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
                             unsigned long ioba, unsigned long tce);
 extern long kvm_vm_ioctl_allocate_rma(struct kvm *kvm,
                                struct kvm_allocate_rma *rma);
-extern struct kvmppc_linear_info *kvm_alloc_rma(void);
+extern struct kvm_rma_info *kvm_alloc_rma(void);
-extern void kvm_release_rma(struct kvmppc_linear_info *ri);
+extern void kvm_release_rma(struct kvm_rma_info *ri);
-extern struct kvmppc_linear_info *kvm_alloc_hpt(void);
+extern struct page *kvm_alloc_hpt(unsigned long nr_pages);
-extern void kvm_release_hpt(struct kvmppc_linear_info *li);
+extern void kvm_release_hpt(struct page *page, unsigned long nr_pages);
 extern int kvmppc_core_init_vm(struct kvm *kvm);
 extern void kvmppc_core_destroy_vm(struct kvm *kvm);
 extern void kvmppc_core_free_memslot(struct kvm_memory_slot *free,
@@ -261,6 +261,7 @@ void kvmppc_set_pid(struct kvm_vcpu *vcpu, u32 pid);
 struct openpic;
 #ifdef CONFIG_KVM_BOOK3S_64_HV
+extern void kvm_cma_reserve(void) __init;
 static inline void kvmppc_set_xics_phys(int cpu, unsigned long addr)
 {
        paca[cpu].kvm_hstate.xics_phys = addr;
@@ -281,13 +282,12 @@ static inline void kvmppc_set_host_ipi(int cpu, u8 host_ipi)
 }
 extern void kvmppc_fast_vcpu_kick(struct kvm_vcpu *vcpu);
-extern void kvm_linear_init(void);
 #else
-static inline void kvmppc_set_xics_phys(int cpu, unsigned long addr)
+static inline void __init kvm_cma_reserve(void)
 {}
-static inline void kvm_linear_init(void)
+static inline void kvmppc_set_xics_phys(int cpu, unsigned long addr)
 {}
 static inline u32 kvmppc_get_xics_latch(void)
@@ -394,10 +394,15 @@ static inline void kvmppc_mmu_flush_icache(pfn_t pfn)
        }
 }
-/* Please call after prepare_to_enter. This function puts the lazy ee state
+/*
-   back to normal mode, without actually enabling interrupts. */
+ * Please call after prepare_to_enter. This function puts the lazy ee and irq
-static inline void kvmppc_lazy_ee_enable(void)
+ * disabled tracking state back to normal mode, without actually enabling
+ * interrupts.
+ */
+static inline void kvmppc_fix_ee_before_entry(void)
 {
+        trace_hardirqs_on();
 #ifdef CONFIG_PPC64
        /* Only need to enable IRQs by hard enabling them after this */
        local_paca->irq_happened = 0;
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
index 8207459efe56..d8958be5f31a 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -454,6 +454,7 @@ int main(void)
        DEFINE(VCPU_SPRG2, offsetof(struct kvm_vcpu, arch.shregs.sprg2));
        DEFINE(VCPU_SPRG3, offsetof(struct kvm_vcpu, arch.shregs.sprg3));
 #endif
+        DEFINE(VCPU_SHARED_SPRG3, offsetof(struct kvm_vcpu_arch_shared, sprg3));
        DEFINE(VCPU_SHARED_SPRG4, offsetof(struct kvm_vcpu_arch_shared, sprg4));
        DEFINE(VCPU_SHARED_SPRG5, offsetof(struct kvm_vcpu_arch_shared, sprg5));
        DEFINE(VCPU_SHARED_SPRG6, offsetof(struct kvm_vcpu_arch_shared, sprg6));
diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c
index 389fb8077cc9..fe6a58c9f0b7 100644
--- a/arch/powerpc/kernel/setup_64.c
+++ b/arch/powerpc/kernel/setup_64.c
@@ -229,6 +229,8 @@ void __init early_setup(unsigned long dt_ptr)
        /* Initialize the hash table or TLB handling */
        early_init_mmu();
+        kvm_cma_reserve();
        /*
         * Reserve any gigantic pages requested on the command line.
         * memblock needs to have been initialized by the time this is
@@ -609,8 +611,6 @@ void __init setup_arch(char **cmdline_p)
        /* Initialize the MMU context management stuff */
        mmu_context_init();
-        kvm_linear_init();
        /* Interrupt code needs to be 64K-aligned */
        if ((unsigned long)_stext & 0xffff)
                panic("Kernelbase not 64K-aligned (0x%lx)!\n",
diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig
index eb643f862579..ffaef2cb101a 100644
--- a/arch/powerpc/kvm/Kconfig
+++ b/arch/powerpc/kvm/Kconfig
@@ -72,6 +72,7 @@ config KVM_BOOK3S_64_HV
        bool "KVM support for POWER7 and PPC970 using hypervisor mode in host"
        depends on KVM_BOOK3S_64
        select MMU_NOTIFIER
+        select CMA
        ---help---
          Support running unmodified book3s_64 guest kernels in
          virtual machines on POWER7 and PPC970 processors that have
diff --git a/arch/powerpc/kvm/Makefile b/arch/powerpc/kvm/Makefile
index 008cd856c5b5..6646c952c5e3 100644
--- a/arch/powerpc/kvm/Makefile
+++ b/arch/powerpc/kvm/Makefile
@@ -81,6 +81,7 @@ kvm-book3s_64-builtin-objs-$(CONFIG_KVM_BOOK3S_64_HV) := \
        book3s_64_vio_hv.o \
        book3s_hv_ras.o \
        book3s_hv_builtin.o \
+        book3s_hv_cma.o \
        $(kvm-book3s_64-builtin-xics-objs-y)
 kvm-book3s_64-objs-$(CONFIG_KVM_XICS) += \
diff --git a/arch/powerpc/kvm/book3s_64_mmu.c b/arch/powerpc/kvm/book3s_64_mmu.c
index 739bfbadb85e..7e345e00661a 100644
--- a/arch/powerpc/kvm/book3s_64_mmu.c
+++ b/arch/powerpc/kvm/book3s_64_mmu.c
@@ -182,10 +182,13 @@ static int kvmppc_mmu_book3s_64_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,
        hva_t ptegp;
        u64 pteg[16];
        u64 avpn = 0;
+        u64 v, r;
+        u64 v_val, v_mask;
+        u64 eaddr_mask;
        int i;
-        u8 key = 0;
+        u8 pp, key = 0;
        bool found = false;
-        int second = 0;
+        bool second = false;
        ulong mp_ea = vcpu->arch.magic_page_ea;
        /* Magic page override */
@@ -208,8 +211,16 @@ static int kvmppc_mmu_book3s_64_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,
                goto no_seg_found;
        avpn = kvmppc_mmu_book3s_64_get_avpn(slbe, eaddr);
+        v_val = avpn & HPTE_V_AVPN;
        if (slbe->tb)
-                avpn |= SLB_VSID_B_1T;
+                v_val |= SLB_VSID_B_1T;
+        if (slbe->large)
+                v_val |= HPTE_V_LARGE;
+        v_val |= HPTE_V_VALID;
+        v_mask = SLB_VSID_B | HPTE_V_AVPN | HPTE_V_LARGE | HPTE_V_VALID |
+                HPTE_V_SECONDARY;
 do_second:
        ptegp = kvmppc_mmu_book3s_64_get_pteg(vcpu_book3s, slbe, eaddr, second);
@@ -227,91 +238,74 @@ do_second:
                key = 4;
        for (i=0; i<16; i+=2) {
-                u64 v = pteg[i];
+                /* Check all relevant fields of 1st dword */
-                u64 r = pteg[i+1];
+                if ((pteg[i] & v_mask) == v_val) {
-                /* Valid check */
-                if (!(v & HPTE_V_VALID))
-                        continue;
-                /* Hash check */
-                if ((v & HPTE_V_SECONDARY) != second)
-                        continue;
-                /* AVPN compare */
-                if (HPTE_V_COMPARE(avpn, v)) {
-                        u8 pp = (r & HPTE_R_PP) | key;
-                        int eaddr_mask = 0xFFF;
-                        gpte->eaddr = eaddr;
-                        gpte->vpage = kvmppc_mmu_book3s_64_ea_to_vp(vcpu,
-                                                                    eaddr,
-                                                                    data);
-                        if (slbe->large)
-                                eaddr_mask = 0xFFFFFF;
-                        gpte->raddr = (r & HPTE_R_RPN) | (eaddr & eaddr_mask);
-                        gpte->may_execute = ((r & HPTE_R_N) ? false : true);
-                        gpte->may_read = false;
-                        gpte->may_write = false;
-                        switch (pp) {
-                        case 0:
-                        case 1:
-                        case 2:
-                        case 6:
-                                gpte->may_write = true;
-                                /* fall through */
-                        case 3:
-                        case 5:
-                        case 7:
-                                gpte->may_read = true;
-                                break;
-                        }
-                        dprintk("KVM MMU: Translated 0x%lx [0x%llx] -> 0x%llx "
-                                "-> 0x%lx\n",
-                                eaddr, avpn, gpte->vpage, gpte->raddr);
                        found = true;
                        break;
                }
        }
-        /* Update PTE R and C bits, so the guest's swapper knows we used the
+        if (!found) {
-         * page */
+                if (second)
-        if (found) {
+                        goto no_page_found;
-                u32 oldr = pteg[i+1];
+                v_val |= HPTE_V_SECONDARY;
+                second = true;
+                goto do_second;
+        }
-                if (gpte->may_read) {
+        v = pteg[i];
-                        /* Set the accessed flag */
+        r = pteg[i+1];
-                        pteg[i+1] |= HPTE_R_R;
+        pp = (r & HPTE_R_PP) | key;
-                }
+        eaddr_mask = 0xFFF;
-                if (gpte->may_write) {
-                        /* Set the dirty flag */
+        gpte->eaddr = eaddr;
-                        pteg[i+1] |= HPTE_R_C;
+        gpte->vpage = kvmppc_mmu_book3s_64_ea_to_vp(vcpu, eaddr, data);
-                } else {
+        if (slbe->large)
-                        dprintk("KVM: Mapping read-only page!\n");
+                eaddr_mask = 0xFFFFFF;
-                }
+        gpte->raddr = (r & HPTE_R_RPN & ~eaddr_mask) | (eaddr & eaddr_mask);
+        gpte->may_execute = ((r & HPTE_R_N) ? false : true);
+        gpte->may_read = false;
+        gpte->may_write = false;
+        switch (pp) {
+        case 0:
+        case 1:
+        case 2:
+        case 6:
+                gpte->may_write = true;
+                /* fall through */
+        case 3:
+        case 5:
+        case 7:
+                gpte->may_read = true;
+                break;
+        }
-                /* Write back into the PTEG */
+        dprintk("KVM MMU: Translated 0x%lx [0x%llx] -> 0x%llx "
-                if (pteg[i+1] != oldr)
+                "-> 0x%lx\n",
-                        copy_to_user((void __user *)ptegp, pteg, sizeof(pteg));
+                eaddr, avpn, gpte->vpage, gpte->raddr);
-                if (!gpte->may_read)
+        /* Update PTE R and C bits, so the guest's swapper knows we used the
-                        return -EPERM;
+         * page */
-                return 0;
+        if (gpte->may_read) {
-        } else {
+                /* Set the accessed flag */
-                dprintk("KVM MMU: No PTE found (ea=0x%lx sdr1=0x%llx "
+                r |= HPTE_R_R;
-                        "ptegp=0x%lx)\n",
+        }
-                        eaddr, to_book3s(vcpu)->sdr1, ptegp);
+        if (data && gpte->may_write) {
-                for (i = 0; i < 16; i += 2)
+                /* Set the dirty flag -- XXX even if not writing */
-                        dprintk("   %02d: 0x%llx - 0x%llx (0x%llx)\n",
+                r |= HPTE_R_C;
-                                i, pteg[i], pteg[i+1], avpn);
+        }
-                if (!second) {
+        /* Write back into the PTEG */
-                        second = HPTE_V_SECONDARY;
+        if (pteg[i+1] != r) {
-                        goto do_second;
+                pteg[i+1] = r;
-                }
+                copy_to_user((void __user *)ptegp, pteg, sizeof(pteg));
        }
+        if (!gpte->may_read)
+                return -EPERM;
+        return 0;
 no_page_found:
        return -ENOENT;
diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c
index 710d31317d81..043eec8461e7 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
@@ -37,6 +37,8 @@
 #include <asm/ppc-opcode.h>
 #include <asm/cputable.h>
+#include "book3s_hv_cma.h"
 /* POWER7 has 10-bit LPIDs, PPC970 has 6-bit LPIDs */
 #define MAX_LPID_970    63
@@ -52,8 +54,8 @@ long kvmppc_alloc_hpt(struct kvm *kvm, u32 *htab_orderp)
 {
        unsigned long hpt;
        struct revmap_entry *rev;
-        struct kvmppc_linear_info *li;
+        struct page *page = NULL;
-        long order = kvm_hpt_order;
+        long order = KVM_DEFAULT_HPT_ORDER;
        if (htab_orderp) {
                order = *htab_orderp;
@@ -61,26 +63,23 @@ long kvmppc_alloc_hpt(struct kvm *kvm, u32 *htab_orderp)
                        order = PPC_MIN_HPT_ORDER;
        }
+        kvm->arch.hpt_cma_alloc = 0;
        /*
-         * If the user wants a different size from default,
         * try first to allocate it from the kernel page allocator.
+         * We keep the CMA reserved for failed allocation.
         */
-        hpt = 0;
+        hpt = __get_free_pages(GFP_KERNEL | __GFP_ZERO | __GFP_REPEAT |
-        if (order != kvm_hpt_order) {
+                               __GFP_NOWARN, order - PAGE_SHIFT);
-                hpt = __get_free_pages(GFP_KERNEL|__GFP_ZERO|__GFP_REPEAT|
-                                       __GFP_NOWARN, order - PAGE_SHIFT);
-                if (!hpt)
-                        --order;
-        }
        /* Next try to allocate from the preallocated pool */
        if (!hpt) {
-                li = kvm_alloc_hpt();
+                VM_BUG_ON(order < KVM_CMA_CHUNK_ORDER);
-                if (li) {
+                page = kvm_alloc_hpt(1 << (order - PAGE_SHIFT));
-                        hpt = (ulong)li->base_virt;
+                if (page) {
-                        kvm->arch.hpt_li = li;
+                        hpt = (unsigned long)pfn_to_kaddr(page_to_pfn(page));
-                        order = kvm_hpt_order;
+                        kvm->arch.hpt_cma_alloc = 1;
-                }
+                } else
+                        --order;
        }
        /* Lastly try successively smaller sizes from the page allocator */
@@ -118,8 +117,8 @@ long kvmppc_alloc_hpt(struct kvm *kvm, u32 *htab_orderp)
        return 0;
 out_freehpt:
-        if (kvm->arch.hpt_li)
+        if (kvm->arch.hpt_cma_alloc)
-                kvm_release_hpt(kvm->arch.hpt_li);
+                kvm_release_hpt(page, 1 << (order - PAGE_SHIFT));
        else
                free_pages(hpt, order - PAGE_SHIFT);
        return -ENOMEM;
@@ -165,8 +164,9 @@ void kvmppc_free_hpt(struct kvm *kvm)
 {
        kvmppc_free_lpid(kvm->arch.lpid);
        vfree(kvm->arch.revmap);
-        if (kvm->arch.hpt_li)
+        if (kvm->arch.hpt_cma_alloc)
-                kvm_release_hpt(kvm->arch.hpt_li);
+                kvm_release_hpt(virt_to_page(kvm->arch.hpt_virt),
+                                1 << (kvm->arch.hpt_order - PAGE_SHIFT));
        else
                free_pages(kvm->arch.hpt_virt,
                           kvm->arch.hpt_order - PAGE_SHIFT);
@@ -1579,7 +1579,7 @@ int kvm_vm_ioctl_get_htab_fd(struct kvm *kvm, struct kvm_get_htab_fd *ghf)
        ctx->first_pass = 1;
        rwflag = (ghf->flags & KVM_GET_HTAB_WRITE) ? O_WRONLY : O_RDONLY;
-        ret = anon_inode_getfd("kvm-htab", &kvm_htab_fops, ctx, rwflag);
+        ret = anon_inode_getfd("kvm-htab", &kvm_htab_fops, ctx, rwflag | O_CLOEXEC);
        if (ret < 0) {
                kvm_put_kvm(kvm);
                return ret;
diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c
index b2d3f3b2de72..54cf9bc94dad 100644
--- a/arch/powerpc/kvm/book3s_64_vio.c
+++ b/arch/powerpc/kvm/book3s_64_vio.c
@@ -136,7 +136,7 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
        mutex_unlock(&kvm->lock);
        return anon_inode_getfd("kvm-spapr-tce", &kvm_spapr_tce_fops,
-                                stt, O_RDWR);
+                                stt, O_RDWR | O_CLOEXEC);
 fail:
        if (stt) {
diff --git a/arch/powerpc/kvm/book3s_emulate.c b/arch/powerpc/kvm/book3s_emulate.c
index 1f6344c4408d..360ce68c9809 100644
--- a/arch/powerpc/kvm/book3s_emulate.c
+++ b/arch/powerpc/kvm/book3s_emulate.c
@@ -458,6 +458,7 @@ int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, ulong spr_val)
        case SPRN_PMC4_GEKKO:
        case SPRN_WPAR_GEKKO:
        case SPRN_MSSSR0:
+        case SPRN_DABR:
                break;
 unprivileged:
        default:
@@ -555,6 +556,7 @@ int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, ulong *spr_val)
        case SPRN_PMC4_GEKKO:
        case SPRN_WPAR_GEKKO:
        case SPRN_MSSSR0:
+        case SPRN_DABR:
                *spr_val = 0;
                break;
        default:
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 7629cd3eb91a..b0ee3bc9ca76 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -680,13 +680,12 @@ static int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
 }
 int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
-                                  struct kvm_sregs *sregs)
+                                  struct kvm_sregs *sregs)
 {
        int i;
-        sregs->pvr = vcpu->arch.pvr;
        memset(sregs, 0, sizeof(struct kvm_sregs));
+        sregs->pvr = vcpu->arch.pvr;
        for (i = 0; i < vcpu->arch.slb_max; i++) {
                sregs->u.s.ppc64.slb[i].slbe = vcpu->arch.slb[i].orige;
                sregs->u.s.ppc64.slb[i].slbv = vcpu->arch.slb[i].origv;
@@ -696,7 +695,7 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
 }
 int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
-                                  struct kvm_sregs *sregs)
+                                  struct kvm_sregs *sregs)
 {
        int i, j;
@@ -1511,10 +1510,10 @@ static inline int lpcr_rmls(unsigned long rma_size)
 static int kvm_rma_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
-        struct kvmppc_linear_info *ri = vma->vm_file->private_data;
        struct page *page;
+        struct kvm_rma_info *ri = vma->vm_file->private_data;
-        if (vmf->pgoff >= ri->npages)
+        if (vmf->pgoff >= kvm_rma_pages)
                return VM_FAULT_SIGBUS;
        page = pfn_to_page(ri->base_pfn + vmf->pgoff);
@@ -1536,7 +1535,7 @@ static int kvm_rma_mmap(struct file *file, struct vm_area_struct *vma)
 static int kvm_rma_release(struct inode *inode, struct file *filp)
 {
-        struct kvmppc_linear_info *ri = filp->private_data;
+        struct kvm_rma_info *ri = filp->private_data;
        kvm_release_rma(ri);
        return 0;
@@ -1549,18 +1548,27 @@ static const struct file_operations kvm_rma_fops = {
 long kvm_vm_ioctl_allocate_rma(struct kvm *kvm, struct kvm_allocate_rma *ret)
 {
-        struct kvmppc_linear_info *ri;
        long fd;
+        struct kvm_rma_info *ri;
+        /*
+         * Only do this on PPC970 in HV mode
+         */
+        if (!cpu_has_feature(CPU_FTR_HVMODE) ||
+            !cpu_has_feature(CPU_FTR_ARCH_201))
+                return -EINVAL;
+        if (!kvm_rma_pages)
+                return -EINVAL;
        ri = kvm_alloc_rma();
        if (!ri)
                return -ENOMEM;
-        fd = anon_inode_getfd("kvm-rma", &kvm_rma_fops, ri, O_RDWR);
+        fd = anon_inode_getfd("kvm-rma", &kvm_rma_fops, ri, O_RDWR | O_CLOEXEC);
        if (fd < 0)
                kvm_release_rma(ri);
-        ret->rma_size = ri->npages << PAGE_SHIFT;
+        ret->rma_size = kvm_rma_pages << PAGE_SHIFT;
        return fd;
 }
@@ -1725,7 +1733,7 @@ static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu)
 {
        int err = 0;
        struct kvm *kvm = vcpu->kvm;
-        struct kvmppc_linear_info *ri = NULL;
+        struct kvm_rma_info *ri = NULL;
        unsigned long hva;
        struct kvm_memory_slot *memslot;
        struct vm_area_struct *vma;
@@ -1803,7 +1811,7 @@ static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu)
        } else {
                /* Set up to use an RMO region */
-                rma_size = ri->npages;
+                rma_size = kvm_rma_pages;
                if (rma_size > memslot->npages)
                        rma_size = memslot->npages;
                rma_size <<= PAGE_SHIFT;
@@ -1831,14 +1839,14 @@ static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu)
                        /* POWER7 */
                        lpcr &= ~(LPCR_VPM0 | LPCR_VRMA_L);
                        lpcr |= rmls << LPCR_RMLS_SH;
-                        kvm->arch.rmor = kvm->arch.rma->base_pfn << PAGE_SHIFT;
+                        kvm->arch.rmor = ri->base_pfn << PAGE_SHIFT;
                }
                kvm->arch.lpcr = lpcr;
                pr_info("KVM: Using RMO at %lx size %lx (LPCR = %lx)\n",
                        ri->base_pfn << PAGE_SHIFT, rma_size, lpcr);
                /* Initialize phys addrs of pages in RMO */
-                npages = ri->npages;
+                npages = kvm_rma_pages;
                porder = __ilog2(npages);
                physp = memslot->arch.slot_phys;
                if (physp) {
diff --git a/arch/powerpc/kvm/book3s_hv_builtin.c b/arch/powerpc/kvm/book3s_hv_builtin.c
index ec0a9e5de100..8cd0daebb82d 100644
--- a/arch/powerpc/kvm/book3s_hv_builtin.c
+++ b/arch/powerpc/kvm/book3s_hv_builtin.c
@@ -13,33 +13,34 @@
 #include <linux/spinlock.h>
 #include <linux/bootmem.h>
 #include <linux/init.h>
+#include <linux/memblock.h>
+#include <linux/sizes.h>
 #include <asm/cputable.h>
 #include <asm/kvm_ppc.h>
 #include <asm/kvm_book3s.h>
-#define KVM_LINEAR_RMA          0
+#include "book3s_hv_cma.h"
-#define KVM_LINEAR_HPT          1
+/*
+ * Hash page table alignment on newer cpus(CPU_FTR_ARCH_206)
-static void __init kvm_linear_init_one(ulong size, int count, int type);
+ * should be power of 2.
-static struct kvmppc_linear_info *kvm_alloc_linear(int type);
+ */
-static void kvm_release_linear(struct kvmppc_linear_info *ri);
+#define HPT_ALIGN_PAGES         ((1 << 18) >> PAGE_SHIFT) /* 256k */
+/*
-int kvm_hpt_order = KVM_DEFAULT_HPT_ORDER;
+ * By default we reserve 5% of memory for hash pagetable allocation.
-EXPORT_SYMBOL_GPL(kvm_hpt_order);
+ */
+static unsigned long kvm_cma_resv_ratio = 5;
-/*************** RMA *************/
 /*
- * This maintains a list of RMAs (real mode areas) for KVM guests to use.
+ * We allocate RMAs (real mode areas) for KVM guests from the KVM CMA area.
 * Each RMA has to be physically contiguous and of a size that the
 * hardware supports.  PPC970 and POWER7 support 64MB, 128MB and 256MB,
 * and other larger sizes.  Since we are unlikely to be allocate that
 * much physically contiguous memory after the system is up and running,
- * we preallocate a set of RMAs in early boot for KVM to use.
+ * we preallocate a set of RMAs in early boot using CMA.
+ * should be power of 2.
 */
-static unsigned long kvm_rma_size = 64 << 20;   /* 64MB */
+unsigned long kvm_rma_pages = (1 << 27) >> PAGE_SHIFT;  /* 128MB */
-static unsigned long kvm_rma_count;
+EXPORT_SYMBOL_GPL(kvm_rma_pages);
 /* Work out RMLS (real mode limit selector) field value for a given RMA size.
   Assumes POWER7 or PPC970. */
@@ -69,165 +70,114 @@ static inline int lpcr_rmls(unsigned long rma_size)
 static int __init early_parse_rma_size(char *p)
 {
-        if (!p)
+        unsigned long kvm_rma_size;
-                return 1;
+        pr_debug("%s(%s)\n", __func__, p);
+        if (!p)
+                return -EINVAL;
        kvm_rma_size = memparse(p, &p);
+        /*
+         * Check that the requested size is one supported in hardware
+         */
+        if (lpcr_rmls(kvm_rma_size) < 0) {
+                pr_err("RMA size of 0x%lx not supported\n", kvm_rma_size);
+                return -EINVAL;
+        }
+        kvm_rma_pages = kvm_rma_size >> PAGE_SHIFT;
        return 0;
 }
 early_param("kvm_rma_size", early_parse_rma_size);
-static int __init early_parse_rma_count(char *p)
+struct kvm_rma_info *kvm_alloc_rma()
 {
-        if (!p)
+        struct page *page;
-                return 1;
+        struct kvm_rma_info *ri;
-        kvm_rma_count = simple_strtoul(p, NULL, 0);
+        ri = kmalloc(sizeof(struct kvm_rma_info), GFP_KERNEL);
+        if (!ri)
-        return 0;
+                return NULL;
-}
+        page = kvm_alloc_cma(kvm_rma_pages, kvm_rma_pages);
-early_param("kvm_rma_count", early_parse_rma_count);
+        if (!page)
+                goto err_out;
-struct kvmppc_linear_info *kvm_alloc_rma(void)
+        atomic_set(&ri->use_count, 1);
-{
+        ri->base_pfn = page_to_pfn(page);
-        return kvm_alloc_linear(KVM_LINEAR_RMA);
+        return ri;
+err_out:
+        kfree(ri);
+        return NULL;
 }
 EXPORT_SYMBOL_GPL(kvm_alloc_rma);
-void kvm_release_rma(struct kvmppc_linear_info *ri)
+void kvm_release_rma(struct kvm_rma_info *ri)
 {
-        kvm_release_linear(ri);
+        if (atomic_dec_and_test(&ri->use_count)) {
+                kvm_release_cma(pfn_to_page(ri->base_pfn), kvm_rma_pages);
+                kfree(ri);
+        }
 }
 EXPORT_SYMBOL_GPL(kvm_release_rma);
-/*************** HPT *************/
+static int __init early_parse_kvm_cma_resv(char *p)
-/*
- * This maintains a list of big linear HPT tables that contain the GVA->HPA
- * memory mappings. If we don't reserve those early on, we might not be able
- * to get a big (usually 16MB) linear memory region from the kernel anymore.
- */
-static unsigned long kvm_hpt_count;
-static int __init early_parse_hpt_count(char *p)
 {
+        pr_debug("%s(%s)\n", __func__, p);
        if (!p)
-                return 1;
+                return -EINVAL;
+        return kstrtoul(p, 0, &kvm_cma_resv_ratio);
-        kvm_hpt_count = simple_strtoul(p, NULL, 0);
-        return 0;
 }
-early_param("kvm_hpt_count", early_parse_hpt_count);
+early_param("kvm_cma_resv_ratio", early_parse_kvm_cma_resv);
-struct kvmppc_linear_info *kvm_alloc_hpt(void)
+struct page *kvm_alloc_hpt(unsigned long nr_pages)
 {
-        return kvm_alloc_linear(KVM_LINEAR_HPT);
+        unsigned long align_pages = HPT_ALIGN_PAGES;
+        /* Old CPUs require HPT aligned on a multiple of its size */
+        if (!cpu_has_feature(CPU_FTR_ARCH_206))
+                align_pages = nr_pages;
+        return kvm_alloc_cma(nr_pages, align_pages);
 }
 EXPORT_SYMBOL_GPL(kvm_alloc_hpt);
-void kvm_release_hpt(struct kvmppc_linear_info *li)
+void kvm_release_hpt(struct page *page, unsigned long nr_pages)
 {
-        kvm_release_linear(li);
+        kvm_release_cma(page, nr_pages);
 }
 EXPORT_SYMBOL_GPL(kvm_release_hpt);
-/*************** generic *************/
+/**
+ * kvm_cma_reserve() - reserve area for kvm hash pagetable
-static LIST_HEAD(free_linears);
+ *
-static DEFINE_SPINLOCK(linear_lock);
+ * This function reserves memory from early allocator. It should be
+ * called by arch specific code once the early allocator (memblock or bootmem)
-static void __init kvm_linear_init_one(ulong size, int count, int type)
+ * has been activated and all other subsystems have already allocated/reserved
-{
+ * memory.
-        unsigned long i;
-        unsigned long j, npages;
-        void *linear;
-        struct page *pg;
-        const char *typestr;
-        struct kvmppc_linear_info *linear_info;
-        if (!count)
-                return;
-        typestr = (type == KVM_LINEAR_RMA) ? "RMA" : "HPT";
-        npages = size >> PAGE_SHIFT;
-        linear_info = alloc_bootmem(count * sizeof(struct kvmppc_linear_info));
-        for (i = 0; i < count; ++i) {
-                linear = alloc_bootmem_align(size, size);
-                pr_debug("Allocated KVM %s at %p (%ld MB)\n", typestr, linear,
-                         size >> 20);
-                linear_info[i].base_virt = linear;
-                linear_info[i].base_pfn = __pa(linear) >> PAGE_SHIFT;
-                linear_info[i].npages = npages;
-                linear_info[i].type = type;
-                list_add_tail(&linear_info[i].list, &free_linears);
-                atomic_set(&linear_info[i].use_count, 0);
-                pg = pfn_to_page(linear_info[i].base_pfn);
-                for (j = 0; j < npages; ++j) {
-                        atomic_inc(&pg->_count);
-                        ++pg;
-                }
-        }
-}
-static struct kvmppc_linear_info *kvm_alloc_linear(int type)
-{
-        struct kvmppc_linear_info *ri, *ret;
-        ret = NULL;
-        spin_lock(&linear_lock);
-        list_for_each_entry(ri, &free_linears, list) {
-                if (ri->type != type)
-                        continue;
-                list_del(&ri->list);
-                atomic_inc(&ri->use_count);
-                memset(ri->base_virt, 0, ri->npages << PAGE_SHIFT);
-                ret = ri;
-                break;
-        }
-        spin_unlock(&linear_lock);
-        return ret;
-}
-static void kvm_release_linear(struct kvmppc_linear_info *ri)
-{
-        if (atomic_dec_and_test(&ri->use_count)) {
-                spin_lock(&linear_lock);
-                list_add_tail(&ri->list, &free_linears);
-                spin_unlock(&linear_lock);
-        }
-}
-/*
- * Called at boot time while the bootmem allocator is active,
- * to allocate contiguous physical memory for the hash page
- * tables for guests.
 */
-void __init kvm_linear_init(void)
+void __init kvm_cma_reserve(void)
 {
-        /* HPT */
+        unsigned long align_size;
-        kvm_linear_init_one(1 << kvm_hpt_order, kvm_hpt_count, KVM_LINEAR_HPT);
+        struct memblock_region *reg;
+        phys_addr_t selected_size = 0;
-        /* RMA */
+        /*
-        /* Only do this on PPC970 in HV mode */
+         * We cannot use memblock_phys_mem_size() here, because
-        if (!cpu_has_feature(CPU_FTR_HVMODE) ||
+         * memblock_analyze() has not been called yet.
-            !cpu_has_feature(CPU_FTR_ARCH_201))
+         */
-                return;
+        for_each_memblock(memory, reg)
+                selected_size += memblock_region_memory_end_pfn(reg) -
-        if (!kvm_rma_size || !kvm_rma_count)
+                                 memblock_region_memory_base_pfn(reg);
-                return;
+        selected_size = (selected_size * kvm_cma_resv_ratio / 100) << PAGE_SHIFT;
-        /* Check that the requested size is one supported in hardware */
+        if (selected_size) {
-        if (lpcr_rmls(kvm_rma_size) < 0) {
+                pr_debug("%s: reserving %ld MiB for global area\n", __func__,
-                pr_err("RMA size of 0x%lx not supported\n", kvm_rma_size);
+                         (unsigned long)selected_size / SZ_1M);
-                return;
+                /*
+                 * Old CPUs require HPT aligned on a multiple of its size. So for them
+                 * make the alignment as max size we could request.
+                 */
+                if (!cpu_has_feature(CPU_FTR_ARCH_206))
+                        align_size = __rounddown_pow_of_two(selected_size);
+                else
+                        align_size = HPT_ALIGN_PAGES << PAGE_SHIFT;
+                align_size = max(kvm_rma_pages << PAGE_SHIFT, align_size);
+                kvm_cma_declare_contiguous(selected_size, align_size);
        }
-        kvm_linear_init_one(kvm_rma_size, kvm_rma_count, KVM_LINEAR_RMA);
 }
diff --git a/arch/powerpc/kvm/book3s_hv_cma.c b/arch/powerpc/kvm/book3s_hv_cma.c
new file mode 100644
index 000000000000..d9d3d8553d51
--- /dev/null
+++ b/arch/powerpc/kvm/book3s_hv_cma.c
@@ -0,0 +1,240 @@
+/*
+ * Contiguous Memory Allocator for ppc KVM hash pagetable  based on CMA
+ * for DMA mapping framework
+ *
+ * Copyright IBM Corporation, 2013
+ * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License or (at your optional) any later version of the license.
+ *
+ */
+#define pr_fmt(fmt) "kvm_cma: " fmt
+#ifdef CONFIG_CMA_DEBUG
+#ifndef DEBUG
+#  define DEBUG
+#endif
+#endif
+#include <linux/memblock.h>
+#include <linux/mutex.h>
+#include <linux/sizes.h>
+#include <linux/slab.h>
+#include "book3s_hv_cma.h"
+struct kvm_cma {
+        unsigned long   base_pfn;
+        unsigned long   count;
+        unsigned long   *bitmap;
+};
+static DEFINE_MUTEX(kvm_cma_mutex);
+static struct kvm_cma kvm_cma_area;
+/**
+ * kvm_cma_declare_contiguous() - reserve area for contiguous memory handling
+ *                                for kvm hash pagetable
+ * @size:  Size of the reserved memory.
+ * @alignment:  Alignment for the contiguous memory area
+ *
+ * This function reserves memory for kvm cma area. It should be
+ * called by arch code when early allocator (memblock or bootmem)
+ * is still activate.
+ */
+long __init kvm_cma_declare_contiguous(phys_addr_t size, phys_addr_t alignment)
+{
+        long base_pfn;
+        phys_addr_t addr;
+        struct kvm_cma *cma = &kvm_cma_area;
+        pr_debug("%s(size %lx)\n", __func__, (unsigned long)size);
+        if (!size)
+                return -EINVAL;
+        /*
+         * Sanitise input arguments.
+         * We should be pageblock aligned for CMA.
+         */
+        alignment = max(alignment, (phys_addr_t)(PAGE_SIZE << pageblock_order));
+        size = ALIGN(size, alignment);
+        /*
+         * Reserve memory
+         * Use __memblock_alloc_base() since
+         * memblock_alloc_base() panic()s.
+         */
+        addr = __memblock_alloc_base(size, alignment, 0);
+        if (!addr) {
+                base_pfn = -ENOMEM;
+                goto err;
+        } else
+                base_pfn = PFN_DOWN(addr);
+        /*
+         * Each reserved area must be initialised later, when more kernel
+         * subsystems (like slab allocator) are available.
+         */
+        cma->base_pfn = base_pfn;
+        cma->count    = size >> PAGE_SHIFT;
+        pr_info("CMA: reserved %ld MiB\n", (unsigned long)size / SZ_1M);
+        return 0;
+err:
+        pr_err("CMA: failed to reserve %ld MiB\n", (unsigned long)size / SZ_1M);
+        return base_pfn;
+}
+/**
+ * kvm_alloc_cma() - allocate pages from contiguous area
+ * @nr_pages: Requested number of pages.
+ * @align_pages: Requested alignment in number of pages
+ *
+ * This function allocates memory buffer for hash pagetable.
+ */
+struct page *kvm_alloc_cma(unsigned long nr_pages, unsigned long align_pages)
+{
+        int ret;
+        struct page *page = NULL;
+        struct kvm_cma *cma = &kvm_cma_area;
+        unsigned long chunk_count, nr_chunk;
+        unsigned long mask, pfn, pageno, start = 0;
+        if (!cma || !cma->count)
+                return NULL;
+        pr_debug("%s(cma %p, count %lu, align pages %lu)\n", __func__,
+                 (void *)cma, nr_pages, align_pages);
+        if (!nr_pages)
+                return NULL;
+        /*
+         * align mask with chunk size. The bit tracks pages in chunk size
+         */
+        VM_BUG_ON(!is_power_of_2(align_pages));
+        mask = (align_pages >> (KVM_CMA_CHUNK_ORDER - PAGE_SHIFT)) - 1;
+        BUILD_BUG_ON(PAGE_SHIFT > KVM_CMA_CHUNK_ORDER);
+        chunk_count = cma->count >>  (KVM_CMA_CHUNK_ORDER - PAGE_SHIFT);
+        nr_chunk = nr_pages >> (KVM_CMA_CHUNK_ORDER - PAGE_SHIFT);
+        mutex_lock(&kvm_cma_mutex);
+        for (;;) {
+                pageno = bitmap_find_next_zero_area(cma->bitmap, chunk_count,
+                                                    start, nr_chunk, mask);
+                if (pageno >= chunk_count)
+                        break;
+                pfn = cma->base_pfn + (pageno << (KVM_CMA_CHUNK_ORDER - PAGE_SHIFT));
+                ret = alloc_contig_range(pfn, pfn + nr_pages, MIGRATE_CMA);
+                if (ret == 0) {
+                        bitmap_set(cma->bitmap, pageno, nr_chunk);
+                        page = pfn_to_page(pfn);
+                        memset(pfn_to_kaddr(pfn), 0, nr_pages << PAGE_SHIFT);
+                        break;
+                } else if (ret != -EBUSY) {
+                        break;
+                }
+                pr_debug("%s(): memory range at %p is busy, retrying\n",
+                         __func__, pfn_to_page(pfn));
+                /* try again with a bit different memory target */
+                start = pageno + mask + 1;
+        }
+        mutex_unlock(&kvm_cma_mutex);
+        pr_debug("%s(): returned %p\n", __func__, page);
+        return page;
+}
+/**
+ * kvm_release_cma() - release allocated pages for hash pagetable
+ * @pages: Allocated pages.
+ * @nr_pages: Number of allocated pages.
+ *
+ * This function releases memory allocated by kvm_alloc_cma().
+ * It returns false when provided pages do not belong to contiguous area and
+ * true otherwise.
+ */
+bool kvm_release_cma(struct page *pages, unsigned long nr_pages)
+{
+        unsigned long pfn;
+        unsigned long nr_chunk;
+        struct kvm_cma *cma = &kvm_cma_area;
+        if (!cma || !pages)
+                return false;
+        pr_debug("%s(page %p count %lu)\n", __func__, (void *)pages, nr_pages);
+        pfn = page_to_pfn(pages);
+        if (pfn < cma->base_pfn || pfn >= cma->base_pfn + cma->count)
+                return false;
+        VM_BUG_ON(pfn + nr_pages > cma->base_pfn + cma->count);
+        nr_chunk = nr_pages >>  (KVM_CMA_CHUNK_ORDER - PAGE_SHIFT);
+        mutex_lock(&kvm_cma_mutex);
+        bitmap_clear(cma->bitmap,
+                     (pfn - cma->base_pfn) >> (KVM_CMA_CHUNK_ORDER - PAGE_SHIFT),
+                     nr_chunk);
+        free_contig_range(pfn, nr_pages);
+        mutex_unlock(&kvm_cma_mutex);
+        return true;
+}
+static int __init kvm_cma_activate_area(unsigned long base_pfn,
+                                        unsigned long count)
+{
+        unsigned long pfn = base_pfn;
+        unsigned i = count >> pageblock_order;
+        struct zone *zone;
+        WARN_ON_ONCE(!pfn_valid(pfn));
+        zone = page_zone(pfn_to_page(pfn));
+        do {
+                unsigned j;
+                base_pfn = pfn;
+                for (j = pageblock_nr_pages; j; --j, pfn++) {
+                        WARN_ON_ONCE(!pfn_valid(pfn));
+                        /*
+                         * alloc_contig_range requires the pfn range
+                         * specified to be in the same zone. Make this
+                         * simple by forcing the entire CMA resv range
+                         * to be in the same zone.
+                         */
+                        if (page_zone(pfn_to_page(pfn)) != zone)
+                                return -EINVAL;
+                }
+                init_cma_reserved_pageblock(pfn_to_page(base_pfn));
+        } while (--i);
+        return 0;
+}
+static int __init kvm_cma_init_reserved_areas(void)
+{
+        int bitmap_size, ret;
+        unsigned long chunk_count;
+        struct kvm_cma *cma = &kvm_cma_area;
+        pr_debug("%s()\n", __func__);
+        if (!cma->count)
+                return 0;
+        chunk_count = cma->count >> (KVM_CMA_CHUNK_ORDER - PAGE_SHIFT);
+        bitmap_size = BITS_TO_LONGS(chunk_count) * sizeof(long);
+        cma->bitmap = kzalloc(bitmap_size, GFP_KERNEL);
+        if (!cma->bitmap)
+                return -ENOMEM;
+        ret = kvm_cma_activate_area(cma->base_pfn, cma->count);
+        if (ret)
+                goto error;
+        return 0;
+error:
+        kfree(cma->bitmap);
+        return ret;
+}
+core_initcall(kvm_cma_init_reserved_areas);
diff --git a/arch/powerpc/kvm/book3s_hv_cma.h b/arch/powerpc/kvm/book3s_hv_cma.h
new file mode 100644
index 000000000000..655144f75fa5
--- /dev/null
+++ b/arch/powerpc/kvm/book3s_hv_cma.h
@@ -0,0 +1,27 @@
+/*
+ * Contiguous Memory Allocator for ppc KVM hash pagetable  based on CMA
+ * for DMA mapping framework
+ *
+ * Copyright IBM Corporation, 2013
+ * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License or (at your optional) any later version of the license.
+ *
+ */
+#ifndef __POWERPC_KVM_CMA_ALLOC_H__
+#define __POWERPC_KVM_CMA_ALLOC_H__
+/*
+ * Both RMA and Hash page allocation will be multiple of 256K.
+ */
+#define KVM_CMA_CHUNK_ORDER     18
+extern struct page *kvm_alloc_cma(unsigned long nr_pages,
+                                  unsigned long align_pages);
+extern bool kvm_release_cma(struct page *pages, unsigned long nr_pages);
+extern long kvm_cma_declare_contiguous(phys_addr_t size,
+                                       phys_addr_t alignment) __init;
+#endif
diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
index fc25689a9f35..45e30d6e462b 100644
--- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
@@ -383,6 +383,80 @@ static inline int try_lock_tlbie(unsigned int *lock)
        return old == 0;
 }
+/*
+ * tlbie/tlbiel is a bit different on the PPC970 compared to later
+ * processors such as POWER7; the large page bit is in the instruction
+ * not RB, and the top 16 bits and the bottom 12 bits of the VA
+ * in RB must be 0.
+ */
+static void do_tlbies_970(struct kvm *kvm, unsigned long *rbvalues,
+                          long npages, int global, bool need_sync)
+{
+        long i;
+        if (global) {
+                while (!try_lock_tlbie(&kvm->arch.tlbie_lock))
+                        cpu_relax();
+                if (need_sync)
+                        asm volatile("ptesync" : : : "memory");
+                for (i = 0; i < npages; ++i) {
+                        unsigned long rb = rbvalues[i];
+                        if (rb & 1)             /* large page */
+                                asm volatile("tlbie %0,1" : :
+                                             "r" (rb & 0x0000fffffffff000ul));
+                        else
+                                asm volatile("tlbie %0,0" : :
+                                             "r" (rb & 0x0000fffffffff000ul));
+                }
+                asm volatile("eieio; tlbsync; ptesync" : : : "memory");
+                kvm->arch.tlbie_lock = 0;
+        } else {
+                if (need_sync)
+                        asm volatile("ptesync" : : : "memory");
+                for (i = 0; i < npages; ++i) {
+                        unsigned long rb = rbvalues[i];
+                        if (rb & 1)             /* large page */
+                                asm volatile("tlbiel %0,1" : :
+                                             "r" (rb & 0x0000fffffffff000ul));
+                        else
+                                asm volatile("tlbiel %0,0" : :
+                                             "r" (rb & 0x0000fffffffff000ul));
+                }
+                asm volatile("ptesync" : : : "memory");
+        }
+}
+static void do_tlbies(struct kvm *kvm, unsigned long *rbvalues,
+                      long npages, int global, bool need_sync)
+{
+        long i;
+        if (cpu_has_feature(CPU_FTR_ARCH_201)) {
+                /* PPC970 tlbie instruction is a bit different */
+                do_tlbies_970(kvm, rbvalues, npages, global, need_sync);
+                return;
+        }
+        if (global) {
+                while (!try_lock_tlbie(&kvm->arch.tlbie_lock))
+                        cpu_relax();
+                if (need_sync)
+                        asm volatile("ptesync" : : : "memory");
+                for (i = 0; i < npages; ++i)
+                        asm volatile(PPC_TLBIE(%1,%0) : :
+                                     "r" (rbvalues[i]), "r" (kvm->arch.lpid));
+                asm volatile("eieio; tlbsync; ptesync" : : : "memory");
+                kvm->arch.tlbie_lock = 0;
+        } else {
+                if (need_sync)
+                        asm volatile("ptesync" : : : "memory");
+                for (i = 0; i < npages; ++i)
+                        asm volatile("tlbiel %0" : : "r" (rbvalues[i]));
+                asm volatile("ptesync" : : : "memory");
+        }
+}
 long kvmppc_do_h_remove(struct kvm *kvm, unsigned long flags,
                        unsigned long pte_index, unsigned long avpn,
                        unsigned long *hpret)
@@ -408,19 +482,7 @@ long kvmppc_do_h_remove(struct kvm *kvm, unsigned long flags,
        if (v & HPTE_V_VALID) {
                hpte[0] &= ~HPTE_V_VALID;
                rb = compute_tlbie_rb(v, hpte[1], pte_index);
-                if (global_invalidates(kvm, flags)) {
+                do_tlbies(kvm, &rb, 1, global_invalidates(kvm, flags), true);
-                        while (!try_lock_tlbie(&kvm->arch.tlbie_lock))
-                                cpu_relax();
-                        asm volatile("ptesync" : : : "memory");
-                        asm volatile(PPC_TLBIE(%1,%0)"; eieio; tlbsync"
-                                     : : "r" (rb), "r" (kvm->arch.lpid));
-                        asm volatile("ptesync" : : : "memory");
-                        kvm->arch.tlbie_lock = 0;
-                } else {
-                        asm volatile("ptesync" : : : "memory");
-                        asm volatile("tlbiel %0" : : "r" (rb));
-                        asm volatile("ptesync" : : : "memory");
-                }
                /* Read PTE low word after tlbie to get final R/C values */
                remove_revmap_chain(kvm, pte_index, rev, v, hpte[1]);
        }
@@ -448,12 +510,11 @@ long kvmppc_h_bulk_remove(struct kvm_vcpu *vcpu)
        unsigned long *hp, *hptes[4], tlbrb[4];
        long int i, j, k, n, found, indexes[4];
        unsigned long flags, req, pte_index, rcbits;
-        long int local = 0;
+        int global;
        long int ret = H_SUCCESS;
        struct revmap_entry *rev, *revs[4];
-        if (atomic_read(&kvm->online_vcpus) == 1)
+        global = global_invalidates(kvm, 0);
-                local = 1;
        for (i = 0; i < 4 && ret == H_SUCCESS; ) {
                n = 0;
                for (; i < 4; ++i) {
@@ -529,22 +590,7 @@ long kvmppc_h_bulk_remove(struct kvm_vcpu *vcpu)
                        break;
                /* Now that we've collected a batch, do the tlbies */
-                if (!local) {
+                do_tlbies(kvm, tlbrb, n, global, true);
-                        while(!try_lock_tlbie(&kvm->arch.tlbie_lock))
-                                cpu_relax();
-                        asm volatile("ptesync" : : : "memory");
-                        for (k = 0; k < n; ++k)
-                                asm volatile(PPC_TLBIE(%1,%0) : :
-                                             "r" (tlbrb[k]),
-                                             "r" (kvm->arch.lpid));
-                        asm volatile("eieio; tlbsync; ptesync" : : : "memory");
-                        kvm->arch.tlbie_lock = 0;
-                } else {
-                        asm volatile("ptesync" : : : "memory");
-                        for (k = 0; k < n; ++k)
-                                asm volatile("tlbiel %0" : : "r" (tlbrb[k]));
-                        asm volatile("ptesync" : : : "memory");
-                }
                /* Read PTE low words after tlbie to get final R/C values */
                for (k = 0; k < n; ++k) {
@@ -603,19 +649,7 @@ long kvmppc_h_protect(struct kvm_vcpu *vcpu, unsigned long flags,
        if (v & HPTE_V_VALID) {
                rb = compute_tlbie_rb(v, r, pte_index);
                hpte[0] = v & ~HPTE_V_VALID;
-                if (global_invalidates(kvm, flags)) {
+                do_tlbies(kvm, &rb, 1, global_invalidates(kvm, flags), true);
-                        while(!try_lock_tlbie(&kvm->arch.tlbie_lock))
-                                cpu_relax();
-                        asm volatile("ptesync" : : : "memory");
-                        asm volatile(PPC_TLBIE(%1,%0)"; eieio; tlbsync"
-                                     : : "r" (rb), "r" (kvm->arch.lpid));
-                        asm volatile("ptesync" : : : "memory");
-                        kvm->arch.tlbie_lock = 0;
-                } else {
-                        asm volatile("ptesync" : : : "memory");
-                        asm volatile("tlbiel %0" : : "r" (rb));
-                        asm volatile("ptesync" : : : "memory");
-                }
                /*
                 * If the host has this page as readonly but the guest
                 * wants to make it read/write, reduce the permissions.
@@ -686,13 +720,7 @@ void kvmppc_invalidate_hpte(struct kvm *kvm, unsigned long *hptep,
        hptep[0] &= ~HPTE_V_VALID;
        rb = compute_tlbie_rb(hptep[0], hptep[1], pte_index);
-        while (!try_lock_tlbie(&kvm->arch.tlbie_lock))
+        do_tlbies(kvm, &rb, 1, 1, true);
-                cpu_relax();
-        asm volatile("ptesync" : : : "memory");
-        asm volatile(PPC_TLBIE(%1,%0)"; eieio; tlbsync"
-                     : : "r" (rb), "r" (kvm->arch.lpid));
-        asm volatile("ptesync" : : : "memory");
-        kvm->arch.tlbie_lock = 0;
 }
 EXPORT_SYMBOL_GPL(kvmppc_invalidate_hpte);
@@ -706,12 +734,7 @@ void kvmppc_clear_ref_hpte(struct kvm *kvm, unsigned long *hptep,
        rbyte = (hptep[1] & ~HPTE_R_R) >> 8;
        /* modify only the second-last byte, which contains the ref bit */
        *((char *)hptep + 14) = rbyte;
-        while (!try_lock_tlbie(&kvm->arch.tlbie_lock))
+        do_tlbies(kvm, &rb, 1, 1, false);
-                cpu_relax();
-        asm volatile(PPC_TLBIE(%1,%0)"; eieio; tlbsync"
-                     : : "r" (rb), "r" (kvm->arch.lpid));
-        asm volatile("ptesync" : : : "memory");
-        kvm->arch.tlbie_lock = 0;
 }
 EXPORT_SYMBOL_GPL(kvmppc_clear_ref_hpte);
diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index b02f91e4c70d..60dce5bfab3f 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -1381,7 +1381,7 @@ hcall_try_real_mode:
        cmpldi  r3,hcall_real_table_end - hcall_real_table
        bge     guest_exit_cont
        LOAD_REG_ADDR(r4, hcall_real_table)
-        lwzx    r3,r3,r4
+        lwax    r3,r3,r4
        cmpwi   r3,0
        beq     guest_exit_cont
        add     r3,r3,r4
diff --git a/arch/powerpc/kvm/book3s_interrupts.S b/arch/powerpc/kvm/book3s_interrupts.S
index 48cbbf862958..17cfae5497a3 100644
--- a/arch/powerpc/kvm/book3s_interrupts.S
+++ b/arch/powerpc/kvm/book3s_interrupts.S
@@ -92,6 +92,11 @@ kvm_start_lightweight:
        PPC_LL  r3, VCPU_HFLAGS(r4)
        rldicl  r3, r3, 0, 63           /* r3 &= 1 */
        stb     r3, HSTATE_RESTORE_HID5(r13)
+        /* Load up guest SPRG3 value, since it's user readable */
+        ld      r3, VCPU_SHARED(r4)
+        ld      r3, VCPU_SHARED_SPRG3(r3)
+        mtspr   SPRN_SPRG3, r3
 #endif /* CONFIG_PPC_BOOK3S_64 */
        PPC_LL  r4, VCPU_SHADOW_MSR(r4) /* get shadow_msr */
@@ -123,6 +128,15 @@ kvmppc_handler_highmem:
        /* R7 = vcpu */
        PPC_LL  r7, GPR4(r1)
+#ifdef CONFIG_PPC_BOOK3S_64
+        /*
+         * Reload kernel SPRG3 value.
+         * No need to save guest value as usermode can't modify SPRG3.
+         */
+        ld      r3, PACA_SPRG3(r13)
+        mtspr   SPRN_SPRG3, r3
+#endif /* CONFIG_PPC_BOOK3S_64 */
        PPC_STL r14, VCPU_GPR(R14)(r7)
        PPC_STL r15, VCPU_GPR(R15)(r7)
        PPC_STL r16, VCPU_GPR(R16)(r7)
diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c
index c6e13d9a9e15..27db1e665959 100644
--- a/arch/powerpc/kvm/book3s_pr.c
+++ b/arch/powerpc/kvm/book3s_pr.c
@@ -468,7 +468,8 @@ void kvmppc_giveup_ext(struct kvm_vcpu *vcpu, ulong msr)
                 * both the traditional FP registers and the added VSX
                 * registers into thread.fpr[].
                 */
-                giveup_fpu(current);
+                if (current->thread.regs->msr & MSR_FP)
+                        giveup_fpu(current);
                for (i = 0; i < ARRAY_SIZE(vcpu->arch.fpr); i++)
                        vcpu_fpr[i] = thread_fpr[get_fpr_index(i)];
@@ -483,7 +484,8 @@ void kvmppc_giveup_ext(struct kvm_vcpu *vcpu, ulong msr)
 #ifdef CONFIG_ALTIVEC
        if (msr & MSR_VEC) {
-                giveup_altivec(current);
+                if (current->thread.regs->msr & MSR_VEC)
+                        giveup_altivec(current);
                memcpy(vcpu->arch.vr, t->vr, sizeof(vcpu->arch.vr));
                vcpu->arch.vscr = t->vscr;
        }
@@ -575,8 +577,6 @@ static int kvmppc_handle_ext(struct kvm_vcpu *vcpu, unsigned int exit_nr,
        printk(KERN_INFO "Loading up ext 0x%lx\n", msr);
 #endif
-        current->thread.regs->msr |= msr;
        if (msr & MSR_FP) {
                for (i = 0; i < ARRAY_SIZE(vcpu->arch.fpr); i++)
                        thread_fpr[get_fpr_index(i)] = vcpu_fpr[i];
@@ -598,12 +598,32 @@ static int kvmppc_handle_ext(struct kvm_vcpu *vcpu, unsigned int exit_nr,
 #endif
        }
+        current->thread.regs->msr |= msr;
        vcpu->arch.guest_owned_ext |= msr;
        kvmppc_recalc_shadow_msr(vcpu);
        return RESUME_GUEST;
 }
+/*
+ * Kernel code using FP or VMX could have flushed guest state to
+ * the thread_struct; if so, get it back now.
+ */
+static void kvmppc_handle_lost_ext(struct kvm_vcpu *vcpu)
+{
+        unsigned long lost_ext;
+        lost_ext = vcpu->arch.guest_owned_ext & ~current->thread.regs->msr;
+        if (!lost_ext)
+                return;
+        if (lost_ext & MSR_FP)
+                kvmppc_load_up_fpu();
+        if (lost_ext & MSR_VEC)
+                kvmppc_load_up_altivec();
+        current->thread.regs->msr |= lost_ext;
+}
 int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
                       unsigned int exit_nr)
 {
@@ -772,7 +792,7 @@ program_interrupt:
        }
        case BOOK3S_INTERRUPT_SYSCALL:
                if (vcpu->arch.papr_enabled &&
-                    (kvmppc_get_last_inst(vcpu) == 0x44000022) &&
+                    (kvmppc_get_last_sc(vcpu) == 0x44000022) &&
                    !(vcpu->arch.shared->msr & MSR_PR)) {
                        /* SC 1 papr hypercalls */
                        ulong cmd = kvmppc_get_gpr(vcpu, 3);
@@ -890,8 +910,9 @@ program_interrupt:
                        local_irq_enable();
                        r = s;
                } else {
-                        kvmppc_lazy_ee_enable();
+                        kvmppc_fix_ee_before_entry();
                }
+                kvmppc_handle_lost_ext(vcpu);
        }
        trace_kvm_book3s_reenter(r, vcpu);
@@ -1162,7 +1183,7 @@ int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
        if (vcpu->arch.shared->msr & MSR_FP)
                kvmppc_handle_ext(vcpu, BOOK3S_INTERRUPT_FP_UNAVAIL, MSR_FP);
-        kvmppc_lazy_ee_enable();
+        kvmppc_fix_ee_before_entry();
        ret = __kvmppc_vcpu_run(kvm_run, vcpu);
diff --git a/arch/powerpc/kvm/book3s_xics.c b/arch/powerpc/kvm/book3s_xics.c
index 94c1dd46b83d..a3a5cb8ee7ea 100644
--- a/arch/powerpc/kvm/book3s_xics.c
+++ b/arch/powerpc/kvm/book3s_xics.c
@@ -19,6 +19,7 @@
 #include <asm/hvcall.h>
 #include <asm/xics.h>
 #include <asm/debug.h>
+#include <asm/time.h>
 #include <linux/debugfs.h>
 #include <linux/seq_file.h>
diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c
index dcc94f016007..17722d82f1d1 100644
--- a/arch/powerpc/kvm/booke.c
+++ b/arch/powerpc/kvm/booke.c
@@ -674,8 +674,6 @@ int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
                goto out;
        }
-        kvm_guest_enter();
 #ifdef CONFIG_PPC_FPU
        /* Save userspace FPU state in stack */
        enable_kernel_fp();
@@ -698,7 +696,7 @@ int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
        kvmppc_load_guest_fp(vcpu);
 #endif
-        kvmppc_lazy_ee_enable();
+        kvmppc_fix_ee_before_entry();
        ret = __kvmppc_vcpu_run(kvm_run, vcpu);
@@ -1168,7 +1166,7 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
                        local_irq_enable();
                        r = (s << 2) | RESUME_HOST | (r & RESUME_FLAG_NV);
                } else {
-                        kvmppc_lazy_ee_enable();
+                        kvmppc_fix_ee_before_entry();
                }
        }
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 6316ee336e88..f55e14cd1762 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -117,8 +117,6 @@ int kvmppc_prepare_to_enter(struct kvm_vcpu *vcpu)
                        kvm_guest_exit();
                        continue;
                }
-                trace_hardirqs_on();
 #endif
                kvm_guest_enter();
@@ -420,6 +418,10 @@ int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages)
        return kvmppc_core_create_memslot(slot, npages);
 }
+void kvm_arch_memslots_updated(struct kvm *kvm)
+{
+}
 int kvm_arch_prepare_memory_region(struct kvm *kvm,
                                   struct kvm_memory_slot *memslot,
                                   struct kvm_userspace_memory_region *mem,
diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h
index 3238d4004e84..e87ecaa2c569 100644
--- a/arch/s390/include/asm/kvm_host.h
+++ b/arch/s390/include/asm/kvm_host.h
@@ -274,6 +274,14 @@ struct kvm_arch{
        int css_support;
 };
+#define KVM_HVA_ERR_BAD         (-1UL)
+#define KVM_HVA_ERR_RO_BAD      (-2UL)
+static inline bool kvm_is_error_hva(unsigned long addr)
+{
+        return IS_ERR_VALUE(addr);
+}
 extern int sie64a(struct kvm_s390_sie_block *, u64 *);
 extern char sie_exit;
 #endif
diff --git a/arch/s390/include/asm/mmu.h b/arch/s390/include/asm/mmu.h
index 6340178748bf..ff132ac64ddd 100644
--- a/arch/s390/include/asm/mmu.h
+++ b/arch/s390/include/asm/mmu.h
@@ -12,8 +12,6 @@ typedef struct {
        unsigned long asce_bits;
        unsigned long asce_limit;
        unsigned long vdso_base;
-        /* Cloned contexts will be created with extended page tables. */
-        unsigned int alloc_pgste:1;
        /* The mmu context has extended page tables. */
        unsigned int has_pgste:1;
 } mm_context_t;
diff --git a/arch/s390/include/asm/mmu_context.h b/arch/s390/include/asm/mmu_context.h
index 7b7fce4e8469..9f973d8de90e 100644
--- a/arch/s390/include/asm/mmu_context.h
+++ b/arch/s390/include/asm/mmu_context.h
@@ -21,24 +21,7 @@ static inline int init_new_context(struct task_struct *tsk,
 #ifdef CONFIG_64BIT
        mm->context.asce_bits |= _ASCE_TYPE_REGION3;
 #endif
-        if (current->mm && current->mm->context.alloc_pgste) {
+        mm->context.has_pgste = 0;
-                /*
-                 * alloc_pgste indicates, that any NEW context will be created
-                 * with extended page tables. The old context is unchanged. The
-                 * page table allocation and the page table operations will
-                 * look at has_pgste to distinguish normal and extended page
-                 * tables. The only way to create extended page tables is to
-                 * set alloc_pgste and then create a new context (e.g. dup_mm).
-                 * The page table allocation is called after init_new_context
-                 * and if has_pgste is set, it will create extended page
-                 * tables.
-                 */
-                mm->context.has_pgste = 1;
-                mm->context.alloc_pgste = 1;
-        } else {
-                mm->context.has_pgste = 0;
-                mm->context.alloc_pgste = 0;
-        }
        mm->context.asce_limit = STACK_TOP_MAX;
        crst_table_init((unsigned long *) mm->pgd, pgd_entry_type(mm));
        return 0;
diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h
index 9f215b40109e..9b60a36c348d 100644
--- a/arch/s390/include/asm/pgtable.h
+++ b/arch/s390/include/asm/pgtable.h
@@ -1442,6 +1442,17 @@ static inline pmd_t pmd_mkwrite(pmd_t pmd)
 }
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLB_PAGE */
+static inline void pmdp_flush_lazy(struct mm_struct *mm,
+                                   unsigned long address, pmd_t *pmdp)
+{
+        int active = (mm == current->active_mm) ? 1 : 0;
+        if ((atomic_read(&mm->context.attach_count) & 0xffff) > active)
+                __pmd_idte(address, pmdp);
+        else
+                mm->context.flush_mm = 1;
+}
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 #define __HAVE_ARCH_PGTABLE_DEPOSIT
diff --git a/arch/s390/include/asm/processor.h b/arch/s390/include/asm/processor.h
index b0e6435b2f02..0eb37505cab1 100644
--- a/arch/s390/include/asm/processor.h
+++ b/arch/s390/include/asm/processor.h
@@ -43,6 +43,7 @@ extern void execve_tail(void);
 #ifndef CONFIG_64BIT
 #define TASK_SIZE               (1UL << 31)
+#define TASK_MAX_SIZE           (1UL << 31)
 #define TASK_UNMAPPED_BASE      (1UL << 30)
 #else /* CONFIG_64BIT */
@@ -51,6 +52,7 @@ extern void execve_tail(void);
 #define TASK_UNMAPPED_BASE      (test_thread_flag(TIF_31BIT) ? \
                                        (1UL << 30) : (1UL << 41))
 #define TASK_SIZE               TASK_SIZE_OF(current)
+#define TASK_MAX_SIZE           (1UL << 53)
 #endif /* CONFIG_64BIT */
diff --git a/arch/s390/kvm/diag.c b/arch/s390/kvm/diag.c
index 3074475c8ae0..3a74d8af0d69 100644
--- a/arch/s390/kvm/diag.c
+++ b/arch/s390/kvm/diag.c
@@ -119,12 +119,21 @@ static int __diag_virtio_hypercall(struct kvm_vcpu *vcpu)
         * The layout is as follows:
         * - gpr 2 contains the subchannel id (passed as addr)
         * - gpr 3 contains the virtqueue index (passed as datamatch)
+         * - gpr 4 contains the index on the bus (optionally)
         */
-        ret = kvm_io_bus_write(vcpu->kvm, KVM_VIRTIO_CCW_NOTIFY_BUS,
+        ret = kvm_io_bus_write_cookie(vcpu->kvm, KVM_VIRTIO_CCW_NOTIFY_BUS,
-                                vcpu->run->s.regs.gprs[2],
+                                      vcpu->run->s.regs.gprs[2],
-                                8, &vcpu->run->s.regs.gprs[3]);
+                                      8, &vcpu->run->s.regs.gprs[3],
+                                      vcpu->run->s.regs.gprs[4]);
        srcu_read_unlock(&vcpu->kvm->srcu, idx);
-        /* kvm_io_bus_write returns -EOPNOTSUPP if it found no match. */
+        /*
+         * Return cookie in gpr 2, but don't overwrite the register if the
+         * diagnose will be handled by userspace.
+         */
+        if (ret != -EOPNOTSUPP)
+                vcpu->run->s.regs.gprs[2] = ret;
+        /* kvm_io_bus_write_cookie returns -EOPNOTSUPP if it found no match. */
        return ret < 0 ? ret : 0;
 }
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 34c1c9a90be2..776dafe918db 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -28,6 +28,7 @@
 #include <asm/pgtable.h>
 #include <asm/nmi.h>
 #include <asm/switch_to.h>
+#include <asm/facility.h>
 #include <asm/sclp.h>
 #include "kvm-s390.h"
 #include "gaccess.h"
@@ -84,9 +85,15 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
        { NULL }
 };
-static unsigned long long *facilities;
+unsigned long *vfacilities;
 static struct gmap_notifier gmap_notifier;
+/* test availability of vfacility */
+static inline int test_vfacility(unsigned long nr)
+{
+        return __test_facility(nr, (void *) vfacilities);
+}
 /* Section: not file related */
 int kvm_arch_hardware_enable(void *garbage)
 {
@@ -387,7 +394,7 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
        vcpu->arch.sie_block->ecb   = 6;
        vcpu->arch.sie_block->ecb2  = 8;
        vcpu->arch.sie_block->eca   = 0xC1002001U;
-        vcpu->arch.sie_block->fac   = (int) (long) facilities;
+        vcpu->arch.sie_block->fac   = (int) (long) vfacilities;
        hrtimer_init(&vcpu->arch.ckc_timer, CLOCK_REALTIME, HRTIMER_MODE_ABS);
        tasklet_init(&vcpu->arch.tasklet, kvm_s390_tasklet,
                     (unsigned long) vcpu);
@@ -1063,6 +1070,10 @@ int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages)
        return 0;
 }
+void kvm_arch_memslots_updated(struct kvm *kvm)
+{
+}
 /* Section: memory related */
 int kvm_arch_prepare_memory_region(struct kvm *kvm,
                                   struct kvm_memory_slot *memslot,
@@ -1129,20 +1140,20 @@ static int __init kvm_s390_init(void)
         * to hold the maximum amount of facilities. On the other hand, we
         * only set facilities that are known to work in KVM.
         */
-        facilities = (unsigned long long *) get_zeroed_page(GFP_KERNEL|GFP_DMA);
+        vfacilities = (unsigned long *) get_zeroed_page(GFP_KERNEL|GFP_DMA);
-        if (!facilities) {
+        if (!vfacilities) {
                kvm_exit();
                return -ENOMEM;
        }
-        memcpy(facilities, S390_lowcore.stfle_fac_list, 16);
+        memcpy(vfacilities, S390_lowcore.stfle_fac_list, 16);
-        facilities[0] &= 0xff82fff3f47c0000ULL;
+        vfacilities[0] &= 0xff82fff3f47c0000UL;
-        facilities[1] &= 0x001c000000000000ULL;
+        vfacilities[1] &= 0x001c000000000000UL;
        return 0;
 }
 static void __exit kvm_s390_exit(void)
 {
-        free_page((unsigned long) facilities);
+        free_page((unsigned long) vfacilities);
        kvm_exit();
 }
diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h
index 028ca9fd2158..dc99f1ca4267 100644
--- a/arch/s390/kvm/kvm-s390.h
+++ b/arch/s390/kvm/kvm-s390.h
@@ -24,6 +24,9 @@
 typedef int (*intercept_handler_t)(struct kvm_vcpu *vcpu);
+/* declare vfacilities extern */
+extern unsigned long *vfacilities;
 /* negativ values are error codes, positive values for internal conditions */
 #define SIE_INTERCEPT_RERUNVCPU         (1<<0)
 #define SIE_INTERCEPT_UCONTROL          (1<<1)
@@ -112,6 +115,13 @@ static inline u64 kvm_s390_get_base_disp_rs(struct kvm_vcpu *vcpu)
        return (base2 ? vcpu->run->s.regs.gprs[base2] : 0) + disp2;
 }
+/* Set the condition code in the guest program status word */
+static inline void kvm_s390_set_psw_cc(struct kvm_vcpu *vcpu, unsigned long cc)
+{
+        vcpu->arch.sie_block->gpsw.mask &= ~(3UL << 44);
+        vcpu->arch.sie_block->gpsw.mask |= cc << 44;
+}
 int kvm_s390_handle_wait(struct kvm_vcpu *vcpu);
 enum hrtimer_restart kvm_s390_idle_wakeup(struct hrtimer *timer);
 void kvm_s390_tasklet(unsigned long parm);
diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c
index 4cdc54e63ebc..59200ee275e5 100644
--- a/arch/s390/kvm/priv.c
+++ b/arch/s390/kvm/priv.c
@@ -164,8 +164,7 @@ static int handle_tpi(struct kvm_vcpu *vcpu)
        kfree(inti);
 no_interrupt:
        /* Set condition code and we're done. */
-        vcpu->arch.sie_block->gpsw.mask &= ~(3ul << 44);
+        kvm_s390_set_psw_cc(vcpu, cc);
-        vcpu->arch.sie_block->gpsw.mask |= (cc & 3ul) << 44;
        return 0;
 }
@@ -220,15 +219,13 @@ static int handle_io_inst(struct kvm_vcpu *vcpu)
                 * Set condition code 3 to stop the guest from issueing channel
                 * I/O instructions.
                 */
-                vcpu->arch.sie_block->gpsw.mask &= ~(3ul << 44);
+                kvm_s390_set_psw_cc(vcpu, 3);
-                vcpu->arch.sie_block->gpsw.mask |= (3 & 3ul) << 44;
                return 0;
        }
 }
 static int handle_stfl(struct kvm_vcpu *vcpu)
 {
-        unsigned int facility_list;
        int rc;
        vcpu->stat.instruction_stfl++;
@@ -236,15 +233,13 @@ static int handle_stfl(struct kvm_vcpu *vcpu)
        if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE)
                return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP);
-        /* only pass the facility bits, which we can handle */
-        facility_list = S390_lowcore.stfl_fac_list & 0xff82fff3;
        rc = copy_to_guest(vcpu, offsetof(struct _lowcore, stfl_fac_list),
-                           &facility_list, sizeof(facility_list));
+                           vfacilities, 4);
        if (rc)
                return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
-        VCPU_EVENT(vcpu, 5, "store facility list value %x", facility_list);
+        VCPU_EVENT(vcpu, 5, "store facility list value %x",
-        trace_kvm_s390_handle_stfl(vcpu, facility_list);
+                   *(unsigned int *) vfacilities);
+        trace_kvm_s390_handle_stfl(vcpu, *(unsigned int *) vfacilities);
        return 0;
 }
@@ -387,7 +382,7 @@ static int handle_stsi(struct kvm_vcpu *vcpu)
                return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP);
        if (fc > 3) {
-                vcpu->arch.sie_block->gpsw.mask |= 3ul << 44;     /* cc 3 */
+                kvm_s390_set_psw_cc(vcpu, 3);
                return 0;
        }
@@ -397,7 +392,7 @@ static int handle_stsi(struct kvm_vcpu *vcpu)
        if (fc == 0) {
                vcpu->run->s.regs.gprs[0] = 3 << 28;
-                vcpu->arch.sie_block->gpsw.mask &= ~(3ul << 44);  /* cc 0 */
+                kvm_s390_set_psw_cc(vcpu, 0);
                return 0;
        }
@@ -431,12 +426,11 @@ static int handle_stsi(struct kvm_vcpu *vcpu)
        }
        trace_kvm_s390_handle_stsi(vcpu, fc, sel1, sel2, operand2);
        free_page(mem);
-        vcpu->arch.sie_block->gpsw.mask &= ~(3ul << 44);
+        kvm_s390_set_psw_cc(vcpu, 0);
        vcpu->run->s.regs.gprs[0] = 0;
        return 0;
 out_no_data:
-        /* condition code 3 */
+        kvm_s390_set_psw_cc(vcpu, 3);
-        vcpu->arch.sie_block->gpsw.mask |= 3ul << 44;
 out_exception:
        free_page(mem);
        return rc;
@@ -494,12 +488,12 @@ static int handle_epsw(struct kvm_vcpu *vcpu)
        kvm_s390_get_regs_rre(vcpu, &reg1, &reg2);
        /* This basically extracts the mask half of the psw. */
-        vcpu->run->s.regs.gprs[reg1] &= 0xffffffff00000000;
+        vcpu->run->s.regs.gprs[reg1] &= 0xffffffff00000000UL;
        vcpu->run->s.regs.gprs[reg1] |= vcpu->arch.sie_block->gpsw.mask >> 32;
        if (reg2) {
-                vcpu->run->s.regs.gprs[reg2] &= 0xffffffff00000000;
+                vcpu->run->s.regs.gprs[reg2] &= 0xffffffff00000000UL;
                vcpu->run->s.regs.gprs[reg2] |=
-                        vcpu->arch.sie_block->gpsw.mask & 0x00000000ffffffff;
+                        vcpu->arch.sie_block->gpsw.mask & 0x00000000ffffffffUL;
        }
        return 0;
 }
diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c
index 6d16132d0850..bf7c0dc64a76 100644
--- a/arch/s390/mm/pgtable.c
+++ b/arch/s390/mm/pgtable.c
@@ -335,7 +335,7 @@ int gmap_map_segment(struct gmap *gmap, unsigned long from,
        if ((from | to | len) & (PMD_SIZE - 1))
                return -EINVAL;
-        if (len == 0 || from + len > PGDIR_SIZE ||
+        if (len == 0 || from + len > TASK_MAX_SIZE ||
            from + len < from || to + len < to)
                return -EINVAL;
@@ -732,6 +732,11 @@ void gmap_do_ipte_notify(struct mm_struct *mm, unsigned long addr, pte_t *pte)
        spin_unlock(&gmap_notifier_lock);
 }
+static inline int page_table_with_pgste(struct page *page)
+{
+        return atomic_read(&page->_mapcount) == 0;
+}
 static inline unsigned long *page_table_alloc_pgste(struct mm_struct *mm,
                                                    unsigned long vmaddr)
 {
@@ -751,7 +756,7 @@ static inline unsigned long *page_table_alloc_pgste(struct mm_struct *mm,
        mp->vmaddr = vmaddr & PMD_MASK;
        INIT_LIST_HEAD(&mp->mapper);
        page->index = (unsigned long) mp;
-        atomic_set(&page->_mapcount, 3);
+        atomic_set(&page->_mapcount, 0);
        table = (unsigned long *) page_to_phys(page);
        clear_table(table, _PAGE_INVALID, PAGE_SIZE/2);
        clear_table(table + PTRS_PER_PTE, PGSTE_HR_BIT | PGSTE_HC_BIT,
@@ -818,6 +823,11 @@ EXPORT_SYMBOL(set_guest_storage_key);
 #else /* CONFIG_PGSTE */
+static inline int page_table_with_pgste(struct page *page)
+{
+        return 0;
+}
 static inline unsigned long *page_table_alloc_pgste(struct mm_struct *mm,
                                                    unsigned long vmaddr)
 {
@@ -894,12 +904,12 @@ void page_table_free(struct mm_struct *mm, unsigned long *table)
        struct page *page;
        unsigned int bit, mask;
-        if (mm_has_pgste(mm)) {
+        page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
+        if (page_table_with_pgste(page)) {
                gmap_disconnect_pgtable(mm, table);
                return page_table_free_pgste(table);
        }
        /* Free 1K/2K page table fragment of a 4K page */
-        page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
        bit = 1 << ((__pa(table) & ~PAGE_MASK)/(PTRS_PER_PTE*sizeof(pte_t)));
        spin_lock_bh(&mm->context.list_lock);
        if ((atomic_read(&page->_mapcount) & FRAG_MASK) != FRAG_MASK)
@@ -937,14 +947,14 @@ void page_table_free_rcu(struct mmu_gather *tlb, unsigned long *table)
        unsigned int bit, mask;
        mm = tlb->mm;
-        if (mm_has_pgste(mm)) {
+        page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
+        if (page_table_with_pgste(page)) {
                gmap_disconnect_pgtable(mm, table);
                table = (unsigned long *) (__pa(table) | FRAG_MASK);
                tlb_remove_table(tlb, table);
                return;
        }
        bit = 1 << ((__pa(table) & ~PAGE_MASK) / (PTRS_PER_PTE*sizeof(pte_t)));
-        page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
        spin_lock_bh(&mm->context.list_lock);
        if ((atomic_read(&page->_mapcount) & FRAG_MASK) != FRAG_MASK)
                list_del(&page->lru);
@@ -1030,36 +1040,120 @@ void tlb_remove_table(struct mmu_gather *tlb, void *table)
 }
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
-void thp_split_vma(struct vm_area_struct *vma)
+static inline void thp_split_vma(struct vm_area_struct *vma)
 {
        unsigned long addr;
-        struct page *page;
-        for (addr = vma->vm_start; addr < vma->vm_end; addr += PAGE_SIZE) {
+        for (addr = vma->vm_start; addr < vma->vm_end; addr += PAGE_SIZE)
-                page = follow_page(vma, addr, FOLL_SPLIT);
+                follow_page(vma, addr, FOLL_SPLIT);
-        }
 }
-void thp_split_mm(struct mm_struct *mm)
+static inline void thp_split_mm(struct mm_struct *mm)
 {
-        struct vm_area_struct *vma = mm->mmap;
+        struct vm_area_struct *vma;
-        while (vma != NULL) {
+        for (vma = mm->mmap; vma != NULL; vma = vma->vm_next) {
                thp_split_vma(vma);
                vma->vm_flags &= ~VM_HUGEPAGE;
                vma->vm_flags |= VM_NOHUGEPAGE;
-                vma = vma->vm_next;
        }
+        mm->def_flags |= VM_NOHUGEPAGE;
+}
+#else
+static inline void thp_split_mm(struct mm_struct *mm)
+{
 }
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+static unsigned long page_table_realloc_pmd(struct mmu_gather *tlb,
+                                struct mm_struct *mm, pud_t *pud,
+                                unsigned long addr, unsigned long end)
+{
+        unsigned long next, *table, *new;
+        struct page *page;
+        pmd_t *pmd;
+        pmd = pmd_offset(pud, addr);
+        do {
+                next = pmd_addr_end(addr, end);
+again:
+                if (pmd_none_or_clear_bad(pmd))
+                        continue;
+                table = (unsigned long *) pmd_deref(*pmd);
+                page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
+                if (page_table_with_pgste(page))
+                        continue;
+                /* Allocate new page table with pgstes */
+                new = page_table_alloc_pgste(mm, addr);
+                if (!new) {
+                        mm->context.has_pgste = 0;
+                        continue;
+                }
+                spin_lock(&mm->page_table_lock);
+                if (likely((unsigned long *) pmd_deref(*pmd) == table)) {
+                        /* Nuke pmd entry pointing to the "short" page table */
+                        pmdp_flush_lazy(mm, addr, pmd);
+                        pmd_clear(pmd);
+                        /* Copy ptes from old table to new table */
+                        memcpy(new, table, PAGE_SIZE/2);
+                        clear_table(table, _PAGE_INVALID, PAGE_SIZE/2);
+                        /* Establish new table */
+                        pmd_populate(mm, pmd, (pte_t *) new);
+                        /* Free old table with rcu, there might be a walker! */
+                        page_table_free_rcu(tlb, table);
+                        new = NULL;
+                }
+                spin_unlock(&mm->page_table_lock);
+                if (new) {
+                        page_table_free_pgste(new);
+                        goto again;
+                }
+        } while (pmd++, addr = next, addr != end);
+        return addr;
+}
+static unsigned long page_table_realloc_pud(struct mmu_gather *tlb,
+                                   struct mm_struct *mm, pgd_t *pgd,
+                                   unsigned long addr, unsigned long end)
+{
+        unsigned long next;
+        pud_t *pud;
+        pud = pud_offset(pgd, addr);
+        do {
+                next = pud_addr_end(addr, end);
+                if (pud_none_or_clear_bad(pud))
+                        continue;
+                next = page_table_realloc_pmd(tlb, mm, pud, addr, next);
+        } while (pud++, addr = next, addr != end);
+        return addr;
+}
+static void page_table_realloc(struct mmu_gather *tlb, struct mm_struct *mm,
+                               unsigned long addr, unsigned long end)
+{
+        unsigned long next;
+        pgd_t *pgd;
+        pgd = pgd_offset(mm, addr);
+        do {
+                next = pgd_addr_end(addr, end);
+                if (pgd_none_or_clear_bad(pgd))
+                        continue;
+                next = page_table_realloc_pud(tlb, mm, pgd, addr, next);
+        } while (pgd++, addr = next, addr != end);
+}
 /*
 * switch on pgstes for its userspace process (for kvm)
 */
 int s390_enable_sie(void)
 {
        struct task_struct *tsk = current;
-        struct mm_struct *mm, *old_mm;
+        struct mm_struct *mm = tsk->mm;
+        struct mmu_gather tlb;
        /* Do we have switched amode? If no, we cannot do sie */
        if (s390_user_mode == HOME_SPACE_MODE)
@@ -1069,57 +1163,16 @@ int s390_enable_sie(void)
        if (mm_has_pgste(tsk->mm))
                return 0;
-        /* lets check if we are allowed to replace the mm */
+        down_write(&mm->mmap_sem);
-        task_lock(tsk);
-        if (!tsk->mm || atomic_read(&tsk->mm->mm_users) > 1 ||
-#ifdef CONFIG_AIO
-            !hlist_empty(&tsk->mm->ioctx_list) ||
-#endif
-            tsk->mm != tsk->active_mm) {
-                task_unlock(tsk);
-                return -EINVAL;
-        }
-        task_unlock(tsk);
-        /* we copy the mm and let dup_mm create the page tables with_pgstes */
-        tsk->mm->context.alloc_pgste = 1;
-        /* make sure that both mms have a correct rss state */
-        sync_mm_rss(tsk->mm);
-        mm = dup_mm(tsk);
-        tsk->mm->context.alloc_pgste = 0;
-        if (!mm)
-                return -ENOMEM;
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
        /* split thp mappings and disable thp for future mappings */
        thp_split_mm(mm);
-        mm->def_flags |= VM_NOHUGEPAGE;
+        /* Reallocate the page tables with pgstes */
-#endif
+        mm->context.has_pgste = 1;
+        tlb_gather_mmu(&tlb, mm, 0, TASK_SIZE);
-        /* Now lets check again if something happened */
+        page_table_realloc(&tlb, mm, 0, TASK_SIZE);
-        task_lock(tsk);
+        tlb_finish_mmu(&tlb, 0, TASK_SIZE);
-        if (!tsk->mm || atomic_read(&tsk->mm->mm_users) > 1 ||
+        up_write(&mm->mmap_sem);
-#ifdef CONFIG_AIO
+        return mm->context.has_pgste ? 0 : -ENOMEM;
-            !hlist_empty(&tsk->mm->ioctx_list) ||
-#endif
-            tsk->mm != tsk->active_mm) {
-                mmput(mm);
-                task_unlock(tsk);
-                return -EINVAL;
-        }
-        /* ok, we are alone. No ptrace, no threads, etc. */
-        old_mm = tsk->mm;
-        tsk->mm = tsk->active_mm = mm;
-        preempt_disable();
-        update_mm(mm, tsk);
-        atomic_inc(&mm->context.attach_count);
-        atomic_dec(&old_mm->context.attach_count);
-        cpumask_set_cpu(smp_processor_id(), mm_cpumask(mm));
-        preempt_enable();
-        task_unlock(tsk);
-        mmput(old_mm);
-        return 0;
 }
 EXPORT_SYMBOL_GPL(s390_enable_sie);
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index f87f7fcefa0a..c76ff74a98f2 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -286,6 +286,7 @@ struct kvm_mmu {
        u64 *pae_root;
        u64 *lm_root;
        u64 rsvd_bits_mask[2][4];
+        u64 bad_mt_xwr;
        /*
         * Bitmap: bit set = last pte in walk
@@ -323,6 +324,7 @@ struct kvm_pmu {
        u64 global_ovf_ctrl;
        u64 counter_bitmask[2];
        u64 global_ctrl_mask;
+        u64 reserved_bits;
        u8 version;
        struct kvm_pmc gp_counters[INTEL_PMC_MAX_GENERIC];
        struct kvm_pmc fixed_counters[INTEL_PMC_MAX_FIXED];
@@ -511,6 +513,14 @@ struct kvm_vcpu_arch {
         * instruction.
         */
        bool write_fault_to_shadow_pgtable;
+        /* set at EPT violation at this point */
+        unsigned long exit_qualification;
+        /* pv related host specific info */
+        struct {
+                bool pv_unhalted;
+        } pv;
 };
 struct kvm_lpage_info {
@@ -802,8 +812,8 @@ extern u32  kvm_min_guest_tsc_khz;
 extern u32  kvm_max_guest_tsc_khz;
 enum emulation_result {
-        EMULATE_DONE,       /* no further processing */
+        EMULATE_DONE,         /* no further processing */
-        EMULATE_DO_MMIO,      /* kvm_run filled with mmio request */
+        EMULATE_USER_EXIT,    /* kvm_run ready for userspace exit */
        EMULATE_FAIL,         /* can't emulate this instruction */
 };
diff --git a/arch/x86/include/asm/pvclock.h b/arch/x86/include/asm/pvclock.h
index 109a9dd5d454..be8269b00e2a 100644
--- a/arch/x86/include/asm/pvclock.h
+++ b/arch/x86/include/asm/pvclock.h
@@ -93,7 +93,6 @@ unsigned __pvclock_read_cycles(const struct pvclock_vcpu_time_info *src,
 struct pvclock_vsyscall_time_info {
        struct pvclock_vcpu_time_info pvti;
-        u32 migrate_count;
 } __attribute__((__aligned__(SMP_CACHE_BYTES)));
 #define PVTI_SIZE sizeof(struct pvclock_vsyscall_time_info)
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index f3e01a2cbaa1..966502d4682e 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -387,6 +387,7 @@ enum vmcs_field {
 #define VMX_EPT_EXTENT_INDIVIDUAL_ADDR          0
 #define VMX_EPT_EXTENT_CONTEXT                  1
 #define VMX_EPT_EXTENT_GLOBAL                   2
+#define VMX_EPT_EXTENT_SHIFT                    24
 #define VMX_EPT_EXECUTE_ONLY_BIT                (1ull)
 #define VMX_EPT_PAGE_WALK_4_BIT                 (1ull << 6)
@@ -394,6 +395,7 @@ enum vmcs_field {
 #define VMX_EPTP_WB_BIT                         (1ull << 14)
 #define VMX_EPT_2MB_PAGE_BIT                    (1ull << 16)
 #define VMX_EPT_1GB_PAGE_BIT                    (1ull << 17)
+#define VMX_EPT_INVEPT_BIT                      (1ull << 20)
 #define VMX_EPT_AD_BIT                              (1ull << 21)
 #define VMX_EPT_EXTENT_CONTEXT_BIT              (1ull << 25)
 #define VMX_EPT_EXTENT_GLOBAL_BIT               (1ull << 26)
diff --git a/arch/x86/include/uapi/asm/vmx.h b/arch/x86/include/uapi/asm/vmx.h
index d651082c7cf7..0e79420376eb 100644
--- a/arch/x86/include/uapi/asm/vmx.h
+++ b/arch/x86/include/uapi/asm/vmx.h
@@ -65,6 +65,7 @@
 #define EXIT_REASON_EOI_INDUCED         45
 #define EXIT_REASON_EPT_VIOLATION       48
 #define EXIT_REASON_EPT_MISCONFIG       49
+#define EXIT_REASON_INVEPT              50
 #define EXIT_REASON_PREEMPTION_TIMER    52
 #define EXIT_REASON_WBINVD              54
 #define EXIT_REASON_XSETBV              55
@@ -106,12 +107,13 @@
        { EXIT_REASON_APIC_ACCESS,           "APIC_ACCESS" }, \
        { EXIT_REASON_EPT_VIOLATION,         "EPT_VIOLATION" }, \
        { EXIT_REASON_EPT_MISCONFIG,         "EPT_MISCONFIG" }, \
+        { EXIT_REASON_INVEPT,                "INVEPT" }, \
+        { EXIT_REASON_PREEMPTION_TIMER,      "PREEMPTION_TIMER" }, \
        { EXIT_REASON_WBINVD,                "WBINVD" }, \
        { EXIT_REASON_APIC_WRITE,            "APIC_WRITE" }, \
        { EXIT_REASON_EOI_INDUCED,           "EOI_INDUCED" }, \
        { EXIT_REASON_INVALID_STATE,         "INVALID_STATE" }, \
        { EXIT_REASON_INVD,                  "INVD" }, \
-        { EXIT_REASON_INVPCID,               "INVPCID" }, \
+        { EXIT_REASON_INVPCID,               "INVPCID" }
-        { EXIT_REASON_PREEMPTION_TIMER,      "PREEMPTION_TIMER" }
 #endif /* _UAPIVMX_H */
diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c
index 2cb9470ea85b..a16bae3f83b3 100644
--- a/arch/x86/kernel/pvclock.c
+++ b/arch/x86/kernel/pvclock.c
@@ -128,46 +128,7 @@ void pvclock_read_wallclock(struct pvclock_wall_clock *wall_clock,
        set_normalized_timespec(ts, now.tv_sec, now.tv_nsec);
 }
-static struct pvclock_vsyscall_time_info *pvclock_vdso_info;
-static struct pvclock_vsyscall_time_info *
-pvclock_get_vsyscall_user_time_info(int cpu)
-{
-        if (!pvclock_vdso_info) {
-                BUG();
-                return NULL;
-        }
-        return &pvclock_vdso_info[cpu];
-}
-struct pvclock_vcpu_time_info *pvclock_get_vsyscall_time_info(int cpu)
-{
-        return &pvclock_get_vsyscall_user_time_info(cpu)->pvti;
-}
 #ifdef CONFIG_X86_64
-static int pvclock_task_migrate(struct notifier_block *nb, unsigned long l,
-                                void *v)
-{
-        struct task_migration_notifier *mn = v;
-        struct pvclock_vsyscall_time_info *pvti;
-        pvti = pvclock_get_vsyscall_user_time_info(mn->from_cpu);
-        /* this is NULL when pvclock vsyscall is not initialized */
-        if (unlikely(pvti == NULL))
-                return NOTIFY_DONE;
-        pvti->migrate_count++;
-        return NOTIFY_DONE;
-}
-static struct notifier_block pvclock_migrate = {
-        .notifier_call = pvclock_task_migrate,
-};
 /*
 * Initialize the generic pvclock vsyscall state.  This will allocate
 * a/some page(s) for the per-vcpu pvclock information, set up a
@@ -181,17 +142,12 @@ int __init pvclock_init_vsyscall(struct pvclock_vsyscall_time_info *i,
        WARN_ON (size != PVCLOCK_VSYSCALL_NR_PAGES*PAGE_SIZE);
-        pvclock_vdso_info = i;
        for (idx = 0; idx <= (PVCLOCK_FIXMAP_END-PVCLOCK_FIXMAP_BEGIN); idx++) {
                __set_fixmap(PVCLOCK_FIXMAP_BEGIN + idx,
                             __pa(i) + (idx*PAGE_SIZE),
                             PAGE_KERNEL_VVAR);
        }
-        register_task_migration_notifier(&pvclock_migrate);
        return 0;
 }
 #endif
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index a20ecb5b6cbf..b110fe6c03d4 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -413,7 +413,8 @@ static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
                             (1 << KVM_FEATURE_CLOCKSOURCE2) |
                             (1 << KVM_FEATURE_ASYNC_PF) |
                             (1 << KVM_FEATURE_PV_EOI) |
-                             (1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT);
+                             (1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT) |
+                             (1 << KVM_FEATURE_PV_UNHALT);
                if (sched_info_on())
                        entry->eax |= (1 << KVM_FEATURE_STEAL_TIME);
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index afc11245827c..5439117d5c4c 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -79,16 +79,6 @@ static inline void apic_set_reg(struct kvm_lapic *apic, int reg_off, u32 val)
        *((u32 *) (apic->regs + reg_off)) = val;
 }
-static inline int apic_test_and_set_vector(int vec, void *bitmap)
-{
-        return test_and_set_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
-}
-static inline int apic_test_and_clear_vector(int vec, void *bitmap)
-{
-        return test_and_clear_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
-}
 static inline int apic_test_vector(int vec, void *bitmap)
 {
        return test_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
@@ -331,10 +321,10 @@ void kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir)
 }
 EXPORT_SYMBOL_GPL(kvm_apic_update_irr);
-static inline int apic_test_and_set_irr(int vec, struct kvm_lapic *apic)
+static inline void apic_set_irr(int vec, struct kvm_lapic *apic)
 {
        apic->irr_pending = true;
-        return apic_test_and_set_vector(vec, apic->regs + APIC_IRR);
+        apic_set_vector(vec, apic->regs + APIC_IRR);
 }
 static inline int apic_search_irr(struct kvm_lapic *apic)
@@ -681,32 +671,28 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
                if (unlikely(!apic_enabled(apic)))
                        break;
+                result = 1;
                if (dest_map)
                        __set_bit(vcpu->vcpu_id, dest_map);
-                if (kvm_x86_ops->deliver_posted_interrupt) {
+                if (kvm_x86_ops->deliver_posted_interrupt)
-                        result = 1;
                        kvm_x86_ops->deliver_posted_interrupt(vcpu, vector);
-                } else {
+                else {
-                        result = !apic_test_and_set_irr(vector, apic);
+                        apic_set_irr(vector, apic);
-                        if (!result) {
-                                if (trig_mode)
-                                        apic_debug("level trig mode repeatedly "
-                                                "for vector %d", vector);
-                                goto out;
-                        }
                        kvm_make_request(KVM_REQ_EVENT, vcpu);
                        kvm_vcpu_kick(vcpu);
                }
-out:
                trace_kvm_apic_accept_irq(vcpu->vcpu_id, delivery_mode,
-                                trig_mode, vector, !result);
+                                          trig_mode, vector, false);
                break;
        case APIC_DM_REMRD:
-                apic_debug("Ignoring delivery mode 3\n");
+                result = 1;
+                vcpu->arch.pv.pv_unhalted = 1;
+                kvm_make_request(KVM_REQ_EVENT, vcpu);
+                kvm_vcpu_kick(vcpu);
                break;
        case APIC_DM_SMI:
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 9e9285ae9b94..6e2d2c8f230b 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -132,8 +132,8 @@ module_param(dbg, bool, 0644);
        (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \
                                            * PT32_LEVEL_BITS))) - 1))
-#define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | PT_USER_MASK \
+#define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | shadow_user_mask \
-                        | PT64_NX_MASK)
+                        | shadow_x_mask | shadow_nx_mask)
 #define ACC_EXEC_MASK    1
 #define ACC_WRITE_MASK   PT_WRITABLE_MASK
@@ -331,11 +331,6 @@ static int is_large_pte(u64 pte)
        return pte & PT_PAGE_SIZE_MASK;
 }
-static int is_dirty_gpte(unsigned long pte)
-{
-        return pte & PT_DIRTY_MASK;
-}
 static int is_rmap_spte(u64 pte)
 {
        return is_shadow_present_pte(pte);
@@ -2052,12 +2047,18 @@ static void shadow_walk_next(struct kvm_shadow_walk_iterator *iterator)
        return __shadow_walk_next(iterator, *iterator->sptep);
 }
-static void link_shadow_page(u64 *sptep, struct kvm_mmu_page *sp)
+static void link_shadow_page(u64 *sptep, struct kvm_mmu_page *sp, bool accessed)
 {
        u64 spte;
+        BUILD_BUG_ON(VMX_EPT_READABLE_MASK != PT_PRESENT_MASK ||
+                        VMX_EPT_WRITABLE_MASK != PT_WRITABLE_MASK);
        spte = __pa(sp->spt) | PT_PRESENT_MASK | PT_WRITABLE_MASK |
-               shadow_user_mask | shadow_x_mask | shadow_accessed_mask;
+               shadow_user_mask | shadow_x_mask;
+        if (accessed)
+                spte |= shadow_accessed_mask;
        mmu_spte_set(sptep, spte);
 }
@@ -2574,14 +2575,6 @@ static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
        mmu_free_roots(vcpu);
 }
-static bool is_rsvd_bits_set(struct kvm_mmu *mmu, u64 gpte, int level)
-{
-        int bit7;
-        bit7 = (gpte >> 7) & 1;
-        return (gpte & mmu->rsvd_bits_mask[bit7][level-1]) != 0;
-}
 static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn,
                                     bool no_dirty_log)
 {
@@ -2594,26 +2587,6 @@ static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn,
        return gfn_to_pfn_memslot_atomic(slot, gfn);
 }
-static bool prefetch_invalid_gpte(struct kvm_vcpu *vcpu,
-                                  struct kvm_mmu_page *sp, u64 *spte,
-                                  u64 gpte)
-{
-        if (is_rsvd_bits_set(&vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL))
-                goto no_present;
-        if (!is_present_gpte(gpte))
-                goto no_present;
-        if (!(gpte & PT_ACCESSED_MASK))
-                goto no_present;
-        return false;
-no_present:
-        drop_spte(vcpu->kvm, spte);
-        return true;
-}
 static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu,
                                    struct kvm_mmu_page *sp,
                                    u64 *start, u64 *end)
@@ -2710,7 +2683,7 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
                                              iterator.level - 1,
                                              1, ACC_ALL, iterator.sptep);
-                        link_shadow_page(iterator.sptep, sp);
+                        link_shadow_page(iterator.sptep, sp, true);
                }
        }
        return emulate;
@@ -2808,7 +2781,7 @@ exit:
        return ret;
 }
-static bool page_fault_can_be_fast(struct kvm_vcpu *vcpu, u32 error_code)
+static bool page_fault_can_be_fast(u32 error_code)
 {
        /*
         * Do not fix the mmio spte with invalid generation number which
@@ -2861,7 +2834,7 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level,
        bool ret = false;
        u64 spte = 0ull;
-        if (!page_fault_can_be_fast(vcpu, error_code))
+        if (!page_fault_can_be_fast(error_code))
                return false;
        walk_shadow_page_lockless_begin(vcpu);
@@ -3209,6 +3182,7 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
        mmu_sync_roots(vcpu);
        spin_unlock(&vcpu->kvm->mmu_lock);
 }
+EXPORT_SYMBOL_GPL(kvm_mmu_sync_roots);
 static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr,
                                  u32 access, struct x86_exception *exception)
@@ -3478,6 +3452,7 @@ void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu)
        ++vcpu->stat.tlb_flush;
        kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
 }
+EXPORT_SYMBOL_GPL(kvm_mmu_flush_tlb);
 static void paging_new_cr3(struct kvm_vcpu *vcpu)
 {
@@ -3501,18 +3476,6 @@ static void paging_free(struct kvm_vcpu *vcpu)
        nonpaging_free(vcpu);
 }
-static inline void protect_clean_gpte(unsigned *access, unsigned gpte)
-{
-        unsigned mask;
-        BUILD_BUG_ON(PT_WRITABLE_MASK != ACC_WRITE_MASK);
-        mask = (unsigned)~ACC_WRITE_MASK;
-        /* Allow write access to dirty gptes */
-        mask |= (gpte >> (PT_DIRTY_SHIFT - PT_WRITABLE_SHIFT)) & PT_WRITABLE_MASK;
-        *access &= mask;
-}
 static bool sync_mmio_spte(struct kvm *kvm, u64 *sptep, gfn_t gfn,
                           unsigned access, int *nr_present)
 {
@@ -3530,16 +3493,6 @@ static bool sync_mmio_spte(struct kvm *kvm, u64 *sptep, gfn_t gfn,
        return false;
 }
-static inline unsigned gpte_access(struct kvm_vcpu *vcpu, u64 gpte)
-{
-        unsigned access;
-        access = (gpte & (PT_WRITABLE_MASK | PT_USER_MASK)) | ACC_EXEC_MASK;
-        access &= ~(gpte >> PT64_NX_SHIFT);
-        return access;
-}
 static inline bool is_last_gpte(struct kvm_mmu *mmu, unsigned level, unsigned gpte)
 {
        unsigned index;
@@ -3549,6 +3502,11 @@ static inline bool is_last_gpte(struct kvm_mmu *mmu, unsigned level, unsigned gp
        return mmu->last_pte_bitmap & (1 << index);
 }
+#define PTTYPE_EPT 18 /* arbitrary */
+#define PTTYPE PTTYPE_EPT
+#include "paging_tmpl.h"
+#undef PTTYPE
 #define PTTYPE 64
 #include "paging_tmpl.h"
 #undef PTTYPE
@@ -3563,6 +3521,8 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
        int maxphyaddr = cpuid_maxphyaddr(vcpu);
        u64 exb_bit_rsvd = 0;
+        context->bad_mt_xwr = 0;
        if (!context->nx)
                exb_bit_rsvd = rsvd_bits(63, 63);
        switch (context->root_level) {
@@ -3618,7 +3578,40 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
        }
 }
-static void update_permission_bitmask(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu)
+static void reset_rsvds_bits_mask_ept(struct kvm_vcpu *vcpu,
+                struct kvm_mmu *context, bool execonly)
+{
+        int maxphyaddr = cpuid_maxphyaddr(vcpu);
+        int pte;
+        context->rsvd_bits_mask[0][3] =
+                rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 7);
+        context->rsvd_bits_mask[0][2] =
+                rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 6);
+        context->rsvd_bits_mask[0][1] =
+                rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 6);
+        context->rsvd_bits_mask[0][0] = rsvd_bits(maxphyaddr, 51);
+        /* large page */
+        context->rsvd_bits_mask[1][3] = context->rsvd_bits_mask[0][3];
+        context->rsvd_bits_mask[1][2] =
+                rsvd_bits(maxphyaddr, 51) | rsvd_bits(12, 29);
+        context->rsvd_bits_mask[1][1] =
+                rsvd_bits(maxphyaddr, 51) | rsvd_bits(12, 20);
+        context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[0][0];
+        for (pte = 0; pte < 64; pte++) {
+                int rwx_bits = pte & 7;
+                int mt = pte >> 3;
+                if (mt == 0x2 || mt == 0x3 || mt == 0x7 ||
+                                rwx_bits == 0x2 || rwx_bits == 0x6 ||
+                                (rwx_bits == 0x4 && !execonly))
+                        context->bad_mt_xwr |= (1ull << pte);
+        }
+}
+static void update_permission_bitmask(struct kvm_vcpu *vcpu,
+                struct kvm_mmu *mmu, bool ept)
 {
        unsigned bit, byte, pfec;
        u8 map;
@@ -3636,12 +3629,16 @@ static void update_permission_bitmask(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu
                        w = bit & ACC_WRITE_MASK;
                        u = bit & ACC_USER_MASK;
-                        /* Not really needed: !nx will cause pte.nx to fault */
+                        if (!ept) {
-                        x |= !mmu->nx;
+                                /* Not really needed: !nx will cause pte.nx to fault */
-                        /* Allow supervisor writes if !cr0.wp */
+                                x |= !mmu->nx;
-                        w |= !is_write_protection(vcpu) && !uf;
+                                /* Allow supervisor writes if !cr0.wp */
-                        /* Disallow supervisor fetches of user code if cr4.smep */
+                                w |= !is_write_protection(vcpu) && !uf;
-                        x &= !(smep && u && !uf);
+                                /* Disallow supervisor fetches of user code if cr4.smep */
+                                x &= !(smep && u && !uf);
+                        } else
+                                /* Not really needed: no U/S accesses on ept  */
+                                u = 1;
                        fault = (ff && !x) || (uf && !u) || (wf && !w);
                        map |= fault << bit;
@@ -3676,7 +3673,7 @@ static int paging64_init_context_common(struct kvm_vcpu *vcpu,
        context->root_level = level;
        reset_rsvds_bits_mask(vcpu, context);
-        update_permission_bitmask(vcpu, context);
+        update_permission_bitmask(vcpu, context, false);
        update_last_pte_bitmap(vcpu, context);
        ASSERT(is_pae(vcpu));
@@ -3706,7 +3703,7 @@ static int paging32_init_context(struct kvm_vcpu *vcpu,
        context->root_level = PT32_ROOT_LEVEL;
        reset_rsvds_bits_mask(vcpu, context);
-        update_permission_bitmask(vcpu, context);
+        update_permission_bitmask(vcpu, context, false);
        update_last_pte_bitmap(vcpu, context);
        context->new_cr3 = paging_new_cr3;
@@ -3768,7 +3765,7 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
                context->gva_to_gpa = paging32_gva_to_gpa;
        }
-        update_permission_bitmask(vcpu, context);
+        update_permission_bitmask(vcpu, context, false);
        update_last_pte_bitmap(vcpu, context);
        return 0;
@@ -3800,6 +3797,33 @@ int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context)
 }
 EXPORT_SYMBOL_GPL(kvm_init_shadow_mmu);
+int kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context,
+                bool execonly)
+{
+        ASSERT(vcpu);
+        ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
+        context->shadow_root_level = kvm_x86_ops->get_tdp_level();
+        context->nx = true;
+        context->new_cr3 = paging_new_cr3;
+        context->page_fault = ept_page_fault;
+        context->gva_to_gpa = ept_gva_to_gpa;
+        context->sync_page = ept_sync_page;
+        context->invlpg = ept_invlpg;
+        context->update_pte = ept_update_pte;
+        context->free = paging_free;
+        context->root_level = context->shadow_root_level;
+        context->root_hpa = INVALID_PAGE;
+        context->direct_map = false;
+        update_permission_bitmask(vcpu, context, true);
+        reset_rsvds_bits_mask_ept(vcpu, context, execonly);
+        return 0;
+}
+EXPORT_SYMBOL_GPL(kvm_init_shadow_ept_mmu);
 static int init_kvm_softmmu(struct kvm_vcpu *vcpu)
 {
        int r = kvm_init_shadow_mmu(vcpu, vcpu->arch.walk_mmu);
@@ -3847,7 +3871,7 @@ static int init_kvm_nested_mmu(struct kvm_vcpu *vcpu)
                g_context->gva_to_gpa = paging32_gva_to_gpa_nested;
        }
-        update_permission_bitmask(vcpu, g_context);
+        update_permission_bitmask(vcpu, g_context, false);
        update_last_pte_bitmap(vcpu, g_context);
        return 0;
@@ -3923,8 +3947,8 @@ static bool need_remote_flush(u64 old, u64 new)
                return true;
        if ((old ^ new) & PT64_BASE_ADDR_MASK)
                return true;
-        old ^= PT64_NX_MASK;
+        old ^= shadow_nx_mask;
-        new ^= PT64_NX_MASK;
+        new ^= shadow_nx_mask;
        return (old & ~new & PT64_PERM_MASK) != 0;
 }
@@ -4182,7 +4206,7 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code,
        switch (er) {
        case EMULATE_DONE:
                return 1;
-        case EMULATE_DO_MMIO:
+        case EMULATE_USER_EXIT:
                ++vcpu->stat.mmio_exits;
                /* fall through */
        case EMULATE_FAIL:
@@ -4390,11 +4414,8 @@ void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm)
        /*
         * The very rare case: if the generation-number is round,
         * zap all shadow pages.
-         *
-         * The max value is MMIO_MAX_GEN - 1 since it is not called
-         * when mark memslot invalid.
         */
-        if (unlikely(kvm_current_mmio_generation(kvm) >= (MMIO_MAX_GEN - 1))) {
+        if (unlikely(kvm_current_mmio_generation(kvm) >= MMIO_MAX_GEN)) {
                printk_ratelimited(KERN_INFO "kvm: zapping shadow pages for mmio generation wraparound\n");
                kvm_mmu_invalidate_zap_all_pages(kvm);
        }
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 5b59c573aba7..77e044a0f5f7 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -71,6 +71,8 @@ enum {
 int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct);
 int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context);
+int kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context,
+                bool execonly);
 static inline unsigned int kvm_mmu_available_pages(struct kvm *kvm)
 {
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 7769699d48a8..043330159179 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -23,6 +23,13 @@
 * so the code in this file is compiled twice, once per pte size.
 */
+/*
+ * This is used to catch non optimized PT_GUEST_(DIRTY|ACCESS)_SHIFT macro
+ * uses for EPT without A/D paging type.
+ */
+extern u64 __pure __using_nonexistent_pte_bit(void)
+               __compiletime_error("wrong use of PT_GUEST_(DIRTY|ACCESS)_SHIFT");
 #if PTTYPE == 64
        #define pt_element_t u64
        #define guest_walker guest_walker64
@@ -32,6 +39,10 @@
        #define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl)
        #define PT_INDEX(addr, level) PT64_INDEX(addr, level)
        #define PT_LEVEL_BITS PT64_LEVEL_BITS
+        #define PT_GUEST_ACCESSED_MASK PT_ACCESSED_MASK
+        #define PT_GUEST_DIRTY_MASK PT_DIRTY_MASK
+        #define PT_GUEST_DIRTY_SHIFT PT_DIRTY_SHIFT
+        #define PT_GUEST_ACCESSED_SHIFT PT_ACCESSED_SHIFT
        #ifdef CONFIG_X86_64
        #define PT_MAX_FULL_LEVELS 4
        #define CMPXCHG cmpxchg
@@ -49,7 +60,26 @@
        #define PT_INDEX(addr, level) PT32_INDEX(addr, level)
        #define PT_LEVEL_BITS PT32_LEVEL_BITS
        #define PT_MAX_FULL_LEVELS 2
+        #define PT_GUEST_ACCESSED_MASK PT_ACCESSED_MASK
+        #define PT_GUEST_DIRTY_MASK PT_DIRTY_MASK
+        #define PT_GUEST_DIRTY_SHIFT PT_DIRTY_SHIFT
+        #define PT_GUEST_ACCESSED_SHIFT PT_ACCESSED_SHIFT
        #define CMPXCHG cmpxchg
+#elif PTTYPE == PTTYPE_EPT
+        #define pt_element_t u64
+        #define guest_walker guest_walkerEPT
+        #define FNAME(name) ept_##name
+        #define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK
+        #define PT_LVL_ADDR_MASK(lvl) PT64_LVL_ADDR_MASK(lvl)
+        #define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl)
+        #define PT_INDEX(addr, level) PT64_INDEX(addr, level)
+        #define PT_LEVEL_BITS PT64_LEVEL_BITS
+        #define PT_GUEST_ACCESSED_MASK 0
+        #define PT_GUEST_DIRTY_MASK 0
+        #define PT_GUEST_DIRTY_SHIFT __using_nonexistent_pte_bit()
+        #define PT_GUEST_ACCESSED_SHIFT __using_nonexistent_pte_bit()
+        #define CMPXCHG cmpxchg64
+        #define PT_MAX_FULL_LEVELS 4
 #else
        #error Invalid PTTYPE value
 #endif
@@ -80,6 +110,40 @@ static gfn_t gpte_to_gfn_lvl(pt_element_t gpte, int lvl)
        return (gpte & PT_LVL_ADDR_MASK(lvl)) >> PAGE_SHIFT;
 }
+static inline void FNAME(protect_clean_gpte)(unsigned *access, unsigned gpte)
+{
+        unsigned mask;
+        /* dirty bit is not supported, so no need to track it */
+        if (!PT_GUEST_DIRTY_MASK)
+                return;
+        BUILD_BUG_ON(PT_WRITABLE_MASK != ACC_WRITE_MASK);
+        mask = (unsigned)~ACC_WRITE_MASK;
+        /* Allow write access to dirty gptes */
+        mask |= (gpte >> (PT_GUEST_DIRTY_SHIFT - PT_WRITABLE_SHIFT)) &
+                PT_WRITABLE_MASK;
+        *access &= mask;
+}
+static bool FNAME(is_rsvd_bits_set)(struct kvm_mmu *mmu, u64 gpte, int level)
+{
+        int bit7 = (gpte >> 7) & 1, low6 = gpte & 0x3f;
+        return (gpte & mmu->rsvd_bits_mask[bit7][level-1]) |
+                ((mmu->bad_mt_xwr & (1ull << low6)) != 0);
+}
+static inline int FNAME(is_present_gpte)(unsigned long pte)
+{
+#if PTTYPE != PTTYPE_EPT
+        return is_present_gpte(pte);
+#else
+        return pte & 7;
+#endif
+}
 static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
                               pt_element_t __user *ptep_user, unsigned index,
                               pt_element_t orig_pte, pt_element_t new_pte)
@@ -103,6 +167,42 @@ static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
        return (ret != orig_pte);
 }
+static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu,
+                                  struct kvm_mmu_page *sp, u64 *spte,
+                                  u64 gpte)
+{
+        if (FNAME(is_rsvd_bits_set)(&vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL))
+                goto no_present;
+        if (!FNAME(is_present_gpte)(gpte))
+                goto no_present;
+        /* if accessed bit is not supported prefetch non accessed gpte */
+        if (PT_GUEST_ACCESSED_MASK && !(gpte & PT_GUEST_ACCESSED_MASK))
+                goto no_present;
+        return false;
+no_present:
+        drop_spte(vcpu->kvm, spte);
+        return true;
+}
+static inline unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, u64 gpte)
+{
+        unsigned access;
+#if PTTYPE == PTTYPE_EPT
+        access = ((gpte & VMX_EPT_WRITABLE_MASK) ? ACC_WRITE_MASK : 0) |
+                ((gpte & VMX_EPT_EXECUTABLE_MASK) ? ACC_EXEC_MASK : 0) |
+                ACC_USER_MASK;
+#else
+        access = (gpte & (PT_WRITABLE_MASK | PT_USER_MASK)) | ACC_EXEC_MASK;
+        access &= ~(gpte >> PT64_NX_SHIFT);
+#endif
+        return access;
+}
 static int FNAME(update_accessed_dirty_bits)(struct kvm_vcpu *vcpu,
                                             struct kvm_mmu *mmu,
                                             struct guest_walker *walker,
@@ -114,18 +214,23 @@ static int FNAME(update_accessed_dirty_bits)(struct kvm_vcpu *vcpu,
        gfn_t table_gfn;
        int ret;
+        /* dirty/accessed bits are not supported, so no need to update them */
+        if (!PT_GUEST_DIRTY_MASK)
+                return 0;
        for (level = walker->max_level; level >= walker->level; --level) {
                pte = orig_pte = walker->ptes[level - 1];
                table_gfn = walker->table_gfn[level - 1];
                ptep_user = walker->ptep_user[level - 1];
                index = offset_in_page(ptep_user) / sizeof(pt_element_t);
-                if (!(pte & PT_ACCESSED_MASK)) {
+                if (!(pte & PT_GUEST_ACCESSED_MASK)) {
                        trace_kvm_mmu_set_accessed_bit(table_gfn, index, sizeof(pte));
-                        pte |= PT_ACCESSED_MASK;
+                        pte |= PT_GUEST_ACCESSED_MASK;
                }
-                if (level == walker->level && write_fault && !is_dirty_gpte(pte)) {
+                if (level == walker->level && write_fault &&
+                                !(pte & PT_GUEST_DIRTY_MASK)) {
                        trace_kvm_mmu_set_dirty_bit(table_gfn, index, sizeof(pte));
-                        pte |= PT_DIRTY_MASK;
+                        pte |= PT_GUEST_DIRTY_MASK;
                }
                if (pte == orig_pte)
                        continue;
@@ -170,7 +275,7 @@ retry_walk:
        if (walker->level == PT32E_ROOT_LEVEL) {
                pte = mmu->get_pdptr(vcpu, (addr >> 30) & 3);
                trace_kvm_mmu_paging_element(pte, walker->level);
-                if (!is_present_gpte(pte))
+                if (!FNAME(is_present_gpte)(pte))
                        goto error;
                --walker->level;
        }
@@ -179,7 +284,7 @@ retry_walk:
        ASSERT((!is_long_mode(vcpu) && is_pae(vcpu)) ||
               (mmu->get_cr3(vcpu) & CR3_NONPAE_RESERVED_BITS) == 0);
-        accessed_dirty = PT_ACCESSED_MASK;
+        accessed_dirty = PT_GUEST_ACCESSED_MASK;
        pt_access = pte_access = ACC_ALL;
        ++walker->level;
@@ -215,17 +320,17 @@ retry_walk:
                trace_kvm_mmu_paging_element(pte, walker->level);
-                if (unlikely(!is_present_gpte(pte)))
+                if (unlikely(!FNAME(is_present_gpte)(pte)))
                        goto error;
-                if (unlikely(is_rsvd_bits_set(&vcpu->arch.mmu, pte,
+                if (unlikely(FNAME(is_rsvd_bits_set)(mmu, pte,
-                                              walker->level))) {
+                                                     walker->level))) {
                        errcode |= PFERR_RSVD_MASK | PFERR_PRESENT_MASK;
                        goto error;
                }
                accessed_dirty &= pte;
-                pte_access = pt_access & gpte_access(vcpu, pte);
+                pte_access = pt_access & FNAME(gpte_access)(vcpu, pte);
                walker->ptes[walker->level - 1] = pte;
        } while (!is_last_gpte(mmu, walker->level, pte));
@@ -248,13 +353,15 @@ retry_walk:
        walker->gfn = real_gpa >> PAGE_SHIFT;
        if (!write_fault)
-                protect_clean_gpte(&pte_access, pte);
+                FNAME(protect_clean_gpte)(&pte_access, pte);
        else
                /*
-                 * On a write fault, fold the dirty bit into accessed_dirty by
+                 * On a write fault, fold the dirty bit into accessed_dirty.
-                 * shifting it one place right.
+                 * For modes without A/D bits support accessed_dirty will be
+                 * always clear.
                 */
-                accessed_dirty &= pte >> (PT_DIRTY_SHIFT - PT_ACCESSED_SHIFT);
+                accessed_dirty &= pte >>
+                        (PT_GUEST_DIRTY_SHIFT - PT_GUEST_ACCESSED_SHIFT);
        if (unlikely(!accessed_dirty)) {
                ret = FNAME(update_accessed_dirty_bits)(vcpu, mmu, walker, write_fault);
@@ -279,6 +386,25 @@ error:
        walker->fault.vector = PF_VECTOR;
        walker->fault.error_code_valid = true;
        walker->fault.error_code = errcode;
+#if PTTYPE == PTTYPE_EPT
+        /*
+         * Use PFERR_RSVD_MASK in error_code to to tell if EPT
+         * misconfiguration requires to be injected. The detection is
+         * done by is_rsvd_bits_set() above.
+         *
+         * We set up the value of exit_qualification to inject:
+         * [2:0] - Derive from [2:0] of real exit_qualification at EPT violation
+         * [5:3] - Calculated by the page walk of the guest EPT page tables
+         * [7:8] - Derived from [7:8] of real exit_qualification
+         *
+         * The other bits are set to 0.
+         */
+        if (!(errcode & PFERR_RSVD_MASK)) {
+                vcpu->arch.exit_qualification &= 0x187;
+                vcpu->arch.exit_qualification |= ((pt_access & pte) & 0x7) << 3;
+        }
+#endif
        walker->fault.address = addr;
        walker->fault.nested_page_fault = mmu != vcpu->arch.walk_mmu;
@@ -293,6 +419,7 @@ static int FNAME(walk_addr)(struct guest_walker *walker,
                                        access);
 }
+#if PTTYPE != PTTYPE_EPT
 static int FNAME(walk_addr_nested)(struct guest_walker *walker,
                                   struct kvm_vcpu *vcpu, gva_t addr,
                                   u32 access)
@@ -300,6 +427,7 @@ static int FNAME(walk_addr_nested)(struct guest_walker *walker,
        return FNAME(walk_addr_generic)(walker, vcpu, &vcpu->arch.nested_mmu,
                                        addr, access);
 }
+#endif
 static bool
 FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
@@ -309,14 +437,14 @@ FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
        gfn_t gfn;
        pfn_t pfn;
-        if (prefetch_invalid_gpte(vcpu, sp, spte, gpte))
+        if (FNAME(prefetch_invalid_gpte)(vcpu, sp, spte, gpte))
                return false;
        pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte);
        gfn = gpte_to_gfn(gpte);
-        pte_access = sp->role.access & gpte_access(vcpu, gpte);
+        pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
-        protect_clean_gpte(&pte_access, gpte);
+        FNAME(protect_clean_gpte)(&pte_access, gpte);
        pfn = pte_prefetch_gfn_to_pfn(vcpu, gfn,
                        no_dirty_log && (pte_access & ACC_WRITE_MASK));
        if (is_error_pfn(pfn))
@@ -446,7 +574,7 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
                        goto out_gpte_changed;
                if (sp)
-                        link_shadow_page(it.sptep, sp);
+                        link_shadow_page(it.sptep, sp, PT_GUEST_ACCESSED_MASK);
        }
        for (;
@@ -466,7 +594,7 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
                sp = kvm_mmu_get_page(vcpu, direct_gfn, addr, it.level-1,
                                      true, direct_access, it.sptep);
-                link_shadow_page(it.sptep, sp);
+                link_shadow_page(it.sptep, sp, PT_GUEST_ACCESSED_MASK);
        }
        clear_sp_write_flooding_count(it.sptep);
@@ -727,6 +855,7 @@ static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr, u32 access,
        return gpa;
 }
+#if PTTYPE != PTTYPE_EPT
 static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gva_t vaddr,
                                      u32 access,
                                      struct x86_exception *exception)
@@ -745,6 +874,7 @@ static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gva_t vaddr,
        return gpa;
 }
+#endif
 /*
 * Using the cached information from sp->gfns is safe because:
@@ -785,15 +915,15 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
                                          sizeof(pt_element_t)))
                        return -EINVAL;
-                if (prefetch_invalid_gpte(vcpu, sp, &sp->spt[i], gpte)) {
+                if (FNAME(prefetch_invalid_gpte)(vcpu, sp, &sp->spt[i], gpte)) {
                        vcpu->kvm->tlbs_dirty++;
                        continue;
                }
                gfn = gpte_to_gfn(gpte);
                pte_access = sp->role.access;
-                pte_access &= gpte_access(vcpu, gpte);
+                pte_access &= FNAME(gpte_access)(vcpu, gpte);
-                protect_clean_gpte(&pte_access, gpte);
+                FNAME(protect_clean_gpte)(&pte_access, gpte);
                if (sync_mmio_spte(vcpu->kvm, &sp->spt[i], gfn, pte_access,
                      &nr_present))
@@ -830,3 +960,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
 #undef gpte_to_gfn
 #undef gpte_to_gfn_lvl
 #undef CMPXCHG
+#undef PT_GUEST_ACCESSED_MASK
+#undef PT_GUEST_DIRTY_MASK
+#undef PT_GUEST_DIRTY_SHIFT
+#undef PT_GUEST_ACCESSED_SHIFT
diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c
index c53e797e7369..5c4f63151b4d 100644
--- a/arch/x86/kvm/pmu.c
+++ b/arch/x86/kvm/pmu.c
@@ -160,7 +160,7 @@ static void stop_counter(struct kvm_pmc *pmc)
 static void reprogram_counter(struct kvm_pmc *pmc, u32 type,
                unsigned config, bool exclude_user, bool exclude_kernel,
-                bool intr)
+                bool intr, bool in_tx, bool in_tx_cp)
 {
        struct perf_event *event;
        struct perf_event_attr attr = {
@@ -173,6 +173,10 @@ static void reprogram_counter(struct kvm_pmc *pmc, u32 type,
                .exclude_kernel = exclude_kernel,
                .config = config,
        };
+        if (in_tx)
+                attr.config |= HSW_IN_TX;
+        if (in_tx_cp)
+                attr.config |= HSW_IN_TX_CHECKPOINTED;
        attr.sample_period = (-pmc->counter) & pmc_bitmask(pmc);
@@ -226,7 +230,9 @@ static void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel)
        if (!(eventsel & (ARCH_PERFMON_EVENTSEL_EDGE |
                                ARCH_PERFMON_EVENTSEL_INV |
-                                ARCH_PERFMON_EVENTSEL_CMASK))) {
+                                ARCH_PERFMON_EVENTSEL_CMASK |
+                                HSW_IN_TX |
+                                HSW_IN_TX_CHECKPOINTED))) {
                config = find_arch_event(&pmc->vcpu->arch.pmu, event_select,
                                unit_mask);
                if (config != PERF_COUNT_HW_MAX)
@@ -239,7 +245,9 @@ static void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel)
        reprogram_counter(pmc, type, config,
                        !(eventsel & ARCH_PERFMON_EVENTSEL_USR),
                        !(eventsel & ARCH_PERFMON_EVENTSEL_OS),
-                        eventsel & ARCH_PERFMON_EVENTSEL_INT);
+                        eventsel & ARCH_PERFMON_EVENTSEL_INT,
+                        (eventsel & HSW_IN_TX),
+                        (eventsel & HSW_IN_TX_CHECKPOINTED));
 }
 static void reprogram_fixed_counter(struct kvm_pmc *pmc, u8 en_pmi, int idx)
@@ -256,7 +264,7 @@ static void reprogram_fixed_counter(struct kvm_pmc *pmc, u8 en_pmi, int idx)
                        arch_events[fixed_pmc_events[idx]].event_type,
                        !(en & 0x2), /* exclude user */
                        !(en & 0x1), /* exclude kernel */
-                        pmi);
+                        pmi, false, false);
 }
 static inline u8 fixed_en_pmi(u64 ctrl, int idx)
@@ -408,7 +416,7 @@ int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
                } else if ((pmc = get_gp_pmc(pmu, index, MSR_P6_EVNTSEL0))) {
                        if (data == pmc->eventsel)
                                return 0;
-                        if (!(data & 0xffffffff00200000ull)) {
+                        if (!(data & pmu->reserved_bits)) {
                                reprogram_gp_counter(pmc, data);
                                return 0;
                        }
@@ -450,6 +458,7 @@ void kvm_pmu_cpuid_update(struct kvm_vcpu *vcpu)
        pmu->counter_bitmask[KVM_PMC_GP] = 0;
        pmu->counter_bitmask[KVM_PMC_FIXED] = 0;
        pmu->version = 0;
+        pmu->reserved_bits = 0xffffffff00200000ull;
        entry = kvm_find_cpuid_entry(vcpu, 0xa, 0);
        if (!entry)
@@ -478,6 +487,12 @@ void kvm_pmu_cpuid_update(struct kvm_vcpu *vcpu)
        pmu->global_ctrl = ((1 << pmu->nr_arch_gp_counters) - 1) |
                (((1ull << pmu->nr_arch_fixed_counters) - 1) << INTEL_PMC_IDX_FIXED);
        pmu->global_ctrl_mask = ~pmu->global_ctrl;
+        entry = kvm_find_cpuid_entry(vcpu, 7, 0);
+        if (entry &&
+            (boot_cpu_has(X86_FEATURE_HLE) || boot_cpu_has(X86_FEATURE_RTM)) &&
+            (entry->ebx & (X86_FEATURE_HLE|X86_FEATURE_RTM)))
+                pmu->reserved_bits ^= HSW_IN_TX|HSW_IN_TX_CHECKPOINTED;
 }
 void kvm_pmu_init(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 064d0be67ecc..1f1da43ff2a2 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -373,6 +373,7 @@ struct nested_vmx {
         * we must keep them pinned while L2 runs.
         */
        struct page *apic_access_page;
+        u64 msr_ia32_feature_control;
 };
 #define POSTED_INTR_ON  0
@@ -711,10 +712,10 @@ static void nested_release_page_clean(struct page *page)
        kvm_release_page_clean(page);
 }
+static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu);
 static u64 construct_eptp(unsigned long root_hpa);
 static void kvm_cpu_vmxon(u64 addr);
 static void kvm_cpu_vmxoff(void);
-static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3);
 static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr);
 static void vmx_set_segment(struct kvm_vcpu *vcpu,
                            struct kvm_segment *var, int seg);
@@ -1039,12 +1040,16 @@ static inline bool nested_cpu_has2(struct vmcs12 *vmcs12, u32 bit)
                (vmcs12->secondary_vm_exec_control & bit);
 }
-static inline bool nested_cpu_has_virtual_nmis(struct vmcs12 *vmcs12,
+static inline bool nested_cpu_has_virtual_nmis(struct vmcs12 *vmcs12)
-        struct kvm_vcpu *vcpu)
 {
        return vmcs12->pin_based_vm_exec_control & PIN_BASED_VIRTUAL_NMIS;
 }
+static inline int nested_cpu_has_ept(struct vmcs12 *vmcs12)
+{
+        return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_EPT);
+}
 static inline bool is_exception(u32 intr_info)
 {
        return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
@@ -2155,6 +2160,7 @@ static u32 nested_vmx_pinbased_ctls_low, nested_vmx_pinbased_ctls_high;
 static u32 nested_vmx_exit_ctls_low, nested_vmx_exit_ctls_high;
 static u32 nested_vmx_entry_ctls_low, nested_vmx_entry_ctls_high;
 static u32 nested_vmx_misc_low, nested_vmx_misc_high;
+static u32 nested_vmx_ept_caps;
 static __init void nested_vmx_setup_ctls_msrs(void)
 {
        /*
@@ -2190,14 +2196,17 @@ static __init void nested_vmx_setup_ctls_msrs(void)
         * If bit 55 of VMX_BASIC is off, bits 0-8 and 10, 11, 13, 14, 16 and
         * 17 must be 1.
         */
+        rdmsr(MSR_IA32_VMX_EXIT_CTLS,
+                nested_vmx_exit_ctls_low, nested_vmx_exit_ctls_high);
        nested_vmx_exit_ctls_low = VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR;
        /* Note that guest use of VM_EXIT_ACK_INTR_ON_EXIT is not supported. */
+        nested_vmx_exit_ctls_high &=
 #ifdef CONFIG_X86_64
-        nested_vmx_exit_ctls_high = VM_EXIT_HOST_ADDR_SPACE_SIZE;
+                VM_EXIT_HOST_ADDR_SPACE_SIZE |
-#else
-        nested_vmx_exit_ctls_high = 0;
 #endif
-        nested_vmx_exit_ctls_high |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR;
+                VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT;
+        nested_vmx_exit_ctls_high |= (VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR |
+                                      VM_EXIT_LOAD_IA32_EFER);
        /* entry controls */
        rdmsr(MSR_IA32_VMX_ENTRY_CTLS,
@@ -2205,8 +2214,12 @@ static __init void nested_vmx_setup_ctls_msrs(void)
        /* If bit 55 of VMX_BASIC is off, bits 0-8 and 12 must be 1. */
        nested_vmx_entry_ctls_low = VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR;
        nested_vmx_entry_ctls_high &=
-                VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_IA32E_MODE;
+#ifdef CONFIG_X86_64
-        nested_vmx_entry_ctls_high |= VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR;
+                VM_ENTRY_IA32E_MODE |
+#endif
+                VM_ENTRY_LOAD_IA32_PAT;
+        nested_vmx_entry_ctls_high |= (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR |
+                                       VM_ENTRY_LOAD_IA32_EFER);
        /* cpu-based controls */
        rdmsr(MSR_IA32_VMX_PROCBASED_CTLS,
@@ -2241,6 +2254,22 @@ static __init void nested_vmx_setup_ctls_msrs(void)
                SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
                SECONDARY_EXEC_WBINVD_EXITING;
+        if (enable_ept) {
+                /* nested EPT: emulate EPT also to L1 */
+                nested_vmx_secondary_ctls_high |= SECONDARY_EXEC_ENABLE_EPT;
+                nested_vmx_ept_caps = VMX_EPT_PAGE_WALK_4_BIT |
+                         VMX_EPTP_WB_BIT | VMX_EPT_INVEPT_BIT;
+                nested_vmx_ept_caps &= vmx_capability.ept;
+                /*
+                 * Since invept is completely emulated we support both global
+                 * and context invalidation independent of what host cpu
+                 * supports
+                 */
+                nested_vmx_ept_caps |= VMX_EPT_EXTENT_GLOBAL_BIT |
+                        VMX_EPT_EXTENT_CONTEXT_BIT;
+        } else
+                nested_vmx_ept_caps = 0;
        /* miscellaneous data */
        rdmsr(MSR_IA32_VMX_MISC, nested_vmx_misc_low, nested_vmx_misc_high);
        nested_vmx_misc_low &= VMX_MISC_PREEMPTION_TIMER_RATE_MASK |
@@ -2282,8 +2311,11 @@ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
        switch (msr_index) {
        case MSR_IA32_FEATURE_CONTROL:
-                *pdata = 0;
+                if (nested_vmx_allowed(vcpu)) {
-                break;
+                        *pdata = to_vmx(vcpu)->nested.msr_ia32_feature_control;
+                        break;
+                }
+                return 0;
        case MSR_IA32_VMX_BASIC:
                /*
                 * This MSR reports some information about VMX support. We
@@ -2346,8 +2378,8 @@ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
                                        nested_vmx_secondary_ctls_high);
                break;
        case MSR_IA32_VMX_EPT_VPID_CAP:
-                /* Currently, no nested ept or nested vpid */
+                /* Currently, no nested vpid support */
-                *pdata = 0;
+                *pdata = nested_vmx_ept_caps;
                break;
        default:
                return 0;
@@ -2356,14 +2388,24 @@ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
        return 1;
 }
-static int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
+static int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 {
+        u32 msr_index = msr_info->index;
+        u64 data = msr_info->data;
+        bool host_initialized = msr_info->host_initiated;
        if (!nested_vmx_allowed(vcpu))
                return 0;
-        if (msr_index == MSR_IA32_FEATURE_CONTROL)
+        if (msr_index == MSR_IA32_FEATURE_CONTROL) {
-                /* TODO: the right thing. */
+                if (!host_initialized &&
+                                to_vmx(vcpu)->nested.msr_ia32_feature_control
+                                & FEATURE_CONTROL_LOCKED)
+                        return 0;
+                to_vmx(vcpu)->nested.msr_ia32_feature_control = data;
                return 1;
+        }
        /*
         * No need to treat VMX capability MSRs specially: If we don't handle
         * them, handle_wrmsr will #GP(0), which is correct (they are readonly)
@@ -2494,7 +2536,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
                        return 1;
                /* Otherwise falls through */
        default:
-                if (vmx_set_vmx_msr(vcpu, msr_index, data))
+                if (vmx_set_vmx_msr(vcpu, msr_info))
                        break;
                msr = find_msr_entry(vmx, msr_index);
                if (msr) {
@@ -5302,9 +5344,13 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu)
        /* It is a write fault? */
        error_code = exit_qualification & (1U << 1);
+        /* It is a fetch fault? */
+        error_code |= (exit_qualification & (1U << 2)) << 2;
        /* ept page table is present? */
        error_code |= (exit_qualification >> 3) & 0x1;
+        vcpu->arch.exit_qualification = exit_qualification;
        return kvm_mmu_page_fault(vcpu, gpa, error_code, NULL, 0);
 }
@@ -5438,7 +5484,8 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
                err = emulate_instruction(vcpu, EMULTYPE_NO_REEXECUTE);
-                if (err == EMULATE_DO_MMIO) {
+                if (err == EMULATE_USER_EXIT) {
+                        ++vcpu->stat.mmio_exits;
                        ret = 0;
                        goto out;
                }
@@ -5567,8 +5614,47 @@ static void nested_free_all_saved_vmcss(struct vcpu_vmx *vmx)
                free_loaded_vmcs(&vmx->vmcs01);
 }
+/*
+ * The following 3 functions, nested_vmx_succeed()/failValid()/failInvalid(),
+ * set the success or error code of an emulated VMX instruction, as specified
+ * by Vol 2B, VMX Instruction Reference, "Conventions".
+ */
+static void nested_vmx_succeed(struct kvm_vcpu *vcpu)
+{
+        vmx_set_rflags(vcpu, vmx_get_rflags(vcpu)
+                        & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF |
+                            X86_EFLAGS_ZF | X86_EFLAGS_SF | X86_EFLAGS_OF));
+}
+static void nested_vmx_failInvalid(struct kvm_vcpu *vcpu)
+{
+        vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu)
+                        & ~(X86_EFLAGS_PF | X86_EFLAGS_AF | X86_EFLAGS_ZF |
+                            X86_EFLAGS_SF | X86_EFLAGS_OF))
+                        | X86_EFLAGS_CF);
+}
 static void nested_vmx_failValid(struct kvm_vcpu *vcpu,
-                                 u32 vm_instruction_error);
+                                        u32 vm_instruction_error)
+{
+        if (to_vmx(vcpu)->nested.current_vmptr == -1ull) {
+                /*
+                 * failValid writes the error number to the current VMCS, which
+                 * can't be done there isn't a current VMCS.
+                 */
+                nested_vmx_failInvalid(vcpu);
+                return;
+        }
+        vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu)
+                        & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF |
+                            X86_EFLAGS_SF | X86_EFLAGS_OF))
+                        | X86_EFLAGS_ZF);
+        get_vmcs12(vcpu)->vm_instruction_error = vm_instruction_error;
+        /*
+         * We don't need to force a shadow sync because
+         * VM_INSTRUCTION_ERROR is not shadowed
+         */
+}
 /*
 * Emulate the VMXON instruction.
@@ -5583,6 +5669,8 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
        struct kvm_segment cs;
        struct vcpu_vmx *vmx = to_vmx(vcpu);
        struct vmcs *shadow_vmcs;
+        const u64 VMXON_NEEDED_FEATURES = FEATURE_CONTROL_LOCKED
+                | FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;
        /* The Intel VMX Instruction Reference lists a bunch of bits that
         * are prerequisite to running VMXON, most notably cr4.VMXE must be
@@ -5611,6 +5699,13 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
                skip_emulated_instruction(vcpu);
                return 1;
        }
+        if ((vmx->nested.msr_ia32_feature_control & VMXON_NEEDED_FEATURES)
+                        != VMXON_NEEDED_FEATURES) {
+                kvm_inject_gp(vcpu, 0);
+                return 1;
+        }
        if (enable_shadow_vmcs) {
                shadow_vmcs = alloc_vmcs();
                if (!shadow_vmcs)
@@ -5628,6 +5723,7 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
        vmx->nested.vmxon = true;
        skip_emulated_instruction(vcpu);
+        nested_vmx_succeed(vcpu);
        return 1;
 }
@@ -5712,6 +5808,7 @@ static int handle_vmoff(struct kvm_vcpu *vcpu)
                return 1;
        free_nested(to_vmx(vcpu));
        skip_emulated_instruction(vcpu);
+        nested_vmx_succeed(vcpu);
        return 1;
 }
@@ -5768,48 +5865,6 @@ static int get_vmx_mem_address(struct kvm_vcpu *vcpu,
        return 0;
 }
-/*
- * The following 3 functions, nested_vmx_succeed()/failValid()/failInvalid(),
- * set the success or error code of an emulated VMX instruction, as specified
- * by Vol 2B, VMX Instruction Reference, "Conventions".
- */
-static void nested_vmx_succeed(struct kvm_vcpu *vcpu)
-{
-        vmx_set_rflags(vcpu, vmx_get_rflags(vcpu)
-                        & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF |
-                            X86_EFLAGS_ZF | X86_EFLAGS_SF | X86_EFLAGS_OF));
-}
-static void nested_vmx_failInvalid(struct kvm_vcpu *vcpu)
-{
-        vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu)
-                        & ~(X86_EFLAGS_PF | X86_EFLAGS_AF | X86_EFLAGS_ZF |
-                            X86_EFLAGS_SF | X86_EFLAGS_OF))
-                        | X86_EFLAGS_CF);
-}
-static void nested_vmx_failValid(struct kvm_vcpu *vcpu,
-                                        u32 vm_instruction_error)
-{
-        if (to_vmx(vcpu)->nested.current_vmptr == -1ull) {
-                /*
-                 * failValid writes the error number to the current VMCS, which
-                 * can't be done there isn't a current VMCS.
-                 */
-                nested_vmx_failInvalid(vcpu);
-                return;
-        }
-        vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu)
-                        & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF |
-                            X86_EFLAGS_SF | X86_EFLAGS_OF))
-                        | X86_EFLAGS_ZF);
-        get_vmcs12(vcpu)->vm_instruction_error = vm_instruction_error;
-        /*
-         * We don't need to force a shadow sync because
-         * VM_INSTRUCTION_ERROR is not shadowed
-         */
-}
 /* Emulate the VMCLEAR instruction */
 static int handle_vmclear(struct kvm_vcpu *vcpu)
 {
@@ -5972,8 +6027,8 @@ static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx)
        unsigned long field;
        u64 field_value;
        struct vmcs *shadow_vmcs = vmx->nested.current_shadow_vmcs;
-        unsigned long *fields = (unsigned long *)shadow_read_write_fields;
+        const unsigned long *fields = shadow_read_write_fields;
-        int num_fields = max_shadow_read_write_fields;
+        const int num_fields = max_shadow_read_write_fields;
        vmcs_load(shadow_vmcs);
@@ -6002,12 +6057,11 @@ static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx)
 static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx)
 {
-        unsigned long *fields[] = {
+        const unsigned long *fields[] = {
-                (unsigned long *)shadow_read_write_fields,
+                shadow_read_write_fields,
-                (unsigned long *)shadow_read_only_fields
+                shadow_read_only_fields
        };
-        int num_lists =  ARRAY_SIZE(fields);
+        const int max_fields[] = {
-        int max_fields[] = {
                max_shadow_read_write_fields,
                max_shadow_read_only_fields
        };
@@ -6018,7 +6072,7 @@ static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx)
        vmcs_load(shadow_vmcs);
-        for (q = 0; q < num_lists; q++) {
+        for (q = 0; q < ARRAY_SIZE(fields); q++) {
                for (i = 0; i < max_fields[q]; i++) {
                        field = fields[q][i];
                        vmcs12_read_any(&vmx->vcpu, field, &field_value);
@@ -6248,6 +6302,74 @@ static int handle_vmptrst(struct kvm_vcpu *vcpu)
        return 1;
 }
+/* Emulate the INVEPT instruction */
+static int handle_invept(struct kvm_vcpu *vcpu)
+{
+        u32 vmx_instruction_info, types;
+        unsigned long type;
+        gva_t gva;
+        struct x86_exception e;
+        struct {
+                u64 eptp, gpa;
+        } operand;
+        u64 eptp_mask = ((1ull << 51) - 1) & PAGE_MASK;
+        if (!(nested_vmx_secondary_ctls_high & SECONDARY_EXEC_ENABLE_EPT) ||
+            !(nested_vmx_ept_caps & VMX_EPT_INVEPT_BIT)) {
+                kvm_queue_exception(vcpu, UD_VECTOR);
+                return 1;
+        }
+        if (!nested_vmx_check_permission(vcpu))
+                return 1;
+        if (!kvm_read_cr0_bits(vcpu, X86_CR0_PE)) {
+                kvm_queue_exception(vcpu, UD_VECTOR);
+                return 1;
+        }
+        vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
+        type = kvm_register_read(vcpu, (vmx_instruction_info >> 28) & 0xf);
+        types = (nested_vmx_ept_caps >> VMX_EPT_EXTENT_SHIFT) & 6;
+        if (!(types & (1UL << type))) {
+                nested_vmx_failValid(vcpu,
+                                VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
+                return 1;
+        }
+        /* According to the Intel VMX instruction reference, the memory
+         * operand is read even if it isn't needed (e.g., for type==global)
+         */
+        if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
+                        vmx_instruction_info, &gva))
+                return 1;
+        if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, &operand,
+                                sizeof(operand), &e)) {
+                kvm_inject_page_fault(vcpu, &e);
+                return 1;
+        }
+        switch (type) {
+        case VMX_EPT_EXTENT_CONTEXT:
+                if ((operand.eptp & eptp_mask) !=
+                                (nested_ept_get_cr3(vcpu) & eptp_mask))
+                        break;
+        case VMX_EPT_EXTENT_GLOBAL:
+                kvm_mmu_sync_roots(vcpu);
+                kvm_mmu_flush_tlb(vcpu);
+                nested_vmx_succeed(vcpu);
+                break;
+        default:
+                BUG_ON(1);
+                break;
+        }
+        skip_emulated_instruction(vcpu);
+        return 1;
+}
 /*
 * The exit handlers return 1 if the exit was handled fully and guest execution
 * may resume.  Otherwise they set the kvm_run parameter to indicate what needs
@@ -6292,6 +6414,7 @@ static int (*const kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
        [EXIT_REASON_PAUSE_INSTRUCTION]       = handle_pause,
        [EXIT_REASON_MWAIT_INSTRUCTION]       = handle_invalid_op,
        [EXIT_REASON_MONITOR_INSTRUCTION]     = handle_invalid_op,
+        [EXIT_REASON_INVEPT]                  = handle_invept,
 };
 static const int kvm_vmx_max_exit_handlers =
@@ -6518,6 +6641,7 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
        case EXIT_REASON_VMPTRST: case EXIT_REASON_VMREAD:
        case EXIT_REASON_VMRESUME: case EXIT_REASON_VMWRITE:
        case EXIT_REASON_VMOFF: case EXIT_REASON_VMON:
+        case EXIT_REASON_INVEPT:
                /*
                 * VMX instructions trap unconditionally. This allows L1 to
                 * emulate them for its L2 guest, i.e., allows 3-level nesting!
@@ -6550,7 +6674,20 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
                return nested_cpu_has2(vmcs12,
                        SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES);
        case EXIT_REASON_EPT_VIOLATION:
+                /*
+                 * L0 always deals with the EPT violation. If nested EPT is
+                 * used, and the nested mmu code discovers that the address is
+                 * missing in the guest EPT table (EPT12), the EPT violation
+                 * will be injected with nested_ept_inject_page_fault()
+                 */
+                return 0;
        case EXIT_REASON_EPT_MISCONFIG:
+                /*
+                 * L2 never uses directly L1's EPT, but rather L0's own EPT
+                 * table (shadow on EPT) or a merged EPT table that L0 built
+                 * (EPT on EPT). So any problems with the structure of the
+                 * table is L0's fault.
+                 */
                return 0;
        case EXIT_REASON_PREEMPTION_TIMER:
                return vmcs12->pin_based_vm_exec_control &
@@ -6638,7 +6775,7 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
        if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked &&
            !(is_guest_mode(vcpu) && nested_cpu_has_virtual_nmis(
-                                        get_vmcs12(vcpu), vcpu)))) {
+                                        get_vmcs12(vcpu))))) {
                if (vmx_interrupt_allowed(vcpu)) {
                        vmx->soft_vnmi_blocked = 0;
                } else if (vmx->vnmi_blocked_time > 1000000000LL &&
@@ -7326,6 +7463,48 @@ static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
                entry->ecx |= bit(X86_FEATURE_VMX);
 }
+static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu,
+                struct x86_exception *fault)
+{
+        struct vmcs12 *vmcs12;
+        nested_vmx_vmexit(vcpu);
+        vmcs12 = get_vmcs12(vcpu);
+        if (fault->error_code & PFERR_RSVD_MASK)
+                vmcs12->vm_exit_reason = EXIT_REASON_EPT_MISCONFIG;
+        else
+                vmcs12->vm_exit_reason = EXIT_REASON_EPT_VIOLATION;
+        vmcs12->exit_qualification = vcpu->arch.exit_qualification;
+        vmcs12->guest_physical_address = fault->address;
+}
+/* Callbacks for nested_ept_init_mmu_context: */
+static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu)
+{
+        /* return the page table to be shadowed - in our case, EPT12 */
+        return get_vmcs12(vcpu)->ept_pointer;
+}
+static int nested_ept_init_mmu_context(struct kvm_vcpu *vcpu)
+{
+        int r = kvm_init_shadow_ept_mmu(vcpu, &vcpu->arch.mmu,
+                        nested_vmx_ept_caps & VMX_EPT_EXECUTE_ONLY_BIT);
+        vcpu->arch.mmu.set_cr3           = vmx_set_cr3;
+        vcpu->arch.mmu.get_cr3           = nested_ept_get_cr3;
+        vcpu->arch.mmu.inject_page_fault = nested_ept_inject_page_fault;
+        vcpu->arch.walk_mmu              = &vcpu->arch.nested_mmu;
+        return r;
+}
+static void nested_ept_uninit_mmu_context(struct kvm_vcpu *vcpu)
+{
+        vcpu->arch.walk_mmu = &vcpu->arch.mmu;
+}
 /*
 * prepare_vmcs02 is called when the L1 guest hypervisor runs its nested
 * L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it
@@ -7388,7 +7567,7 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
                vmcs12->guest_interruptibility_info);
        vmcs_write32(GUEST_SYSENTER_CS, vmcs12->guest_sysenter_cs);
        kvm_set_dr(vcpu, 7, vmcs12->guest_dr7);
-        vmcs_writel(GUEST_RFLAGS, vmcs12->guest_rflags);
+        vmx_set_rflags(vcpu, vmcs12->guest_rflags);
        vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS,
                vmcs12->guest_pending_dbg_exceptions);
        vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->guest_sysenter_esp);
@@ -7508,15 +7687,24 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
        vcpu->arch.cr0_guest_owned_bits &= ~vmcs12->cr0_guest_host_mask;
        vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits);
-        /* Note: IA32_MODE, LOAD_IA32_EFER are modified by vmx_set_efer below */
+        /* L2->L1 exit controls are emulated - the hardware exit is to L0 so
-        vmcs_write32(VM_EXIT_CONTROLS,
+         * we should use its exit controls. Note that VM_EXIT_LOAD_IA32_EFER
-                vmcs12->vm_exit_controls | vmcs_config.vmexit_ctrl);
+         * bits are further modified by vmx_set_efer() below.
-        vmcs_write32(VM_ENTRY_CONTROLS, vmcs12->vm_entry_controls |
+         */
+        vmcs_write32(VM_EXIT_CONTROLS, vmcs_config.vmexit_ctrl);
+        /* vmcs12's VM_ENTRY_LOAD_IA32_EFER and VM_ENTRY_IA32E_MODE are
+         * emulated by vmx_set_efer(), below.
+         */
+        vmcs_write32(VM_ENTRY_CONTROLS,
+                (vmcs12->vm_entry_controls & ~VM_ENTRY_LOAD_IA32_EFER &
+                        ~VM_ENTRY_IA32E_MODE) |
                (vmcs_config.vmentry_ctrl & ~VM_ENTRY_IA32E_MODE));
-        if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT)
+        if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT) {
                vmcs_write64(GUEST_IA32_PAT, vmcs12->guest_ia32_pat);
-        else if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT)
+                vcpu->arch.pat = vmcs12->guest_ia32_pat;
+        } else if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT)
                vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat);
@@ -7538,6 +7726,11 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
                vmx_flush_tlb(vcpu);
        }
+        if (nested_cpu_has_ept(vmcs12)) {
+                kvm_mmu_unload(vcpu);
+                nested_ept_init_mmu_context(vcpu);
+        }
        if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)
                vcpu->arch.efer = vmcs12->guest_ia32_efer;
        else if (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE)
@@ -7565,6 +7758,16 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
        kvm_set_cr3(vcpu, vmcs12->guest_cr3);
        kvm_mmu_reset_context(vcpu);
+        /*
+         * L1 may access the L2's PDPTR, so save them to construct vmcs12
+         */
+        if (enable_ept) {
+                vmcs_write64(GUEST_PDPTR0, vmcs12->guest_pdptr0);
+                vmcs_write64(GUEST_PDPTR1, vmcs12->guest_pdptr1);
+                vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2);
+                vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3);
+        }
        kvm_register_write(vcpu, VCPU_REGS_RSP, vmcs12->guest_rsp);
        kvm_register_write(vcpu, VCPU_REGS_RIP, vmcs12->guest_rip);
 }
@@ -7887,6 +8090,22 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
        vmcs12->guest_pending_dbg_exceptions =
                vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS);
+        /*
+         * In some cases (usually, nested EPT), L2 is allowed to change its
+         * own CR3 without exiting. If it has changed it, we must keep it.
+         * Of course, if L0 is using shadow page tables, GUEST_CR3 was defined
+         * by L0, not L1 or L2, so we mustn't unconditionally copy it to vmcs12.
+         *
+         * Additionally, restore L2's PDPTR to vmcs12.
+         */
+        if (enable_ept) {
+                vmcs12->guest_cr3 = vmcs_read64(GUEST_CR3);
+                vmcs12->guest_pdptr0 = vmcs_read64(GUEST_PDPTR0);
+                vmcs12->guest_pdptr1 = vmcs_read64(GUEST_PDPTR1);
+                vmcs12->guest_pdptr2 = vmcs_read64(GUEST_PDPTR2);
+                vmcs12->guest_pdptr3 = vmcs_read64(GUEST_PDPTR3);
+        }
        vmcs12->vm_entry_controls =
                (vmcs12->vm_entry_controls & ~VM_ENTRY_IA32E_MODE) |
                (vmcs_read32(VM_ENTRY_CONTROLS) & VM_ENTRY_IA32E_MODE);
@@ -7948,6 +8167,8 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
 static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
                                   struct vmcs12 *vmcs12)
 {
+        struct kvm_segment seg;
        if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER)
                vcpu->arch.efer = vmcs12->host_ia32_efer;
        else if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE)
@@ -7982,7 +8203,9 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
        vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK);
        kvm_set_cr4(vcpu, vmcs12->host_cr4);
-        /* shadow page tables on either EPT or shadow page tables */
+        if (nested_cpu_has_ept(vmcs12))
+                nested_ept_uninit_mmu_context(vcpu);
        kvm_set_cr3(vcpu, vmcs12->host_cr3);
        kvm_mmu_reset_context(vcpu);
@@ -8001,23 +8224,61 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
        vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->host_ia32_sysenter_eip);
        vmcs_writel(GUEST_IDTR_BASE, vmcs12->host_idtr_base);
        vmcs_writel(GUEST_GDTR_BASE, vmcs12->host_gdtr_base);
-        vmcs_writel(GUEST_TR_BASE, vmcs12->host_tr_base);
-        vmcs_writel(GUEST_GS_BASE, vmcs12->host_gs_base);
+        if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) {
-        vmcs_writel(GUEST_FS_BASE, vmcs12->host_fs_base);
-        vmcs_write16(GUEST_ES_SELECTOR, vmcs12->host_es_selector);
-        vmcs_write16(GUEST_CS_SELECTOR, vmcs12->host_cs_selector);
-        vmcs_write16(GUEST_SS_SELECTOR, vmcs12->host_ss_selector);
-        vmcs_write16(GUEST_DS_SELECTOR, vmcs12->host_ds_selector);
-        vmcs_write16(GUEST_FS_SELECTOR, vmcs12->host_fs_selector);
-        vmcs_write16(GUEST_GS_SELECTOR, vmcs12->host_gs_selector);
-        vmcs_write16(GUEST_TR_SELECTOR, vmcs12->host_tr_selector);
-        if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT)
                vmcs_write64(GUEST_IA32_PAT, vmcs12->host_ia32_pat);
+                vcpu->arch.pat = vmcs12->host_ia32_pat;
+        }
        if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL)
                vmcs_write64(GUEST_IA32_PERF_GLOBAL_CTRL,
                        vmcs12->host_ia32_perf_global_ctrl);
+        /* Set L1 segment info according to Intel SDM
+            27.5.2 Loading Host Segment and Descriptor-Table Registers */
+        seg = (struct kvm_segment) {
+                .base = 0,
+                .limit = 0xFFFFFFFF,
+                .selector = vmcs12->host_cs_selector,
+                .type = 11,
+                .present = 1,
+                .s = 1,
+                .g = 1
+        };
+        if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE)
+                seg.l = 1;
+        else
+                seg.db = 1;
+        vmx_set_segment(vcpu, &seg, VCPU_SREG_CS);
+        seg = (struct kvm_segment) {
+                .base = 0,
+                .limit = 0xFFFFFFFF,
+                .type = 3,
+                .present = 1,
+                .s = 1,
+                .db = 1,
+                .g = 1
+        };
+        seg.selector = vmcs12->host_ds_selector;
+        vmx_set_segment(vcpu, &seg, VCPU_SREG_DS);
+        seg.selector = vmcs12->host_es_selector;
+        vmx_set_segment(vcpu, &seg, VCPU_SREG_ES);
+        seg.selector = vmcs12->host_ss_selector;
+        vmx_set_segment(vcpu, &seg, VCPU_SREG_SS);
+        seg.selector = vmcs12->host_fs_selector;
+        seg.base = vmcs12->host_fs_base;
+        vmx_set_segment(vcpu, &seg, VCPU_SREG_FS);
+        seg.selector = vmcs12->host_gs_selector;
+        seg.base = vmcs12->host_gs_base;
+        vmx_set_segment(vcpu, &seg, VCPU_SREG_GS);
+        seg = (struct kvm_segment) {
+                .base = vmcs12->host_tr_base,
+                .limit = 0x67,
+                .selector = vmcs12->host_tr_selector,
+                .type = 11,
+                .present = 1
+        };
+        vmx_set_segment(vcpu, &seg, VCPU_SREG_TR);
        kvm_set_dr(vcpu, 7, 0x400);
        vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
 }
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index d21bce505315..e5ca72a5cdb6 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -682,17 +682,6 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
                 */
        }
-        /*
-         * Does the new cr3 value map to physical memory? (Note, we
-         * catch an invalid cr3 even in real-mode, because it would
-         * cause trouble later on when we turn on paging anyway.)
-         *
-         * A real CPU would silently accept an invalid cr3 and would
-         * attempt to use it - with largely undefined (and often hard
-         * to debug) behavior on the guest side.
-         */
-        if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGE_SHIFT)))
-                return 1;
        vcpu->arch.cr3 = cr3;
        __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
        vcpu->arch.mmu.new_cr3(vcpu);
@@ -850,7 +839,8 @@ static u32 msrs_to_save[] = {
 #ifdef CONFIG_X86_64
        MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
 #endif
-        MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA
+        MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA,
+        MSR_IA32_FEATURE_CONTROL
 };
 static unsigned num_msrs_to_save;
@@ -1457,6 +1447,29 @@ static void pvclock_update_vm_gtod_copy(struct kvm *kvm)
 #endif
 }
+static void kvm_gen_update_masterclock(struct kvm *kvm)
+{
+#ifdef CONFIG_X86_64
+        int i;
+        struct kvm_vcpu *vcpu;
+        struct kvm_arch *ka = &kvm->arch;
+        spin_lock(&ka->pvclock_gtod_sync_lock);
+        kvm_make_mclock_inprogress_request(kvm);
+        /* no guest entries from this point */
+        pvclock_update_vm_gtod_copy(kvm);
+        kvm_for_each_vcpu(i, vcpu, kvm)
+                set_bit(KVM_REQ_CLOCK_UPDATE, &vcpu->requests);
+        /* guest entries allowed */
+        kvm_for_each_vcpu(i, vcpu, kvm)
+                clear_bit(KVM_REQ_MCLOCK_INPROGRESS, &vcpu->requests);
+        spin_unlock(&ka->pvclock_gtod_sync_lock);
+#endif
+}
 static int kvm_guest_time_update(struct kvm_vcpu *v)
 {
        unsigned long flags, this_tsc_khz;
@@ -3806,6 +3819,7 @@ long kvm_arch_vm_ioctl(struct file *filp,
                delta = user_ns.clock - now_ns;
                local_irq_enable();
                kvm->arch.kvmclock_offset = delta;
+                kvm_gen_update_masterclock(kvm);
                break;
        }
        case KVM_GET_CLOCK: {
@@ -4955,6 +4969,97 @@ static bool retry_instruction(struct x86_emulate_ctxt *ctxt,
 static int complete_emulated_mmio(struct kvm_vcpu *vcpu);
 static int complete_emulated_pio(struct kvm_vcpu *vcpu);
+static int kvm_vcpu_check_hw_bp(unsigned long addr, u32 type, u32 dr7,
+                                unsigned long *db)
+{
+        u32 dr6 = 0;
+        int i;
+        u32 enable, rwlen;
+        enable = dr7;
+        rwlen = dr7 >> 16;
+        for (i = 0; i < 4; i++, enable >>= 2, rwlen >>= 4)
+                if ((enable & 3) && (rwlen & 15) == type && db[i] == addr)
+                        dr6 |= (1 << i);
+        return dr6;
+}
+static void kvm_vcpu_check_singlestep(struct kvm_vcpu *vcpu, int *r)
+{
+        struct kvm_run *kvm_run = vcpu->run;
+        /*
+         * Use the "raw" value to see if TF was passed to the processor.
+         * Note that the new value of the flags has not been saved yet.
+         *
+         * This is correct even for TF set by the guest, because "the
+         * processor will not generate this exception after the instruction
+         * that sets the TF flag".
+         */
+        unsigned long rflags = kvm_x86_ops->get_rflags(vcpu);
+        if (unlikely(rflags & X86_EFLAGS_TF)) {
+                if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) {
+                        kvm_run->debug.arch.dr6 = DR6_BS | DR6_FIXED_1;
+                        kvm_run->debug.arch.pc = vcpu->arch.singlestep_rip;
+                        kvm_run->debug.arch.exception = DB_VECTOR;
+                        kvm_run->exit_reason = KVM_EXIT_DEBUG;
+                        *r = EMULATE_USER_EXIT;
+                } else {
+                        vcpu->arch.emulate_ctxt.eflags &= ~X86_EFLAGS_TF;
+                        /*
+                         * "Certain debug exceptions may clear bit 0-3.  The
+                         * remaining contents of the DR6 register are never
+                         * cleared by the processor".
+                         */
+                        vcpu->arch.dr6 &= ~15;
+                        vcpu->arch.dr6 |= DR6_BS;
+                        kvm_queue_exception(vcpu, DB_VECTOR);
+                }
+        }
+}
+static bool kvm_vcpu_check_breakpoint(struct kvm_vcpu *vcpu, int *r)
+{
+        struct kvm_run *kvm_run = vcpu->run;
+        unsigned long eip = vcpu->arch.emulate_ctxt.eip;
+        u32 dr6 = 0;
+        if (unlikely(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) &&
+            (vcpu->arch.guest_debug_dr7 & DR7_BP_EN_MASK)) {
+                dr6 = kvm_vcpu_check_hw_bp(eip, 0,
+                                           vcpu->arch.guest_debug_dr7,
+                                           vcpu->arch.eff_db);
+                if (dr6 != 0) {
+                        kvm_run->debug.arch.dr6 = dr6 | DR6_FIXED_1;
+                        kvm_run->debug.arch.pc = kvm_rip_read(vcpu) +
+                                get_segment_base(vcpu, VCPU_SREG_CS);
+                        kvm_run->debug.arch.exception = DB_VECTOR;
+                        kvm_run->exit_reason = KVM_EXIT_DEBUG;
+                        *r = EMULATE_USER_EXIT;
+                        return true;
+                }
+        }
+        if (unlikely(vcpu->arch.dr7 & DR7_BP_EN_MASK)) {
+                dr6 = kvm_vcpu_check_hw_bp(eip, 0,
+                                           vcpu->arch.dr7,
+                                           vcpu->arch.db);
+                if (dr6 != 0) {
+                        vcpu->arch.dr6 &= ~15;
+                        vcpu->arch.dr6 |= dr6;
+                        kvm_queue_exception(vcpu, DB_VECTOR);
+                        *r = EMULATE_DONE;
+                        return true;
+                }
+        }
+        return false;
+}
 int x86_emulate_instruction(struct kvm_vcpu *vcpu,
                            unsigned long cr2,
                            int emulation_type,
@@ -4975,6 +5080,16 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu,
        if (!(emulation_type & EMULTYPE_NO_DECODE)) {
                init_emulate_ctxt(vcpu);
+                /*
+                 * We will reenter on the same instruction since
+                 * we do not set complete_userspace_io.  This does not
+                 * handle watchpoints yet, those would be handled in
+                 * the emulate_ops.
+                 */
+                if (kvm_vcpu_check_breakpoint(vcpu, &r))
+                        return r;
                ctxt->interruptibility = 0;
                ctxt->have_exception = false;
                ctxt->perm_ok = false;
@@ -5031,17 +5146,18 @@ restart:
                inject_emulated_exception(vcpu);
                r = EMULATE_DONE;
        } else if (vcpu->arch.pio.count) {
-                if (!vcpu->arch.pio.in)
+                if (!vcpu->arch.pio.in) {
+                        /* FIXME: return into emulator if single-stepping.  */
                        vcpu->arch.pio.count = 0;
-                else {
+                } else {
                        writeback = false;
                        vcpu->arch.complete_userspace_io = complete_emulated_pio;
                }
-                r = EMULATE_DO_MMIO;
+                r = EMULATE_USER_EXIT;
        } else if (vcpu->mmio_needed) {
                if (!vcpu->mmio_is_write)
                        writeback = false;
-                r = EMULATE_DO_MMIO;
+                r = EMULATE_USER_EXIT;
                vcpu->arch.complete_userspace_io = complete_emulated_mmio;
        } else if (r == EMULATION_RESTART)
                goto restart;
@@ -5050,10 +5166,12 @@ restart:
        if (writeback) {
                toggle_interruptibility(vcpu, ctxt->interruptibility);
-                kvm_set_rflags(vcpu, ctxt->eflags);
                kvm_make_request(KVM_REQ_EVENT, vcpu);
                vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
                kvm_rip_write(vcpu, ctxt->eip);
+                if (r == EMULATE_DONE)
+                        kvm_vcpu_check_singlestep(vcpu, &r);
+                kvm_set_rflags(vcpu, ctxt->eflags);
        } else
                vcpu->arch.emulate_regs_need_sync_to_vcpu = true;
@@ -5347,7 +5465,7 @@ static struct notifier_block pvclock_gtod_notifier = {
 int kvm_arch_init(void *opaque)
 {
        int r;
-        struct kvm_x86_ops *ops = (struct kvm_x86_ops *)opaque;
+        struct kvm_x86_ops *ops = opaque;
        if (kvm_x86_ops) {
                printk(KERN_ERR "kvm: already loaded the other module\n");
@@ -5495,6 +5613,23 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu)
        return 1;
 }
+/*
+ * kvm_pv_kick_cpu_op:  Kick a vcpu.
+ *
+ * @apicid - apicid of vcpu to be kicked.
+ */
+static void kvm_pv_kick_cpu_op(struct kvm *kvm, unsigned long flags, int apicid)
+{
+        struct kvm_lapic_irq lapic_irq;
+        lapic_irq.shorthand = 0;
+        lapic_irq.dest_mode = 0;
+        lapic_irq.dest_id = apicid;
+        lapic_irq.delivery_mode = APIC_DM_REMRD;
+        kvm_irq_delivery_to_apic(kvm, 0, &lapic_irq, NULL);
+}
 int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
 {
        unsigned long nr, a0, a1, a2, a3, ret;
@@ -5528,6 +5663,10 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
        case KVM_HC_VAPIC_POLL_IRQ:
                ret = 0;
                break;
+        case KVM_HC_KICK_CPU:
+                kvm_pv_kick_cpu_op(vcpu->kvm, a0, a1);
+                ret = 0;
+                break;
        default:
                ret = -KVM_ENOSYS;
                break;
@@ -5689,29 +5828,6 @@ static void process_nmi(struct kvm_vcpu *vcpu)
        kvm_make_request(KVM_REQ_EVENT, vcpu);
 }
-static void kvm_gen_update_masterclock(struct kvm *kvm)
-{
-#ifdef CONFIG_X86_64
-        int i;
-        struct kvm_vcpu *vcpu;
-        struct kvm_arch *ka = &kvm->arch;
-        spin_lock(&ka->pvclock_gtod_sync_lock);
-        kvm_make_mclock_inprogress_request(kvm);
-        /* no guest entries from this point */
-        pvclock_update_vm_gtod_copy(kvm);
-        kvm_for_each_vcpu(i, vcpu, kvm)
-                set_bit(KVM_REQ_CLOCK_UPDATE, &vcpu->requests);
-        /* guest entries allowed */
-        kvm_for_each_vcpu(i, vcpu, kvm)
-                clear_bit(KVM_REQ_MCLOCK_INPROGRESS, &vcpu->requests);
-        spin_unlock(&ka->pvclock_gtod_sync_lock);
-#endif
-}
 static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu)
 {
        u64 eoi_exit_bitmap[4];
@@ -5950,6 +6066,7 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
                                kvm_apic_accept_events(vcpu);
                                switch(vcpu->arch.mp_state) {
                                case KVM_MP_STATE_HALTED:
+                                        vcpu->arch.pv.pv_unhalted = false;
                                        vcpu->arch.mp_state =
                                                KVM_MP_STATE_RUNNABLE;
                                case KVM_MP_STATE_RUNNABLE:
@@ -6061,6 +6178,8 @@ static int complete_emulated_mmio(struct kvm_vcpu *vcpu)
        if (vcpu->mmio_cur_fragment == vcpu->mmio_nr_fragments) {
                vcpu->mmio_needed = 0;
+                /* FIXME: return into emulator if single-stepping.  */
                if (vcpu->mmio_is_write)
                        return 1;
                vcpu->mmio_read_completed = 1;
@@ -6249,7 +6368,12 @@ int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
                                    struct kvm_mp_state *mp_state)
 {
        kvm_apic_accept_events(vcpu);
-        mp_state->mp_state = vcpu->arch.mp_state;
+        if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED &&
+                                        vcpu->arch.pv.pv_unhalted)
+                mp_state->mp_state = KVM_MP_STATE_RUNNABLE;
+        else
+                mp_state->mp_state = vcpu->arch.mp_state;
        return 0;
 }
@@ -6770,6 +6894,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
        BUG_ON(vcpu->kvm == NULL);
        kvm = vcpu->kvm;
+        vcpu->arch.pv.pv_unhalted = false;
        vcpu->arch.emulate_ctxt.ops = &emulate_ops;
        if (!irqchip_in_kernel(kvm) || kvm_vcpu_is_bsp(vcpu))
                vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
@@ -7019,6 +7144,15 @@ out_free:
        return -ENOMEM;
 }
+void kvm_arch_memslots_updated(struct kvm *kvm)
+{
+        /*
+         * memslots->generation has been incremented.
+         * mmio generation may have reached its maximum value.
+         */
+        kvm_mmu_invalidate_mmio_sptes(kvm);
+}
 int kvm_arch_prepare_memory_region(struct kvm *kvm,
                                struct kvm_memory_slot *memslot,
                                struct kvm_userspace_memory_region *mem,
@@ -7079,11 +7213,6 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
         */
        if ((change != KVM_MR_DELETE) && (mem->flags & KVM_MEM_LOG_DIRTY_PAGES))
                kvm_mmu_slot_remove_write_access(kvm, mem->slot);
-        /*
-         * If memory slot is created, or moved, we need to clear all
-         * mmio sptes.
-         */
-        kvm_mmu_invalidate_mmio_sptes(kvm);
 }
 void kvm_arch_flush_shadow_all(struct kvm *kvm)
@@ -7103,6 +7232,7 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
                !vcpu->arch.apf.halted)
                || !list_empty_careful(&vcpu->async_pf.done)
                || kvm_apic_has_events(vcpu)
+                || vcpu->arch.pv.pv_unhalted
                || atomic_read(&vcpu->arch.nmi_queued) ||
                (kvm_arch_interrupt_allowed(vcpu) &&
                 kvm_cpu_has_interrupt(vcpu));
diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c
index c74436e687bf..72074d528400 100644
--- a/arch/x86/vdso/vclock_gettime.c
+++ b/arch/x86/vdso/vclock_gettime.c
@@ -85,15 +85,18 @@ static notrace cycle_t vread_pvclock(int *mode)
        cycle_t ret;
        u64 last;
        u32 version;
-        u32 migrate_count;
        u8 flags;
        unsigned cpu, cpu1;
        /*
-         * When looping to get a consistent (time-info, tsc) pair, we
+         * Note: hypervisor must guarantee that:
-         * also need to deal with the possibility we can switch vcpus,
+         * 1. cpu ID number maps 1:1 to per-CPU pvclock time info.
-         * so make sure we always re-fetch time-info for the current vcpu.
+         * 2. that per-CPU pvclock time info is updated if the
+         *    underlying CPU changes.
+         * 3. that version is increased whenever underlying CPU
+         *    changes.
+         *
         */
        do {
                cpu = __getcpu() & VGETCPU_CPU_MASK;
@@ -104,8 +107,6 @@ static notrace cycle_t vread_pvclock(int *mode)
                pvti = get_pvti(cpu);
-                migrate_count = pvti->migrate_count;
                version = __pvclock_read_cycles(&pvti->pvti, &ret, &flags);
                /*
@@ -117,8 +118,7 @@ static notrace cycle_t vread_pvclock(int *mode)
                cpu1 = __getcpu() & VGETCPU_CPU_MASK;
        } while (unlikely(cpu != cpu1 ||
                          (pvti->pvti.version & 1) ||
-                          pvti->pvti.version != version ||
+                          pvti->pvti.version != version));
-                          pvti->migrate_count != migrate_count));
        if (unlikely(!(flags & PVCLOCK_TSC_STABLE_BIT)))
                *mode = VCLOCK_NONE;
diff --git a/drivers/base/Kconfig b/drivers/base/Kconfig
index 5daa2599ed48..e373671652b0 100644
--- a/drivers/base/Kconfig
+++ b/drivers/base/Kconfig
@@ -200,11 +200,9 @@ config DMA_SHARED_BUFFER
          APIs extension; the file's descriptor can then be passed on to other
          driver.
-config CMA
+config DMA_CMA
-        bool "Contiguous Memory Allocator"
+        bool "DMA Contiguous Memory Allocator"
-        depends on HAVE_DMA_CONTIGUOUS && HAVE_MEMBLOCK
+        depends on HAVE_DMA_CONTIGUOUS && CMA
-        select MIGRATION
-        select MEMORY_ISOLATION
        help
          This enables the Contiguous Memory Allocator which allows drivers
          to allocate big physically-contiguous blocks of memory for use with
@@ -213,17 +211,7 @@ config CMA
          For more information see <include/linux/dma-contiguous.h>.
          If unsure, say "n".
-if CMA
+if  DMA_CMA
-config CMA_DEBUG
-        bool "CMA debug messages (DEVELOPMENT)"
-        depends on DEBUG_KERNEL
-        help
-          Turns on debug messages in CMA.  This produces KERN_DEBUG
-          messages for every CMA call as well as various messages while
-          processing calls such as dma_alloc_from_contiguous().
-          This option does not affect warning and error messages.
 comment "Default contiguous memory area size:"
 config CMA_SIZE_MBYTES
diff --git a/drivers/base/Makefile b/drivers/base/Makefile
index 48029aa477d9..94e8a80e87f8 100644
--- a/drivers/base/Makefile
+++ b/drivers/base/Makefile
@@ -6,7 +6,7 @@ obj-y			:= core.o bus.o dd.o syscore.o \
                           attribute_container.o transport_class.o \
                           topology.o
 obj-$(CONFIG_DEVTMPFS)  += devtmpfs.o
-obj-$(CONFIG_CMA) += dma-contiguous.o
+obj-$(CONFIG_DMA_CMA) += dma-contiguous.o
 obj-y                   += power/
 obj-$(CONFIG_HAS_DMA)   += dma-mapping.o
 obj-$(CONFIG_HAVE_GENERIC_DMA_COHERENT) += dma-coherent.o
diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h
index 343744e4809c..7e2d15837b02 100644
--- a/include/kvm/arm_vgic.h
+++ b/include/kvm/arm_vgic.h
@@ -26,7 +26,7 @@
 #include <linux/types.h>
 #include <linux/irqchip/arm-gic.h>
-#define VGIC_NR_IRQS            128
+#define VGIC_NR_IRQS            256
 #define VGIC_NR_SGIS            16
 #define VGIC_NR_PPIS            16
 #define VGIC_NR_PRIVATE_IRQS    (VGIC_NR_SGIS + VGIC_NR_PPIS)
diff --git a/include/linux/dma-contiguous.h b/include/linux/dma-contiguous.h
index 01b5c84be828..00141d3325fe 100644
--- a/include/linux/dma-contiguous.h
+++ b/include/linux/dma-contiguous.h
@@ -57,7 +57,7 @@ struct cma;
 struct page;
 struct device;
-#ifdef CONFIG_CMA
+#ifdef CONFIG_DMA_CMA
 /*
 * There is always at least global CMA area and a few optional device
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index a63d83ebd151..ca645a01d37a 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -85,6 +85,12 @@ static inline bool is_noslot_pfn(pfn_t pfn)
        return pfn == KVM_PFN_NOSLOT;
 }
+/*
+ * architectures with KVM_HVA_ERR_BAD other than PAGE_OFFSET (e.g. s390)
+ * provide own defines and kvm_is_error_hva
+ */
+#ifndef KVM_HVA_ERR_BAD
 #define KVM_HVA_ERR_BAD         (PAGE_OFFSET)
 #define KVM_HVA_ERR_RO_BAD      (PAGE_OFFSET + PAGE_SIZE)
@@ -93,6 +99,8 @@ static inline bool kvm_is_error_hva(unsigned long addr)
        return addr >= PAGE_OFFSET;
 }
+#endif
 #define KVM_ERR_PTR_BAD_PAGE    (ERR_PTR(-ENOENT))
 static inline bool is_error_page(struct page *page)
@@ -160,8 +168,12 @@ enum kvm_bus {
 int kvm_io_bus_write(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
                     int len, const void *val);
+int kvm_io_bus_write_cookie(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
+                            int len, const void *val, long cookie);
 int kvm_io_bus_read(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr, int len,
                    void *val);
+int kvm_io_bus_read_cookie(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
+                           int len, void *val, long cookie);
 int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
                            int len, struct kvm_io_device *dev);
 int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx,
@@ -499,6 +511,7 @@ int __kvm_set_memory_region(struct kvm *kvm,
 void kvm_arch_free_memslot(struct kvm_memory_slot *free,
                           struct kvm_memory_slot *dont);
 int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages);
+void kvm_arch_memslots_updated(struct kvm *kvm);
 int kvm_arch_prepare_memory_region(struct kvm *kvm,
                                struct kvm_memory_slot *memslot,
                                struct kvm_userspace_memory_region *mem,
diff --git a/include/linux/sched.h b/include/linux/sched.h
index f79ced719435..ce1e1c0aaa33 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -107,14 +107,6 @@ extern unsigned long this_cpu_load(void);
 extern void calc_global_load(unsigned long ticks);
 extern void update_cpu_load_nohz(void);
-/* Notifier for when a task gets migrated to a new CPU */
-struct task_migration_notifier {
-        struct task_struct *task;
-        int from_cpu;
-        int to_cpu;
-};
-extern void register_task_migration_notifier(struct notifier_block *n);
 extern unsigned long get_parent_ip(unsigned long addr);
 extern void dump_cpu_task(int cpu);
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index acccd08be6c7..99c25338ede8 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -667,6 +667,7 @@ struct kvm_ppc_smmu_info {
 #define KVM_CAP_PPC_RTAS 91
 #define KVM_CAP_IRQ_XICS 92
 #define KVM_CAP_ARM_EL1_32BIT 93
+#define KVM_CAP_SPAPR_MULTITCE 94
 #ifdef KVM_CAP_IRQ_ROUTING
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 725aa067ad63..5ac63c9a995a 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -978,13 +978,6 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
                rq->skip_clock_update = 1;
 }
-static ATOMIC_NOTIFIER_HEAD(task_migration_notifier);
-void register_task_migration_notifier(struct notifier_block *n)
-{
-        atomic_notifier_chain_register(&task_migration_notifier, n);
-}
 #ifdef CONFIG_SMP
 void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
 {
@@ -1015,18 +1008,10 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
        trace_sched_migrate_task(p, new_cpu);
        if (task_cpu(p) != new_cpu) {
-                struct task_migration_notifier tmn;
                if (p->sched_class->migrate_task_rq)
                        p->sched_class->migrate_task_rq(p, new_cpu);
                p->se.nr_migrations++;
                perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, NULL, 0);
-                tmn.task = p;
-                tmn.from_cpu = task_cpu(p);
-                tmn.to_cpu = new_cpu;
-                atomic_notifier_call_chain(&task_migration_notifier, 0, &tmn);
        }
        __set_task_cpu(p, new_cpu);
diff --git a/mm/Kconfig b/mm/Kconfig
index 8028dcc6615c..6cdd27043303 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -478,6 +478,30 @@ config FRONTSWAP
          If unsure, say Y to enable frontswap.
+config CMA
+        bool "Contiguous Memory Allocator"
+        depends on HAVE_MEMBLOCK
+        select MIGRATION
+        select MEMORY_ISOLATION
+        help
+          This enables the Contiguous Memory Allocator which allows other
+          subsystems to allocate big physically-contiguous blocks of memory.
+          CMA reserves a region of memory and allows only movable pages to
+          be allocated from it. This way, the kernel can use the memory for
+          pagecache and when a subsystem requests for contiguous area, the
+          allocated pages are migrated away to serve the contiguous request.
+          If unsure, say "n".
+config CMA_DEBUG
+        bool "CMA debug messages (DEVELOPMENT)"
+        depends on DEBUG_KERNEL && CMA
+        help
+          Turns on debug messages in CMA.  This produces KERN_DEBUG
+          messages for every CMA call as well as various messages while
+          processing calls such as dma_alloc_from_contiguous().
+          This option does not affect warning and error messages.
 config ZBUD
        tristate
        default n
diff --git a/virt/kvm/arm/vgic.c b/virt/kvm/arm/vgic.c
index 17c5ac7d10ed..685fc72fc751 100644
--- a/virt/kvm/arm/vgic.c
+++ b/virt/kvm/arm/vgic.c
@@ -149,7 +149,7 @@ static u32 *vgic_bytemap_get_reg(struct vgic_bytemap *x, int cpuid, u32 offset)
 {
        offset >>= 2;
        BUG_ON(offset > (VGIC_NR_IRQS / 4));
-        if (offset < 4)
+        if (offset < 8)
                return x->percpu[cpuid] + offset;
        else
                return x->shared + offset - 8;
@@ -432,19 +432,13 @@ static bool handle_mmio_priority_reg(struct kvm_vcpu *vcpu,
 static u32 vgic_get_target_reg(struct kvm *kvm, int irq)
 {
        struct vgic_dist *dist = &kvm->arch.vgic;
-        struct kvm_vcpu *vcpu;
+        int i;
-        int i, c;
-        unsigned long *bmap;
        u32 val = 0;
        irq -= VGIC_NR_PRIVATE_IRQS;
-        kvm_for_each_vcpu(c, vcpu, kvm) {
+        for (i = 0; i < GICD_IRQS_PER_ITARGETSR; i++)
-                bmap = vgic_bitmap_get_shared_map(&dist->irq_spi_target[c]);
+                val |= 1 << (dist->irq_spi_cpu[irq + i] + i * 8);
-                for (i = 0; i < GICD_IRQS_PER_ITARGETSR; i++)
-                        if (test_bit(irq + i, bmap))
-                                val |= 1 << (c + i * 8);
-        }
        return val;
 }
@@ -547,8 +541,12 @@ static bool handle_mmio_cfg_reg(struct kvm_vcpu *vcpu,
                                struct kvm_exit_mmio *mmio, phys_addr_t offset)
 {
        u32 val;
-        u32 *reg = vgic_bitmap_get_reg(&vcpu->kvm->arch.vgic.irq_cfg,
+        u32 *reg;
-                                       vcpu->vcpu_id, offset >> 1);
+        offset >>= 1;
+        reg = vgic_bitmap_get_reg(&vcpu->kvm->arch.vgic.irq_cfg,
+                                  vcpu->vcpu_id, offset);
        if (offset & 2)
                val = *reg >> 16;
        else
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 1580dd4ace4e..bf040c4e02b3 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -102,28 +102,8 @@ static bool largepages_enabled = true;
 bool kvm_is_mmio_pfn(pfn_t pfn)
 {
-        if (pfn_valid(pfn)) {
+        if (pfn_valid(pfn))
-                int reserved;
+                return PageReserved(pfn_to_page(pfn));
-                struct page *tail = pfn_to_page(pfn);
-                struct page *head = compound_trans_head(tail);
-                reserved = PageReserved(head);
-                if (head != tail) {
-                        /*
-                         * "head" is not a dangling pointer
-                         * (compound_trans_head takes care of that)
-                         * but the hugepage may have been splitted
-                         * from under us (and we may not hold a
-                         * reference count on the head page so it can
-                         * be reused before we run PageReferenced), so
-                         * we've to check PageTail before returning
-                         * what we just read.
-                         */
-                        smp_rmb();
-                        if (PageTail(tail))
-                                return reserved;
-                }
-                return PageReserved(tail);
-        }
        return true;
 }
@@ -731,7 +711,10 @@ static struct kvm_memslots *install_new_memslots(struct kvm *kvm,
        update_memslots(slots, new, kvm->memslots->generation);
        rcu_assign_pointer(kvm->memslots, slots);
        synchronize_srcu_expedited(&kvm->srcu);
-        return old_memslots; 
+        kvm_arch_memslots_updated(kvm);
+        return old_memslots;
 }
 /*
@@ -1893,7 +1876,7 @@ static struct file_operations kvm_vcpu_fops = {
 */
 static int create_vcpu_fd(struct kvm_vcpu *vcpu)
 {
-        return anon_inode_getfd("kvm-vcpu", &kvm_vcpu_fops, vcpu, O_RDWR);
+        return anon_inode_getfd("kvm-vcpu", &kvm_vcpu_fops, vcpu, O_RDWR | O_CLOEXEC);
 }
 /*
@@ -2302,7 +2285,7 @@ static int kvm_ioctl_create_device(struct kvm *kvm,
                return ret;
        }
-        ret = anon_inode_getfd(ops->name, &kvm_device_fops, dev, O_RDWR);
+        ret = anon_inode_getfd(ops->name, &kvm_device_fops, dev, O_RDWR | O_CLOEXEC);
        if (ret < 0) {
                ops->destroy(dev);
                return ret;
@@ -2586,7 +2569,7 @@ static int kvm_dev_ioctl_create_vm(unsigned long type)
                return r;
        }
 #endif
-        r = anon_inode_getfd("kvm-vm", &kvm_vm_fops, kvm, O_RDWR);
+        r = anon_inode_getfd("kvm-vm", &kvm_vm_fops, kvm, O_RDWR | O_CLOEXEC);
        if (r < 0)
                kvm_put_kvm(kvm);
@@ -2812,11 +2795,9 @@ static void kvm_io_bus_destroy(struct kvm_io_bus *bus)
        kfree(bus);
 }
-static int kvm_io_bus_sort_cmp(const void *p1, const void *p2)
+static inline int kvm_io_bus_cmp(const struct kvm_io_range *r1,
+                                 const struct kvm_io_range *r2)
 {
-        const struct kvm_io_range *r1 = p1;
-        const struct kvm_io_range *r2 = p2;
        if (r1->addr < r2->addr)
                return -1;
        if (r1->addr + r1->len > r2->addr + r2->len)
@@ -2824,6 +2805,11 @@ static int kvm_io_bus_sort_cmp(const void *p1, const void *p2)
        return 0;
 }
+static int kvm_io_bus_sort_cmp(const void *p1, const void *p2)
+{
+        return kvm_io_bus_cmp(p1, p2);
+}
 static int kvm_io_bus_insert_dev(struct kvm_io_bus *bus, struct kvm_io_device *dev,
                          gpa_t addr, int len)
 {
@@ -2857,17 +2843,54 @@ static int kvm_io_bus_get_first_dev(struct kvm_io_bus *bus,
        off = range - bus->range;
-        while (off > 0 && kvm_io_bus_sort_cmp(&key, &bus->range[off-1]) == 0)
+        while (off > 0 && kvm_io_bus_cmp(&key, &bus->range[off-1]) == 0)
                off--;
        return off;
 }
+static int __kvm_io_bus_write(struct kvm_io_bus *bus,
+                              struct kvm_io_range *range, const void *val)
+{
+        int idx;
+        idx = kvm_io_bus_get_first_dev(bus, range->addr, range->len);
+        if (idx < 0)
+                return -EOPNOTSUPP;
+        while (idx < bus->dev_count &&
+                kvm_io_bus_cmp(range, &bus->range[idx]) == 0) {
+                if (!kvm_iodevice_write(bus->range[idx].dev, range->addr,
+                                        range->len, val))
+                        return idx;
+                idx++;
+        }
+        return -EOPNOTSUPP;
+}
 /* kvm_io_bus_write - called under kvm->slots_lock */
 int kvm_io_bus_write(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
                     int len, const void *val)
 {
-        int idx;
+        struct kvm_io_bus *bus;
+        struct kvm_io_range range;
+        int r;
+        range = (struct kvm_io_range) {
+                .addr = addr,
+                .len = len,
+        };
+        bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu);
+        r = __kvm_io_bus_write(bus, &range, val);
+        return r < 0 ? r : 0;
+}
+/* kvm_io_bus_write_cookie - called under kvm->slots_lock */
+int kvm_io_bus_write_cookie(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
+                            int len, const void *val, long cookie)
+{
        struct kvm_io_bus *bus;
        struct kvm_io_range range;
@@ -2877,14 +2900,35 @@ int kvm_io_bus_write(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
        };
        bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu);
-        idx = kvm_io_bus_get_first_dev(bus, addr, len);
+        /* First try the device referenced by cookie. */
+        if ((cookie >= 0) && (cookie < bus->dev_count) &&
+            (kvm_io_bus_cmp(&range, &bus->range[cookie]) == 0))
+                if (!kvm_iodevice_write(bus->range[cookie].dev, addr, len,
+                                        val))
+                        return cookie;
+        /*
+         * cookie contained garbage; fall back to search and return the
+         * correct cookie value.
+         */
+        return __kvm_io_bus_write(bus, &range, val);
+}
+static int __kvm_io_bus_read(struct kvm_io_bus *bus, struct kvm_io_range *range,
+                             void *val)
+{
+        int idx;
+        idx = kvm_io_bus_get_first_dev(bus, range->addr, range->len);
        if (idx < 0)
                return -EOPNOTSUPP;
        while (idx < bus->dev_count &&
-                kvm_io_bus_sort_cmp(&range, &bus->range[idx]) == 0) {
+                kvm_io_bus_cmp(range, &bus->range[idx]) == 0) {
-                if (!kvm_iodevice_write(bus->range[idx].dev, addr, len, val))
+                if (!kvm_iodevice_read(bus->range[idx].dev, range->addr,
-                        return 0;
+                                       range->len, val))
+                        return idx;
                idx++;
        }
@@ -2895,9 +2939,9 @@ int kvm_io_bus_write(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
 int kvm_io_bus_read(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
                    int len, void *val)
 {
-        int idx;
        struct kvm_io_bus *bus;
        struct kvm_io_range range;
+        int r;
        range = (struct kvm_io_range) {
                .addr = addr,
@@ -2905,18 +2949,36 @@ int kvm_io_bus_read(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
        };
        bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu);
-        idx = kvm_io_bus_get_first_dev(bus, addr, len);
+        r = __kvm_io_bus_read(bus, &range, val);
-        if (idx < 0)
+        return r < 0 ? r : 0;
-                return -EOPNOTSUPP;
+}
-        while (idx < bus->dev_count &&
+/* kvm_io_bus_read_cookie - called under kvm->slots_lock */
-                kvm_io_bus_sort_cmp(&range, &bus->range[idx]) == 0) {
+int kvm_io_bus_read_cookie(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
-                if (!kvm_iodevice_read(bus->range[idx].dev, addr, len, val))
+                           int len, void *val, long cookie)
-                        return 0;
+{
-                idx++;
+        struct kvm_io_bus *bus;
-        }
+        struct kvm_io_range range;
-        return -EOPNOTSUPP;
+        range = (struct kvm_io_range) {
+                .addr = addr,
+                .len = len,
+        };
+        bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu);
+        /* First try the device referenced by cookie. */
+        if ((cookie >= 0) && (cookie < bus->dev_count) &&
+            (kvm_io_bus_cmp(&range, &bus->range[cookie]) == 0))
+                if (!kvm_iodevice_read(bus->range[cookie].dev, addr, len,
+                                       val))
+                        return cookie;
+        /*
+         * cookie contained garbage; fall back to search and return the
+         * correct cookie value.
+         */
+        return __kvm_io_bus_read(bus, &range, val);
 }
 /* Caller must hold slots_lock. */