Merge branch 'next' of git://git.kernel.org/pub/scm/virt/kvm/kvm

Pull KVM updates from Gleb Natapov: "The highlights of the release are nested EPT and pv-ticketlocks support (hypervisor part, guest part, which is most of the code, goes through tip tree). Apart of that there are many fixes for all arches" Fix up semantic conflicts as discussed in the pull request thread.. * 'next' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (88 commits) ARM: KVM: Add newlines to panic strings ARM: KVM: Work around older compiler bug ARM: KVM: Simplify tracepoint text ARM: KVM: Fix kvm_set_pte assignment ARM: KVM: vgic: Bump VGIC_NR_IRQS to 256 ARM: KVM: Bugfix: vgic_bytemap_get_reg per cpu regs ARM: KVM: vgic: fix GICD_ICFGRn access ARM: KVM: vgic: simplify vgic_get_target_reg KVM: MMU: remove unused parameter KVM: PPC: Book3S PR: Rework kvmppc_mmu_book3s_64_xlate() KVM: PPC: Book3S PR: Make instruction fetch fallback work for system calls KVM: PPC: Book3S PR: Don't corrupt guest state when kernel uses VMX KVM: x86: update masterclock when kvmclock_offset is calculated (v2) KVM: PPC: Book3S: Fix compile error in XICS emulation KVM: PPC: Book3S PR: return appropriate error when allocation fails arch: powerpc: kvm: add signed type cast for comparation KVM: x86: add comments where MMIO does not return to the emulator KVM: vmx: count exits to userspace during invalid guest emulation KVM: rename __kvm_io_bus_sort_cmp to kvm_io_bus_cmp kvm: optimize away THP checks in kvm_is_mmio_pfn() ...
author: Linus Torvalds <torvalds@linux-foundation.org> 2013-09-04 21:15:06 -0400
committer: Linus Torvalds <torvalds@linux-foundation.org> 2013-09-04 21:15:06 -0400
commit: ae7a835cc546fc67df90edaaa0c48ae2b22a29fe (patch)
tree: b1235437fde066ab0f272f164d75dc1b98a244cf
parent: cf39c8e5352b4fb9efedfe7e9acb566a85ed847c (diff)
parent: 6b9e4fa07443f5baf5bbd7ab043abd6976f8d7bc (diff)
73 files changed, 2413 insertions, 1403 deletions
diff --git a/Documentation/virtual/kvm/cpuid.txt b/Documentation/virtual/kvm/cpuid.txt
index 83afe65d4966..22ff659bc0fb 100644
--- a/Documentation/virtual/kvm/cpuid.txt
+++ b/Documentation/virtual/kvm/cpuid.txt
@@ -43,6 +43,10 @@ KVM_FEATURE_CLOCKSOURCE2           ||     3 || kvmclock available at msrs
 KVM_FEATURE_ASYNC_PF               ||     4 || async pf can be enabled by
                                   ||       || writing to msr 0x4b564d02
 ------------------------------------------------------------------------------
+KVM_FEATURE_PV_UNHALT              ||     7 || guest checks this feature bit
+                                   ||       || before enabling paravirtualized
+                                   ||       || spinlock support.
+------------------------------------------------------------------------------
 KVM_FEATURE_CLOCKSOURCE_STABLE_BIT ||    24 || host will warn if no guest-side
                                   ||       || per-cpu warps are expected in
                                   ||       || kvmclock.
diff --git a/Documentation/virtual/kvm/hypercalls.txt b/Documentation/virtual/kvm/hypercalls.txt
index ea113b5d87a4..022198e389d7 100644
--- a/Documentation/virtual/kvm/hypercalls.txt
+++ b/Documentation/virtual/kvm/hypercalls.txt
@@ -64,3 +64,17 @@ Purpose: To enable communication between the hypervisor and guest there is a
 shared page that contains parts of supervisor visible register state.
 The guest can map this shared page to access its supervisor register through
 memory using this hypercall.
+5. KVM_HC_KICK_CPU
+------------------------
+Architecture: x86
+Status: active
+Purpose: Hypercall used to wakeup a vcpu from HLT state
+Usage example : A vcpu of a paravirtualized guest that is busywaiting in guest
+kernel mode for an event to occur (ex: a spinlock to become available) can
+execute HLT instruction once it has busy-waited for more than a threshold
+time-interval. Execution of HLT instruction would cause the hypervisor to put
+the vcpu to sleep until occurence of an appropriate event. Another vcpu of the
+same guest can wakeup the sleeping vcpu by issuing KVM_HC_KICK_CPU hypercall,
+specifying APIC ID (a1) of the vcpu to be woken up. An additional argument (a0)
+is used in the hypercall for future use.
diff --git a/arch/arm/configs/keystone_defconfig b/arch/arm/configs/keystone_defconfig
index 62e968cac9dc..1f36b823905f 100644
--- a/arch/arm/configs/keystone_defconfig
+++ b/arch/arm/configs/keystone_defconfig
@@ -104,6 +104,7 @@ CONFIG_IP_SCTP=y
 CONFIG_VLAN_8021Q=y
 CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug"
 CONFIG_CMA=y
+CONFIG_DMA_CMA=y
 CONFIG_MTD=y
 CONFIG_MTD_CMDLINE_PARTS=y
 CONFIG_MTD_BLOCK=y
diff --git a/arch/arm/configs/omap2plus_defconfig b/arch/arm/configs/omap2plus_defconfig
index 5339e6a4d639..5465f564fdf3 100644
--- a/arch/arm/configs/omap2plus_defconfig
+++ b/arch/arm/configs/omap2plus_defconfig
@@ -78,6 +78,7 @@ CONFIG_MAC80211_RC_PID=y
 CONFIG_MAC80211_RC_DEFAULT_PID=y
 CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug"
 CONFIG_CMA=y
+CONFIG_DMA_CMA=y
 CONFIG_CONNECTOR=y
 CONFIG_DEVTMPFS=y
 CONFIG_DEVTMPFS_MOUNT=y
diff --git a/arch/arm/configs/tegra_defconfig b/arch/arm/configs/tegra_defconfig
index 1effb43dab80..92d0a149aeb5 100644
--- a/arch/arm/configs/tegra_defconfig
+++ b/arch/arm/configs/tegra_defconfig
@@ -79,6 +79,7 @@ CONFIG_DEVTMPFS=y
 CONFIG_DEVTMPFS_MOUNT=y
 # CONFIG_FIRMWARE_IN_KERNEL is not set
 CONFIG_CMA=y
+CONFIG_DMA_CMA=y
 CONFIG_MTD=y
 CONFIG_MTD_M25P80=y
 CONFIG_PROC_DEVICETREE=y
diff --git a/arch/arm/include/asm/dma-contiguous.h b/arch/arm/include/asm/dma-contiguous.h
index 3ed37b4d93da..e072bb2ba1b1 100644
--- a/arch/arm/include/asm/dma-contiguous.h
+++ b/arch/arm/include/asm/dma-contiguous.h
@@ -2,7 +2,7 @@
 #define ASMARM_DMA_CONTIGUOUS_H
 #ifdef __KERNEL__
-#ifdef CONFIG_CMA
+#ifdef CONFIG_DMA_CMA
 #include <linux/types.h>
 #include <asm-generic/dma-contiguous.h>
diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h
index 472ac7091003..9b28c41f4ba9 100644
--- a/arch/arm/include/asm/kvm_mmu.h
+++ b/arch/arm/include/asm/kvm_mmu.h
@@ -64,7 +64,7 @@ void kvm_clear_hyp_idmap(void);
 static inline void kvm_set_pte(pte_t *pte, pte_t new_pte)
 {
-        pte_val(*pte) = new_pte;
+        *pte = new_pte;
        /*
         * flush_pmd_entry just takes a void pointer and cleans the necessary
         * cache entries, so we can reuse the function for ptes.
diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
index 741f66a2edbd..9c697db2787e 100644
--- a/arch/arm/kvm/arm.c
+++ b/arch/arm/kvm/arm.c
@@ -219,6 +219,10 @@ long kvm_arch_dev_ioctl(struct file *filp,
        return -EINVAL;
 }
+void kvm_arch_memslots_updated(struct kvm *kvm)
+{
+}
 int kvm_arch_prepare_memory_region(struct kvm *kvm,
                                   struct kvm_memory_slot *memslot,
                                   struct kvm_userspace_memory_region *mem,
diff --git a/arch/arm/kvm/interrupts.S b/arch/arm/kvm/interrupts.S
index 16cd4ba5d7fd..85dd84b10687 100644
--- a/arch/arm/kvm/interrupts.S
+++ b/arch/arm/kvm/interrupts.S
@@ -492,10 +492,10 @@ __kvm_hyp_code_end:
        .section ".rodata"
 und_die_str:
-        .ascii  "unexpected undefined exception in Hyp mode at: %#08x"
+        .ascii  "unexpected undefined exception in Hyp mode at: %#08x\n"
 pabt_die_str:
-        .ascii  "unexpected prefetch abort in Hyp mode at: %#08x"
+        .ascii  "unexpected prefetch abort in Hyp mode at: %#08x\n"
 dabt_die_str:
-        .ascii  "unexpected data abort in Hyp mode at: %#08x"
+        .ascii  "unexpected data abort in Hyp mode at: %#08x\n"
 svc_die_str:
-        .ascii  "unexpected HVC/SVC trap in Hyp mode at: %#08x"
+        .ascii  "unexpected HVC/SVC trap in Hyp mode at: %#08x\n"
diff --git a/arch/arm/kvm/reset.c b/arch/arm/kvm/reset.c
index b7840e7aa452..71e08baee209 100644
--- a/arch/arm/kvm/reset.c
+++ b/arch/arm/kvm/reset.c
@@ -40,7 +40,7 @@ static struct kvm_regs a15_regs_reset = {
 };
 static const struct kvm_irq_level a15_vtimer_irq = {
-        .irq = 27,
+        { .irq = 27 },
        .level = 1,
 };
diff --git a/arch/arm/kvm/trace.h b/arch/arm/kvm/trace.h
index a8e73ed5ad5b..b1d640f78623 100644
--- a/arch/arm/kvm/trace.h
+++ b/arch/arm/kvm/trace.h
@@ -59,10 +59,9 @@ TRACE_EVENT(kvm_guest_fault,
                __entry->ipa                    = ipa;
        ),
-        TP_printk("guest fault at PC %#08lx (hxfar %#08lx, "
+        TP_printk("ipa %#llx, hsr %#08lx, hxfar %#08lx, pc %#08lx",
-                  "ipa %#16llx, hsr %#08lx",
+                  __entry->ipa, __entry->hsr,
-                  __entry->vcpu_pc, __entry->hxfar,
+                  __entry->hxfar, __entry->vcpu_pc)
-                  __entry->ipa, __entry->hsr)
 );
 TRACE_EVENT(kvm_irq_line,
diff --git a/arch/arm/mm/dma-mapping.c b/arch/arm/mm/dma-mapping.c
index 7f9b1798c6cf..dbddc07a3bbd 100644
--- a/arch/arm/mm/dma-mapping.c
+++ b/arch/arm/mm/dma-mapping.c
@@ -358,7 +358,7 @@ static int __init atomic_pool_init(void)
        if (!pages)
                goto no_pages;
-        if (IS_ENABLED(CONFIG_CMA))
+        if (IS_ENABLED(CONFIG_DMA_CMA))
                ptr = __alloc_from_contiguous(NULL, pool->size, prot, &page,
                                              atomic_pool_init);
        else
@@ -670,7 +670,7 @@ static void *__dma_alloc(struct device *dev, size_t size, dma_addr_t *handle,
                addr = __alloc_simple_buffer(dev, size, gfp, &page);
        else if (!(gfp & __GFP_WAIT))
                addr = __alloc_from_pool(size, &page);
-        else if (!IS_ENABLED(CONFIG_CMA))
+        else if (!IS_ENABLED(CONFIG_DMA_CMA))
                addr = __alloc_remap_buffer(dev, size, gfp, prot, &page, caller);
        else
                addr = __alloc_from_contiguous(dev, size, prot, &page, caller);
@@ -759,7 +759,7 @@ static void __arm_dma_free(struct device *dev, size_t size, void *cpu_addr,
                __dma_free_buffer(page, size);
        } else if (__free_from_pool(cpu_addr, size)) {
                return;
-        } else if (!IS_ENABLED(CONFIG_CMA)) {
+        } else if (!IS_ENABLED(CONFIG_DMA_CMA)) {
                __dma_free_remap(cpu_addr, size);
                __dma_free_buffer(page, size);
        } else {
diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c
index 5b2dc0d10c8f..bdfd8789b376 100644
--- a/arch/ia64/kvm/kvm-ia64.c
+++ b/arch/ia64/kvm/kvm-ia64.c
@@ -1560,6 +1560,10 @@ int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages)
        return 0;
 }
+void kvm_arch_memslots_updated(struct kvm *kvm)
+{
+}
 int kvm_arch_prepare_memory_region(struct kvm *kvm,
                struct kvm_memory_slot *memslot,
                struct kvm_userspace_memory_region *mem,
diff --git a/arch/mips/kvm/kvm_locore.S b/arch/mips/kvm/kvm_locore.S
index dca2aa665993..bbace092ad0a 100644
--- a/arch/mips/kvm/kvm_locore.S
+++ b/arch/mips/kvm/kvm_locore.S
@@ -1,13 +1,13 @@
 /*
-* This file is subject to the terms and conditions of the GNU General Public
+ * This file is subject to the terms and conditions of the GNU General Public
-* License.  See the file "COPYING" in the main directory of this archive
+ * License.  See the file "COPYING" in the main directory of this archive
-* for more details.
+ * for more details.
-*
+ *
-* Main entry point for the guest, exception handling.
+ * Main entry point for the guest, exception handling.
-*
+ *
-* Copyright (C) 2012  MIPS Technologies, Inc.  All rights reserved.
+ * Copyright (C) 2012  MIPS Technologies, Inc.  All rights reserved.
-* Authors: Sanjay Lal <sanjayl@kymasys.com>
+ * Authors: Sanjay Lal <sanjayl@kymasys.com>
-*/
+ */
 #include <asm/asm.h>
 #include <asm/asmmacro.h>
@@ -55,195 +55,193 @@
 * a0: run
 * a1: vcpu
 */
+        .set    noreorder
+        .set    noat
 FEXPORT(__kvm_mips_vcpu_run)
-    .set    push
+        /* k0/k1 not being used in host kernel context */
-    .set    noreorder
+        INT_ADDIU k1, sp, -PT_SIZE
-    .set    noat
+        LONG_S  $0, PT_R0(k1)
+        LONG_S  $1, PT_R1(k1)
-    /* k0/k1 not being used in host kernel context */
+        LONG_S  $2, PT_R2(k1)
-        addiu           k1,sp, -PT_SIZE
+        LONG_S  $3, PT_R3(k1)
-    LONG_S          $0, PT_R0(k1)
-    LONG_S      $1, PT_R1(k1)
+        LONG_S  $4, PT_R4(k1)
-    LONG_S      $2, PT_R2(k1)
+        LONG_S  $5, PT_R5(k1)
-    LONG_S      $3, PT_R3(k1)
+        LONG_S  $6, PT_R6(k1)
+        LONG_S  $7, PT_R7(k1)
-    LONG_S      $4, PT_R4(k1)
-    LONG_S      $5, PT_R5(k1)
+        LONG_S  $8,  PT_R8(k1)
-    LONG_S      $6, PT_R6(k1)
+        LONG_S  $9,  PT_R9(k1)
-    LONG_S      $7, PT_R7(k1)
+        LONG_S  $10, PT_R10(k1)
+        LONG_S  $11, PT_R11(k1)
-    LONG_S      $8,  PT_R8(k1)
+        LONG_S  $12, PT_R12(k1)
-    LONG_S      $9,  PT_R9(k1)
+        LONG_S  $13, PT_R13(k1)
-    LONG_S      $10, PT_R10(k1)
+        LONG_S  $14, PT_R14(k1)
-    LONG_S      $11, PT_R11(k1)
+        LONG_S  $15, PT_R15(k1)
-    LONG_S      $12, PT_R12(k1)
+        LONG_S  $16, PT_R16(k1)
-    LONG_S      $13, PT_R13(k1)
+        LONG_S  $17, PT_R17(k1)
-    LONG_S      $14, PT_R14(k1)
-    LONG_S      $15, PT_R15(k1)
+        LONG_S  $18, PT_R18(k1)
-    LONG_S      $16, PT_R16(k1)
+        LONG_S  $19, PT_R19(k1)
-    LONG_S      $17, PT_R17(k1)
+        LONG_S  $20, PT_R20(k1)
+        LONG_S  $21, PT_R21(k1)
-    LONG_S      $18, PT_R18(k1)
+        LONG_S  $22, PT_R22(k1)
-    LONG_S      $19, PT_R19(k1)
+        LONG_S  $23, PT_R23(k1)
-    LONG_S      $20, PT_R20(k1)
+        LONG_S  $24, PT_R24(k1)
-    LONG_S      $21, PT_R21(k1)
+        LONG_S  $25, PT_R25(k1)
-    LONG_S      $22, PT_R22(k1)
-    LONG_S      $23, PT_R23(k1)
-    LONG_S      $24, PT_R24(k1)
-    LONG_S      $25, PT_R25(k1)
        /* XXXKYMA k0/k1 not saved, not being used if we got here through an ioctl() */
-    LONG_S      $28, PT_R28(k1)
+        LONG_S  $28, PT_R28(k1)
-    LONG_S      $29, PT_R29(k1)
+        LONG_S  $29, PT_R29(k1)
-    LONG_S      $30, PT_R30(k1)
+        LONG_S  $30, PT_R30(k1)
-    LONG_S      $31, PT_R31(k1)
+        LONG_S  $31, PT_R31(k1)
-    /* Save hi/lo */
+        /* Save hi/lo */
-        mflo            v0
+        mflo    v0
-        LONG_S          v0, PT_LO(k1)
+        LONG_S  v0, PT_LO(k1)
-        mfhi            v1
+        mfhi    v1
-        LONG_S          v1, PT_HI(k1)
+        LONG_S  v1, PT_HI(k1)
        /* Save host status */
-        mfc0            v0, CP0_STATUS
+        mfc0    v0, CP0_STATUS
-        LONG_S          v0, PT_STATUS(k1)
+        LONG_S  v0, PT_STATUS(k1)
        /* Save host ASID, shove it into the BVADDR location */
-        mfc0            v1,CP0_ENTRYHI
+        mfc0    v1, CP0_ENTRYHI
-        andi            v1, 0xff
+        andi    v1, 0xff
-        LONG_S          v1, PT_HOST_ASID(k1)
+        LONG_S  v1, PT_HOST_ASID(k1)
-    /* Save DDATA_LO, will be used to store pointer to vcpu */
+        /* Save DDATA_LO, will be used to store pointer to vcpu */
-    mfc0        v1, CP0_DDATA_LO
+        mfc0    v1, CP0_DDATA_LO
-    LONG_S      v1, PT_HOST_USERLOCAL(k1)
+        LONG_S  v1, PT_HOST_USERLOCAL(k1)
-    /* DDATA_LO has pointer to vcpu */
+        /* DDATA_LO has pointer to vcpu */
-    mtc0        a1,CP0_DDATA_LO
+        mtc0    a1, CP0_DDATA_LO
-    /* Offset into vcpu->arch */
+        /* Offset into vcpu->arch */
-        addiu           k1, a1, VCPU_HOST_ARCH
+        INT_ADDIU k1, a1, VCPU_HOST_ARCH
-    /* Save the host stack to VCPU, used for exception processing when we exit from the Guest */
+        /*
-    LONG_S      sp, VCPU_HOST_STACK(k1)
+         * Save the host stack to VCPU, used for exception processing
+         * when we exit from the Guest
+         */
+        LONG_S  sp, VCPU_HOST_STACK(k1)
-    /* Save the kernel gp as well */
+        /* Save the kernel gp as well */
-    LONG_S      gp, VCPU_HOST_GP(k1)
+        LONG_S  gp, VCPU_HOST_GP(k1)
        /* Setup status register for running the guest in UM, interrupts are disabled */
-        li                      k0,(ST0_EXL | KSU_USER| ST0_BEV)
+        li      k0, (ST0_EXL | KSU_USER | ST0_BEV)
-        mtc0            k0,CP0_STATUS
+        mtc0    k0, CP0_STATUS
-    ehb
+        ehb
-    /* load up the new EBASE */
+        /* load up the new EBASE */
-    LONG_L      k0, VCPU_GUEST_EBASE(k1)
+        LONG_L  k0, VCPU_GUEST_EBASE(k1)
-    mtc0        k0,CP0_EBASE
+        mtc0    k0, CP0_EBASE
-    /* Now that the new EBASE has been loaded, unset BEV, set interrupt mask as it was
+        /*
-     * but make sure that timer interrupts are enabled
+         * Now that the new EBASE has been loaded, unset BEV, set
-     */
+         * interrupt mask as it was but make sure that timer interrupts
-    li          k0,(ST0_EXL | KSU_USER | ST0_IE)
+         * are enabled
-    andi        v0, v0, ST0_IM
+         */
-    or          k0, k0, v0
+        li      k0, (ST0_EXL | KSU_USER | ST0_IE)
-    mtc0        k0,CP0_STATUS
+        andi    v0, v0, ST0_IM
-    ehb
+        or      k0, k0, v0
+        mtc0    k0, CP0_STATUS
+        ehb
        /* Set Guest EPC */
-        LONG_L          t0, VCPU_PC(k1)
+        LONG_L  t0, VCPU_PC(k1)
-        mtc0            t0, CP0_EPC
+        mtc0    t0, CP0_EPC
 FEXPORT(__kvm_mips_load_asid)
-    /* Set the ASID for the Guest Kernel */
+        /* Set the ASID for the Guest Kernel */
-    sll         t0, t0, 1                       /* with kseg0 @ 0x40000000, kernel */
+        INT_SLL t0, t0, 1       /* with kseg0 @ 0x40000000, kernel */
-                                                /* addresses shift to 0x80000000 */
+                                /* addresses shift to 0x80000000 */
-    bltz        t0, 1f                          /* If kernel */
+        bltz    t0, 1f          /* If kernel */
-        addiu       t1, k1, VCPU_GUEST_KERNEL_ASID  /* (BD)  */
+         INT_ADDIU t1, k1, VCPU_GUEST_KERNEL_ASID  /* (BD)  */
-    addiu       t1, k1, VCPU_GUEST_USER_ASID    /* else user */
+        INT_ADDIU t1, k1, VCPU_GUEST_USER_ASID    /* else user */
 1:
-    /* t1: contains the base of the ASID array, need to get the cpu id  */
+             /* t1: contains the base of the ASID array, need to get the cpu id  */
-    LONG_L      t2, TI_CPU($28)             /* smp_processor_id */
+        LONG_L  t2, TI_CPU($28)             /* smp_processor_id */
-    sll         t2, t2, 2                   /* x4 */
+        INT_SLL t2, t2, 2                   /* x4 */
-    addu        t3, t1, t2
+        REG_ADDU t3, t1, t2
-    LONG_L      k0, (t3)
+        LONG_L  k0, (t3)
-    andi        k0, k0, 0xff
+        andi    k0, k0, 0xff
-        mtc0            k0,CP0_ENTRYHI
+        mtc0    k0, CP0_ENTRYHI
-    ehb
+        ehb
-    /* Disable RDHWR access */
+        /* Disable RDHWR access */
-    mtc0    zero,  CP0_HWRENA
+        mtc0    zero, CP0_HWRENA
-    /* Now load up the Guest Context from VCPU */
+        /* Now load up the Guest Context from VCPU */
-    LONG_L      $1, VCPU_R1(k1)
+        LONG_L  $1, VCPU_R1(k1)
-    LONG_L      $2, VCPU_R2(k1)
+        LONG_L  $2, VCPU_R2(k1)
-    LONG_L      $3, VCPU_R3(k1)
+        LONG_L  $3, VCPU_R3(k1)
-    LONG_L      $4, VCPU_R4(k1)
+        LONG_L  $4, VCPU_R4(k1)
-    LONG_L      $5, VCPU_R5(k1)
+        LONG_L  $5, VCPU_R5(k1)
-    LONG_L      $6, VCPU_R6(k1)
+        LONG_L  $6, VCPU_R6(k1)
-    LONG_L      $7, VCPU_R7(k1)
+        LONG_L  $7, VCPU_R7(k1)
-    LONG_L      $8,  VCPU_R8(k1)
+        LONG_L  $8, VCPU_R8(k1)
-    LONG_L      $9,  VCPU_R9(k1)
+        LONG_L  $9, VCPU_R9(k1)
-    LONG_L      $10, VCPU_R10(k1)
+        LONG_L  $10, VCPU_R10(k1)
-    LONG_L      $11, VCPU_R11(k1)
+        LONG_L  $11, VCPU_R11(k1)
-    LONG_L      $12, VCPU_R12(k1)
+        LONG_L  $12, VCPU_R12(k1)
-    LONG_L      $13, VCPU_R13(k1)
+        LONG_L  $13, VCPU_R13(k1)
-    LONG_L      $14, VCPU_R14(k1)
+        LONG_L  $14, VCPU_R14(k1)
-    LONG_L      $15, VCPU_R15(k1)
+        LONG_L  $15, VCPU_R15(k1)
-    LONG_L      $16, VCPU_R16(k1)
+        LONG_L  $16, VCPU_R16(k1)
-    LONG_L      $17, VCPU_R17(k1)
+        LONG_L  $17, VCPU_R17(k1)
-    LONG_L      $18, VCPU_R18(k1)
+        LONG_L  $18, VCPU_R18(k1)
-    LONG_L      $19, VCPU_R19(k1)
+        LONG_L  $19, VCPU_R19(k1)
-    LONG_L      $20, VCPU_R20(k1)
+        LONG_L  $20, VCPU_R20(k1)
-    LONG_L      $21, VCPU_R21(k1)
+        LONG_L  $21, VCPU_R21(k1)
-    LONG_L      $22, VCPU_R22(k1)
+        LONG_L  $22, VCPU_R22(k1)
-    LONG_L      $23, VCPU_R23(k1)
+        LONG_L  $23, VCPU_R23(k1)
-    LONG_L      $24, VCPU_R24(k1)
+        LONG_L  $24, VCPU_R24(k1)
-    LONG_L      $25, VCPU_R25(k1)
+        LONG_L  $25, VCPU_R25(k1)
-    /* k0/k1 loaded up later */
+        /* k0/k1 loaded up later */
-    LONG_L      $28, VCPU_R28(k1)
+        LONG_L  $28, VCPU_R28(k1)
-    LONG_L      $29, VCPU_R29(k1)
+        LONG_L  $29, VCPU_R29(k1)
-    LONG_L      $30, VCPU_R30(k1)
+        LONG_L  $30, VCPU_R30(k1)
-    LONG_L      $31, VCPU_R31(k1)
+        LONG_L  $31, VCPU_R31(k1)
-    /* Restore hi/lo */
+        /* Restore hi/lo */
-        LONG_L          k0, VCPU_LO(k1)
+        LONG_L  k0, VCPU_LO(k1)
-        mtlo            k0
+        mtlo    k0
-        LONG_L          k0, VCPU_HI(k1)
+        LONG_L  k0, VCPU_HI(k1)
-        mthi            k0
+        mthi    k0
 FEXPORT(__kvm_mips_load_k0k1)
        /* Restore the guest's k0/k1 registers */
-    LONG_L      k0, VCPU_R26(k1)
+        LONG_L  k0, VCPU_R26(k1)
-    LONG_L      k1, VCPU_R27(k1)
+        LONG_L  k1, VCPU_R27(k1)
-    /* Jump to guest */
+        /* Jump to guest */
        eret
-        .set    pop
 VECTOR(MIPSX(exception), unknown)
 /*
 * Find out what mode we came from and jump to the proper handler.
 */
-    .set    push
+        mtc0    k0, CP0_ERROREPC        #01: Save guest k0
-        .set    noat
+        ehb                             #02:
-    .set    noreorder
-    mtc0    k0, CP0_ERROREPC    #01: Save guest k0
+        mfc0    k0, CP0_EBASE           #02: Get EBASE
-    ehb                         #02:
+        INT_SRL k0, k0, 10              #03: Get rid of CPUNum
+        INT_SLL k0, k0, 10              #04
-    mfc0    k0, CP0_EBASE       #02: Get EBASE
+        LONG_S  k1, 0x3000(k0)          #05: Save k1 @ offset 0x3000
-    srl     k0, k0, 10          #03: Get rid of CPUNum
+        INT_ADDIU k0, k0, 0x2000                #06: Exception handler is installed @ offset 0x2000
-    sll     k0, k0, 10          #04
+        j       k0                      #07: jump to the function
-    LONG_S  k1, 0x3000(k0)      #05: Save k1 @ offset 0x3000
+         nop                            #08: branch delay slot
-    addiu   k0, k0, 0x2000      #06: Exception handler is installed @ offset 0x2000
-        j       k0                                      #07: jump to the function
-        nop                                             #08: branch delay slot
-        .set    push
 VECTOR_END(MIPSX(exceptionEnd))
 .end MIPSX(exception)
@@ -253,329 +251,327 @@ VECTOR_END(MIPSX(exceptionEnd))
 *
 */
 NESTED (MIPSX(GuestException), CALLFRAME_SIZ, ra)
-    .set    push
+        /* Get the VCPU pointer from DDTATA_LO */
-    .set    noat
+        mfc0    k1, CP0_DDATA_LO
-    .set    noreorder
+        INT_ADDIU k1, k1, VCPU_HOST_ARCH
-    /* Get the VCPU pointer from DDTATA_LO */
+        /* Start saving Guest context to VCPU */
-    mfc0        k1, CP0_DDATA_LO
+        LONG_S  $0, VCPU_R0(k1)
-        addiu           k1, k1, VCPU_HOST_ARCH
+        LONG_S  $1, VCPU_R1(k1)
+        LONG_S  $2, VCPU_R2(k1)
-    /* Start saving Guest context to VCPU */
+        LONG_S  $3, VCPU_R3(k1)
-    LONG_S  $0, VCPU_R0(k1)
+        LONG_S  $4, VCPU_R4(k1)
-    LONG_S  $1, VCPU_R1(k1)
+        LONG_S  $5, VCPU_R5(k1)
-    LONG_S  $2, VCPU_R2(k1)
+        LONG_S  $6, VCPU_R6(k1)
-    LONG_S  $3, VCPU_R3(k1)
+        LONG_S  $7, VCPU_R7(k1)
-    LONG_S  $4, VCPU_R4(k1)
+        LONG_S  $8, VCPU_R8(k1)
-    LONG_S  $5, VCPU_R5(k1)
+        LONG_S  $9, VCPU_R9(k1)
-    LONG_S  $6, VCPU_R6(k1)
+        LONG_S  $10, VCPU_R10(k1)
-    LONG_S  $7, VCPU_R7(k1)
+        LONG_S  $11, VCPU_R11(k1)
-    LONG_S  $8, VCPU_R8(k1)
+        LONG_S  $12, VCPU_R12(k1)
-    LONG_S  $9, VCPU_R9(k1)
+        LONG_S  $13, VCPU_R13(k1)
-    LONG_S  $10, VCPU_R10(k1)
+        LONG_S  $14, VCPU_R14(k1)
-    LONG_S  $11, VCPU_R11(k1)
+        LONG_S  $15, VCPU_R15(k1)
-    LONG_S  $12, VCPU_R12(k1)
+        LONG_S  $16, VCPU_R16(k1)
-    LONG_S  $13, VCPU_R13(k1)
+        LONG_S  $17, VCPU_R17(k1)
-    LONG_S  $14, VCPU_R14(k1)
+        LONG_S  $18, VCPU_R18(k1)
-    LONG_S  $15, VCPU_R15(k1)
+        LONG_S  $19, VCPU_R19(k1)
-    LONG_S  $16, VCPU_R16(k1)
+        LONG_S  $20, VCPU_R20(k1)
-    LONG_S  $17,VCPU_R17(k1)
+        LONG_S  $21, VCPU_R21(k1)
-    LONG_S  $18, VCPU_R18(k1)
+        LONG_S  $22, VCPU_R22(k1)
-    LONG_S  $19, VCPU_R19(k1)
+        LONG_S  $23, VCPU_R23(k1)
-    LONG_S  $20, VCPU_R20(k1)
+        LONG_S  $24, VCPU_R24(k1)
-    LONG_S  $21, VCPU_R21(k1)
+        LONG_S  $25, VCPU_R25(k1)
-    LONG_S  $22, VCPU_R22(k1)
-    LONG_S  $23, VCPU_R23(k1)
+        /* Guest k0/k1 saved later */
-    LONG_S  $24, VCPU_R24(k1)
-    LONG_S  $25, VCPU_R25(k1)
+        LONG_S  $28, VCPU_R28(k1)
+        LONG_S  $29, VCPU_R29(k1)
-    /* Guest k0/k1 saved later */
+        LONG_S  $30, VCPU_R30(k1)
+        LONG_S  $31, VCPU_R31(k1)
-    LONG_S  $28, VCPU_R28(k1)
-    LONG_S  $29, VCPU_R29(k1)
+        /* We need to save hi/lo and restore them on
-    LONG_S  $30, VCPU_R30(k1)
+         * the way out
-    LONG_S  $31, VCPU_R31(k1)
+         */
+        mfhi    t0
-    /* We need to save hi/lo and restore them on
+        LONG_S  t0, VCPU_HI(k1)
-     * the way out
-     */
+        mflo    t0
-    mfhi    t0
+        LONG_S  t0, VCPU_LO(k1)
-    LONG_S  t0, VCPU_HI(k1)
+        /* Finally save guest k0/k1 to VCPU */
-    mflo    t0
+        mfc0    t0, CP0_ERROREPC
-    LONG_S  t0, VCPU_LO(k1)
+        LONG_S  t0, VCPU_R26(k1)
-    /* Finally save guest k0/k1 to VCPU */
+        /* Get GUEST k1 and save it in VCPU */
-    mfc0    t0, CP0_ERROREPC
+        PTR_LI  t1, ~0x2ff
-    LONG_S  t0, VCPU_R26(k1)
+        mfc0    t0, CP0_EBASE
+        and     t0, t0, t1
-    /* Get GUEST k1 and save it in VCPU */
+        LONG_L  t0, 0x3000(t0)
-    la      t1, ~0x2ff
+        LONG_S  t0, VCPU_R27(k1)
-    mfc0    t0, CP0_EBASE
-    and     t0, t0, t1
+        /* Now that context has been saved, we can use other registers */
-    LONG_L  t0, 0x3000(t0)
-    LONG_S  t0, VCPU_R27(k1)
+        /* Restore vcpu */
+        mfc0    a1, CP0_DDATA_LO
-    /* Now that context has been saved, we can use other registers */
+        move    s1, a1
-    /* Restore vcpu */
+        /* Restore run (vcpu->run) */
-    mfc0        a1, CP0_DDATA_LO
+        LONG_L  a0, VCPU_RUN(a1)
-    move        s1, a1
+        /* Save pointer to run in s0, will be saved by the compiler */
+        move    s0, a0
-   /* Restore run (vcpu->run) */
-    LONG_L      a0, VCPU_RUN(a1)
+        /* Save Host level EPC, BadVaddr and Cause to VCPU, useful to
-    /* Save pointer to run in s0, will be saved by the compiler */
+         * process the exception */
-    move        s0, a0
+        mfc0    k0,CP0_EPC
+        LONG_S  k0, VCPU_PC(k1)
-    /* Save Host level EPC, BadVaddr and Cause to VCPU, useful to process the exception */
+        mfc0    k0, CP0_BADVADDR
-    mfc0    k0,CP0_EPC
+        LONG_S  k0, VCPU_HOST_CP0_BADVADDR(k1)
-    LONG_S  k0, VCPU_PC(k1)
+        mfc0    k0, CP0_CAUSE
-    mfc0    k0, CP0_BADVADDR
+        LONG_S  k0, VCPU_HOST_CP0_CAUSE(k1)
-    LONG_S  k0, VCPU_HOST_CP0_BADVADDR(k1)
+        mfc0    k0, CP0_ENTRYHI
-    mfc0    k0, CP0_CAUSE
+        LONG_S  k0, VCPU_HOST_ENTRYHI(k1)
-    LONG_S  k0, VCPU_HOST_CP0_CAUSE(k1)
+        /* Now restore the host state just enough to run the handlers */
-    mfc0    k0, CP0_ENTRYHI
-    LONG_S  k0, VCPU_HOST_ENTRYHI(k1)
+        /* Swtich EBASE to the one used by Linux */
+        /* load up the host EBASE */
-    /* Now restore the host state just enough to run the handlers */
+        mfc0    v0, CP0_STATUS
-    /* Swtich EBASE to the one used by Linux */
+        .set    at
-    /* load up the host EBASE */
+        or      k0, v0, ST0_BEV
-    mfc0        v0, CP0_STATUS
+        .set    noat
-    .set at
+        mtc0    k0, CP0_STATUS
-        or          k0, v0, ST0_BEV
+        ehb
-    .set noat
+        LONG_L  k0, VCPU_HOST_EBASE(k1)
-    mtc0        k0, CP0_STATUS
+        mtc0    k0,CP0_EBASE
-    ehb
-    LONG_L      k0, VCPU_HOST_EBASE(k1)
-    mtc0        k0,CP0_EBASE
-    /* Now that the new EBASE has been loaded, unset BEV and KSU_USER */
-    .set at
-        and         v0, v0, ~(ST0_EXL | KSU_USER | ST0_IE)
-    or          v0, v0, ST0_CU0
-    .set noat
-    mtc0        v0, CP0_STATUS
-    ehb
-    /* Load up host GP */
-    LONG_L  gp, VCPU_HOST_GP(k1)
-    /* Need a stack before we can jump to "C" */
-    LONG_L  sp, VCPU_HOST_STACK(k1)
-    /* Saved host state */
-    addiu   sp,sp, -PT_SIZE
-    /* XXXKYMA do we need to load the host ASID, maybe not because the
+        /* Now that the new EBASE has been loaded, unset BEV and KSU_USER */
-     * kernel entries are marked GLOBAL, need to verify
+        .set    at
-     */
+        and     v0, v0, ~(ST0_EXL | KSU_USER | ST0_IE)
+        or      v0, v0, ST0_CU0
+        .set    noat
+        mtc0    v0, CP0_STATUS
+        ehb
+        /* Load up host GP */
+        LONG_L  gp, VCPU_HOST_GP(k1)
+        /* Need a stack before we can jump to "C" */
+        LONG_L  sp, VCPU_HOST_STACK(k1)
+        /* Saved host state */
+        INT_ADDIU sp, sp, -PT_SIZE
-    /* Restore host DDATA_LO */
+        /* XXXKYMA do we need to load the host ASID, maybe not because the
-    LONG_L      k0, PT_HOST_USERLOCAL(sp)
+         * kernel entries are marked GLOBAL, need to verify
-    mtc0        k0, CP0_DDATA_LO
+         */
-    /* Restore RDHWR access */
+        /* Restore host DDATA_LO */
-    la      k0, 0x2000000F
+        LONG_L  k0, PT_HOST_USERLOCAL(sp)
-    mtc0    k0,  CP0_HWRENA
+        mtc0    k0, CP0_DDATA_LO
-    /* Jump to handler */
+        /* Restore RDHWR access */
+        PTR_LI  k0, 0x2000000F
+        mtc0    k0, CP0_HWRENA
+        /* Jump to handler */
 FEXPORT(__kvm_mips_jump_to_handler)
-    /* XXXKYMA: not sure if this is safe, how large is the stack?? */
+        /* XXXKYMA: not sure if this is safe, how large is the stack??
-    /* Now jump to the kvm_mips_handle_exit() to see if we can deal with this in the kernel */
+         * Now jump to the kvm_mips_handle_exit() to see if we can deal
-    la          t9,kvm_mips_handle_exit
+         * with this in the kernel */
-    jalr.hb     t9
+        PTR_LA  t9, kvm_mips_handle_exit
-    addiu       sp,sp, -CALLFRAME_SIZ           /* BD Slot */
+        jalr.hb t9
+         INT_ADDIU sp, sp, -CALLFRAME_SIZ           /* BD Slot */
-    /* Return from handler Make sure interrupts are disabled */
-    di
+        /* Return from handler Make sure interrupts are disabled */
-    ehb
+        di
+        ehb
-    /* XXXKYMA: k0/k1 could have been blown away if we processed an exception
-     * while we were handling the exception from the guest, reload k1
+        /* XXXKYMA: k0/k1 could have been blown away if we processed
-     */
+         * an exception while we were handling the exception from the
-    move        k1, s1
+         * guest, reload k1
-        addiu           k1, k1, VCPU_HOST_ARCH
+         */
-    /* Check return value, should tell us if we are returning to the host (handle I/O etc)
+        move    k1, s1
-     * or resuming the guest
+        INT_ADDIU k1, k1, VCPU_HOST_ARCH
-     */
-    andi        t0, v0, RESUME_HOST
+        /* Check return value, should tell us if we are returning to the
-    bnez        t0, __kvm_mips_return_to_host
+         * host (handle I/O etc)or resuming the guest
-    nop
+         */
+        andi    t0, v0, RESUME_HOST
+        bnez    t0, __kvm_mips_return_to_host
+         nop
 __kvm_mips_return_to_guest:
-    /* Put the saved pointer to vcpu (s1) back into the DDATA_LO Register */
+        /* Put the saved pointer to vcpu (s1) back into the DDATA_LO Register */
-    mtc0        s1, CP0_DDATA_LO
+        mtc0    s1, CP0_DDATA_LO
-    /* Load up the Guest EBASE to minimize the window where BEV is set */
-    LONG_L      t0, VCPU_GUEST_EBASE(k1)
-    /* Switch EBASE back to the one used by KVM */
-    mfc0        v1, CP0_STATUS
-    .set at
-        or          k0, v1, ST0_BEV
-    .set noat
-    mtc0        k0, CP0_STATUS
-    ehb
-    mtc0        t0,CP0_EBASE
-    /* Setup status register for running guest in UM */
-    .set at
-    or     v1, v1, (ST0_EXL | KSU_USER | ST0_IE)
-    and     v1, v1, ~ST0_CU0
-    .set noat
-    mtc0    v1, CP0_STATUS
-    ehb
+        /* Load up the Guest EBASE to minimize the window where BEV is set */
+        LONG_L  t0, VCPU_GUEST_EBASE(k1)
+        /* Switch EBASE back to the one used by KVM */
+        mfc0    v1, CP0_STATUS
+        .set    at
+        or      k0, v1, ST0_BEV
+        .set    noat
+        mtc0    k0, CP0_STATUS
+        ehb
+        mtc0    t0, CP0_EBASE
+        /* Setup status register for running guest in UM */
+        .set    at
+        or      v1, v1, (ST0_EXL | KSU_USER | ST0_IE)
+        and     v1, v1, ~ST0_CU0
+        .set    noat
+        mtc0    v1, CP0_STATUS
+        ehb
        /* Set Guest EPC */
-        LONG_L          t0, VCPU_PC(k1)
+        LONG_L  t0, VCPU_PC(k1)
-        mtc0            t0, CP0_EPC
+        mtc0    t0, CP0_EPC
-    /* Set the ASID for the Guest Kernel */
+        /* Set the ASID for the Guest Kernel */
-    sll         t0, t0, 1                       /* with kseg0 @ 0x40000000, kernel */
+        INT_SLL t0, t0, 1       /* with kseg0 @ 0x40000000, kernel */
-                                                /* addresses shift to 0x80000000 */
+                                /* addresses shift to 0x80000000 */
-    bltz        t0, 1f                          /* If kernel */
+        bltz    t0, 1f          /* If kernel */
-        addiu       t1, k1, VCPU_GUEST_KERNEL_ASID  /* (BD)  */
+         INT_ADDIU t1, k1, VCPU_GUEST_KERNEL_ASID  /* (BD)  */
-    addiu       t1, k1, VCPU_GUEST_USER_ASID    /* else user */
+        INT_ADDIU t1, k1, VCPU_GUEST_USER_ASID    /* else user */
 1:
-    /* t1: contains the base of the ASID array, need to get the cpu id  */
+        /* t1: contains the base of the ASID array, need to get the cpu id  */
-    LONG_L      t2, TI_CPU($28)             /* smp_processor_id */
+        LONG_L  t2, TI_CPU($28)         /* smp_processor_id */
-    sll         t2, t2, 2                   /* x4 */
+        INT_SLL t2, t2, 2               /* x4 */
-    addu        t3, t1, t2
+        REG_ADDU t3, t1, t2
-    LONG_L      k0, (t3)
+        LONG_L  k0, (t3)
-    andi        k0, k0, 0xff
+        andi    k0, k0, 0xff
-        mtc0            k0,CP0_ENTRYHI
+        mtc0    k0,CP0_ENTRYHI
-    ehb
+        ehb
-    /* Disable RDHWR access */
+        /* Disable RDHWR access */
-    mtc0    zero,  CP0_HWRENA
+        mtc0    zero,  CP0_HWRENA
-    /* load the guest context from VCPU and return */
+        /* load the guest context from VCPU and return */
-    LONG_L  $0, VCPU_R0(k1)
+        LONG_L  $0, VCPU_R0(k1)
-    LONG_L  $1, VCPU_R1(k1)
+        LONG_L  $1, VCPU_R1(k1)
-    LONG_L  $2, VCPU_R2(k1)
+        LONG_L  $2, VCPU_R2(k1)
-    LONG_L  $3, VCPU_R3(k1)
+        LONG_L  $3, VCPU_R3(k1)
-    LONG_L  $4, VCPU_R4(k1)
+        LONG_L  $4, VCPU_R4(k1)
-    LONG_L  $5, VCPU_R5(k1)
+        LONG_L  $5, VCPU_R5(k1)
-    LONG_L  $6, VCPU_R6(k1)
+        LONG_L  $6, VCPU_R6(k1)
-    LONG_L  $7, VCPU_R7(k1)
+        LONG_L  $7, VCPU_R7(k1)
-    LONG_L  $8, VCPU_R8(k1)
+        LONG_L  $8, VCPU_R8(k1)
-    LONG_L  $9, VCPU_R9(k1)
+        LONG_L  $9, VCPU_R9(k1)
-    LONG_L  $10, VCPU_R10(k1)
+        LONG_L  $10, VCPU_R10(k1)
-    LONG_L  $11, VCPU_R11(k1)
+        LONG_L  $11, VCPU_R11(k1)
-    LONG_L  $12, VCPU_R12(k1)
+        LONG_L  $12, VCPU_R12(k1)
-    LONG_L  $13, VCPU_R13(k1)
+        LONG_L  $13, VCPU_R13(k1)
-    LONG_L  $14, VCPU_R14(k1)
+        LONG_L  $14, VCPU_R14(k1)
-    LONG_L  $15, VCPU_R15(k1)
+        LONG_L  $15, VCPU_R15(k1)
-    LONG_L  $16, VCPU_R16(k1)
+        LONG_L  $16, VCPU_R16(k1)
-    LONG_L  $17, VCPU_R17(k1)
+        LONG_L  $17, VCPU_R17(k1)
-    LONG_L  $18, VCPU_R18(k1)
+        LONG_L  $18, VCPU_R18(k1)
-    LONG_L  $19, VCPU_R19(k1)
+        LONG_L  $19, VCPU_R19(k1)
-    LONG_L  $20, VCPU_R20(k1)
+        LONG_L  $20, VCPU_R20(k1)
-    LONG_L  $21, VCPU_R21(k1)
+        LONG_L  $21, VCPU_R21(k1)
-    LONG_L  $22, VCPU_R22(k1)
+        LONG_L  $22, VCPU_R22(k1)
-    LONG_L  $23, VCPU_R23(k1)
+        LONG_L  $23, VCPU_R23(k1)
-    LONG_L  $24, VCPU_R24(k1)
+        LONG_L  $24, VCPU_R24(k1)
-    LONG_L  $25, VCPU_R25(k1)
+        LONG_L  $25, VCPU_R25(k1)
-    /* $/k1 loaded later */
+        /* $/k1 loaded later */
-    LONG_L  $28, VCPU_R28(k1)
+        LONG_L  $28, VCPU_R28(k1)
-    LONG_L  $29, VCPU_R29(k1)
+        LONG_L  $29, VCPU_R29(k1)
-    LONG_L  $30, VCPU_R30(k1)
+        LONG_L  $30, VCPU_R30(k1)
-    LONG_L  $31, VCPU_R31(k1)
+        LONG_L  $31, VCPU_R31(k1)
 FEXPORT(__kvm_mips_skip_guest_restore)
-    LONG_L  k0, VCPU_HI(k1)
+        LONG_L  k0, VCPU_HI(k1)
-    mthi    k0
+        mthi    k0
-    LONG_L  k0, VCPU_LO(k1)
+        LONG_L  k0, VCPU_LO(k1)
-    mtlo    k0
+        mtlo    k0
-    LONG_L  k0, VCPU_R26(k1)
+        LONG_L  k0, VCPU_R26(k1)
-    LONG_L  k1, VCPU_R27(k1)
+        LONG_L  k1, VCPU_R27(k1)
-    eret
+        eret
 __kvm_mips_return_to_host:
-    /* EBASE is already pointing to Linux */
+        /* EBASE is already pointing to Linux */
-    LONG_L  k1, VCPU_HOST_STACK(k1)
+        LONG_L  k1, VCPU_HOST_STACK(k1)
-        addiu   k1,k1, -PT_SIZE
+        INT_ADDIU k1,k1, -PT_SIZE
-    /* Restore host DDATA_LO */
+        /* Restore host DDATA_LO */
-    LONG_L      k0, PT_HOST_USERLOCAL(k1)
+        LONG_L  k0, PT_HOST_USERLOCAL(k1)
-    mtc0        k0, CP0_DDATA_LO
+        mtc0    k0, CP0_DDATA_LO
-    /* Restore host ASID */
+        /* Restore host ASID */
-    LONG_L      k0, PT_HOST_ASID(sp)
+        LONG_L  k0, PT_HOST_ASID(sp)
-    andi        k0, 0xff
+        andi    k0, 0xff
-    mtc0        k0,CP0_ENTRYHI
+        mtc0    k0,CP0_ENTRYHI
-    ehb
+        ehb
-    /* Load context saved on the host stack */
+        /* Load context saved on the host stack */
-    LONG_L  $0, PT_R0(k1)
+        LONG_L  $0, PT_R0(k1)
-    LONG_L  $1, PT_R1(k1)
+        LONG_L  $1, PT_R1(k1)
-    /* r2/v0 is the return code, shift it down by 2 (arithmetic) to recover the err code  */
+        /* r2/v0 is the return code, shift it down by 2 (arithmetic)
-    sra     k0, v0, 2
+         * to recover the err code  */
-    move    $2, k0
+        INT_SRA k0, v0, 2
+        move    $2, k0
-    LONG_L  $3, PT_R3(k1)
-    LONG_L  $4, PT_R4(k1)
+        LONG_L  $3, PT_R3(k1)
-    LONG_L  $5, PT_R5(k1)
+        LONG_L  $4, PT_R4(k1)
-    LONG_L  $6, PT_R6(k1)
+        LONG_L  $5, PT_R5(k1)
-    LONG_L  $7, PT_R7(k1)
+        LONG_L  $6, PT_R6(k1)
-    LONG_L  $8, PT_R8(k1)
+        LONG_L  $7, PT_R7(k1)
-    LONG_L  $9, PT_R9(k1)
+        LONG_L  $8, PT_R8(k1)
-    LONG_L  $10, PT_R10(k1)
+        LONG_L  $9, PT_R9(k1)
-    LONG_L  $11, PT_R11(k1)
+        LONG_L  $10, PT_R10(k1)
-    LONG_L  $12, PT_R12(k1)
+        LONG_L  $11, PT_R11(k1)
-    LONG_L  $13, PT_R13(k1)
+        LONG_L  $12, PT_R12(k1)
-    LONG_L  $14, PT_R14(k1)
+        LONG_L  $13, PT_R13(k1)
-    LONG_L  $15, PT_R15(k1)
+        LONG_L  $14, PT_R14(k1)
-    LONG_L  $16, PT_R16(k1)
+        LONG_L  $15, PT_R15(k1)
-    LONG_L  $17, PT_R17(k1)
+        LONG_L  $16, PT_R16(k1)
-    LONG_L  $18, PT_R18(k1)
+        LONG_L  $17, PT_R17(k1)
-    LONG_L  $19, PT_R19(k1)
+        LONG_L  $18, PT_R18(k1)
-    LONG_L  $20, PT_R20(k1)
+        LONG_L  $19, PT_R19(k1)
-    LONG_L  $21, PT_R21(k1)
+        LONG_L  $20, PT_R20(k1)
-    LONG_L  $22, PT_R22(k1)
+        LONG_L  $21, PT_R21(k1)
-    LONG_L  $23, PT_R23(k1)
+        LONG_L  $22, PT_R22(k1)
-    LONG_L  $24, PT_R24(k1)
+        LONG_L  $23, PT_R23(k1)
-    LONG_L  $25, PT_R25(k1)
+        LONG_L  $24, PT_R24(k1)
+        LONG_L  $25, PT_R25(k1)
-    /* Host k0/k1 were not saved */
+        /* Host k0/k1 were not saved */
-    LONG_L  $28, PT_R28(k1)
-    LONG_L  $29, PT_R29(k1)
+        LONG_L  $28, PT_R28(k1)
-    LONG_L  $30, PT_R30(k1)
+        LONG_L  $29, PT_R29(k1)
+        LONG_L  $30, PT_R30(k1)
-    LONG_L  k0, PT_HI(k1)
-    mthi    k0
+        LONG_L  k0, PT_HI(k1)
+        mthi    k0
-    LONG_L  k0, PT_LO(k1)
-    mtlo    k0
+        LONG_L  k0, PT_LO(k1)
+        mtlo    k0
-    /* Restore RDHWR access */
-    la      k0, 0x2000000F
+        /* Restore RDHWR access */
-    mtc0    k0,  CP0_HWRENA
+        PTR_LI  k0, 0x2000000F
+        mtc0    k0,  CP0_HWRENA
-    /* Restore RA, which is the address we will return to */
-    LONG_L  ra, PT_R31(k1)
+        /* Restore RA, which is the address we will return to */
-    j       ra
+        LONG_L  ra, PT_R31(k1)
-    nop
+        j       ra
+         nop
-    .set    pop
 VECTOR_END(MIPSX(GuestExceptionEnd))
 .end MIPSX(GuestException)
@@ -627,24 +623,23 @@ MIPSX(exceptions):
 #define HW_SYNCI_Step       $1
 LEAF(MIPSX(SyncICache))
-    .set    push
+        .set    push
        .set    mips32r2
-    beq     a1, zero, 20f
+        beq     a1, zero, 20f
-    nop
+         nop
-    addu    a1, a0, a1
+        REG_ADDU a1, a0, a1
-    rdhwr   v0, HW_SYNCI_Step
+        rdhwr   v0, HW_SYNCI_Step
-    beq     v0, zero, 20f
+        beq     v0, zero, 20f
-    nop
+         nop
 10:
-    synci   0(a0)
+        synci   0(a0)
-    addu    a0, a0, v0
+        REG_ADDU a0, a0, v0
-    sltu    v1, a0, a1
+        sltu    v1, a0, a1
-    bne     v1, zero, 10b
+        bne     v1, zero, 10b
-    nop
+         nop
-    sync
+        sync
 20:
-    jr.hb   ra
+        jr.hb   ra
-    nop
+         nop
-    .set pop
+        .set    pop
 END(MIPSX(SyncICache))
diff --git a/arch/mips/kvm/kvm_mips.c b/arch/mips/kvm/kvm_mips.c
index dd203e59e6fd..a7b044536de4 100644
--- a/arch/mips/kvm/kvm_mips.c
+++ b/arch/mips/kvm/kvm_mips.c
@@ -208,6 +208,10 @@ int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages)
        return 0;
 }
+void kvm_arch_memslots_updated(struct kvm *kvm)
+{
+}
 int kvm_arch_prepare_memory_region(struct kvm *kvm,
                                struct kvm_memory_slot *memslot,
                                struct kvm_userspace_memory_region *mem,
diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h
index 08891d07aeb6..fa19e2f1a874 100644
--- a/arch/powerpc/include/asm/kvm_book3s.h
+++ b/arch/powerpc/include/asm/kvm_book3s.h
@@ -334,6 +334,27 @@ static inline u32 kvmppc_get_last_inst(struct kvm_vcpu *vcpu)
        return r;
 }
+/*
+ * Like kvmppc_get_last_inst(), but for fetching a sc instruction.
+ * Because the sc instruction sets SRR0 to point to the following
+ * instruction, we have to fetch from pc - 4.
+ */
+static inline u32 kvmppc_get_last_sc(struct kvm_vcpu *vcpu)
+{
+        ulong pc = kvmppc_get_pc(vcpu) - 4;
+        struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu);
+        u32 r;
+        /* Load the instruction manually if it failed to do so in the
+         * exit path */
+        if (svcpu->last_inst == KVM_INST_FETCH_FAILED)
+                kvmppc_ld(vcpu, &pc, sizeof(u32), &svcpu->last_inst, false);
+        r = svcpu->last_inst;
+        svcpu_put(svcpu);
+        return r;
+}
 static inline ulong kvmppc_get_fault_dar(struct kvm_vcpu *vcpu)
 {
        struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu);
@@ -446,6 +467,23 @@ static inline u32 kvmppc_get_last_inst(struct kvm_vcpu *vcpu)
        return vcpu->arch.last_inst;
 }
+/*
+ * Like kvmppc_get_last_inst(), but for fetching a sc instruction.
+ * Because the sc instruction sets SRR0 to point to the following
+ * instruction, we have to fetch from pc - 4.
+ */
+static inline u32 kvmppc_get_last_sc(struct kvm_vcpu *vcpu)
+{
+        ulong pc = kvmppc_get_pc(vcpu) - 4;
+        /* Load the instruction manually if it failed to do so in the
+         * exit path */
+        if (vcpu->arch.last_inst == KVM_INST_FETCH_FAILED)
+                kvmppc_ld(vcpu, &pc, sizeof(u32), &vcpu->arch.last_inst, false);
+        return vcpu->arch.last_inst;
+}
 static inline ulong kvmppc_get_fault_dar(struct kvm_vcpu *vcpu)
 {
        return vcpu->arch.fault_dar;
diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h
index a1ecb14e4442..86d638a3b359 100644
--- a/arch/powerpc/include/asm/kvm_book3s_64.h
+++ b/arch/powerpc/include/asm/kvm_book3s_64.h
@@ -37,7 +37,7 @@ static inline void svcpu_put(struct kvmppc_book3s_shadow_vcpu *svcpu)
 #ifdef CONFIG_KVM_BOOK3S_64_HV
 #define KVM_DEFAULT_HPT_ORDER   24      /* 16MB HPT by default */
-extern int kvm_hpt_order;               /* order of preallocated HPTs */
+extern unsigned long kvm_rma_pages;
 #endif
 #define VRMA_VSID       0x1ffffffUL     /* 1TB VSID reserved for VRMA */
@@ -100,7 +100,7 @@ static inline unsigned long compute_tlbie_rb(unsigned long v, unsigned long r,
                        /* (masks depend on page size) */
                        rb |= 0x1000;           /* page encoding in LP field */
                        rb |= (va_low & 0x7f) << 16; /* 7b of VA in AVA/LP field */
-                        rb |= (va_low & 0xfe);  /* AVAL field (P7 doesn't seem to care) */
+                        rb |= ((va_low << 4) & 0xf0);   /* AVAL field (P7 doesn't seem to care) */
                }
        } else {
                /* 4kB page */
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index af326cde7cb6..33283532e9d8 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -183,13 +183,9 @@ struct kvmppc_spapr_tce_table {
        struct page *pages[0];
 };
-struct kvmppc_linear_info {
+struct kvm_rma_info {
-        void            *base_virt;
+        atomic_t use_count;
-        unsigned long    base_pfn;
+        unsigned long base_pfn;
-        unsigned long    npages;
-        struct list_head list;
-        atomic_t         use_count;
-        int              type;
 };
 /* XICS components, defined in book3s_xics.c */
@@ -246,7 +242,7 @@ struct kvm_arch {
        int tlbie_lock;
        unsigned long lpcr;
        unsigned long rmor;
-        struct kvmppc_linear_info *rma;
+        struct kvm_rma_info *rma;
        unsigned long vrma_slb_v;
        int rma_setup_done;
        int using_mmu_notifiers;
@@ -259,7 +255,7 @@ struct kvm_arch {
        spinlock_t slot_phys_lock;
        cpumask_t need_tlb_flush;
        struct kvmppc_vcore *vcores[KVM_MAX_VCORES];
-        struct kvmppc_linear_info *hpt_li;
+        int hpt_cma_alloc;
 #endif /* CONFIG_KVM_BOOK3S_64_HV */
 #ifdef CONFIG_PPC_BOOK3S_64
        struct list_head spapr_tce_tables;
diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
index a5287fe03d77..b15554a26c20 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -137,10 +137,10 @@ extern long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
                             unsigned long ioba, unsigned long tce);
 extern long kvm_vm_ioctl_allocate_rma(struct kvm *kvm,
                                struct kvm_allocate_rma *rma);
-extern struct kvmppc_linear_info *kvm_alloc_rma(void);
+extern struct kvm_rma_info *kvm_alloc_rma(void);
-extern void kvm_release_rma(struct kvmppc_linear_info *ri);
+extern void kvm_release_rma(struct kvm_rma_info *ri);
-extern struct kvmppc_linear_info *kvm_alloc_hpt(void);
+extern struct page *kvm_alloc_hpt(unsigned long nr_pages);
-extern void kvm_release_hpt(struct kvmppc_linear_info *li);
+extern void kvm_release_hpt(struct page *page, unsigned long nr_pages);
 extern int kvmppc_core_init_vm(struct kvm *kvm);
 extern void kvmppc_core_destroy_vm(struct kvm *kvm);
 extern void kvmppc_core_free_memslot(struct kvm_memory_slot *free,
@@ -261,6 +261,7 @@ void kvmppc_set_pid(struct kvm_vcpu *vcpu, u32 pid);
 struct openpic;
 #ifdef CONFIG_KVM_BOOK3S_64_HV
+extern void kvm_cma_reserve(void) __init;
 static inline void kvmppc_set_xics_phys(int cpu, unsigned long addr)
 {
        paca[cpu].kvm_hstate.xics_phys = addr;
@@ -281,13 +282,12 @@ static inline void kvmppc_set_host_ipi(int cpu, u8 host_ipi)
 }
 extern void kvmppc_fast_vcpu_kick(struct kvm_vcpu *vcpu);
-extern void kvm_linear_init(void);
 #else
-static inline void kvmppc_set_xics_phys(int cpu, unsigned long addr)
+static inline void __init kvm_cma_reserve(void)
 {}
-static inline void kvm_linear_init(void)
+static inline void kvmppc_set_xics_phys(int cpu, unsigned long addr)
 {}
 static inline u32 kvmppc_get_xics_latch(void)
@@ -394,10 +394,15 @@ static inline void kvmppc_mmu_flush_icache(pfn_t pfn)
        }
 }
-/* Please call after prepare_to_enter. This function puts the lazy ee state
+/*
-   back to normal mode, without actually enabling interrupts. */
+ * Please call after prepare_to_enter. This function puts the lazy ee and irq
-static inline void kvmppc_lazy_ee_enable(void)
+ * disabled tracking state back to normal mode, without actually enabling
+ * interrupts.
+ */
+static inline void kvmppc_fix_ee_before_entry(void)
 {
+        trace_hardirqs_on();
 #ifdef CONFIG_PPC64
        /* Only need to enable IRQs by hard enabling them after this */
        local_paca->irq_happened = 0;
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
index 8207459efe56..d8958be5f31a 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -454,6 +454,7 @@ int main(void)
        DEFINE(VCPU_SPRG2, offsetof(struct kvm_vcpu, arch.shregs.sprg2));
        DEFINE(VCPU_SPRG3, offsetof(struct kvm_vcpu, arch.shregs.sprg3));
 #endif
+        DEFINE(VCPU_SHARED_SPRG3, offsetof(struct kvm_vcpu_arch_shared, sprg3));
        DEFINE(VCPU_SHARED_SPRG4, offsetof(struct kvm_vcpu_arch_shared, sprg4));
        DEFINE(VCPU_SHARED_SPRG5, offsetof(struct kvm_vcpu_arch_shared, sprg5));
        DEFINE(VCPU_SHARED_SPRG6, offsetof(struct kvm_vcpu_arch_shared, sprg6));
diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c
index 389fb8077cc9..fe6a58c9f0b7 100644
--- a/arch/powerpc/kernel/setup_64.c
+++ b/arch/powerpc/kernel/setup_64.c
@@ -229,6 +229,8 @@ void __init early_setup(unsigned long dt_ptr)
        /* Initialize the hash table or TLB handling */
        early_init_mmu();
+        kvm_cma_reserve();
        /*
         * Reserve any gigantic pages requested on the command line.
         * memblock needs to have been initialized by the time this is
@@ -609,8 +611,6 @@ void __init setup_arch(char **cmdline_p)
        /* Initialize the MMU context management stuff */
        mmu_context_init();
-        kvm_linear_init();
        /* Interrupt code needs to be 64K-aligned */
        if ((unsigned long)_stext & 0xffff)
                panic("Kernelbase not 64K-aligned (0x%lx)!\n",
diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig
index eb643f862579..ffaef2cb101a 100644
--- a/arch/powerpc/kvm/Kconfig
+++ b/arch/powerpc/kvm/Kconfig
@@ -72,6 +72,7 @@ config KVM_BOOK3S_64_HV
        bool "KVM support for POWER7 and PPC970 using hypervisor mode in host"
        depends on KVM_BOOK3S_64
        select MMU_NOTIFIER
+        select CMA
        ---help---
          Support running unmodified book3s_64 guest kernels in
          virtual machines on POWER7 and PPC970 processors that have
diff --git a/arch/powerpc/kvm/Makefile b/arch/powerpc/kvm/Makefile
index 008cd856c5b5..6646c952c5e3 100644
--- a/arch/powerpc/kvm/Makefile
+++ b/arch/powerpc/kvm/Makefile
@@ -81,6 +81,7 @@ kvm-book3s_64-builtin-objs-$(CONFIG_KVM_BOOK3S_64_HV) := \
        book3s_64_vio_hv.o \
        book3s_hv_ras.o \
        book3s_hv_builtin.o \
+        book3s_hv_cma.o \
        $(kvm-book3s_64-builtin-xics-objs-y)
 kvm-book3s_64-objs-$(CONFIG_KVM_XICS) += \
diff --git a/arch/powerpc/kvm/book3s_64_mmu.c b/arch/powerpc/kvm/book3s_64_mmu.c
index 739bfbadb85e..7e345e00661a 100644
--- a/arch/powerpc/kvm/book3s_64_mmu.c
+++ b/arch/powerpc/kvm/book3s_64_mmu.c
@@ -182,10 +182,13 @@ static int kvmppc_mmu_book3s_64_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,
        hva_t ptegp;
        u64 pteg[16];
        u64 avpn = 0;
+        u64 v, r;
+        u64 v_val, v_mask;
+        u64 eaddr_mask;
        int i;
-        u8 key = 0;
+        u8 pp, key = 0;
        bool found = false;
-        int second = 0;
+        bool second = false;
        ulong mp_ea = vcpu->arch.magic_page_ea;
        /* Magic page override */
@@ -208,8 +211,16 @@ static int kvmppc_mmu_book3s_64_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,
                goto no_seg_found;
        avpn = kvmppc_mmu_book3s_64_get_avpn(slbe, eaddr);
+        v_val = avpn & HPTE_V_AVPN;
        if (slbe->tb)
-                avpn |= SLB_VSID_B_1T;
+                v_val |= SLB_VSID_B_1T;
+        if (slbe->large)
+                v_val |= HPTE_V_LARGE;
+        v_val |= HPTE_V_VALID;
+        v_mask = SLB_VSID_B | HPTE_V_AVPN | HPTE_V_LARGE | HPTE_V_VALID |
+                HPTE_V_SECONDARY;
 do_second:
        ptegp = kvmppc_mmu_book3s_64_get_pteg(vcpu_book3s, slbe, eaddr, second);
@@ -227,91 +238,74 @@ do_second:
                key = 4;
        for (i=0; i<16; i+=2) {
-                u64 v = pteg[i];
+                /* Check all relevant fields of 1st dword */
-                u64 r = pteg[i+1];
+                if ((pteg[i] & v_mask) == v_val) {
-                /* Valid check */
-                if (!(v & HPTE_V_VALID))
-                        continue;
-                /* Hash check */
-                if ((v & HPTE_V_SECONDARY) != second)
-                        continue;
-                /* AVPN compare */
-                if (HPTE_V_COMPARE(avpn, v)) {
-                        u8 pp = (r & HPTE_R_PP) | key;
-                        int eaddr_mask = 0xFFF;
-                        gpte->eaddr = eaddr;
-                        gpte->vpage = kvmppc_mmu_book3s_64_ea_to_vp(vcpu,
-                                                                    eaddr,
-                                                                    data);
-                        if (slbe->large)
-                                eaddr_mask = 0xFFFFFF;
-                        gpte->raddr = (r & HPTE_R_RPN) | (eaddr & eaddr_mask);
-                        gpte->may_execute = ((r & HPTE_R_N) ? false : true);
-                        gpte->may_read = false;
-                        gpte->may_write = false;
-                        switch (pp) {
-                        case 0:
-                        case 1:
-                        case 2:
-                        case 6:
-                                gpte->may_write = true;
-                                /* fall through */
-                        case 3:
-                        case 5:
-                        case 7:
-                                gpte->may_read = true;
-                                break;
-                        }
-                        dprintk("KVM MMU: Translated 0x%lx [0x%llx] -> 0x%llx "
-                                "-> 0x%lx\n",
-                                eaddr, avpn, gpte->vpage, gpte->raddr);
                        found = true;
                        break;
                }
        }
-        /* Update PTE R and C bits, so the guest's swapper knows we used the
+        if (!found) {
-         * page */
+                if (second)
-        if (found) {
+                        goto no_page_found;
-                u32 oldr = pteg[i+1];
+                v_val |= HPTE_V_SECONDARY;
+                second = true;
+                goto do_second;
+        }
-                if (gpte->may_read) {
+        v = pteg[i];
-                        /* Set the accessed flag */
+        r = pteg[i+1];
-                        pteg[i+1] |= HPTE_R_R;
+        pp = (r & HPTE_R_PP) | key;
-                }
+        eaddr_mask = 0xFFF;
-                if (gpte->may_write) {
-                        /* Set the dirty flag */
+        gpte->eaddr = eaddr;
-                        pteg[i+1] |= HPTE_R_C;
+        gpte->vpage = kvmppc_mmu_book3s_64_ea_to_vp(vcpu, eaddr, data);
-                } else {
+        if (slbe->large)
-                        dprintk("KVM: Mapping read-only page!\n");
+                eaddr_mask = 0xFFFFFF;
-                }
+        gpte->raddr = (r & HPTE_R_RPN & ~eaddr_mask) | (eaddr & eaddr_mask);
+        gpte->may_execute = ((r & HPTE_R_N) ? false : true);
+        gpte->may_read = false;
+        gpte->may_write = false;
+        switch (pp) {
+        case 0:
+        case 1:
+        case 2:
+        case 6:
+                gpte->may_write = true;
+                /* fall through */
+        case 3:
+        case 5:
+        case 7:
+                gpte->may_read = true;
+                break;
+        }
-                /* Write back into the PTEG */
+        dprintk("KVM MMU: Translated 0x%lx [0x%llx] -> 0x%llx "
-                if (pteg[i+1] != oldr)
+                "-> 0x%lx\n",
-                        copy_to_user((void __user *)ptegp, pteg, sizeof(pteg));
+                eaddr, avpn, gpte->vpage, gpte->raddr);
-                if (!gpte->may_read)
+        /* Update PTE R and C bits, so the guest's swapper knows we used the
-                        return -EPERM;
+         * page */
-                return 0;
+        if (gpte->may_read) {
-        } else {
+                /* Set the accessed flag */
-                dprintk("KVM MMU: No PTE found (ea=0x%lx sdr1=0x%llx "
+                r |= HPTE_R_R;
-                        "ptegp=0x%lx)\n",
+        }
-                        eaddr, to_book3s(vcpu)->sdr1, ptegp);
+        if (data && gpte->may_write) {
-                for (i = 0; i < 16; i += 2)
+                /* Set the dirty flag -- XXX even if not writing */
-                        dprintk("   %02d: 0x%llx - 0x%llx (0x%llx)\n",
+                r |= HPTE_R_C;
-                                i, pteg[i], pteg[i+1], avpn);
+        }
-                if (!second) {
+        /* Write back into the PTEG */
-                        second = HPTE_V_SECONDARY;
+        if (pteg[i+1] != r) {
-                        goto do_second;
+                pteg[i+1] = r;
-                }
+                copy_to_user((void __user *)ptegp, pteg, sizeof(pteg));
        }
+        if (!gpte->may_read)
+                return -EPERM;
+        return 0;
 no_page_found:
        return -ENOENT;
diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c
index 710d31317d81..043eec8461e7 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
@@ -37,6 +37,8 @@
 #include <asm/ppc-opcode.h>
 #include <asm/cputable.h>
+#include "book3s_hv_cma.h"
 /* POWER7 has 10-bit LPIDs, PPC970 has 6-bit LPIDs */
 #define MAX_LPID_970    63
@@ -52,8 +54,8 @@ long kvmppc_alloc_hpt(struct kvm *kvm, u32 *htab_orderp)
 {
        unsigned long hpt;
        struct revmap_entry *rev;
-        struct kvmppc_linear_info *li;
+        struct page *page = NULL;
-        long order = kvm_hpt_order;
+        long order = KVM_DEFAULT_HPT_ORDER;
        if (htab_orderp) {
                order = *htab_orderp;
@@ -61,26 +63,23 @@ long kvmppc_alloc_hpt(struct kvm *kvm, u32 *htab_orderp)
                        order = PPC_MIN_HPT_ORDER;
        }
+        kvm->arch.hpt_cma_alloc = 0;
        /*
-         * If the user wants a different size from default,
         * try first to allocate it from the kernel page allocator.
+         * We keep the CMA reserved for failed allocation.
         */
-        hpt = 0;
+        hpt = __get_free_pages(GFP_KERNEL | __GFP_ZERO | __GFP_REPEAT |
-        if (order != kvm_hpt_order) {
+                               __GFP_NOWARN, order - PAGE_SHIFT);
-                hpt = __get_free_pages(GFP_KERNEL|__GFP_ZERO|__GFP_REPEAT|
-                                       __GFP_NOWARN, order - PAGE_SHIFT);
-                if (!hpt)
-                        --order;
-        }
        /* Next try to allocate from the preallocated pool */
        if (!hpt) {
-                li = kvm_alloc_hpt();
+                VM_BUG_ON(order < KVM_CMA_CHUNK_ORDER);
-                if (li) {
+                page = kvm_alloc_hpt(1 << (order - PAGE_SHIFT));
-                        hpt = (ulong)li->base_virt;
+                if (page) {
-                        kvm->arch.hpt_li = li;
+                        hpt = (unsigned long)pfn_to_kaddr(page_to_pfn(page));
-                        order = kvm_hpt_order;
+                        kvm->arch.hpt_cma_alloc = 1;
-                }
+                } else
+                        --order;
        }
        /* Lastly try successively smaller sizes from the page allocator */
@@ -118,8 +117,8 @@ long kvmppc_alloc_hpt(struct kvm *kvm, u32 *htab_orderp)
        return 0;
 out_freehpt:
-        if (kvm->arch.hpt_li)
+        if (kvm->arch.hpt_cma_alloc)
-                kvm_release_hpt(kvm->arch.hpt_li);
+                kvm_release_hpt(page, 1 << (order - PAGE_SHIFT));
        else
                free_pages(hpt, order - PAGE_SHIFT);
        return -ENOMEM;
@@ -165,8 +164,9 @@ void kvmppc_free_hpt(struct kvm *kvm)
 {
        kvmppc_free_lpid(kvm->arch.lpid);
        vfree(kvm->arch.revmap);
-        if (kvm->arch.hpt_li)
+        if (kvm->arch.hpt_cma_alloc)
-                kvm_release_hpt(kvm->arch.hpt_li);
+                kvm_release_hpt(virt_to_page(kvm->arch.hpt_virt),
+                                1 << (kvm->arch.hpt_order - PAGE_SHIFT));
        else
                free_pages(kvm->arch.hpt_virt,
                           kvm->arch.hpt_order - PAGE_SHIFT);
@@ -1579,7 +1579,7 @@ int kvm_vm_ioctl_get_htab_fd(struct kvm *kvm, struct kvm_get_htab_fd *ghf)
        ctx->first_pass = 1;
        rwflag = (ghf->flags & KVM_GET_HTAB_WRITE) ? O_WRONLY : O_RDONLY;
-        ret = anon_inode_getfd("kvm-htab", &kvm_htab_fops, ctx, rwflag);
+        ret = anon_inode_getfd("kvm-htab", &kvm_htab_fops, ctx, rwflag | O_CLOEXEC);
        if (ret < 0) {
                kvm_put_kvm(kvm);
                return ret;
diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c
index b2d3f3b2de72..54cf9bc94dad 100644
--- a/arch/powerpc/kvm/book3s_64_vio.c
+++ b/arch/powerpc/kvm/book3s_64_vio.c
@@ -136,7 +136,7 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
        mutex_unlock(&kvm->lock);
        return anon_inode_getfd("kvm-spapr-tce", &kvm_spapr_tce_fops,
-                                stt, O_RDWR);
+                                stt, O_RDWR | O_CLOEXEC);
 fail:
        if (stt) {
diff --git a/arch/powerpc/kvm/book3s_emulate.c b/arch/powerpc/kvm/book3s_emulate.c
index 1f6344c4408d..360ce68c9809 100644
--- a/arch/powerpc/kvm/book3s_emulate.c
+++ b/arch/powerpc/kvm/book3s_emulate.c
@@ -458,6 +458,7 @@ int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, ulong spr_val)
        case SPRN_PMC4_GEKKO:
        case SPRN_WPAR_GEKKO:
        case SPRN_MSSSR0:
+        case SPRN_DABR:
                break;
 unprivileged:
        default:
@@ -555,6 +556,7 @@ int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, ulong *spr_val)
        case SPRN_PMC4_GEKKO:
        case SPRN_WPAR_GEKKO:
        case SPRN_MSSSR0:
+        case SPRN_DABR:
                *spr_val = 0;
                break;
        default:
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 7629cd3eb91a..b0ee3bc9ca76 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -680,13 +680,12 @@ static int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
 }
 int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
-                                  struct kvm_sregs *sregs)
+                                  struct kvm_sregs *sregs)
 {
        int i;
-        sregs->pvr = vcpu->arch.pvr;
        memset(sregs, 0, sizeof(struct kvm_sregs));
+        sregs->pvr = vcpu->arch.pvr;
        for (i = 0; i < vcpu->arch.slb_max; i++) {
                sregs->u.s.ppc64.slb[i].slbe = vcpu->arch.slb[i].orige;
                sregs->u.s.ppc64.slb[i].slbv = vcpu->arch.slb[i].origv;
@@ -696,7 +695,7 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
 }
 int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
-                                  struct kvm_sregs *sregs)
+                                  struct kvm_sregs *sregs)
 {
        int i, j;
@@ -1511,10 +1510,10 @@ static inline int lpcr_rmls(unsigned long rma_size)
 static int kvm_rma_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
-        struct kvmppc_linear_info *ri = vma->vm_file->private_data;
        struct page *page;
+        struct kvm_rma_info *ri = vma->vm_file->private_data;
-        if (vmf->pgoff >= ri->npages)
+        if (vmf->pgoff >= kvm_rma_pages)
                return VM_FAULT_SIGBUS;
        page = pfn_to_page(ri->base_pfn + vmf->pgoff);
@@ -1536,7 +1535,7 @@ static int kvm_rma_mmap(struct file *file, struct vm_area_struct *vma)
 static int kvm_rma_release(struct inode *inode, struct file *filp)
 {
-        struct kvmppc_linear_info *ri = filp->private_data;
+        struct kvm_rma_info *ri = filp->private_data;
        kvm_release_rma(ri);
        return 0;
@@ -1549,18 +1548,27 @@ static const struct file_operations kvm_rma_fops = {
 long kvm_vm_ioctl_allocate_rma(struct kvm *kvm, struct kvm_allocate_rma *ret)
 {
-        struct kvmppc_linear_info *ri;
        long fd;
+        struct kvm_rma_info *ri;
+        /*
+         * Only do this on PPC970 in HV mode
+         */
+        if (!cpu_has_feature(CPU_FTR_HVMODE) ||
+            !cpu_has_feature(CPU_FTR_ARCH_201))
+                return -EINVAL;
+        if (!kvm_rma_pages)
+                return -EINVAL;
        ri = kvm_alloc_rma();
        if (!ri)
                return -ENOMEM;
-        fd = anon_inode_getfd("kvm-rma", &kvm_rma_fops, ri, O_RDWR);
+        fd = anon_inode_getfd("kvm-rma", &kvm_rma_fops, ri, O_RDWR | O_CLOEXEC);
        if (fd < 0)
                kvm_release_rma(ri);
-        ret->rma_size = ri->npages << PAGE_SHIFT;
+        ret->rma_size = kvm_rma_pages << PAGE_SHIFT;
        return fd;
 }
@@ -1725,7 +1733,7 @@ static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu)
 {
        int err = 0;
        struct kvm *kvm = vcpu->kvm;
-        struct kvmppc_linear_info *ri = NULL;
+        struct kvm_rma_info *ri = NULL;
        unsigned long hva;
        struct kvm_memory_slot *memslot;
        struct vm_area_struct *vma;
@@ -1803,7 +1811,7 @@ static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu)
        } else {
                /* Set up to use an RMO region */
-                rma_size = ri->npages;
+                rma_size = kvm_rma_pages;
                if (rma_size > memslot->npages)
                        rma_size = memslot->npages;
                rma_size <<= PAGE_SHIFT;
@@ -1831,14 +1839,14 @@ static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu)
                        /* POWER7 */
                        lpcr &= ~(LPCR_VPM0 | LPCR_VRMA_L);
                        lpcr |= rmls << LPCR_RMLS_SH;
-                        kvm->arch.rmor = kvm->arch.rma->base_pfn << PAGE_SHIFT;
+                        kvm->arch.rmor = ri->base_pfn << PAGE_SHIFT;
                }
                kvm->arch.lpcr = lpcr;
                pr_info("KVM: Using RMO at %lx size %lx (LPCR = %lx)\n",
                        ri->base_pfn << PAGE_SHIFT, rma_size, lpcr);
                /* Initialize phys addrs of pages in RMO */
-                npages = ri->npages;
+                npages = kvm_rma_pages;
                porder = __ilog2(npages);
                physp = memslot->arch.slot_phys;
                if (physp) {
diff --git a/arch/powerpc/kvm/book3s_hv_builtin.c b/arch/powerpc/kvm/book3s_hv_builtin.c
index ec0a9e5de100..8cd0daebb82d 100644
--- a/arch/powerpc/kvm/book3s_hv_builtin.c
+++ b/arch/powerpc/kvm/book3s_hv_builtin.c
@@ -13,33 +13,34 @@
 #include <linux/spinlock.h>
 #include <linux/bootmem.h>
 #include <linux/init.h>
+#include <linux/memblock.h>
+#include <linux/sizes.h>
 #include <asm/cputable.h>
 #include <asm/kvm_ppc.h>
 #include <asm/kvm_book3s.h>
-#define KVM_LINEAR_RMA          0
+#include "book3s_hv_cma.h"
-#define KVM_LINEAR_HPT          1
+/*
+ * Hash page table alignment on newer cpus(CPU_FTR_ARCH_206)
-static void __init kvm_linear_init_one(ulong size, int count, int type);
+ * should be power of 2.
-static struct kvmppc_linear_info *kvm_alloc_linear(int type);
+ */
-static void kvm_release_linear(struct kvmppc_linear_info *ri);
+#define HPT_ALIGN_PAGES         ((1 << 18) >> PAGE_SHIFT) /* 256k */
+/*
-int kvm_hpt_order = KVM_DEFAULT_HPT_ORDER;
+ * By default we reserve 5% of memory for hash pagetable allocation.
-EXPORT_SYMBOL_GPL(kvm_hpt_order);
+ */
+static unsigned long kvm_cma_resv_ratio = 5;
-/*************** RMA *************/
 /*
- * This maintains a list of RMAs (real mode areas) for KVM guests to use.
+ * We allocate RMAs (real mode areas) for KVM guests from the KVM CMA area.
 * Each RMA has to be physically contiguous and of a size that the
 * hardware supports.  PPC970 and POWER7 support 64MB, 128MB and 256MB,
 * and other larger sizes.  Since we are unlikely to be allocate that
 * much physically contiguous memory after the system is up and running,
- * we preallocate a set of RMAs in early boot for KVM to use.
+ * we preallocate a set of RMAs in early boot using CMA.
+ * should be power of 2.
 */
-static unsigned long kvm_rma_size = 64 << 20;   /* 64MB */
+unsigned long kvm_rma_pages = (1 << 27) >> PAGE_SHIFT;  /* 128MB */
-static unsigned long kvm_rma_count;
+EXPORT_SYMBOL_GPL(kvm_rma_pages);
 /* Work out RMLS (real mode limit selector) field value for a given RMA size.
   Assumes POWER7 or PPC970. */
@@ -69,165 +70,114 @@ static inline int lpcr_rmls(unsigned long rma_size)
 static int __init early_parse_rma_size(char *p)
 {
-        if (!p)
+        unsigned long kvm_rma_size;
-                return 1;
+        pr_debug("%s(%s)\n", __func__, p);
+        if (!p)
+                return -EINVAL;
        kvm_rma_size = memparse(p, &p);
+        /*
+         * Check that the requested size is one supported in hardware
+         */
+        if (lpcr_rmls(kvm_rma_size) < 0) {
+                pr_err("RMA size of 0x%lx not supported\n", kvm_rma_size);
+                return -EINVAL;
+        }
+        kvm_rma_pages = kvm_rma_size >> PAGE_SHIFT;
        return 0;
 }
 early_param("kvm_rma_size", early_parse_rma_size);
-static int __init early_parse_rma_count(char *p)
+struct kvm_rma_info *kvm_alloc_rma()
 {
-        if (!p)
+        struct page *page;
-                return 1;
+        struct kvm_rma_info *ri;
-        kvm_rma_count = simple_strtoul(p, NULL, 0);
+        ri = kmalloc(sizeof(struct kvm_rma_info), GFP_KERNEL);
+        if (!ri)
-        return 0;
+                return NULL;
-}
+        page = kvm_alloc_cma(kvm_rma_pages, kvm_rma_pages);
-early_param("kvm_rma_count", early_parse_rma_count);
+        if (!page)
+                goto err_out;
-struct kvmppc_linear_info *kvm_alloc_rma(void)
+        atomic_set(&ri->use_count, 1);
-{
+        ri->base_pfn = page_to_pfn(page);
-        return kvm_alloc_linear(KVM_LINEAR_RMA);
+        return ri;
+err_out:
+        kfree(ri);
+        return NULL;
 }
 EXPORT_SYMBOL_GPL(kvm_alloc_rma);
-void kvm_release_rma(struct kvmppc_linear_info *ri)
+void kvm_release_rma(struct kvm_rma_info *ri)
 {
-        kvm_release_linear(ri);
+        if (atomic_dec_and_test(&ri->use_count)) {
+                kvm_release_cma(pfn_to_page(ri->base_pfn), kvm_rma_pages);
+                kfree(ri);
+        }
 }
 EXPORT_SYMBOL_GPL(kvm_release_rma);
-/*************** HPT *************/
+static int __init early_parse_kvm_cma_resv(char *p)
-/*
- * This maintains a list of big linear HPT tables that contain the GVA->HPA
- * memory mappings. If we don't reserve those early on, we might not be able
- * to get a big (usually 16MB) linear memory region from the kernel anymore.
- */
-static unsigned long kvm_hpt_count;
-static int __init early_parse_hpt_count(char *p)
 {
+        pr_debug("%s(%s)\n", __func__, p);
        if (!p)
-                return 1;
+                return -EINVAL;
+        return kstrtoul(p, 0, &kvm_cma_resv_ratio);
-        kvm_hpt_count = simple_strtoul(p, NULL, 0);
-        return 0;
 }
-early_param("kvm_hpt_count", early_parse_hpt_count);
+early_param("kvm_cma_resv_ratio", early_parse_kvm_cma_resv);
-struct kvmppc_linear_info *kvm_alloc_hpt(void)
+struct page *kvm_alloc_hpt(unsigned long nr_pages)
 {
-        return kvm_alloc_linear(KVM_LINEAR_HPT);
+        unsigned long align_pages = HPT_ALIGN_PAGES;
+        /* Old CPUs require HPT aligned on a multiple of its size */
+        if (!cpu_has_feature(CPU_FTR_ARCH_206))
+                align_pages = nr_pages;
+        return kvm_alloc_cma(nr_pages, align_pages);
 }
 EXPORT_SYMBOL_GPL(kvm_alloc_hpt);
-void kvm_release_hpt(struct kvmppc_linear_info *li)
+void kvm_release_hpt(struct page *page, unsigned long nr_pages)
 {
-        kvm_release_linear(li);
+        kvm_release_cma(page, nr_pages);
 }
 EXPORT_SYMBOL_GPL(kvm_release_hpt);
-/*************** generic *************/
+/**
+ * kvm_cma_reserve() - reserve area for kvm hash pagetable
-static LIST_HEAD(free_linears);
+ *
-static DEFINE_SPINLOCK(linear_lock);
+ * This function reserves memory from early allocator. It should be
+ * called by arch specific code once the early allocator (memblock or bootmem)
-static void __init kvm_linear_init_one(ulong size, int count, int type)
+ * has been activated and all other subsystems have already allocated/reserved
-{
+ * memory.
-        unsigned long i;
-        unsigned long j, npages;
-        void *linear;
-        struct page *pg;
-        const char *typestr;
-        struct kvmppc_linear_info *linear_info;
-        if (!count)
-                return;
-        typestr = (type == KVM_LINEAR_RMA) ? "RMA" : "HPT";
-        npages = size >> PAGE_SHIFT;
-        linear_info = alloc_bootmem(count * sizeof(struct kvmppc_linear_info));
-        for (i = 0; i < count; ++i) {
-                linear = alloc_bootmem_align(size, size);
-                pr_debug("Allocated KVM %s at %p (%ld MB)\n", typestr, linear,
-                         size >> 20);
-                linear_info[i].base_virt = linear;
-                linear_info[i].base_pfn = __pa(linear) >> PAGE_SHIFT;
-                linear_info[i].npages = npages;
-                linear_info[i].type = type;
-                list_add_tail(&linear_info[i].list, &free_linears);
-                atomic_set(&linear_info[i].use_count, 0);
-                pg = pfn_to_page(linear_info[i].base_pfn);
-                for (j = 0; j < npages; ++j) {
-                        atomic_inc(&pg->_count);
-                        ++pg;
-                }
-        }
-}
-static struct kvmppc_linear_info *kvm_alloc_linear(int type)
-{
-        struct kvmppc_linear_info *ri, *ret;
-        ret = NULL;
-        spin_lock(&linear_lock);
-        list_for_each_entry(ri, &free_linears, list) {
-                if (ri->type != type)
-                        continue;
-                list_del(&ri->list);
-                atomic_inc(&ri->use_count);
-                memset(ri->base_virt, 0, ri->npages << PAGE_SHIFT);
-                ret = ri;
-                break;
-        }
-        spin_unlock(&linear_lock);
-        return ret;
-}
-static void kvm_release_linear(struct kvmppc_linear_info *ri)
-{
-        if (atomic_dec_and_test(&ri->use_count)) {
-                spin_lock(&linear_lock);
-                list_add_tail(&ri->list, &free_linears);
-                spin_unlock(&linear_lock);
-        }
-}
-/*
- * Called at boot time while the bootmem allocator is active,
- * to allocate contiguous physical memory for the hash page
- * tables for guests.
 */
-void __init kvm_linear_init(void)
+void __init kvm_cma_reserve(void)
 {
-        /* HPT */
+        unsigned long align_size;
-        kvm_linear_init_one(1 << kvm_hpt_order, kvm_hpt_count, KVM_LINEAR_HPT);
+        struct memblock_region *reg;
+        phys_addr_t selected_size = 0;
-        /* RMA */
+        /*
-        /* Only do this on PPC970 in HV mode */
+         * We cannot use memblock_phys_mem_size() here, because
-        if (!cpu_has_feature(CPU_FTR_HVMODE) ||
+         * memblock_analyze() has not been called yet.
-            !cpu_has_feature(CPU_FTR_ARCH_201))
+         */
-                return;
+        for_each_memblock(memory, reg)
+                selected_size += memblock_region_memory_end_pfn(reg) -
-        if (!kvm_rma_size || !kvm_rma_count)
+                                 memblock_region_memory_base_pfn(reg);
-                return;
+        selected_size = (selected_size * kvm_cma_resv_ratio / 100) << PAGE_SHIFT;
-        /* Check that the requested size is one supported in hardware */
+        if (selected_size) {
-        if (lpcr_rmls(kvm_rma_size) < 0) {
+                pr_debug("%s: reserving %ld MiB for global area\n", __func__,
-                pr_err("RMA size of 0x%lx not supported\n", kvm_rma_size);
+                         (unsigned long)selected_size / SZ_1M);
-                return;
+                /*
+                 * Old CPUs require HPT aligned on a multiple of its size. So for them
+                 * make the alignment as max size we could request.
+                 */
+                if (!cpu_has_feature(CPU_FTR_ARCH_206))
+                        align_size = __rounddown_pow_of_two(selected_size);
+                else
+                        align_size = HPT_ALIGN_PAGES << PAGE_SHIFT;
+                align_size = max(kvm_rma_pages << PAGE_SHIFT, align_size);
+                kvm_cma_declare_contiguous(selected_size, align_size);
        }
-        kvm_linear_init_one(kvm_rma_size, kvm_rma_count, KVM_LINEAR_RMA);
 }
diff --git a/arch/powerpc/kvm/book3s_hv_cma.c b/arch/powerpc/kvm/book3s_hv_cma.c
new file mode 100644
index 000000000000..d9d3d8553d51
--- /dev/null
+++ b/arch/powerpc/kvm/book3s_hv_cma.c
@@ -0,0 +1,240 @@
+/*
+ * Contiguous Memory Allocator for ppc KVM hash pagetable  based on CMA
+ * for DMA mapping framework
+ *
+ * Copyright IBM Corporation, 2013
+ * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License or (at your optional) any later version of the license.
+ *
+ */
+#define pr_fmt(fmt) "kvm_cma: " fmt
+#ifdef CONFIG_CMA_DEBUG
+#ifndef DEBUG
+#  define DEBUG
+#endif
+#endif
+#include <linux/memblock.h>
+#include <linux/mutex.h>
+#include <linux/sizes.h>
+#include <linux/slab.h>
+#include "book3s_hv_cma.h"
+struct kvm_cma {
+        unsigned long   base_pfn;
+        unsigned long   count;
+        unsigned long   *bitmap;
+};
+static DEFINE_MUTEX(kvm_cma_mutex);
+static struct kvm_cma kvm_cma_area;
+/**
+ * kvm_cma_declare_contiguous() - reserve area for contiguous memory handling
+ *                                for kvm hash pagetable
+ * @size:  Size of the reserved memory.
+ * @alignment:  Alignment for the contiguous memory area
+ *
+ * This function reserves memory for kvm cma area. It should be
+ * called by arch code when early allocator (memblock or bootmem)
+ * is still activate.
+ */
+long __init kvm_cma_declare_contiguous(phys_addr_t size, phys_addr_t alignment)
+{
+        long base_pfn;
+        phys_addr_t addr;
+        struct kvm_cma *cma = &kvm_cma_area;
+        pr_debug("%s(size %lx)\n", __func__, (unsigned long)size);
+        if (!size)
+                return -EINVAL;
+        /*
+         * Sanitise input arguments.
+         * We should be pageblock aligned for CMA.
+         */
+        alignment = max(alignment, (phys_addr_t)(PAGE_SIZE << pageblock_order));
+        size = ALIGN(size, alignment);
+        /*
+         * Reserve memory
+         * Use __memblock_alloc_base() since
+         * memblock_alloc_base() panic()s.
+         */
+        addr = __memblock_alloc_base(size, alignment, 0);
+        if (!addr) {
+                base_pfn = -ENOMEM;
+                goto err;
+        } else
+                base_pfn = PFN_DOWN(addr);
+        /*
+         * Each reserved area must be initialised later, when more kernel
+         * subsystems (like slab allocator) are available.
+         */
+        cma->base_pfn = base_pfn;
+        cma->count    = size >> PAGE_SHIFT;
+        pr_info("CMA: reserved %ld MiB\n", (unsigned long)size / SZ_1M);
+        return 0;
+err:
+        pr_err("CMA: failed to reserve %ld MiB\n", (unsigned long)size / SZ_1M);
+        return base_pfn;
+}
+/**
+ * kvm_alloc_cma() - allocate pages from contiguous area
+ * @nr_pages: Requested number of pages.
+ * @align_pages: Requested alignment in number of pages
+ *
+ * This function allocates memory buffer for hash pagetable.
+ */
+struct page *kvm_alloc_cma(unsigned long nr_pages, unsigned long align_pages)
+{
+        int ret;
+        struct page *page = NULL;
+        struct kvm_cma *cma = &kvm_cma_area;
+        unsigned long chunk_count, nr_chunk;
+        unsigned long mask, pfn, pageno, start = 0;
+        if (!cma || !cma->count)
+                return NULL;
+        pr_debug("%s(cma %p, count %lu, align pages %lu)\n", __func__,
+                 (void *)cma, nr_pages, align_pages);
+        if (!nr_pages)
+                return NULL;
+        /*
+         * align mask with chunk size. The bit tracks pages in chunk size
+         */
+        VM_BUG_ON(!is_power_of_2(align_pages));
+        mask = (align_pages >> (KVM_CMA_CHUNK_ORDER - PAGE_SHIFT)) - 1;
+        BUILD_BUG_ON(PAGE_SHIFT > KVM_CMA_CHUNK_ORDER);
+        chunk_count = cma->count >>  (KVM_CMA_CHUNK_ORDER - PAGE_SHIFT);
+        nr_chunk = nr_pages >> (KVM_CMA_CHUNK_ORDER - PAGE_SHIFT);
+        mutex_lock(&kvm_cma_mutex);
+        for (;;) {
+                pageno = bitmap_find_next_zero_area(cma->bitmap, chunk_count,
+                                                    start, nr_chunk, mask);
+                if (pageno >= chunk_count)
+                        break;
+                pfn = cma->base_pfn + (pageno << (KVM_CMA_CHUNK_ORDER - PAGE_SHIFT));
+                ret = alloc_contig_range(pfn, pfn + nr_pages, MIGRATE_CMA);
+                if (ret == 0) {
+                        bitmap_set(cma->bitmap, pageno, nr_chunk);
+                        page = pfn_to_page(pfn);
+                        memset(pfn_to_kaddr(pfn), 0, nr_pages << PAGE_SHIFT);
+                        break;
+                } else if (ret != -EBUSY) {
+                        break;
+                }
+                pr_debug("%s(): memory range at %p is busy, retrying\n",
+                         __func__, pfn_to_page(pfn));
+                /* try again with a bit different memory target */
+                start = pageno + mask + 1;
+        }
+        mutex_unlock(&kvm_cma_mutex);
+        pr_debug("%s(): returned %p\n", __func__, page);
+        return page;
+}
+/**
+ * kvm_release_cma() - release allocated pages for hash pagetable
+ * @pages: Allocated pages.
+ * @nr_pages: Number of allocated pages.
+ *
+ * This function releases memory allocated by kvm_alloc_cma().
+ * It returns false when provided pages do not belong to contiguous area and
+ * true otherwise.
+ */
+bool kvm_release_cma(struct page *pages, unsigned long nr_pages)
+{
+        unsigned long pfn;
+        unsigned long nr_chunk;
+        struct kvm_cma *cma = &kvm_cma_area;
+        if (!cma || !pages)
+                return false;
+        pr_debug("%s(page %p count %lu)\n", __func__, (void *)pages, nr_pages);
+        pfn = page_to_pfn(pages);
+        if (pfn < cma->base_pfn || pfn >= cma->base_pfn + cma->count)
+                return false;
+        VM_BUG_ON(pfn + nr_pages > cma->base_pfn + cma->count);
+        nr_chunk = nr_pages >>  (KVM_CMA_CHUNK_ORDER - PAGE_SHIFT);
+        mutex_lock(&kvm_cma_mutex);
+        bitmap_clear(cma->bitmap,
+                     (pfn - cma->base_pfn) >> (KVM_CMA_CHUNK_ORDER - PAGE_SHIFT),
+                     nr_chunk);
+        free_contig_range(pfn, nr_pages);
+        mutex_unlock(&kvm_cma_mutex);
+        return true;
+}
+static int __init kvm_cma_activate_area(unsigned long base_pfn,
+                                        unsigned long count)
+{
+        unsigned long pfn = base_pfn;
+        unsigned i = count >> pageblock_order;
+        struct zone *zone;
+        WARN_ON_ONCE(!pfn_valid(pfn));
+        zone = page_zone(pfn_to_page(pfn));
+        do {
+                unsigned j;
+                base_pfn = pfn;
+                for (j = pageblock_nr_pages; j; --j, pfn++) {
+                        WARN_ON_ONCE(!pfn_valid(pfn));
+                        /*
+                         * alloc_contig_range requires the pfn range
+                         * specified to be in the same zone. Make this
+                         * simple by forcing the entire CMA resv range
+                         * to be in the same zone.
+                         */
+                        if (page_zone(pfn_to_page(pfn)) != zone)
+                                return -EINVAL;
+                }
+                init_cma_reserved_pageblock(pfn_to_page(base_pfn));
+        } while (--i);
+        return 0;
+}
+static int __init kvm_cma_init_reserved_areas(void)
+{
+        int bitmap_size, ret;
+        unsigned long chunk_count;
+        struct kvm_cma *cma = &kvm_cma_area;
+        pr_debug("%s()\n", __func__);
+        if (!cma->count)
+                return 0;
+        chunk_count = cma->count >> (KVM_CMA_CHUNK_ORDER - PAGE_SHIFT);
+        bitmap_size = BITS_TO_LONGS(chunk_count) * sizeof(long);
+        cma->bitmap = kzalloc(bitmap_size, GFP_KERNEL);
+        if (!cma->bitmap)
+                return -ENOMEM;
+        ret = kvm_cma_activate_area(cma->base_pfn, cma->count);
+        if (ret)
+                goto error;
+        return 0;
+error:
+        kfree(cma->bitmap);
+        return ret;
+}
+core_initcall(kvm_cma_init_reserved_areas);
diff --git a/arch/powerpc/kvm/book3s_hv_cma.h b/arch/powerpc/kvm/book3s_hv_cma.h
new file mode 100644
index 000000000000..655144f75fa5
--- /dev/null
+++ b/arch/powerpc/kvm/book3s_hv_cma.h
@@ -0,0 +1,27 @@
+/*
+ * Contiguous Memory Allocator for ppc KVM hash pagetable  based on CMA
+ * for DMA mapping framework
+ *
+ * Copyright IBM Corporation, 2013
+ * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License or (at your optional) any later version of the license.
+ *
+ */
+#ifndef __POWERPC_KVM_CMA_ALLOC_H__
+#define __POWERPC_KVM_CMA_ALLOC_H__
+/*
+ * Both RMA and Hash page allocation will be multiple of 256K.
+ */
+#define KVM_CMA_CHUNK_ORDER     18
+extern struct page *kvm_alloc_cma(unsigned long nr_pages,
+                                  unsigned long align_pages);
+extern bool kvm_release_cma(struct page *pages, unsigned long nr_pages);
+extern long kvm_cma_declare_contiguous(phys_addr_t size,
+                                       phys_addr_t alignment) __init;
+#endif
diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
index fc25689a9f35..45e30d6e462b 100644
--- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
@@ -383,6 +383,80 @@ static inline int try_lock_tlbie(unsigned int *lock)
        return old == 0;
 }
+/*
+ * tlbie/tlbiel is a bit different on the PPC970 compared to later
+ * processors such as POWER7; the large page bit is in the instruction
+ * not RB, and the top 16 bits and the bottom 12 bits of the VA
+ * in RB must be 0.
+ */
+static void do_tlbies_970(struct kvm *kvm, unsigned long *rbvalues,
+                          long npages, int global, bool need_sync)
+{
+        long i;
+        if (global) {
+                while (!try_lock_tlbie(&kvm->arch.tlbie_lock))
+                        cpu_relax();
+                if (need_sync)
+                        asm volatile("ptesync" : : : "memory");
+                for (i = 0; i < npages; ++i) {
+                        unsigned long rb = rbvalues[i];
+                        if (rb & 1)             /* large page */
+                                asm volatile("tlbie %0,1" : :
+                                             "r" (rb & 0x0000fffffffff000ul));
+                        else
+                                asm volatile("tlbie %0,0" : :
+                                             "r" (rb & 0x0000fffffffff000ul));
+                }
+                asm volatile("eieio; tlbsync; ptesync" : : : "memory");
+                kvm->arch.tlbie_lock = 0;
+        } else {
+                if (need_sync)
+                        asm volatile("ptesync" : : : "memory");
+                for (i = 0; i < npages; ++i) {
+                        unsigned long rb = rbvalues[i];
+                        if (rb & 1)             /* large page */
+                                asm volatile("tlbiel %0,1" : :
+                                             "r" (rb & 0x0000fffffffff000ul));
+                        else
+                                asm volatile("tlbiel %0,0" : :
+                                             "r" (rb & 0x0000fffffffff000ul));
+                }
+                asm volatile("ptesync" : : : "memory");
+        }
+}
+static void do_tlbies(struct kvm *kvm, unsigned long *rbvalues,
+                      long npages, int global, bool need_sync)
+{
+        long i;
+        if (cpu_has_feature(CPU_FTR_ARCH_201)) {
+                /* PPC970 tlbie instruction is a bit different */
+                do_tlbies_970(kvm, rbvalues, npages, global, need_sync);
+                return;
+        }
+        if (global) {
+                while (!try_lock_tlbie(&kvm->arch.tlbie_lock))
+                        cpu_relax();
+                if (need_sync)
+                        asm volatile("ptesync" : : : "memory");
+                for (i = 0; i < npages; ++i)
+                        asm volatile(PPC_TLBIE(%1,%0) : :
+                                     "r" (rbvalues[i]), "r" (kvm->arch.lpid));
+                asm volatile("eieio; tlbsync; ptesync" : : : "memory");
+                kvm->arch.tlbie_lock = 0;
+        } else {
+                if (need_sync)
+                        asm volatile("ptesync" : : : "memory");
+                for (i = 0; i < npages; ++i)
+                        asm volatile("tlbiel %0" : : "r" (rbvalues[i]));
+                asm volatile("ptesync" : : : "memory");
+        }
+}
 long kvmppc_do_h_remove(struct kvm *kvm, unsigned long flags,
                        unsigned long pte_index, unsigned long avpn,
                        unsigned long *hpret)
@@ -408,19 +482,7 @@ long kvmppc_do_h_remove(struct kvm *kvm, unsigned long flags,
        if (v & HPTE_V_VALID) {
                hpte[0] &= ~HPTE_V_VALID;
                rb = compute_tlbie_rb(v, hpte[1], pte_index);
-                if (global_invalidates(kvm, flags)) {
+                do_tlbies(kvm, &rb, 1, global_invalidates(kvm, flags), true);
-                        while (!try_lock_tlbie(&kvm->arch.tlbie_lock))
-                                cpu_relax();
-                        asm volatile("ptesync" : : : "memory");
-                        asm volatile(PPC_TLBIE(%1,%0)"; eieio; tlbsync"
-                                     : : "r" (rb), "r" (kvm->arch.lpid));
-                        asm volatile("ptesync" : : : "memory");
-                        kvm->arch.tlbie_lock = 0;
-                } else {
-                        asm volatile("ptesync" : : : "memory");
-                        asm volatile("tlbiel %0" : : "r" (rb));
-                        asm volatile("ptesync" : : : "memory");
-                }
                /* Read PTE low word after tlbie to get final R/C values */
                remove_revmap_chain(kvm, pte_index, rev, v, hpte[1]);
        }
@@ -448,12 +510,11 @@ long kvmppc_h_bulk_remove(struct kvm_vcpu *vcpu)
        unsigned long *hp, *hptes[4], tlbrb[4];
        long int i, j, k, n, found, indexes[4];
        unsigned long flags, req, pte_index, rcbits;
-        long int local = 0;
+        int global;
        long int ret = H_SUCCESS;
        struct revmap_entry *rev, *revs[4];
-        if (atomic_read(&kvm->online_vcpus) == 1)
+        global = global_invalidates(kvm, 0);
-                local = 1;
        for (i = 0; i < 4 && ret == H_SUCCESS; ) {
                n = 0;
                for (; i < 4; ++i) {
@@ -529,22 +590,7 @@ long kvmppc_h_bulk_remove(struct kvm_vcpu *vcpu)
                        break;
                /* Now that we've collected a batch, do the tlbies */
-                if (!local) {
+                do_tlbies(kvm, tlbrb, n, global, true);
-                        while(!try_lock_tlbie(&kvm->arch.tlbie_lock))
-                                cpu_relax();
-                        asm volatile("ptesync" : : : "memory");
-                        for (k = 0; k < n; ++k)
-                                asm volatile(PPC_TLBIE(%1,%0) : :
-                                             "r" (tlbrb[k]),
-                                             "r" (kvm->arch.lpid));
-                        asm volatile("eieio; tlbsync; ptesync" : : : "memory");
-                        kvm->arch.tlbie_lock = 0;
-                } else {
-                        asm volatile("ptesync" : : : "memory");
-                        for (k = 0; k < n; ++k)
-                                asm volatile("tlbiel %0" : : "r" (tlbrb[k]));
-                        asm volatile("ptesync" : : : "memory");
-                }
                /* Read PTE low words after tlbie to get final R/C values */
                for (k = 0; k < n; ++k) {
@@ -603,19 +649,7 @@ long kvmppc_h_protect(struct kvm_vcpu *vcpu, unsigned long flags,
        if (v & HPTE_V_VALID) {
                rb = compute_tlbie_rb(v, r, pte_index);
                hpte[0] = v & ~HPTE_V_VALID;
-                if (global_invalidates(kvm, flags)) {
+                do_tlbies(kvm, &rb, 1, global_invalidates(kvm, flags), true);
-                        while(!try_lock_tlbie(&kvm->arch.tlbie_lock))
-                                cpu_relax();
-                        asm volatile("ptesync" : : : "memory");
-                        asm volatile(PPC_TLBIE(%1,%0)"; eieio; tlbsync"
-                                     : : "r" (rb), "r" (kvm->arch.lpid));
-                        asm volatile("ptesync" : : : "memory");
-                        kvm->arch.tlbie_lock = 0;
-                } else {
-                        asm volatile("ptesync" : : : "memory");
-                        asm volatile("tlbiel %0" : : "r" (rb));
-                        asm volatile("ptesync" : : : "memory");
-                }
                /*
                 * If the host has this page as readonly but the guest
                 * wants to make it read/write, reduce the permissions.
@@ -686,13 +720,7 @@ void kvmppc_invalidate_hpte(struct kvm *kvm, unsigned long *hptep,
        hptep[0] &= ~HPTE_V_VALID;
        rb = compute_tlbie_rb(hptep[0], hptep[1], pte_index);
-        while (!try_lock_tlbie(&kvm->arch.tlbie_lock))
+        do_tlbies(kvm, &rb, 1, 1, true);
-                cpu_relax();
-        asm volatile("ptesync" : : : "memory");
-        asm volatile(PPC_TLBIE(%1,%0)"; eieio; tlbsync"
-                     : : "r" (rb), "r" (kvm->arch.lpid));
-        asm volatile("ptesync" : : : "memory");
-        kvm->arch.tlbie_lock = 0;
 }
 EXPORT_SYMBOL_GPL(kvmppc_invalidate_hpte);
@@ -706,12 +734,7 @@ void kvmppc_clear_ref_hpte(struct kvm *kvm, unsigned long *hptep,
        rbyte = (hptep[1] & ~HPTE_R_R) >> 8;
        /* modify only the second-last byte, which contains the ref bit */
        *((char *)hptep + 14) = rbyte;
-        while (!try_lock_tlbie(&kvm->arch.tlbie_lock))
+        do_tlbies(kvm, &rb, 1, 1, false);
-                cpu_relax();
-        asm volatile(PPC_TLBIE(%1,%0)"; eieio; tlbsync"
-                     : : "r" (rb), "r" (kvm->arch.lpid));
-        asm volatile("ptesync" : : : "memory");
-        kvm->arch.tlbie_lock = 0;
 }
 EXPORT_SYMBOL_GPL(kvmppc_clear_ref_hpte);
diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index b02f91e4c70d..60dce5bfab3f 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -1381,7 +1381,7 @@ hcall_try_real_mode:
        cmpldi  r3,hcall_real_table_end - hcall_real_table
        bge     guest_exit_cont
        LOAD_REG_ADDR(r4, hcall_real_table)
-        lwzx    r3,r3,r4
+        lwax    r3,r3,r4
        cmpwi   r3,0
        beq     guest_exit_cont
        add     r3,r3,r4
diff --git a/arch/powerpc/kvm/book3s_interrupts.S b/arch/powerpc/kvm/book3s_interrupts.S
index 48cbbf862958..17cfae5497a3 100644
--- a/arch/powerpc/kvm/book3s_interrupts.S
+++ b/arch/powerpc/kvm/book3s_interrupts.S
@@ -92,6 +92,11 @@ kvm_start_lightweight:
        PPC_LL  r3, VCPU_HFLAGS(r4)
        rldicl  r3, r3, 0, 63           /* r3 &= 1 */
        stb     r3, HSTATE_RESTORE_HID5(r13)
+        /* Load up guest SPRG3 value, since it's user readable */
+        ld      r3, VCPU_SHARED(r4)
+        ld      r3, VCPU_SHARED_SPRG3(r3)
+        mtspr   SPRN_SPRG3, r3
 #endif /* CONFIG_PPC_BOOK3S_64 */
        PPC_LL  r4, VCPU_SHADOW_MSR(r4) /* get shadow_msr */
@@ -123,6 +128,15 @@ kvmppc_handler_highmem:
        /* R7 = vcpu */
        PPC_LL  r7, GPR4(r1)
+#ifdef CONFIG_PPC_BOOK3S_64
+        /*
+         * Reload kernel SPRG3 value.
+         * No need to save guest value as usermode can't modify SPRG3.
+         */
+        ld      r3, PACA_SPRG3(r13)
+        mtspr   SPRN_SPRG3, r3
+#endif /* CONFIG_PPC_BOOK3S_64 */
        PPC_STL r14, VCPU_GPR(R14)(r7)
        PPC_STL r15, VCPU_GPR(R15)(r7)
        PPC_STL r16, VCPU_GPR(R16)(r7)
diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c
index c6e13d9a9e15..27db1e665959 100644
--- a/arch/powerpc/kvm/book3s_pr.c
+++ b/arch/powerpc/kvm/book3s_pr.c
@@ -468,7 +468,8 @@ void kvmppc_giveup_ext(struct kvm_vcpu *vcpu, ulong msr)
                 * both the traditional FP registers and the added VSX
                 * registers into thread.fpr[].
                 */
-                giveup_fpu(current);
+                if (current->thread.regs->msr & MSR_FP)
+                        giveup_fpu(current);
                for (i = 0; i < ARRAY_SIZE(vcpu->arch.fpr); i++)
                        vcpu_fpr[i] = thread_fpr[get_fpr_index(i)];
@@ -483,7 +484,8 @@ void kvmppc_giveup_ext(struct kvm_vcpu *vcpu, ulong msr)
 #ifdef CONFIG_ALTIVEC
        if (msr & MSR_VEC) {
-                giveup_altivec(current);
+                if (current->thread.regs->msr & MSR_VEC)
+                        giveup_altivec(current);
                memcpy(vcpu->arch.vr, t->vr, sizeof(vcpu->arch.vr));
                vcpu->arch.vscr = t->vscr;
        }
@@ -575,8 +577,6 @@ static int kvmppc_handle_ext(struct kvm_vcpu *vcpu, unsigned int exit_nr,
        printk(KERN_INFO "Loading up ext 0x%lx\n", msr);
 #endif
-        current->thread.regs->msr |= msr;
        if (msr & MSR_FP) {
                for (i = 0; i < ARRAY_SIZE(vcpu->arch.fpr); i++)
                        thread_fpr[get_fpr_index(i)] = vcpu_fpr[i];
@@ -598,12 +598,32 @@ static int kvmppc_handle_ext(struct kvm_vcpu *vcpu, unsigned int exit_nr,
 #endif
        }
+        current->thread.regs->msr |= msr;
        vcpu->arch.guest_owned_ext |= msr;
        kvmppc_recalc_shadow_msr(vcpu);
        return RESUME_GUEST;
 }
+/*
+ * Kernel code using FP or VMX could have flushed guest state to
+ * the thread_struct; if so, get it back now.
+ */
+static void kvmppc_handle_lost_ext(struct kvm_vcpu *vcpu)
+{
+        unsigned long lost_ext;
+        lost_ext = vcpu->arch.guest_owned_ext & ~current->thread.regs->msr;
+        if (!lost_ext)
+                return;
+        if (lost_ext & MSR_FP)
+                kvmppc_load_up_fpu();
+        if (lost_ext & MSR_VEC)
+                kvmppc_load_up_altivec();
+        current->thread.regs->msr |= lost_ext;
+}
 int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
                       unsigned int exit_nr)
 {
@@ -772,7 +792,7 @@ program_interrupt:
        }
        case BOOK3S_INTERRUPT_SYSCALL:
                if (vcpu->arch.papr_enabled &&
-                    (kvmppc_get_last_inst(vcpu) == 0x44000022) &&
+                    (kvmppc_get_last_sc(vcpu) == 0x44000022) &&
                    !(vcpu->arch.shared->msr & MSR_PR)) {
                        /* SC 1 papr hypercalls */
                        ulong cmd = kvmppc_get_gpr(vcpu, 3);
@@ -890,8 +910,9 @@ program_interrupt:
                        local_irq_enable();
                        r = s;
                } else {
-                        kvmppc_lazy_ee_enable();
+                        kvmppc_fix_ee_before_entry();
                }
+                kvmppc_handle_lost_ext(vcpu);
        }
        trace_kvm_book3s_reenter(r, vcpu);
@@ -1162,7 +1183,7 @@ int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
        if (vcpu->arch.shared->msr & MSR_FP)
                kvmppc_handle_ext(vcpu, BOOK3S_INTERRUPT_FP_UNAVAIL, MSR_FP);
-        kvmppc_lazy_ee_enable();
+        kvmppc_fix_ee_before_entry();
        ret = __kvmppc_vcpu_run(kvm_run, vcpu);
diff --git a/arch/powerpc/kvm/book3s_xics.c b/arch/powerpc/kvm/book3s_xics.c
index 94c1dd46b83d..a3a5cb8ee7ea 100644
--- a/arch/powerpc/kvm/book3s_xics.c
+++ b/arch/powerpc/kvm/book3s_xics.c
@@ -19,6 +19,7 @@
 #include <asm/hvcall.h>
 #include <asm/xics.h>
 #include <asm/debug.h>
+#include <asm/time.h>
 #include <linux/debugfs.h>
 #include <linux/seq_file.h>
diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c
index dcc94f016007..17722d82f1d1 100644
--- a/arch/powerpc/kvm/booke.c
+++ b/arch/powerpc/kvm/booke.c
@@ -674,8 +674,6 @@ int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
                goto out;
        }
-        kvm_guest_enter();
 #ifdef CONFIG_PPC_FPU
        /* Save userspace FPU state in stack */
        enable_kernel_fp();
@@ -698,7 +696,7 @@ int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
        kvmppc_load_guest_fp(vcpu);
 #endif
-        kvmppc_lazy_ee_enable();
+        kvmppc_fix_ee_before_entry();
        ret = __kvmppc_vcpu_run(kvm_run, vcpu);
@@ -1168,7 +1166,7 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
                        local_irq_enable();
                        r = (s << 2) | RESUME_HOST | (r & RESUME_FLAG_NV);
                } else {
-                        kvmppc_lazy_ee_enable();
+                        kvmppc_fix_ee_before_entry();
                }
        }
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 6316ee336e88..f55e14cd1762 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -117,8 +117,6 @@ int kvmppc_prepare_to_enter(struct kvm_vcpu *vcpu)
                        kvm_guest_exit();
                        continue;
                }
-                trace_hardirqs_on();
 #endif
                kvm_guest_enter();
@@ -420,6 +418,10 @@ int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages)
        return kvmppc_core_create_memslot(slot, npages);
 }
+void kvm_arch_memslots_updated(struct kvm *kvm)
+{
+}
 int kvm_arch_prepare_memory_region(struct kvm *kvm,
                                   struct kvm_memory_slot *memslot,
                                   struct kvm_userspace_memory_region *mem,
diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h
index 3238d4004e84..e87ecaa2c569 100644
--- a/arch/s390/include/asm/kvm_host.h
+++ b/arch/s390/include/asm/kvm_host.h
@@ -274,6 +274,14 @@ struct kvm_arch{
        int css_support;
 };
+#define KVM_HVA_ERR_BAD         (-1UL)
+#define KVM_HVA_ERR_RO_BAD      (-2UL)
+static inline bool kvm_is_error_hva(unsigned long addr)
+{
+        return IS_ERR_VALUE(addr);
+}
 extern int sie64a(struct kvm_s390_sie_block *, u64 *);
 extern char sie_exit;
 #endif
diff --git a/arch/s390/include/asm/mmu.h b/arch/s390/include/asm/mmu.h
index 6340178748bf..ff132ac64ddd 100644
--- a/arch/s390/include/asm/mmu.h
+++ b/arch/s390/include/asm/mmu.h
@@ -12,8 +12,6 @@ typedef struct {
        unsigned long asce_bits;
        unsigned long asce_limit;
        unsigned long vdso_base;
-        /* Cloned contexts will be created with extended page tables. */
-        unsigned int alloc_pgste:1;
        /* The mmu context has extended page tables. */
        unsigned int has_pgste:1;
 } mm_context_t;
diff --git a/arch/s390/include/asm/mmu_context.h b/arch/s390/include/asm/mmu_context.h
index 7b7fce4e8469..9f973d8de90e 100644
--- a/arch/s390/include/asm/mmu_context.h
+++ b/arch/s390/include/asm/mmu_context.h
@@ -21,24 +21,7 @@ static inline int init_new_context(struct task_struct *tsk,
 #ifdef CONFIG_64BIT
        mm->context.asce_bits |= _ASCE_TYPE_REGION3;
 #endif
-        if (current->mm && current->mm->context.alloc_pgste) {
+        mm->context.has_pgste = 0;
-                /*
-                 * alloc_pgste indicates, that any NEW context will be created
-                 * with extended page tables. The old context is unchanged. The
-                 * page table allocation and the page table operations will
-                 * look at has_pgste to distinguish normal and extended page
-                 * tables. The only way to create extended page tables is to
-                 * set alloc_pgste and then create a new context (e.g. dup_mm).
-                 * The page table allocation is called after init_new_context
-                 * and if has_pgste is set, it will create extended page
-                 * tables.
-                 */
-                mm->context.has_pgste = 1;
-                mm->context.alloc_pgste = 1;
-        } else {
-                mm->context.has_pgste = 0;
-                mm->context.alloc_pgste = 0;
-        }
        mm->context.asce_limit = STACK_TOP_MAX;
        crst_table_init((unsigned long *) mm->pgd, pgd_entry_type(mm));
        return 0;
diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h
index 9f215b40109e..9b60a36c348d 100644
--- a/arch/s390/include/asm/pgtable.h
+++ b/arch/s390/include/asm/pgtable.h
@@ -1442,6 +1442,17 @@ static inline pmd_t pmd_mkwrite(pmd_t pmd)
 }
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLB_PAGE */
+static inline void pmdp_flush_lazy(struct mm_struct *mm,
+                                   unsigned long address, pmd_t *pmdp)
+{
+        int active = (mm == current->active_mm) ? 1 : 0;
+        if ((atomic_read(&mm->context.attach_count) & 0xffff) > active)
+                __pmd_idte(address, pmdp);
+        else
+                mm->context.flush_mm = 1;
+}
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 #define __HAVE_ARCH_PGTABLE_DEPOSIT
diff --git a/arch/s390/include/asm/processor.h b/arch/s390/include/asm/processor.h
index b0e6435b2f02..0eb37505cab1 100644
--- a/arch/s390/include/asm/processor.h
+++ b/arch/s390/include/asm/processor.h
@@ -43,6 +43,7 @@ extern void execve_tail(void);
 #ifndef CONFIG_64BIT
 #define TASK_SIZE               (1UL << 31)
+#define TASK_MAX_SIZE           (1UL << 31)
 #define TASK_UNMAPPED_BASE      (1UL << 30)
 #else /* CONFIG_64BIT */
@@ -51,6 +52,7 @@ extern void execve_tail(void);
 #define TASK_UNMAPPED_BASE      (test_thread_flag(TIF_31BIT) ? \
                                        (1UL << 30) : (1UL << 41))
 #define TASK_SIZE               TASK_SIZE_OF(current)
+#define TASK_MAX_SIZE           (1UL << 53)
 #endif /* CONFIG_64BIT */
diff --git a/arch/s390/kvm/diag.c b/arch/s390/kvm/diag.c
index 3074475c8ae0..3a74d8af0d69 100644
--- a/arch/s390/kvm/diag.c
+++ b/arch/s390/kvm/diag.c
@@ -119,12 +119,21 @@ static int __diag_virtio_hypercall(struct kvm_vcpu *vcpu)
         * The layout is as follows:
         * - gpr 2 contains the subchannel id (passed as addr)
         * - gpr 3 contains the virtqueue index (passed as datamatch)
+         * - gpr 4 contains the index on the bus (optionally)
         */
-        ret = kvm_io_bus_write(vcpu->kvm, KVM_VIRTIO_CCW_NOTIFY_BUS,
+        ret = kvm_io_bus_write_cookie(vcpu->kvm, KVM_VIRTIO_CCW_NOTIFY_BUS,
-                                vcpu->run->s.regs.gprs[2],
+                                      vcpu->run->s.regs.gprs[2],
-                                8, &vcpu->run->s.regs.gprs[3]);
+                                      8, &vcpu->run->s.regs.gprs[3],
+                                      vcpu->run->s.regs.gprs[4]);
        srcu_read_unlock(&vcpu->kvm->srcu, idx);
-        /* kvm_io_bus_write returns -EOPNOTSUPP if it found no match. */
+        /*
+         * Return cookie in gpr 2, but don't overwrite the register if the
+         * diagnose will be handled by userspace.
+         */
+        if (ret != -EOPNOTSUPP)
+                vcpu->run->s.regs.gprs[2] = ret;
+        /* kvm_io_bus_write_cookie returns -EOPNOTSUPP if it found no match. */
        return ret < 0 ? ret : 0;
 }
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 34c1c9a90be2..776dafe918db 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -28,6 +28,7 @@
 #include <asm/pgtable.h>
 #include <asm/nmi.h>
 #include <asm/switch_to.h>
+#include <asm/facility.h>
 #include <asm/sclp.h>
 #include "kvm-s390.h"
 #include "gaccess.h"
@@ -84,9 +85,15 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
        { NULL }
 };
-static unsigned long long *facilities;
+unsigned long *vfacilities;
 static struct gmap_notifier gmap_notifier;
+/* test availability of vfacility */
+static inline int test_vfacility(unsigned long nr)
+{
+        return __test_facility(nr, (void *) vfacilities);
+}
 /* Section: not file related */
 int kvm_arch_hardware_enable(void *garbage)
 {
@@ -387,7 +394,7 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
        vcpu->arch.sie_block->ecb   = 6;
        vcpu->arch.sie_block->ecb2  = 8;
        vcpu->arch.sie_block->eca   = 0xC1002001U;
-        vcpu->arch.sie_block->fac   = (int) (long) facilities;
+        vcpu->arch.sie_block->fac   = (int) (long) vfacilities;
        hrtimer_init(&vcpu->arch.ckc_timer, CLOCK_REALTIME, HRTIMER_MODE_ABS);
        tasklet_init(&vcpu->arch.tasklet, kvm_s390_tasklet,
                     (unsigned long) vcpu);
@@ -1063,6 +1070,10 @@ int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages)
        return 0;
 }
+void kvm_arch_memslots_updated(struct kvm *kvm)
+{
+}
 /* Section: memory related */
 int kvm_arch_prepare_memory_region(struct kvm *kvm,
                                   struct kvm_memory_slot *memslot,
@@ -1129,20 +1140,20 @@ static int __init kvm_s390_init(void)
         * to hold the maximum amount of facilities. On the other hand, we
         * only set facilities that are known to work in KVM.
         */
-        facilities = (unsigned long long *) get_zeroed_page(GFP_KERNEL|GFP_DMA);
+        vfacilities = (unsigned long *) get_zeroed_page(GFP_KERNEL|GFP_DMA);
-        if (!facilities) {
+        if (!vfacilities) {
                kvm_exit();
                return -ENOMEM;
        }
-        memcpy(facilities, S390_lowcore.stfle_fac_list, 16);
+        memcpy(vfacilities, S390_lowcore.stfle_fac_list, 16);
-        facilities[0] &= 0xff82fff3f47c0000ULL;
+        vfacilities[0] &= 0xff82fff3f47c0000UL;
-        facilities[1] &= 0x001c000000000000ULL;
+        vfacilities[1] &= 0x001c000000000000UL;
        return 0;
 }
 static void __exit kvm_s390_exit(void)
 {
-        free_page((unsigned long) facilities);
+        free_page((unsigned long) vfacilities);
        kvm_exit();
 }
diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h
index 028ca9fd2158..dc99f1ca4267 100644
--- a/arch/s390/kvm/kvm-s390.h
+++ b/arch/s390/kvm/kvm-s390.h
@@ -24,6 +24,9 @@
 typedef int (*intercept_handler_t)(struct kvm_vcpu *vcpu);
+/* declare vfacilities extern */
+extern unsigned long *vfacilities;
 /* negativ values are error codes, positive values for internal conditions */
 #define SIE_INTERCEPT_RERUNVCPU         (1<<0)
 #define SIE_INTERCEPT_UCONTROL          (1<<1)
@@ -112,6 +115,13 @@ static inline u64 kvm_s390_get_base_disp_rs(struct kvm_vcpu *vcpu)
        return (base2 ? vcpu->run->s.regs.gprs[base2] : 0) + disp2;
 }
+/* Set the condition code in the guest program status word */
+static inline void kvm_s390_set_psw_cc(struct kvm_vcpu *vcpu, unsigned long cc)
+{
+        vcpu->arch.sie_block->gpsw.mask &= ~(3UL << 44);
+        vcpu->arch.sie_block->gpsw.mask |= cc << 44;
+}
 int kvm_s390_handle_wait(struct kvm_vcpu *vcpu);
 enum hrtimer_restart kvm_s390_idle_wakeup(struct hrtimer *timer);
 void kvm_s390_tasklet(unsigned long parm);
diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c
index 4cdc54e63ebc..59200ee275e5 100644
--- a/arch/s390/kvm/priv.c
+++ b/arch/s390/kvm/priv.c
@@ -164,8 +164,7 @@ static int handle_tpi(struct kvm_vcpu *vcpu)
        kfree(inti);
 no_interrupt:
        /* Set condition code and we're done. */
-        vcpu->arch.sie_block->gpsw.mask &= ~(3ul << 44);
+        kvm_s390_set_psw_cc(vcpu, cc);
-        vcpu->arch.sie_block->gpsw.mask |= (cc & 3ul) << 44;
        return 0;
 }
@@ -220,15 +219,13 @@ static int handle_io_inst(struct kvm_vcpu *vcpu)
                 * Set condition code 3 to stop the guest from issueing channel
                 * I/O instructions.
                 */
-                vcpu->arch.sie_block->gpsw.mask &= ~(3ul << 44);
+                kvm_s390_set_psw_cc(vcpu, 3);
-                vcpu->arch.sie_block->gpsw.mask |= (3 & 3ul) << 44;
                return 0;
        }
 }
 static int handle_stfl(struct kvm_vcpu *vcpu)
 {
-        unsigned int facility_list;
        int rc;
        vcpu->stat.instruction_stfl++;
@@ -236,15 +233,13 @@ static int handle_stfl(struct kvm_vcpu *vcpu)
        if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE)
                return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP);
-        /* only pass the facility bits, which we can handle */
-        facility_list = S390_lowcore.stfl_fac_list & 0xff82fff3;
        rc = copy_to_guest(vcpu, offsetof(struct _lowcore, stfl_fac_list),
-                           &facility_list, sizeof(facility_list));
+                           vfacilities, 4);
        if (rc)
                return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
-        VCPU_EVENT(vcpu, 5, "store facility list value %x", facility_list);
+        VCPU_EVENT(vcpu, 5, "store facility list value %x",
-        trace_kvm_s390_handle_stfl(vcpu, facility_list);
+                   *(unsigned int *) vfacilities);
+        trace_kvm_s390_handle_stfl(vcpu, *(unsigned int *) vfacilities);
        return 0;
 }
@@ -387,7 +382,7 @@ static int handle_stsi(struct kvm_vcpu *vcpu)
                return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP);
        if (fc > 3) {
-                vcpu->arch.sie_block->gpsw.mask |= 3ul << 44;     /* cc 3 */
+                kvm_s390_set_psw_cc(vcpu, 3);
                return 0;
        }
@@ -397,7 +392,7 @@ static int handle_stsi(struct kvm_vcpu *vcpu)
        if (fc == 0) {
                vcpu->run->s.regs.gprs[0] = 3 << 28;
-                vcpu->arch.sie_block->gpsw.mask &= ~(3ul << 44);  /* cc 0 */
+                kvm_s390_set_psw_cc(vcpu, 0);
                return 0;
        }
@@ -431,12 +426,11 @@ static int handle_stsi(struct kvm_vcpu *vcpu)
        }
        trace_kvm_s390_handle_stsi(vcpu, fc, sel1, sel2, operand2);
        free_page(mem);
-        vcpu->arch.sie_block->gpsw.mask &= ~(3ul << 44);
+        kvm_s390_set_psw_cc(vcpu, 0);
        vcpu->run->s.regs.gprs[0] = 0;
        return 0;
 out_no_data:
-        /* condition code 3 */
+        kvm_s390_set_psw_cc(vcpu, 3);
-        vcpu->arch.sie_block->gpsw.mask |= 3ul << 44;
 out_exception:
        free_page(mem);
        return rc;
@@ -494,12 +488,12 @@ static int handle_epsw(struct kvm_vcpu *vcpu)
        kvm_s390_get_regs_rre(vcpu, &reg1, &reg2);
        /* This basically extracts the mask half of the psw. */
-        vcpu->run->s.regs.gprs[reg1] &= 0xffffffff00000000;
+        vcpu->run->s.regs.gprs[reg1] &= 0xffffffff00000000UL;
        vcpu->run->s.regs.gprs[reg1] |= vcpu->arch.sie_block->gpsw.mask >> 32;
        if (reg2) {
-                vcpu->run->s.regs.gprs[reg2] &= 0xffffffff00000000;
+                vcpu->run->s.regs.gprs[reg2] &= 0xffffffff00000000UL;
                vcpu->run->s.regs.gprs[reg2] |=
-                        vcpu->arch.sie_block->gpsw.mask & 0x00000000ffffffff;
+                        vcpu->arch.sie_block->gpsw.mask & 0x00000000ffffffffUL;
        }
        return 0;
 }
diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c
index 6d16132d0850..bf7c0dc64a76 100644
--- a/arch/s390/mm/pgtable.c
+++ b/arch/s390/mm/pgtable.c
@@ -335,7 +335,7 @@ int gmap_map_segment(struct gmap *gmap, unsigned long from,
        if ((from | to | len) & (PMD_SIZE - 1))
                return -EINVAL;
-        if (len == 0 || from + len > PGDIR_SIZE ||
+        if (len == 0 || from + len > TASK_MAX_SIZE ||
            from + len < from || to + len < to)
                return -EINVAL;
@@ -732,6 +732,11 @@ void gmap_do_ipte_notify(struct mm_struct *mm, unsigned long addr, pte_t *pte)
        spin_unlock(&gmap_notifier_lock);
 }
+static inline int page_table_with_pgste(struct page *page)
+{
+        return atomic_read(&page->_mapcount) == 0;
+}
 static inline unsigned long *page_table_alloc_pgste(struct mm_struct *mm,
                                                    unsigned long vmaddr)
 {
@@ -751,7 +756,7 @@ static inline unsigned long *page_table_alloc_pgste(struct mm_struct *mm,
        mp->vmaddr = vmaddr & PMD_MASK;
        INIT_LIST_HEAD(&mp->mapper);
        page->index = (unsigned long) mp;
-        atomic_set(&page->_mapcount, 3);
+        atomic_set(&page->_mapcount, 0);
        table = (unsigned long *) page_to_phys(page);
        clear_table(table, _PAGE_INVALID, PAGE_SIZE/2);
        clear_table(table + PTRS_PER_PTE, PGSTE_HR_BIT | PGSTE_HC_BIT,
@@ -818,6 +823,11 @@ EXPORT_SYMBOL(set_guest_storage_key);
 #else /* CONFIG_PGSTE */
+static inline int page_table_with_pgste(struct page *page)
+{
+        return 0;
+}
 static inline unsigned long *page_table_alloc_pgste(struct mm_struct *mm,
                                                    unsigned long vmaddr)
 {
@@ -894,12 +904,12 @@ void page_table_free(struct mm_struct *mm, unsigned long *table)
        struct page *page;
        unsigned int bit, mask;
-        if (mm_has_pgste(mm)) {
+        page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
+        if (page_table_with_pgste(page)) {
                gmap_disconnect_pgtable(mm, table);
                return page_table_free_pgste(table);
        }
        /* Free 1K/2K page table fragment of a 4K page */
-        page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
        bit = 1 << ((__pa(table) & ~PAGE_MASK)/(PTRS_PER_PTE*sizeof(pte_t)));
        spin_lock_bh(&mm->context.list_lock);
        if ((atomic_read(&page->_mapcount) & FRAG_MASK) != FRAG_MASK)
@@ -937,14 +947,14 @@ void page_table_free_rcu(struct mmu_gather *tlb, unsigned long *table)
        unsigned int bit, mask;
        mm = tlb->mm;
-        if (mm_has_pgste(mm)) {
+        page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
+        if (page_table_with_pgste(page)) {
                gmap_disconnect_pgtable(mm, table);
                table = (unsigned long *) (__pa(table) | FRAG_MASK);
                tlb_remove_table(tlb, table);
                return;
        }
        bit = 1 << ((__pa(table) & ~PAGE_MASK) / (PTRS_PER_PTE*sizeof(pte_t)));
-        page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
        spin_lock_bh(&mm->context.list_lock);
        if ((atomic_read(&page->_mapcount) & FRAG_MASK) != FRAG_MASK)
                list_del(&page->lru);
@@ -1030,36 +1040,120 @@ void tlb_remove_table(struct mmu_gather *tlb, void *table)
 }
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
-void thp_split_vma(struct vm_area_struct *vma)
+static inline void thp_split_vma(struct vm_area_struct *vma)
 {
        unsigned long addr;
-        struct page *page;
-        for (addr = vma->vm_start; addr < vma->vm_end; addr += PAGE_SIZE) {
+        for (addr = vma->vm_start; addr < vma->vm_end; addr += PAGE_SIZE)
-                page = follow_page(vma, addr, FOLL_SPLIT);
+                follow_page(vma, addr, FOLL_SPLIT);
-        }
 }
-void thp_split_mm(struct mm_struct *mm)
+static inline void thp_split_mm(struct mm_struct *mm)
 {
-        struct vm_area_struct *vma = mm->mmap;
+        struct vm_area_struct *vma;
-        while (vma != NULL) {
+        for (vma = mm->mmap; vma != NULL; vma = vma->vm_next) {
                thp_split_vma(vma);
                vma->vm_flags &= ~VM_HUGEPAGE;
                vma->vm_flags |= VM_NOHUGEPAGE;
-                vma = vma->vm_next;
        }
+        mm->def_flags |= VM_NOHUGEPAGE;
+}
+#else
+static inline void thp_split_mm(struct mm_struct *mm)
+{
 }
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+static unsigned long page_table_realloc_pmd(struct mmu_gather *tlb,
+                                struct mm_struct *mm, pud_t *pud,
+                                unsigned long addr, unsigned long end)
+{
+        unsigned long next, *table, *new;
+        struct page *page;
+        pmd_t *pmd;
+        pmd = pmd_offset(pud, addr);
+        do {
+                next = pmd_addr_end(addr, end);
+again:
+                if (pmd_none_or_clear_bad(pmd))
+                        continue;
+                table = (unsigned long *) pmd_deref(*pmd);
+                page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
+                if (page_table_with_pgste(page))
+                        continue;
+                /* Allocate new page table with pgstes */
+                new = page_table_alloc_pgste(mm, addr);
+                if (!new) {
+                        mm->context.has_pgste = 0;
+                        continue;
+                }
+                spin_lock(&mm->page_table_lock);
+                if (likely((unsigned long *) pmd_deref(*pmd) == table)) {
+                        /* Nuke pmd entry pointing to the "short" page table */
+                        pmdp_flush_lazy(mm, addr, pmd);
+                        pmd_clear(pmd);
+                        /* Copy ptes from old table to new table */
+                        memcpy(new, table, PAGE_SIZE/2);
+                        clear_table(table, _PAGE_INVALID, PAGE_SIZE/2);
+                        /* Establish new table */
+                        pmd_populate(mm, pmd, (pte_t *) new);
+                        /* Free old table with rcu, there might be a walker! */
+                        page_table_free_rcu(tlb, table);
+                        new = NULL;
+                }
+                spin_unlock(&mm->page_table_lock);
+                if (new) {
+                        page_table_free_pgste(new);
+                        goto again;
+                }
+        } while (pmd++, addr = next, addr != end);
+        return addr;
+}
+static unsigned long page_table_realloc_pud(struct mmu_gather *tlb,
+                                   struct mm_struct *mm, pgd_t *pgd,
+                                   unsigned long addr, unsigned long end)
+{
+        unsigned long next;
+        pud_t *pud;
+        pud = pud_offset(pgd, addr);
+        do {
+                next = pud_addr_end(addr, end);
+                if (pud_none_or_clear_bad(pud))
+                        continue;
+                next = page_table_realloc_pmd(tlb, mm, pud, addr, next);
+        } while (pud++, addr = next, addr != end);
+        return addr;
+}
+static void page_table_realloc(struct mmu_gather *tlb, struct mm_struct *mm,
+                               unsigned long addr, unsigned long end)
+{
+        unsigned long next;
+        pgd_t *pgd;
+        pgd = pgd_offset(mm, addr);
+        do {
+                next = pgd_addr_end(addr, end);
+                if (pgd_none_or_clear_bad(pgd))
+                        continue;
+                next = page_table_realloc_pud(tlb, mm, pgd, addr, next);
+        } while (pgd++, addr = next, addr != end);
+}
 /*
 * switch on pgstes for its userspace process (for kvm)
 */
 int s390_enable_sie(void)
 {
        struct task_struct *tsk = current;
-        struct mm_struct *mm, *old_mm;
+        struct mm_struct *mm = tsk->mm;
+        struct mmu_gather tlb;
        /* Do we have switched amode? If no, we cannot do sie */
        if (s390_user_mode == HOME_SPACE_MODE)
@@ -1069,57 +1163,16 @@ int s390_enable_sie(void)
        if (mm_has_pgste(tsk->mm))
                return 0;
-        /* lets check if we are allowed to replace the mm */
+        down_write(&mm->mmap_sem);
-        task_lock(tsk);
-        if (!tsk->mm || atomic_read(&tsk->mm->mm_users) > 1 ||
-#ifdef CONFIG_AIO
-            !hlist_empty(&tsk->mm->ioctx_list) ||
-#endif
-            tsk->mm != tsk->active_mm) {
-                task_unlock(tsk);
-                return -EINVAL;
-        }
-        task_unlock(tsk);
-        /* we copy the mm and let dup_mm create the page tables with_pgstes */
-        tsk->mm->context.alloc_pgste = 1;
-        /* make sure that both mms have a correct rss state */
-        sync_mm_rss(tsk->mm);
-        mm = dup_mm(tsk);
-        tsk->mm->context.alloc_pgste = 0;
-        if (!mm)
-                return -ENOMEM;
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
        /* split thp mappings and disable thp for future mappings */
        thp_split_mm(mm);
-        mm->def_flags |= VM_NOHUGEPAGE;
+        /* Reallocate the page tables with pgstes */
-#endif
+        mm->context.has_pgste = 1;
+        tlb_gather_mmu(&tlb, mm, 0, TASK_SIZE);
-        /* Now lets check again if something happened */
+        page_table_realloc(&tlb, mm, 0, TASK_SIZE);
-        task_lock(tsk);
+        tlb_finish_mmu(&tlb, 0, TASK_SIZE);
-        if (!tsk->mm || atomic_read(&tsk->mm->mm_users) > 1 ||
+        up_write(&mm->mmap_sem);
-#ifdef CONFIG_AIO
+        return mm->context.has_pgste ? 0 : -ENOMEM;
-            !hlist_empty(&tsk->mm->ioctx_list) ||
-#endif
-            tsk->mm != tsk->active_mm) {
-                mmput(mm);
-                task_unlock(tsk);
-                return -EINVAL;
-        }
-        /* ok, we are alone. No ptrace, no threads, etc. */
-        old_mm = tsk->mm;
-        tsk->mm = tsk->active_mm = mm;
-        preempt_disable();
-        update_mm(mm, tsk);
-        atomic_inc(&mm->context.attach_count);
-        atomic_dec(&old_mm->context.attach_count);
-        cpumask_set_cpu(smp_processor_id(), mm_cpumask(mm));
-        preempt_enable();
-        task_unlock(tsk);
-        mmput(old_mm);
-        return 0;
 }
 EXPORT_SYMBOL_GPL(s390_enable_sie);
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index f87f7fcefa0a..c76ff74a98f2 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -286,6 +286,7 @@ struct kvm_mmu {
        u64 *pae_root;
        u64 *lm_root;
        u64 rsvd_bits_mask[2][4];
+        u64 bad_mt_xwr;
        /*
         * Bitmap: bit set = last pte in walk
@@ -323,6 +324,7 @@ struct kvm_pmu {
        u64 global_ovf_ctrl;
        u64 counter_bitmask[2];
        u64 global_ctrl_mask;
+        u64 reserved_bits;
        u8 version;
        struct kvm_pmc gp_counters[INTEL_PMC_MAX_GENERIC];
        struct kvm_pmc fixed_counters[INTEL_PMC_MAX_FIXED];
@@ -511,6 +513,14 @@ struct kvm_vcpu_arch {
         * instruction.
         */
        bool write_fault_to_shadow_pgtable;
+        /* set at EPT violation at this point */
+        unsigned long exit_qualification;
+        /* pv related host specific info */
+        struct {
+                bool pv_unhalted;
+        } pv;
 };
 struct kvm_lpage_info {
@@ -802,8 +812,8 @@ extern u32  kvm_min_guest_tsc_khz;
 extern u32  kvm_max_guest_tsc_khz;
 enum emulation_result {
-        EMULATE_DONE,       /* no further processing */
+        EMULATE_DONE,         /* no further processing */
-        EMULATE_DO_MMIO,      /* kvm_run filled with mmio request */
+        EMULATE_USER_EXIT,    /* kvm_run ready for userspace exit */
        EMULATE_FAIL,         /* can't emulate this instruction */
 };
diff --git a/arch/x86/include/asm/pvclock.h b/arch/x86/include/asm/pvclock.h
index 109a9dd5d454..be8269b00e2a 100644
--- a/arch/x86/include/asm/pvclock.h
+++ b/arch/x86/include/asm/pvclock.h
@@ -93,7 +93,6 @@ unsigned __pvclock_read_cycles(const struct pvclock_vcpu_time_info *src,
 struct pvclock_vsyscall_time_info {
        struct pvclock_vcpu_time_info pvti;
-        u32 migrate_count;
 } __attribute__((__aligned__(SMP_CACHE_BYTES)));
 #define PVTI_SIZE sizeof(struct pvclock_vsyscall_time_info)
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index f3e01a2cbaa1..966502d4682e 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -387,6 +387,7 @@ enum vmcs_field {
 #define VMX_EPT_EXTENT_INDIVIDUAL_ADDR          0
 #define VMX_EPT_EXTENT_CONTEXT                  1
 #define VMX_EPT_EXTENT_GLOBAL                   2
+#define VMX_EPT_EXTENT_SHIFT                    24
 #define VMX_EPT_EXECUTE_ONLY_BIT                (1ull)
 #define VMX_EPT_PAGE_WALK_4_BIT                 (1ull << 6)
@@ -394,6 +395,7 @@ enum vmcs_field {
 #define VMX_EPTP_WB_BIT                         (1ull << 14)
 #define VMX_EPT_2MB_PAGE_BIT                    (1ull << 16)
 #define VMX_EPT_1GB_PAGE_BIT                    (1ull << 17)
+#define VMX_EPT_INVEPT_BIT                      (1ull << 20)
 #define VMX_EPT_AD_BIT                              (1ull << 21)
 #define VMX_EPT_EXTENT_CONTEXT_BIT              (1ull << 25)
 #define VMX_EPT_EXTENT_GLOBAL_BIT               (1ull << 26)
diff --git a/arch/x86/include/uapi/asm/vmx.h b/arch/x86/include/uapi/asm/vmx.h
index d651082c7cf7..0e79420376eb 100644
--- a/arch/x86/include/uapi/asm/vmx.h
+++ b/arch/x86/include/uapi/asm/vmx.h
@@ -65,6 +65,7 @@
 #define EXIT_REASON_EOI_INDUCED         45
 #define EXIT_REASON_EPT_VIOLATION       48
 #define EXIT_REASON_EPT_MISCONFIG       49
+#define EXIT_REASON_INVEPT              50
 #define EXIT_REASON_PREEMPTION_TIMER    52
 #define EXIT_REASON_WBINVD              54
 #define EXIT_REASON_XSETBV              55
@@ -106,12 +107,13 @@
        { EXIT_REASON_APIC_ACCESS,           "APIC_ACCESS" }, \
        { EXIT_REASON_EPT_VIOLATION,         "EPT_VIOLATION" }, \
        { EXIT_REASON_EPT_MISCONFIG,         "EPT_MISCONFIG" }, \
+        { EXIT_REASON_INVEPT,                "INVEPT" }, \
+        { EXIT_REASON_PREEMPTION_TIMER,      "PREEMPTION_TIMER" }, \
        { EXIT_REASON_WBINVD,                "WBINVD" }, \
        { EXIT_REASON_APIC_WRITE,            "APIC_WRITE" }, \
        { EXIT_REASON_EOI_INDUCED,           "EOI_INDUCED" }, \
        { EXIT_REASON_INVALID_STATE,         "INVALID_STATE" }, \
        { EXIT_REASON_INVD,                  "INVD" }, \
-        { EXIT_REASON_INVPCID,               "INVPCID" }, \
+        { EXIT_REASON_INVPCID,               "INVPCID" }
-        { EXIT_REASON_PREEMPTION_TIMER,      "PREEMPTION_TIMER" }
 #endif /* _UAPIVMX_H */
diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c
index 2cb9470ea85b..a16bae3f83b3 100644
--- a/arch/x86/kernel/pvclock.c
+++ b/arch/x86/kernel/pvclock.c
@@ -128,46 +128,7 @@ void pvclock_read_wallclock(struct pvclock_wall_clock *wall_clock,
        set_normalized_timespec(ts, now.tv_sec, now.tv_nsec);
 }
-static struct pvclock_vsyscall_time_info *pvclock_vdso_info;
-static struct pvclock_vsyscall_time_info *
-pvclock_get_vsyscall_user_time_info(int cpu)
-{
-        if (!pvclock_vdso_info) {
-                BUG();
-                return NULL;
-        }
-        return &pvclock_vdso_info[cpu];
-}
-struct pvclock_vcpu_time_info *pvclock_get_vsyscall_time_info(int cpu)
-{
-        return &pvclock_get_vsyscall_user_time_info(cpu)->pvti;
-}
 #ifdef CONFIG_X86_64
-static int pvclock_task_migrate(struct notifier_block *nb, unsigned long l,
-                                void *v)
-{
-        struct task_migration_notifier *mn = v;
-        struct pvclock_vsyscall_time_info *pvti;
-        pvti = pvclock_get_vsyscall_user_time_info(mn->from_cpu);
-        /* this is NULL when pvclock vsyscall is not initialized */
-        if (unlikely(pvti == NULL))
-                return NOTIFY_DONE;
-        pvti->migrate_count++;
-        return NOTIFY_DONE;
-}
-static struct notifier_block pvclock_migrate = {
-        .notifier_call = pvclock_task_migrate,
-};
 /*
 * Initialize the generic pvclock vsyscall state.  This will allocate
 * a/some page(s) for the per-vcpu pvclock information, set up a
@@ -181,17 +142,12 @@ int __init pvclock_init_vsyscall(struct pvclock_vsyscall_time_info *i,
        WARN_ON (size != PVCLOCK_VSYSCALL_NR_PAGES*PAGE_SIZE);
-        pvclock_vdso_info = i;
        for (idx = 0; idx <= (PVCLOCK_FIXMAP_END-PVCLOCK_FIXMAP_BEGIN); idx++) {
                __set_fixmap(PVCLOCK_FIXMAP_BEGIN + idx,
                             __pa(i) + (idx*PAGE_SIZE),
                             PAGE_KERNEL_VVAR);
        }
-        register_task_migration_notifier(&pvclock_migrate);
        return 0;
 }
 #endif
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index a20ecb5b6cbf..b110fe6c03d4 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -413,7 +413,8 @@ static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
                             (1 << KVM_FEATURE_CLOCKSOURCE2) |
                             (1 << KVM_FEATURE_ASYNC_PF) |
                             (1 << KVM_FEATURE_PV_EOI) |
-                             (1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT);
+                             (1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT) |
+                             (1 << KVM_FEATURE_PV_UNHALT);
                if (sched_info_on())
                        entry->eax |= (1 << KVM_FEATURE_STEAL_TIME);
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index afc11245827c..5439117d5c4c 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -79,16 +79,6 @@ static inline void apic_set_reg(struct kvm_lapic *apic, int reg_off, u32 val)
        *((u32 *) (apic->regs + reg_off)) = val;
 }
-static inline int apic_test_and_set_vector(int vec, void *bitmap)
-{
-        return test_and_set_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
-}
-static inline int apic_test_and_clear_vector(int vec, void *bitmap)
-{
-        return test_and_clear_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
-}
 static inline int apic_test_vector(int vec, void *bitmap)
 {
        return test_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
@@ -331,10 +321,10 @@ void kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir)
 }
 EXPORT_SYMBOL_GPL(kvm_apic_update_irr);
-static inline int apic_test_and_set_irr(int vec, struct kvm_lapic *apic)
+static inline void apic_set_irr(int vec, struct kvm_lapic *apic)
 {
        apic->irr_pending = true;
-        return apic_test_and_set_vector(vec, apic->regs + APIC_IRR);
+        apic_set_vector(vec, apic->regs + APIC_IRR);
 }
 static inline int apic_search_irr(struct kvm_lapic *apic)
@@ -681,32 +671,28 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
                if (unlikely(!apic_enabled(apic)))
                        break;
+                result = 1;
                if (dest_map)
                        __set_bit(vcpu->vcpu_id, dest_map);
-                if (kvm_x86_ops->deliver_posted_interrupt) {
+                if (kvm_x86_ops->deliver_posted_interrupt)
-                        result = 1;
                        kvm_x86_ops->deliver_posted_interrupt(vcpu, vector);
-                } else {
+                else {
-                        result = !apic_test_and_set_irr(vector, apic);
+                        apic_set_irr(vector, apic);
-                        if (!result) {
-                                if (trig_mode)
-                                        apic_debug("level trig mode repeatedly "
-                                                "for vector %d", vector);
-                                goto out;
-                        }
                        kvm_make_request(KVM_REQ_EVENT, vcpu);
                        kvm_vcpu_kick(vcpu);
                }
-out:
                trace_kvm_apic_accept_irq(vcpu->vcpu_id, delivery_mode,
-                                trig_mode, vector, !result);
+                                          trig_mode, vector, false);
                break;
        case APIC_DM_REMRD:
-                apic_debug("Ignoring delivery mode 3\n");
+                result = 1;
+                vcpu->arch.pv.pv_unhalted = 1;
+                kvm_make_request(KVM_REQ_EVENT, vcpu);
+                kvm_vcpu_kick(vcpu);
                break;
        case APIC_DM_SMI:
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 9e9285ae9b94..6e2d2c8f230b 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -132,8 +132,8 @@ module_param(dbg, bool, 0644);
        (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \
                                            * PT32_LEVEL_BITS))) - 1))
-#define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | PT_USER_MASK \
+#define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | shadow_user_mask \
-                        | PT64_NX_MASK)
+                        | shadow_x_mask | shadow_nx_mask)
 #define ACC_EXEC_MASK    1
 #define ACC_WRITE_MASK   PT_WRITABLE_MASK
@@ -331,11 +331,6 @@ static int is_large_pte(u64 pte)
        return pte & PT_PAGE_SIZE_MASK;
 }
-static int is_dirty_gpte(unsigned long pte)
-{
-        return pte & PT_DIRTY_MASK;
-}
 static int is_rmap_spte(u64 pte)
 {
        return is_shadow_present_pte(pte);
@@ -2052,12 +2047,18 @@ static void shadow_walk_next(struct kvm_shadow_walk_iterator *iterator)
        return __shadow_walk_next(iterator, *iterator->sptep);
 }
-static void link_shadow_page(u64 *sptep, struct kvm_mmu_page *sp)
+static void link_shadow_page(u64 *sptep, struct kvm_mmu_page *sp, bool accessed)
 {
        u64 spte;
+        BUILD_BUG_ON(VMX_EPT_READABLE_MASK != PT_PRESENT_MASK ||
+                        VMX_EPT_WRITABLE_MASK != PT_WRITABLE_MASK);
        spte = __pa(sp->spt) | PT_PRESENT_MASK | PT_WRITABLE_MASK |
-               shadow_user_mask | shadow_x_mask | shadow_accessed_mask;
+               shadow_user_mask | shadow_x_mask;
+        if (accessed)
+                spte |= shadow_accessed_mask;
        mmu_spte_set(sptep, spte);
 }
@@ -2574,14 +2575,6 @@ static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
        mmu_free_roots(vcpu);
 }
-static bool is_rsvd_bits_set(struct kvm_mmu *mmu, u64 gpte, int level)
-{
-        int bit7;
-        bit7 = (gpte >> 7) & 1;
-        return (gpte & mmu->rsvd_bits_mask[bit7][level-1]) != 0;
-}
 static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn,
                                     bool no_dirty_log)
 {
@@ -2594,26 +2587,6 @@ static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn,
        return gfn_to_pfn_memslot_atomic(slot, gfn);
 }
-static bool prefetch_invalid_gpte(struct kvm_vcpu *vcpu,
-                                  struct kvm_mmu_page *sp, u64 *spte,
-                                  u64 gpte)
-{
-        if (is_rsvd_bits_set(&vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL))
-                goto no_present;
-        if (!is_present_gpte(gpte))
-                goto no_present;
-        if (!(gpte & PT_ACCESSED_MASK))
-                goto no_present;
-        return false;
-no_present:
-        drop_spte(vcpu->kvm, spte);
-        return true;
-}
 static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu,
                                    struct kvm_mmu_page *sp,
                                    u64 *start, u64 *end)
@@ -2710,7 +2683,7 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
                                              iterator.level - 1,
                                              1, ACC_ALL, iterator.sptep);
-                        link_shadow_page(iterator.sptep, sp);
+                        link_shadow_page(iterator.sptep, sp, true);
                }
        }
        return emulate;
@@ -2808,7 +2781,7 @@ exit:
        return ret;
 }
-static bool page_fault_can_be_fast(struct kvm_vcpu *vcpu, u32 error_code)
+static bool page_fault_can_be_fast(u32 error_code)
 {
        /*
         * Do not fix the mmio spte with invalid generation number which
@@ -2861,7 +2834,7 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level,
        bool ret = false;
        u64 spte = 0ull;
-        if (!page_fault_can_be_fast(vcpu, error_code))
+        if (!page_fault_can_be_fast(error_code))
                return false;
        walk_shadow_page_lockless_begin(vcpu);
@@ -3209,6 +3182,7 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
        mmu_sync_roots(vcpu);
        spin_unlock(&vcpu->kvm->mmu_lock);
 }
+EXPORT_SYMBOL_GPL(kvm_mmu_sync_roots);
 static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr,
                                  u32 access, struct x86_exception *exception)
@@ -3478,6 +3452,7 @@ void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu)
        ++vcpu->stat.tlb_flush;
        kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
 }
+EXPORT_SYMBOL_GPL(kvm_mmu_flush_tlb);
 static void paging_new_cr3(struct kvm_vcpu *vcpu)
 {
@@ -3501,18 +3476,6 @@ static void paging_free(struct kvm_vcpu *vcpu)
        nonpaging_free(vcpu);
 }
-static inline void protect_clean_gpte(unsigned *access, unsigned gpte)
-{
-        unsigned mask;
-        BUILD_BUG_ON(PT_WRITABLE_MASK != ACC_WRITE_MASK);
-        mask = (unsigned)~ACC_WRITE_MASK;
-        /* Allow write access to dirty gptes */
-        mask |= (gpte >> (PT_DIRTY_SHIFT - PT_WRITABLE_SHIFT)) & PT_WRITABLE_MASK;
-        *access &= mask;
-}
 static bool sync_mmio_spte(struct kvm *kvm, u64 *sptep, gfn_t gfn,
                           unsigned access, int *nr_present)
 {
@@ -3530,16 +3493,6 @@ static bool sync_mmio_spte(struct kvm *kvm, u64 *sptep, gfn_t gfn,
        return false;
 }
-static inline unsigned gpte_access(struct kvm_vcpu *vcpu, u64 gpte)
-{
-        unsigned access;
-        access = (gpte & (PT_WRITABLE_MASK | PT_USER_MASK)) | ACC_EXEC_MASK;
-        access &= ~(gpte >> PT64_NX_SHIFT);
-        return access;
-}
 static inline bool is_last_gpte(struct kvm_mmu *mmu, unsigned level, unsigned gpte)
 {
        unsigned index;
@@ -3549,6 +3502,11 @@ static inline bool is_last_gpte(struct kvm_mmu *mmu, unsigned level, unsigned gp
        return mmu->last_pte_bitmap & (1 << index);
 }
+#define PTTYPE_EPT 18 /* arbitrary */
+#define PTTYPE PTTYPE_EPT
+#include "paging_tmpl.h"
+#undef PTTYPE
 #define PTTYPE 64
 #include "paging_tmpl.h"
 #undef PTTYPE
@@ -3563,6 +3521,8 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
        int maxphyaddr = cpuid_maxphyaddr(vcpu);
        u64 exb_bit_rsvd = 0;
+        context->bad_mt_xwr = 0;
        if (!context->nx)
                exb_bit_rsvd = rsvd_bits(63, 63);
        switch (context->root_level) {
@@ -3618,7 +3578,40 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
        }
 }
-static void update_permission_bitmask(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu)
+static void reset_rsvds_bits_mask_ept(struct kvm_vcpu *vcpu,
+                struct kvm_mmu *context, bool execonly)
+{
+        int maxphyaddr = cpuid_maxphyaddr(vcpu);
+        int pte;
+        context->rsvd_bits_mask[0][3] =
+                rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 7);
+        context->rsvd_bits_mask[0][2] =
+                rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 6);
+        context->rsvd_bits_mask[0][1] =
+                rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 6);
+        context->rsvd_bits_mask[0][0] = rsvd_bits(maxphyaddr, 51);
+        /* large page */
+        context->rsvd_bits_mask[1][3] = context->rsvd_bits_mask[0][3];
+        context->rsvd_bits_mask[1][2] =
+                rsvd_bits(maxphyaddr, 51) | rsvd_bits(12, 29);
+        context->rsvd_bits_mask[1][1] =
+                rsvd_bits(maxphyaddr, 51) | rsvd_bits(12, 20);
+        context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[0][0];
+        for (pte = 0; pte < 64; pte++) {
+                int rwx_bits = pte & 7;
+                int mt = pte >> 3;
+                if (mt == 0x2 || mt == 0x3 || mt == 0x7 ||
+                                rwx_bits == 0x2 || rwx_bits == 0x6 ||
+                                (rwx_bits == 0x4 && !execonly))
+                        context->bad_mt_xwr |= (1ull << pte);
+        }
+}
+static void update_permission_bitmask(struct kvm_vcpu *vcpu,
+                struct kvm_mmu *mmu, bool ept)
 {
        unsigned bit, byte, pfec;
        u8 map;
@@ -3636,12 +3629,16 @@ static void update_permission_bitmask(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu
                        w = bit & ACC_WRITE_MASK;
                        u = bit & ACC_USER_MASK;
-                        /* Not really needed: !nx will cause pte.nx to fault */
+                        if (!ept) {
-                        x |= !mmu->nx;
+                                /* Not really needed: !nx will cause pte.nx to fault */
-                        /* Allow supervisor writes if !cr0.wp */
+                                x |= !mmu->nx;
-                        w |= !is_write_protection(vcpu) && !uf;
+                                /* Allow supervisor writes if !cr0.wp */
-                        /* Disallow supervisor fetches of user code if cr4.smep */
+                                w |= !is_write_protection(vcpu) && !uf;
-                        x &= !(smep && u && !uf);
+                                /* Disallow supervisor fetches of user code if cr4.smep */
+                                x &= !(smep && u && !uf);
+                        } else
+                                /* Not really needed: no U/S accesses on ept  */
+                                u = 1;
                        fault = (ff && !x) || (uf && !u) || (wf && !w);
                        map |= fault << bit;
@@ -3676,7 +3673,7 @@ static int paging64_init_context_common(struct kvm_vcpu *vcpu,
        context->root_level = level;
        reset_rsvds_bits_mask(vcpu, context);
-        update_permission_bitmask(vcpu, context);
+        update_permission_bitmask(vcpu, context, false);
        update_last_pte_bitmap(vcpu, context);
        ASSERT(is_pae(vcpu));
@@ -3706,7 +3703,7 @@ static int paging32_init_context(struct kvm_vcpu *vcpu,
        context->root_level = PT32_ROOT_LEVEL;
        reset_rsvds_bits_mask(vcpu, context);
-        update_permission_bitmask(vcpu, context);
+        update_permission_bitmask(vcpu, context, false);
        update_last_pte_bitmap(vcpu, context);
        context->new_cr3 = paging_new_cr3;
@@ -3768,7 +3765,7 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
                context->gva_to_gpa = paging32_gva_to_gpa;
        }
-        update_permission_bitmask(vcpu, context);
+        update_permission_bitmask(vcpu, context, false);
        update_last_pte_bitmap(vcpu, context);
        return 0;
@@ -3800,6 +3797,33 @@ int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context)
 }
 EXPORT_SYMBOL_GPL(kvm_init_shadow_mmu);
+int kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context,
+                bool execonly)
+{
+        ASSERT(vcpu);
+        ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
+        context->shadow_root_level = kvm_x86_ops->get_tdp_level();
+        context->nx = true;
+        context->new_cr3 = paging_new_cr3;
+        context->page_fault = ept_page_fault;
+        context->gva_to_gpa = ept_gva_to_gpa;
+        context->sync_page = ept_sync_page;
+        context->invlpg = ept_invlpg;
+        context->update_pte = ept_update_pte;
+        context->free = paging_free;
+        context->root_level = context->shadow_root_level;
+        context->root_hpa = INVALID_PAGE;
+        context->direct_map = false;
+        update_permission_bitmask(vcpu, context, true);
+        reset_rsvds_bits_mask_ept(vcpu, context, execonly);
+        return 0;
+}
+EXPORT_SYMBOL_GPL(kvm_init_shadow_ept_mmu);
 static int init_kvm_softmmu(struct kvm_vcpu *vcpu)
 {
        int r = kvm_init_shadow_mmu(vcpu, vcpu->arch.walk_mmu);
@@ -3847,7 +3871,7 @@ static int init_kvm_nested_mmu(struct kvm_vcpu *vcpu)
                g_context->gva_to_gpa = paging32_gva_to_gpa_nested;
        }
-        update_permission_bitmask(vcpu, g_context);
+        update_permission_bitmask(vcpu, g_context, false);
        update_last_pte_bitmap(vcpu, g_context);
        return 0;
@@ -3923,8 +3947,8 @@ static bool need_remote_flush(u64 old, u64 new)
                return true;
        if ((old ^ new) & PT64_BASE_ADDR_MASK)
                return true;
-        old ^= PT64_NX_MASK;
+        old ^= shadow_nx_mask;
-        new ^= PT64_NX_MASK;
+        new ^= shadow_nx_mask;
        return (old & ~new & PT64_PERM_MASK) != 0;
 }
@@ -4182,7 +4206,7 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code,
        switch (er) {
        case EMULATE_DONE:
                return 1;
-        case EMULATE_DO_MMIO:
+        case EMULATE_USER_EXIT:
                ++vcpu->stat.mmio_exits;
                /* fall through */
        case EMULATE_FAIL:
@@ -4390,11 +4414,8 @@ void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm)
        /*
         * The very rare case: if the generation-number is round,
         * zap all shadow pages.
-         *
-         * The max value is MMIO_MAX_GEN - 1 since it is not called
-         * when mark memslot invalid.
         */
-        if (unlikely(kvm_current_mmio_generation(kvm) >= (MMIO_MAX_GEN - 1))) {
+        if (unlikely(kvm_current_mmio_generation(kvm) >= MMIO_MAX_GEN)) {
                printk_ratelimited(KERN_INFO "kvm: zapping shadow pages for mmio generation wraparound\n");
                kvm_mmu_invalidate_zap_all_pages(kvm);
        }
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 5b59c573aba7..77e044a0f5f7 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -71,6 +71,8 @@ enum {
 int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct);
 int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context);
+int kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context,
+                bool execonly);
 static inline unsigned int kvm_mmu_available_pages(struct kvm *kvm)
 {
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 7769699d48a8..043330159179 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -23,6 +23,13 @@
 * so the code in this file is compiled twice, once per pte size.
 */
+/*
+ * This is used to catch non optimized PT_GUEST_(DIRTY|ACCESS)_SHIFT macro
+ * uses for EPT without A/D paging type.
+ */
+extern u64 __pure __using_nonexistent_pte_bit(void)
+               __compiletime_error("wrong use of PT_GUEST_(DIRTY|ACCESS)_SHIFT");
 #if PTTYPE == 64
        #define pt_element_t u64
        #define guest_walker guest_walker64
@@ -32,6 +39,10 @@
        #define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl)
        #define PT_INDEX(addr, level) PT64_INDEX(addr, level)
        #define PT_LEVEL_BITS PT64_LEVEL_BITS
+        #define PT_GUEST_ACCESSED_MASK PT_ACCESSED_MASK
+        #define PT_GUEST_DIRTY_MASK PT_DIRTY_MASK
+        #define PT_GUEST_DIRTY_SHIFT PT_DIRTY_SHIFT
+        #define PT_GUEST_ACCESSED_SHIFT PT_ACCESSED_SHIFT
        #ifdef CONFIG_X86_64
        #define PT_MAX_FULL_LEVELS 4
        #define CMPXCHG cmpxchg
@@ -49,7 +60,26 @@
        #define PT_INDEX(addr, level) PT32_INDEX(addr, level)
        #define PT_LEVEL_BITS PT32_LEVEL_BITS
        #define PT_MAX_FULL_LEVELS 2
+        #define PT_GUEST_ACCESSED_MASK PT_ACCESSED_MASK
+        #define PT_GUEST_DIRTY_MASK PT_DIRTY_MASK
+        #define PT_GUEST_DIRTY_SHIFT PT_DIRTY_SHIFT
+        #define PT_GUEST_ACCESSED_SHIFT PT_ACCESSED_SHIFT
        #define CMPXCHG cmpxchg
+#elif PTTYPE == PTTYPE_EPT
+        #define pt_element_t u64
+        #define guest_walker guest_walkerEPT
+        #define FNAME(name) ept_##name
+        #define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK
+        #define PT_LVL_ADDR_MASK(lvl) PT64_LVL_ADDR_MASK(lvl)
+        #define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl)
+        #define PT_INDEX(addr, level) PT64_INDEX(addr, level)
+        #define PT_LEVEL_BITS PT64_LEVEL_BITS
+        #define PT_GUEST_ACCESSED_MASK 0
+        #define PT_GUEST_DIRTY_MASK 0
+        #define PT_GUEST_DIRTY_SHIFT __using_nonexistent_pte_bit()
+        #define PT_GUEST_ACCESSED_SHIFT __using_nonexistent_pte_bit()
+        #define CMPXCHG cmpxchg64
+        #define PT_MAX_FULL_LEVELS 4
 #else
        #error Invalid PTTYPE value
 #endif
@@ -80,6 +110,40 @@ static gfn_t gpte_to_gfn_lvl(pt_element_t gpte, int lvl)
        return (gpte & PT_LVL_ADDR_MASK(lvl)) >> PAGE_SHIFT;
 }
+static inline void FNAME(protect_clean_gpte)(unsigned *access, unsigned gpte)
+{
+        unsigned mask;
+        /* dirty bit is not supported, so no need to track it */
+        if (!PT_GUEST_DIRTY_MASK)
+                return;
+        BUILD_BUG_ON(PT_WRITABLE_MASK != ACC_WRITE_MASK);
+        mask = (unsigned)~ACC_WRITE_MASK;
+        /* Allow write access to dirty gptes */
+        mask |= (gpte >> (PT_GUEST_DIRTY_SHIFT - PT_WRITABLE_SHIFT)) &
+                PT_WRITABLE_MASK;
+        *access &= mask;
+}
+static bool FNAME(is_rsvd_bits_set)(struct kvm_mmu *mmu, u64 gpte, int level)
+{
+        int bit7 = (gpte >> 7) & 1, low6 = gpte & 0x3f;
+        return (gpte & mmu->rsvd_bits_mask[bit7][level-1]) |
+                ((mmu->bad_mt_xwr & (1ull << low6)) != 0);
+}
+static inline int FNAME(is_present_gpte)(unsigned long pte)
+{
+#if PTTYPE != PTTYPE_EPT
+        return is_present_gpte(pte);
+#else
+        return pte & 7;
+#endif
+}
 static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
                               pt_element_t __user *ptep_user, unsigned index,
                               pt_element_t orig_pte, pt_element_t new_pte)
@@ -103,6 +167,42 @@ static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
        return (ret != orig_pte);
 }
+static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu,
+                                  struct kvm_mmu_page *sp, u64 *spte,
+                                  u64 gpte)
+{
+        if (FNAME(is_rsvd_bits_set)(&vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL))
+                goto no_present;
+        if (!FNAME(is_present_gpte)(gpte))
+                goto no_present;
+        /* if accessed bit is not supported prefetch non accessed gpte */
+        if (PT_GUEST_ACCESSED_MASK && !(gpte & PT_GUEST_ACCESSED_MASK))
+                goto no_present;
+        return false;
+no_present:
+        drop_spte(vcpu->kvm, spte);
+        return true;
+}
+static inline unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, u64 gpte)
+{
+        unsigned access;
+#if PTTYPE == PTTYPE_EPT
+        access = ((gpte & VMX_EPT_WRITABLE_MASK) ? ACC_WRITE_MASK : 0) |
+                ((gpte & VMX_EPT_EXECUTABLE_MASK) ? ACC_EXEC_MASK : 0) |
+                ACC_USER_MASK;
+#else
+        access = (gpte & (PT_WRITABLE_MASK | PT_USER_MASK)) | ACC_EXEC_MASK;
+        access &= ~(gpte >> PT64_NX_SHIFT);
+#endif
+        return access;
+}
 static int FNAME(update_accessed_dirty_bits)(struct kvm_vcpu *vcpu,
                                             struct kvm_mmu *mmu,
                                             struct guest_walker *walker,
@@ -114,18 +214,23 @@ static int FNAME(update_accessed_dirty_bits)(struct kvm_vcpu *vcpu,
        gfn_t table_gfn;
        int ret;
+        /* dirty/accessed bits are not supported, so no need to update them */
+        if (!PT_GUEST_DIRTY_MASK)
+                return 0;
        for (level = walker->max_level; level >= walker->level; --level) {
                pte = orig_pte = walker->ptes[level - 1];
                table_gfn = walker->table_gfn[level - 1];
                ptep_user = walker->ptep_user[level - 1];
                index = offset_in_page(ptep_user) / sizeof(pt_element_t);
-                if (!(pte & PT_ACCESSED_MASK)) {
+                if (!(pte & PT_GUEST_ACCESSED_MASK)) {
                        trace_kvm_mmu_set_accessed_bit(table_gfn, index, sizeof(pte));
-                        pte |= PT_ACCESSED_MASK;
+                        pte |= PT_GUEST_ACCESSED_MASK;
                }
-                if (level == walker->level && write_fault && !is_dirty_gpte(pte)) {
+                if (level == walker->level && write_fault &&
+                                !(pte & PT_GUEST_DIRTY_MASK)) {
                        trace_kvm_mmu_set_dirty_bit(table_gfn, index, sizeof(pte));
-                        pte |= PT_DIRTY_MASK;
+                        pte |= PT_GUEST_DIRTY_MASK;
                }
                if (pte == orig_pte)
                        continue;
@@ -170,7 +275,7 @@ retry_walk:
        if (walker->level == PT32E_ROOT_LEVEL) {
                pte = mmu->get_pdptr(vcpu, (addr >> 30) & 3);
                trace_kvm_mmu_paging_element(pte, walker->level);
-                if (!is_present_gpte(pte))
+                if (!FNAME(is_present_gpte)(pte))
                        goto error;
                --walker->level;
        }
@@ -179,7 +284,7 @@ retry_walk:
        ASSERT((!is_long_mode(vcpu) && is_pae(vcpu)) ||
               (mmu->get_cr3(vcpu) & CR3_NONPAE_RESERVED_BITS) == 0);
-        accessed_dirty = PT_ACCESSED_MASK;
+        accessed_dirty = PT_GUEST_ACCESSED_MASK;
        pt_access = pte_access = ACC_ALL;
        ++walker->level;
@@ -215,17 +320,17 @@ retry_walk:
                trace_kvm_mmu_paging_element(pte, walker->level);
-                if (unlikely(!is_present_gpte(pte)))
+                if (unlikely(!FNAME(is_present_gpte)(pte)))
                        goto error;
-                if (unlikely(is_rsvd_bits_set(&vcpu->arch.mmu, pte,
+                if (unlikely(FNAME(is_rsvd_bits_set)(mmu, pte,
-                                              walker->level))) {
+                                                     walker->level))) {
                        errcode |= PFERR_RSVD_MASK | PFERR_PRESENT_MASK;
                        goto error;
                }
                accessed_dirty &= pte;
-                pte_access = pt_access & gpte_access(vcpu, pte);
+                pte_access = pt_access & FNAME(gpte_access)(vcpu, pte);
                walker->ptes[walker->level - 1] = pte;
        } while (!is_last_gpte(mmu, walker->level, pte));
@@ -248,13 +353,15 @@ retry_walk:
        walker->gfn = real_gpa >> PAGE_SHIFT;
        if (!write_fault)
-                protect_clean_gpte(&pte_access, pte);
+                FNAME(protect_clean_gpte)(&pte_access, pte);
        else
                /*
-                 * On a write fault, fold the dirty bit into accessed_dirty by
+                 * On a write fault, fold the dirty bit into accessed_dirty.
-                 * shifting it one place right.
+                 * For modes without A/D bits support accessed_dirty will be
+                 * always clear.
                 */
-                accessed_dirty &= pte >> (PT_DIRTY_SHIFT - PT_ACCESSED_SHIFT);
+                accessed_dirty &= pte >>
+                        (PT_GUEST_DIRTY_SHIFT - PT_GUEST_ACCESSED_SHIFT);
        if (unlikely(!accessed_dirty)) {
                ret = FNAME(update_accessed_dirty_bits)(vcpu, mmu, walker, write_fault);
@@ -279,6 +386,25 @@ error:
        walker->fault.vector = PF_VECTOR;
        walker->fault.error_code_valid = true;
        walker->fault.error_code = errcode;
+#if PTTYPE == PTTYPE_EPT
+        /*
+         * Use PFERR_RSVD_MASK in error_code to to tell if EPT
+         * misconfiguration requires to be injected. The detection is
+         * done by is_rsvd_bits_set() above.
+         *
+         * We set up the value of exit_qualification to inject:
+         * [2:0] - Derive from [2:0] of real exit_qualification at EPT violation
+         * [5:3] - Calculated by the page walk of the guest EPT page tables
+         * [7:8] - Derived from [7:8] of real exit_qualification
+         *
+         * The other bits are set to 0.
+         */
+        if (!(errcode & PFERR_RSVD_MASK)) {
+                vcpu->arch.exit_qualification &= 0x187;
+                vcpu->arch.exit_qualification |= ((pt_access & pte) & 0x7) << 3;
+        }
+#endif
        walker->fault.address = addr;
        walker->fault.nested_page_fault = mmu != vcpu->arch.walk_mmu;
@@ -293,6 +419,7 @@ static int FNAME(walk_addr)(struct guest_walker *walker,
                                        access);
 }
+#if PTTYPE != PTTYPE_EPT
 static int FNAME(walk_addr_nested)(struct guest_walker *walker,
                                   struct kvm_vcpu *vcpu, gva_t addr,
                                   u32 access)
@@ -300,6 +427,7 @@ static int FNAME(walk_addr_nested)(struct guest_walker *walker,
        return FNAME(walk_addr_generic)(walker, vcpu, &vcpu->arch.nested_mmu,
                                        addr, access);
 }
+#endif
 static bool
 FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
@@ -309,14 +437,14 @@ FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
        gfn_t gfn;
        pfn_t pfn;
-        if (prefetch_invalid_gpte(vcpu, sp, spte, gpte))
+        if (FNAME(prefetch_invalid_gpte)(vcpu, sp, spte, gpte))
                return false;
        pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte);
        gfn = gpte_to_gfn(gpte);
-        pte_access = sp->role.access & gpte_access(vcpu, gpte);
+        pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
-        protect_clean_gpte(&pte_access, gpte);
+        FNAME(protect_clean_gpte)(&pte_access, gpte);
        pfn = pte_prefetch_gfn_to_pfn(vcpu, gfn,
                        no_dirty_log && (pte_access & ACC_WRITE_MASK));
        if (is_error_pfn(pfn))
@@ -446,7 +574,7 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
                        goto out_gpte_changed;
                if (sp)
-                        link_shadow_page(it.sptep, sp);
+                        link_shadow_page(it.sptep, sp, PT_GUEST_ACCESSED_MASK);
        }
        for (;
@@ -466,7 +594,7 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
                sp = kvm_mmu_get_page(vcpu, direct_gfn, addr, it.level-1,
                                      true, direct_access, it.sptep);
-                link_shadow_page(it.sptep, sp);
+                link_shadow_page(it.sptep, sp, PT_GUEST_ACCESSED_MASK);
        }
        clear_sp_write_flooding_count(it.sptep);
@@ -727,6 +855,7 @@ static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr, u32 access,
        return gpa;
 }
+#if PTTYPE != PTTYPE_EPT
 static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gva_t vaddr,
                                      u32 access,
                                      struct x86_exception *exception)
@@ -745,6 +874,7 @@ static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gva_t vaddr,
        return gpa;
 }
+#endif
 /*
 * Using the cached information from sp->gfns is safe because:
@@ -785,15 +915,15 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
                                          sizeof(pt_element_t)))
                        return -EINVAL;
-                if (prefetch_invalid_gpte(vcpu, sp, &sp->spt[i], gpte)) {
+                if (FNAME(prefetch_invalid_gpte)(vcpu, sp, &sp->spt[i], gpte)) {
                        vcpu->kvm->tlbs_dirty++;
                        continue;
                }
                gfn = gpte_to_gfn(gpte);
                pte_access = sp->role.access;
-                pte_access &= gpte_access(vcpu, gpte);
+                pte_access &= FNAME(gpte_access)(vcpu, gpte);
-                protect_clean_gpte(&pte_access, gpte);
+                FNAME(protect_clean_gpte)(&pte_access, gpte);
                if (sync_mmio_spte(vcpu->kvm, &sp->spt[i], gfn, pte_access,
                      &nr_present))
@@ -830,3 +960,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
 #undef gpte_to_gfn
 #undef gpte_to_gfn_lvl
 #undef CMPXCHG
+#undef PT_GUEST_ACCESSED_MASK
+#undef PT_GUEST_DIRTY_MASK
+#undef PT_GUEST_DIRTY_SHIFT
+#undef PT_GUEST_ACCESSED_SHIFT
diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c
index c53e797e7369..5c4f63151b4d 100644
--- a/arch/x86/kvm/pmu.c
+++ b/arch/x86/kvm/pmu.c
@@ -160,7 +160,7 @@ static void stop_counter(struct kvm_pmc *pmc)
 static void reprogram_counter(struct kvm_pmc *pmc, u32 type,
                unsigned config, bool exclude_user, bool exclude_kernel,
-                bool intr)
+                bool intr, bool in_tx, bool in_tx_cp)
 {
        struct perf_event *event;
        struct perf_event_attr attr = {
@@ -173,6 +173,10 @@ static void reprogram_counter(struct kvm_pmc *pmc, u32 type,
                .exclude_kernel = exclude_kernel,
                .config = config,
        };
+        if (in_tx)
+                attr.config |= HSW_IN_TX;
+        if (in_tx_cp)
+                attr.config |= HSW_IN_TX_CHECKPOINTED;
        attr.sample_period = (-pmc->counter) & pmc_bitmask(pmc);
@@ -226,7 +230,9 @@ static void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel)
        if (!(eventsel & (ARCH_PERFMON_EVENTSEL_EDGE |
                                ARCH_PERFMON_EVENTSEL_INV |
-                                ARCH_PERFMON_EVENTSEL_CMASK))) {
+                                ARCH_PERFMON_EVENTSEL_CMASK |
+                                HSW_IN_TX |
+                                HSW_IN_TX_CHECKPOINTED))) {
                config = find_arch_event(&pmc->vcpu->arch.pmu, event_select,
                                unit_mask);
                if (config != PERF_COUNT_HW_MAX)
@@ -239,7 +245,9 @@ static void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel)
        reprogram_counter(pmc, type, config,
                        !(eventsel & ARCH_PERFMON_EVENTSEL_USR),
                        !(eventsel & ARCH_PERFMON_EVENTSEL_OS),
-                        eventsel & ARCH_PERFMON_EVENTSEL_INT);
+                        eventsel & ARCH_PERFMON_EVENTSEL_INT,
+                        (eventsel & HSW_IN_TX),
+                        (eventsel & HSW_IN_TX_CHECKPOINTED));
 }
 static void reprogram_fixed_counter(struct kvm_pmc *pmc, u8 en_pmi, int idx)
@@ -256,7 +264,7 @@ static void reprogram_fixed_counter(struct kvm_pmc *pmc, u8 en_pmi, int idx)
                        arch_events[fixed_pmc_events[idx]].event_type,
                        !(en & 0x2), /* exclude user */
                        !(en & 0x1), /* exclude kernel */
-                        pmi);
+                        pmi, false, false);
 }
 static inline u8 fixed_en_pmi(u64 ctrl, int idx)
@@ -408,7 +416,7 @@ int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
                } else if ((pmc = get_gp_pmc(pmu, index, MSR_P6_EVNTSEL0))) {
                        if (data == pmc->eventsel)
                                return 0;
-                        if (!(data & 0xffffffff00200000ull)) {
+                        if (!(data & pmu->reserved_bits)) {
                                reprogram_gp_counter(pmc, data);
                                return 0;
                        }
@@ -450,6 +458,7 @@ void kvm_pmu_cpuid_update(struct kvm_vcpu *vcpu)
        pmu->counter_bitmask[KVM_PMC_GP] = 0;
        pmu->counter_bitmask[KVM_PMC_FIXED] = 0;
        pmu->version = 0;
+        pmu->reserved_bits = 0xffffffff00200000ull;
        entry = kvm_find_cpuid_entry(vcpu, 0xa, 0);
        if (!entry)
@@ -478,6 +487,12 @@ void kvm_pmu_cpuid_update(struct kvm_vcpu *vcpu)
        pmu->global_ctrl = ((1 << pmu->nr_arch_gp_counters) - 1) |
                (((1ull << pmu->nr_arch_fixed_counters) - 1) << INTEL_PMC_IDX_FIXED);
        pmu->global_ctrl_mask = ~pmu->global_ctrl;
+        entry = kvm_find_cpuid_entry(vcpu, 7, 0);
+        if (entry &&
+            (boot_cpu_has(X86_FEATURE_HLE) || boot_cpu_has(X86_FEATURE_RTM)) &&
+            (entry->ebx & (X86_FEATURE_HLE|X86_FEATURE_RTM)))
+                pmu->reserved_bits ^= HSW_IN_TX|HSW_IN_TX_CHECKPOINTED;
 }
 void kvm_pmu_init(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 064d0be67ecc..1f1da43ff2a2 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -373,6 +373,7 @@ struct nested_vmx {
         * we must keep them pinned while L2 runs.
         */
        struct page *apic_access_page;
+        u64 msr_ia32_feature_control;
 };
 #define POSTED_INTR_ON  0
@@ -711,10 +712,10 @@ static void nested_release_page_clean(struct page *page)
        kvm_release_page_clean(page);
 }
+static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu);
 static u64 construct_eptp(unsigned long root_hpa);
 static void kvm_cpu_vmxon(u64 addr);
 static void kvm_cpu_vmxoff(void);
-static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3);
 static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr);
 static void vmx_set_segment(struct kvm_vcpu *vcpu,
                            struct kvm_segment *var, int seg);
@@ -1039,12 +1040,16 @@ static inline bool nested_cpu_has2(struct vmcs12 *vmcs12, u32 bit)
                (vmcs12->secondary_vm_exec_control & bit);
 }
-static inline bool nested_cpu_has_virtual_nmis(struct vmcs12 *vmcs12,
+static inline bool nested_cpu_has_virtual_nmis(struct vmcs12 *vmcs12)
-        struct kvm_vcpu *vcpu)
 {
        return vmcs12->pin_based_vm_exec_control & PIN_BASED_VIRTUAL_NMIS;
 }
+static inline int nested_cpu_has_ept(struct vmcs12 *vmcs12)
+{
+        return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_EPT);
+}
 static inline bool is_exception(u32 intr_info)
 {
        return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
@@ -2155,6 +2160,7 @@ static u32 nested_vmx_pinbased_ctls_low, nested_vmx_pinbased_ctls_high;
 static u32 nested_vmx_exit_ctls_low, nested_vmx_exit_ctls_high;
 static u32 nested_vmx_entry_ctls_low, nested_vmx_entry_ctls_high;
 static u32 nested_vmx_misc_low, nested_vmx_misc_high;
+static u32 nested_vmx_ept_caps;
 static __init void nested_vmx_setup_ctls_msrs(void)
 {
        /*
@@ -2190,14 +2196,17 @@ static __init void nested_vmx_setup_ctls_msrs(void)
         * If bit 55 of VMX_BASIC is off, bits 0-8 and 10, 11, 13, 14, 16 and
         * 17 must be 1.
         */
+        rdmsr(MSR_IA32_VMX_EXIT_CTLS,
+                nested_vmx_exit_ctls_low, nested_vmx_exit_ctls_high);
        nested_vmx_exit_ctls_low = VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR;
        /* Note that guest use of VM_EXIT_ACK_INTR_ON_EXIT is not supported. */
+        nested_vmx_exit_ctls_high &=
 #ifdef CONFIG_X86_64
-        nested_vmx_exit_ctls_high = VM_EXIT_HOST_ADDR_SPACE_SIZE;
+                VM_EXIT_HOST_ADDR_SPACE_SIZE |
-#else
-        nested_vmx_exit_ctls_high = 0;
 #endif
-        nested_vmx_exit_ctls_high |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR;
+                VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT;
+        nested_vmx_exit_ctls_high |= (VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR |
+                                      VM_EXIT_LOAD_IA32_EFER);
        /* entry controls */
        rdmsr(MSR_IA32_VMX_ENTRY_CTLS,
@@ -2205,8 +2214,12 @@ static __init void nested_vmx_setup_ctls_msrs(void)
        /* If bit 55 of VMX_BASIC is off, bits 0-8 and 12 must be 1. */
        nested_vmx_entry_ctls_low = VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR;
        nested_vmx_entry_ctls_high &=
-                VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_IA32E_MODE;
+#ifdef CONFIG_X86_64
-        nested_vmx_entry_ctls_high |= VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR;
+                VM_ENTRY_IA32E_MODE |
+#endif
+                VM_ENTRY_LOAD_IA32_PAT;
+        nested_vmx_entry_ctls_high |= (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR |
+                                       VM_ENTRY_LOAD_IA32_EFER);
        /* cpu-based controls */
        rdmsr(MSR_IA32_VMX_PROCBASED_CTLS,
@@ -2241,6 +2254,22 @@ static __init void nested_vmx_setup_ctls_msrs(void)
                SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
                SECONDARY_EXEC_WBINVD_EXITING;
+        if (enable_ept) {
+                /* nested EPT: emulate EPT also to L1 */
+                nested_vmx_secondary_ctls_high |= SECONDARY_EXEC_ENABLE_EPT;
+                nested_vmx_ept_caps = VMX_EPT_PAGE_WALK_4_BIT |
+                         VMX_EPTP_WB_BIT | VMX_EPT_INVEPT_BIT;
+                nested_vmx_ept_caps &= vmx_capability.ept;
+                /*
+                 * Since invept is completely emulated we support both global
+                 * and context invalidation independent of what host cpu
+                 * supports
+                 */
+                nested_vmx_ept_caps |= VMX_EPT_EXTENT_GLOBAL_BIT |
+                        VMX_EPT_EXTENT_CONTEXT_BIT;
+        } else
+                nested_vmx_ept_caps = 0;
        /* miscellaneous data */
        rdmsr(MSR_IA32_VMX_MISC, nested_vmx_misc_low, nested_vmx_misc_high);
        nested_vmx_misc_low &= VMX_MISC_PREEMPTION_TIMER_RATE_MASK |
@@ -2282,8 +2311,11 @@ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
        switch (msr_index) {
        case MSR_IA32_FEATURE_CONTROL:
-                *pdata = 0;
+                if (nested_vmx_allowed(vcpu)) {
-                break;
+                        *pdata = to_vmx(vcpu)->nested.msr_ia32_feature_control;
+                        break;
+                }
+                return 0;
        case MSR_IA32_VMX_BASIC:
                /*
                 * This MSR reports some information about VMX support. We
@@ -2346,8 +2378,8 @@ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
                                        nested_vmx_secondary_ctls_high);
                break;
        case MSR_IA32_VMX_EPT_VPID_CAP:
-                /* Currently, no nested ept or nested vpid */
+                /* Currently, no nested vpid support */
-                *pdata = 0;
+                *pdata = nested_vmx_ept_caps;
                break;
        default:
                return 0;
@@ -2356,14 +2388,24 @@ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
        return 1;
 }
-static int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
+static int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 {
+        u32 msr_index = msr_info->index;
+        u64 data = msr_info->data;
+        bool host_initialized = msr_info->host_initiated;
        if (!nested_vmx_allowed(vcpu))
                return 0;
-        if (msr_index == MSR_IA32_FEATURE_CONTROL)
+        if (msr_index == MSR_IA32_FEATURE_CONTROL) {
-                /* TODO: the right thing. */
+                if (!host_initialized &&
+                                to_vmx(vcpu)->nested.msr_ia32_feature_control
+                                & FEATURE_CONTROL_LOCKED)
+                        return 0;
+                to_vmx(vcpu)->nested.msr_ia32_feature_control = data;
                return 1;
+        }
        /*
         * No need to treat VMX capability MSRs specially: If we don't handle
         * them, handle_wrmsr will #GP(0), which is correct (they are readonly)
@@ -2494,7 +2536,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
                        return 1;
                /* Otherwise falls through */
        default:
-                if (vmx_set_vmx_msr(vcpu, msr_index, data))
+                if (vmx_set_vmx_msr(vcpu, msr_info))
                        break;
                msr = find_msr_entry(vmx, msr_index);
                if (msr) {
@@ -5302,9 +5344,13 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu)
        /* It is a write fault? */
        error_code = exit_qualification & (1U << 1);
+        /* It is a fetch fault? */
+        error_code |= (exit_qualification & (1U << 2)) << 2;
        /* ept page table is present? */
        error_code |= (exit_qualification >> 3) & 0x1;
+        vcpu->arch.exit_qualification = exit_qualification;
        return kvm_mmu_page_fault(vcpu, gpa, error_code, NULL, 0);
 }
@@ -5438,7 +5484,8 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
                err = emulate_instruction(vcpu, EMULTYPE_NO_REEXECUTE);
-                if (err == EMULATE_DO_MMIO) {
+                if (err == EMULATE_USER_EXIT) {
+                        ++vcpu->stat.mmio_exits;
                        ret = 0;
                        goto out;
                }
@@ -5567,8 +5614,47 @@ static void nested_free_all_saved_vmcss(struct vcpu_vmx *vmx)
                free_loaded_vmcs(&vmx->vmcs01);
 }
+/*
+ * The following 3 functions, nested_vmx_succeed()/failValid()/failInvalid(),
+ * set the success or error code of an emulated VMX instruction, as specified
+ * by Vol 2B, VMX Instruction Reference, "Conventions".
+ */
+static void nested_vmx_succeed(struct kvm_vcpu *vcpu)
+{
+        vmx_set_rflags(vcpu, vmx_get_rflags(vcpu)
+                        & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF |
+                            X86_EFLAGS_ZF | X86_EFLAGS_SF | X86_EFLAGS_OF));
+}
+static void nested_vmx_failInvalid(struct kvm_vcpu *vcpu)
+{
+        vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu)
+                        & ~(X86_EFLAGS_PF | X86_EFLAGS_AF | X86_EFLAGS_ZF |
+                            X86_EFLAGS_SF | X86_EFLAGS_OF))
+                        | X86_EFLAGS_CF);
+}
 static void nested_vmx_failValid(struct kvm_vcpu *vcpu,
-                                 u32 vm_instruction_error);
+                                        u32 vm_instruction_error)
+{
+        if (to_vmx(vcpu)->nested.current_vmptr == -1ull) {
+                /*
+                 * failValid writes the error number to the current VMCS, which
+                 * can't be done there isn't a current VMCS.
+                 */
+                nested_vmx_failInvalid(vcpu);
+                return;
+        }
+        vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu)
+                        & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF |
+                            X86_EFLAGS_SF | X86_EFLAGS_OF))
+                        | X86_EFLAGS_ZF);
+        get_vmcs12(vcpu)->vm_instruction_error = vm_instruction_error;
+        /*
+         * We don't need to force a shadow sync because
+         * VM_INSTRUCTION_ERROR is not shadowed
+         */
+}
 /*
 * Emulate the VMXON instruction.
@@ -5583,6 +5669,8 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
        struct kvm_segment cs;
        struct vcpu_vmx *vmx = to_vmx(vcpu);
        struct vmcs *shadow_vmcs;
+        const u64 VMXON_NEEDED_FEATURES = FEATURE_CONTROL_LOCKED
+                | FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;
        /* The Intel VMX Instruction Reference lists a bunch of bits that
         * are prerequisite to running VMXON, most notably cr4.VMXE must be
@@ -5611,6 +5699,13 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
                skip_emulated_instruction(vcpu);
                return 1;
        }
+        if ((vmx->nested.msr_ia32_feature_control & VMXON_NEEDED_FEATURES)
+                        != VMXON_NEEDED_FEATURES) {
+                kvm_inject_gp(vcpu, 0);
+                return 1;
+        }
        if (enable_shadow_vmcs) {
                shadow_vmcs = alloc_vmcs();
                if (!shadow_vmcs)
@@ -5628,6 +5723,7 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
        vmx->nested.vmxon = true;
        skip_emulated_instruction(vcpu);
+        nested_vmx_succeed(vcpu);
        return 1;
 }
@@ -5712,6 +5808,7 @@ static int handle_vmoff(struct kvm_vcpu *vcpu)
                return 1;
        free_nested(to_vmx(vcpu));
        skip_emulated_instruction(vcpu);
+        nested_vmx_succeed(vcpu);
        return 1;
 }
@@ -5768,48 +5865,6 @@ static int get_vmx_mem_address(struct kvm_vcpu *vcpu,
        return 0;
 }
-/*
- * The following 3 functions, nested_vmx_succeed()/failValid()/failInvalid(),
- * set the success or error code of an emulated VMX instruction, as specified
- * by Vol 2B, VMX Instruction Reference, "Conventions".
- */
-static void nested_vmx_succeed(struct kvm_vcpu *vcpu)
-{
-        vmx_set_rflags(vcpu, vmx_get_rflags(vcpu)
-                        & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF |
-                            X86_EFLAGS_ZF | X86_EFLAGS_SF | X86_EFLAGS_OF));
-}
-static void nested_vmx_failInvalid(struct kvm_vcpu *vcpu)
-{
-        vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu)
-                        & ~(X86_EFLAGS_PF | X86_EFLAGS_AF | X86_EFLAGS_ZF |
-                            X86_EFLAGS_SF | X86_EFLAGS_OF))
-                        | X86_EFLAGS_CF);
-}
-static void nested_vmx_failValid(struct kvm_vcpu *vcpu,
-                                        u32 vm_instruction_error)
-{
-        if (to_vmx(vcpu)->nested.current_vmptr == -1ull) {
-                /*
-                 * failValid writes the error number to the current VMCS, which
-                 * can't be done there isn't a current VMCS.
-                 */
-                nested_vmx_failInvalid(vcpu);
-                return;
-        }
-        vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu)
-                        & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF |
-                            X86_EFLAGS_SF | X86_EFLAGS_OF))
-                        | X86_EFLAGS_ZF);
-        get_vmcs12(vcpu)->vm_instruction_error = vm_instruction_error;
-        /*
-         * We don't need to force a shadow sync because
-         * VM_INSTRUCTION_ERROR is not shadowed
-         */
-}
 /* Emulate the VMCLEAR instruction */
 static int handle_vmclear(struct kvm_vcpu *vcpu)
 {
@@ -5972,8 +6027,8 @@ static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx)
        unsigned long field;
        u64 field_value;
        struct vmcs *shadow_vmcs = vmx->nested.current_shadow_vmcs;
-        unsigned long *fields = (unsigned long *)shadow_read_write_fields;
+        const unsigned long *fields = shadow_read_write_fields;
-        int num_fields = max_shadow_read_write_fields;
+        const int num_fields = max_shadow_read_write_fields;
        vmcs_load(shadow_vmcs);
@@ -6002,12 +6057,11 @@ static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx)
 static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx)
 {
-        unsigned long *fields[] = {
+        const unsigned long *fields[] = {
-                (unsigned long *)shadow_read_write_fields,
+                shadow_read_write_fields,
-                (unsigned long *)shadow_read_only_fields
+                shadow_read_only_fields
        };
-        int num_lists =  ARRAY_SIZE(fields);
+        const int max_fields[] = {
-        int max_fields[] = {
                max_shadow_read_write_fields,
                max_shadow_read_only_fields
        };
@@ -6018,7 +6072,7 @@ static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx)
        vmcs_load(shadow_vmcs);
-        for (q = 0; q < num_lists; q++) {
+        for (q = 0; q < ARRAY_SIZE(fields); q++) {
                for (i = 0; i < max_fields[q]; i++) {
                        field = fields[q][i];
                        vmcs12_read_any(&vmx->vcpu, field, &field_value);
@@ -6248,6 +6302,74 @@ static int handle_vmptrst(struct kvm_vcpu *vcpu)
        return 1;
 }
+/* Emulate the INVEPT instruction */
+static int handle_invept(struct kvm_vcpu *vcpu)
+{
+        u32 vmx_instruction_info, types;
+        unsigned long type;
+        gva_t gva;
+        struct x86_exception e;
+        struct {
+                u64 eptp, gpa;
+        } operand;
+        u64 eptp_mask = ((1ull << 51) - 1) & PAGE_MASK;
+        if (!(nested_vmx_secondary_ctls_high & SECONDARY_EXEC_ENABLE_EPT) ||
+            !(nested_vmx_ept_caps & VMX_EPT_INVEPT_BIT)) {
+                kvm_queue_exception(vcpu, UD_VECTOR);
+                return 1;
+        }
+        if (!nested_vmx_check_permission(vcpu))
+                return 1;
+        if (!kvm_read_cr0_bits(vcpu, X86_CR0_PE)) {
+                kvm_queue_exception(vcpu, UD_VECTOR);
+                return 1;
+        }
+        vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
+        type = kvm_register_read(vcpu, (vmx_instruction_info >> 28) & 0xf);
+        types = (nested_vmx_ept_caps >> VMX_EPT_EXTENT_SHIFT) & 6;
+        if (!(types & (1UL << type))) {
+                nested_vmx_failValid(vcpu,
+                                VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
+                return 1;
+        }
+        /* According to the Intel VMX instruction reference, the memory
+         * operand is read even if it isn't needed (e.g., for type==global)
+         */
+        if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
+                        vmx_instruction_info, &gva))
+                return 1;
+        if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, &operand,
+                                sizeof(operand), &e)) {
+                kvm_inject_page_fault(vcpu, &e);
+                return 1;
+        }
+        switch (type) {
+        case VMX_EPT_EXTENT_CONTEXT:
+                if ((operand.eptp & eptp_mask) !=
+                                (nested_ept_get_cr3(vcpu) & eptp_mask))
+                        break;
+        case VMX_EPT_EXTENT_GLOBAL:
+                kvm_mmu_sync_roots(vcpu);
+                kvm_mmu_flush_tlb(vcpu);
+                nested_vmx_succeed(vcpu);
+                break;
+        default:
+                BUG_ON(1);
+                break;
+        }
+        skip_emulated_instruction(vcpu);
+        return 1;
+}
 /*
 * The exit handlers return 1 if the exit was handled fully and guest execution
 * may resume.  Otherwise they set the kvm_run parameter to indicate what needs
@@ -6292,6 +6414,7 @@ static int (*const kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
        [EXIT_REASON_PAUSE_INSTRUCTION]       = handle_pause,
        [EXIT_REASON_MWAIT_INSTRUCTION]       = handle_invalid_op,
        [EXIT_REASON_MONITOR_INSTRUCTION]     = handle_invalid_op,
+        [EXIT_REASON_INVEPT]                  = handle_invept,
 };
 static const int kvm_vmx_max_exit_handlers =
@@ -6518,6 +6641,7 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
        case EXIT_REASON_VMPTRST: case EXIT_REASON_VMREAD:
        case EXIT_REASON_VMRESUME: case EXIT_REASON_VMWRITE:
        case EXIT_REASON_VMOFF: case EXIT_REASON_VMON:
+        case EXIT_REASON_INVEPT:
                /*
                 * VMX instructions trap unconditionally. This allows L1 to
                 * emulate them for its L2 guest, i.e., allows 3-level nesting!
@@ -6550,7 +6674,20 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
                return nested_cpu_has2(vmcs12,
                        SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES);
        case EXIT_REASON_EPT_VIOLATION:
+                /*
+                 * L0 always deals with the EPT violation. If nested EPT is
+                 * used, and the nested mmu code discovers that the address is
+                 * missing in the guest EPT table (EPT12), the EPT violation
+                 * will be injected with nested_ept_inject_page_fault()
+                 */
+                return 0;
        case EXIT_REASON_EPT_MISCONFIG:
+                /*
+                 * L2 never uses directly L1's EPT, but rather L0's own EPT
+                 * table (shadow on EPT) or a merged EPT table that L0 built
+                 * (EPT on EPT). So any problems with the structure of the
+                 * table is L0's fault.
+                 */
                return 0;
        case EXIT_REASON_PREEMPTION_TIMER:
                return vmcs12->pin_based_vm_exec_control &
@@ -6638,7 +6775,7 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
        if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked &&
            !(is_guest_mode(vcpu) && nested_cpu_has_virtual_nmis(
-                                        get_vmcs12(vcpu), vcpu)))) {
+                                        get_vmcs12(vcpu))))) {
                if (vmx_interrupt_allowed(vcpu)) {
                        vmx->soft_vnmi_blocked = 0;
                } else if (vmx->vnmi_blocked_time > 1000000000LL &&
@@ -7326,6 +7463,48 @@ static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
                entry->ecx |= bit(X86_FEATURE_VMX);
 }
+static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu,
+                struct x86_exception *fault)
+{
+        struct vmcs12 *vmcs12;
+        nested_vmx_vmexit(vcpu);
+        vmcs12 = get_vmcs12(vcpu);
+        if (fault->error_code & PFERR_RSVD_MASK)
+                vmcs12->vm_exit_reason = EXIT_REASON_EPT_MISCONFIG;
+        else
+                vmcs12->vm_exit_reason = EXIT_REASON_EPT_VIOLATION;
+        vmcs12->exit_qualification = vcpu->arch.exit_qualification;
+        vmcs12->guest_physical_address = fault->address;
+}
+/* Callbacks for nested_ept_init_mmu_context: */
+static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu)
+{
+        /* return the page table to be shadowed - in our case, EPT12 */
+        return get_vmcs12(vcpu)->ept_pointer;
+}
+static int nested_ept_init_mmu_context(struct kvm_vcpu *vcpu)
+{
+        int r = kvm_init_shadow_ept_mmu(vcpu, &vcpu->arch.mmu,
+                        nested_vmx_ept_caps & VMX_EPT_EXECUTE_ONLY_BIT);
+        vcpu->arch.mmu.set_cr3           = vmx_set_cr3;
+        vcpu->arch.mmu.get_cr3           = nested_ept_get_cr3;
+        vcpu->arch.mmu.inject_page_fault = nested_ept_inject_page_fault;
+        vcpu->arch.walk_mmu              = &vcpu->arch.nested_mmu;
+        return r;
+}
+static void nested_ept_uninit_mmu_context(struct kvm_vcpu *vcpu)
+{
+        vcpu->arch.walk_mmu = &vcpu->arch.mmu;
+}
 /*
 * prepare_vmcs02 is called when the L1 guest hypervisor runs its nested
 * L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it
@@ -7388,7 +7567,7 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
                vmcs12->guest_interruptibility_info);
        vmcs_write32(GUEST_SYSENTER_CS, vmcs12->guest_sysenter_cs);
        kvm_set_dr(vcpu, 7, vmcs12->guest_dr7);
-        vmcs_writel(GUEST_RFLAGS, vmcs12->guest_rflags);
+        vmx_set_rflags(vcpu, vmcs12->guest_rflags);
        vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS,
                vmcs12->guest_pending_dbg_exceptions);
        vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->guest_sysenter_esp);
@@ -7508,15 +7687,24 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
        vcpu->arch.cr0_guest_owned_bits &= ~vmcs12->cr0_guest_host_mask;
        vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits);
-        /* Note: IA32_MODE, LOAD_IA32_EFER are modified by vmx_set_efer below */
+        /* L2->L1 exit controls are emulated - the hardware exit is to L0 so
-        vmcs_write32(VM_EXIT_CONTROLS,
+         * we should use its exit controls. Note that VM_EXIT_LOAD_IA32_EFER
-                vmcs12->vm_exit_controls | vmcs_config.vmexit_ctrl);
+         * bits are further modified by vmx_set_efer() below.
-        vmcs_write32(VM_ENTRY_CONTROLS, vmcs12->vm_entry_controls |
+         */
+        vmcs_write32(VM_EXIT_CONTROLS, vmcs_config.vmexit_ctrl);
+        /* vmcs12's VM_ENTRY_LOAD_IA32_EFER and VM_ENTRY_IA32E_MODE are
+         * emulated by vmx_set_efer(), below.
+         */
+        vmcs_write32(VM_ENTRY_CONTROLS,
+                (vmcs12->vm_entry_controls & ~VM_ENTRY_LOAD_IA32_EFER &
+                        ~VM_ENTRY_IA32E_MODE) |
                (vmcs_config.vmentry_ctrl & ~VM_ENTRY_IA32E_MODE));
-        if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT)
+        if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT) {
                vmcs_write64(GUEST_IA32_PAT, vmcs12->guest_ia32_pat);
-        else if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT)
+                vcpu->arch.pat = vmcs12->guest_ia32_pat;
+        } else if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT)
                vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat);
@@ -7538,6 +7726,11 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
                vmx_flush_tlb(vcpu);
        }
+        if (nested_cpu_has_ept(vmcs12)) {
+                kvm_mmu_unload(vcpu);
+                nested_ept_init_mmu_context(vcpu);
+        }
        if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)
                vcpu->arch.efer = vmcs12->guest_ia32_efer;
        else if (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE)
@@ -7565,6 +7758,16 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
        kvm_set_cr3(vcpu, vmcs12->guest_cr3);
        kvm_mmu_reset_context(vcpu);
+        /*
+         * L1 may access the L2's PDPTR, so save them to construct vmcs12
+         */
+        if (enable_ept) {
+                vmcs_write64(GUEST_PDPTR0, vmcs12->guest_pdptr0);
+                vmcs_write64(GUEST_PDPTR1, vmcs12->guest_pdptr1);
+                vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2);
+                vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3);
+        }
        kvm_register_write(vcpu, VCPU_REGS_RSP, vmcs12->guest_rsp);
        kvm_register_write(vcpu, VCPU_REGS_RIP, vmcs12->guest_rip);
 }
@@ -7887,6 +8090,22 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
        vmcs12->guest_pending_dbg_exceptions =
                vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS);
+        /*
+         * In some cases (usually, nested EPT), L2 is allowed to change its
+         * own CR3 without exiting. If it has changed it, we must keep it.
+         * Of course, if L0 is using shadow page tables, GUEST_CR3 was defined
+         * by L0, not L1 or L2, so we mustn't unconditionally copy it to vmcs12.
+         *
+         * Additionally, restore L2's PDPTR to vmcs12.
+         */
+        if (enable_ept) {
+                vmcs12->guest_cr3 = vmcs_read64(GUEST_CR3);
+                vmcs12->guest_pdptr0 = vmcs_read64(GUEST_PDPTR0);
+                vmcs12->guest_pdptr1 = vmcs_read64(GUEST_PDPTR1);
+                vmcs12->guest_pdptr2 = vmcs_read64(GUEST_PDPTR2);
+                vmcs12->guest_pdptr3 = vmcs_read64(GUEST_PDPTR3);
+        }
        vmcs12->vm_entry_controls =
                (vmcs12->vm_entry_controls & ~VM_ENTRY_IA32E_MODE) |
                (vmcs_read32(VM_ENTRY_CONTROLS) & VM_ENTRY_IA32E_MODE);
@@ -7948,6 +8167,8 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
 static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
                                   struct vmcs12 *vmcs12)
 {
+        struct kvm_segment seg;
        if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER)
                vcpu->arch.efer = vmcs12->host_ia32_efer;
        else if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE)
@@ -7982,7 +8203,9 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
        vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK);
        kvm_set_cr4(vcpu, vmcs12->host_cr4);
-        /* shadow page tables on either EPT or shadow page tables */
+        if (nested_cpu_has_ept(vmcs12))
+                nested_ept_uninit_mmu_context(vcpu);
        kvm_set_cr3(vcpu, vmcs12->host_cr3);
        kvm_mmu_reset_context(vcpu);
@@ -8001,23 +8224,61 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
        vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->host_ia32_sysenter_eip);
        vmcs_writel(GUEST_IDTR_BASE, vmcs12->host_idtr_base);
        vmcs_writel(GUEST_GDTR_BASE, vmcs12->host_gdtr_base);
-        vmcs_writel(GUEST_TR_BASE, vmcs12->host_tr_base);
-        vmcs_writel(GUEST_GS_BASE, vmcs12->host_gs_base);
+        if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) {
-        vmcs_writel(GUEST_FS_BASE, vmcs12->host_fs_base);
-        vmcs_write16(GUEST_ES_SELECTOR, vmcs12->host_es_selector);
-        vmcs_write16(GUEST_CS_SELECTOR, vmcs12->host_cs_selector);
-        vmcs_write16(GUEST_SS_SELECTOR, vmcs12->host_ss_selector);
-        vmcs_write16(GUEST_DS_SELECTOR, vmcs12->host_ds_selector);
-        vmcs_write16(GUEST_FS_SELECTOR, vmcs12->host_fs_selector);
-        vmcs_write16(GUEST_GS_SELECTOR, vmcs12->host_gs_selector);
-        vmcs_write16(GUEST_TR_SELECTOR, vmcs12->host_tr_selector);
-        if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT)
                vmcs_write64(GUEST_IA32_PAT, vmcs12->host_ia32_pat);
+                vcpu->arch.pat = vmcs12->host_ia32_pat;
+        }
        if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL)
                vmcs_write64(GUEST_IA32_PERF_GLOBAL_CTRL,
                        vmcs12->host_ia32_perf_global_ctrl);
+        /* Set L1 segment info according to Intel SDM
+            27.5.2 Loading Host Segment and Descriptor-Table Registers */
+        seg = (struct kvm_segment) {
+                .base = 0,
+                .limit = 0xFFFFFFFF,
+                .selector = vmcs12->host_cs_selector,
+                .type = 11,
+                .present = 1,
+                .s = 1,
+                .g = 1
+        };
+        if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE)
+                seg.l = 1;
+        else
+                seg.db = 1;
+        vmx_set_segment(vcpu, &seg, VCPU_SREG_CS);
+        seg = (struct kvm_segment) {
+                .base = 0,
+                .limit = 0xFFFFFFFF,
+                .type = 3,
+                .present = 1,
+                .s = 1,
+                .db = 1,
+                .g = 1
+        };
+        seg.selector = vmcs12->host_ds_selector;
+        vmx_set_segment(vcpu, &seg, VCPU_SREG_DS);
+        seg.selector = vmcs12->host_es_selector;
+        vmx_set_segment(vcpu, &seg, VCPU_SREG_ES);
+        seg.selector = vmcs12->host_ss_selector;
+        vmx_set_segment(vcpu, &seg, VCPU_SREG_SS);
+        seg.selector = vmcs12->host_fs_selector;
+        seg.base = vmcs12->host_fs_base;
+        vmx_set_segment(vcpu, &seg, VCPU_SREG_FS);
+        seg.selector = vmcs12->host_gs_selector;
+        seg.base = vmcs12->host_gs_base;
+        vmx_set_segment(vcpu, &seg, VCPU_SREG_GS);
+        seg = (struct kvm_segment) {
+                .base = vmcs12->host_tr_base,
+                .limit = 0x67,
+                .selector = vmcs12->host_tr_selector,
+                .type = 11,
+                .present = 1
+        };
+        vmx_set_segment(vcpu, &seg, VCPU_SREG_TR);
        kvm_set_dr(vcpu, 7, 0x400);
        vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
 }
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index d21bce505315..e5ca72a5cdb6 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -682,17 +682,6 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
                 */
        }
-        /*
-         * Does the new cr3 value map to physical memory? (Note, we
-         * catch an invalid cr3 even in real-mode, because it would
-         * cause trouble later on when we turn on paging anyway.)
-         *
-         * A real CPU would silently accept an invalid cr3 and would
-         * attempt to use it - with largely undefined (and often hard
-         * to debug) behavior on the guest side.
-         */
-        if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGE_SHIFT)))
-                return 1;
        vcpu->arch.cr3 = cr3;
        __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
        vcpu->arch.mmu.new_cr3(vcpu);
@@ -850,7 +839,8 @@ static u32 msrs_to_save[] = {
 #ifdef CONFIG_X86_64
        MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
 #endif
-        MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA
+        MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA,
+        MSR_IA32_FEATURE_CONTROL
 };
 static unsigned num_msrs_to_save;
@@ -1457,6 +1447,29 @@ static void pvclock_update_vm_gtod_copy(struct kvm *kvm)
 #endif
 }
+static void kvm_gen_update_masterclock(struct kvm *kvm)
+{
+#ifdef CONFIG_X86_64
+        int i;
+        struct kvm_vcpu *vcpu;
+        struct kvm_arch *ka = &kvm->arch;
+        spin_lock(&ka->pvclock_gtod_sync_lock);
+        kvm_make_mclock_inprogress_request(kvm);
+        /* no guest entries from this point */
+        pvclock_update_vm_gtod_copy(kvm);
+        kvm_for_each_vcpu(i, vcpu, kvm)
+                set_bit(KVM_REQ_CLOCK_UPDATE, &vcpu->requests);
+        /* guest entries allowed */
+        kvm_for_each_vcpu(i, vcpu, kvm)
+                clear_bit(KVM_REQ_MCLOCK_INPROGRESS, &vcpu->requests);
+        spin_unlock(&ka->pvclock_gtod_sync_lock);
+#endif
+}
 static int kvm_guest_time_update(struct kvm_vcpu *v)
 {
        unsigned long flags, this_tsc_khz;
@@ -3806,6 +3819,7 @@ long kvm_arch_vm_ioctl(struct file *filp,
                delta = user_ns.clock - now_ns;
                local_irq_enable();
                kvm->arch.kvmclock_offset = delta;
+                kvm_gen_update_masterclock(kvm);
                break;
        }
        case KVM_GET_CLOCK: {
@@ -4955,6 +4969,97 @@ static bool retry_instruction(struct x86_emulate_ctxt *ctxt,
 static int complete_emulated_mmio(struct kvm_vcpu *vcpu);
 static int complete_emulated_pio(struct kvm_vcpu *vcpu);
+static int kvm_vcpu_check_hw_bp(unsigned long addr, u32 type, u32 dr7,
+                                unsigned long *db)
+{
+        u32 dr6 = 0;
+        int i;
+        u32 enable, rwlen;
+        enable = dr7;
+        rwlen = dr7 >> 16;
+        for (i = 0; i < 4; i++, enable >>= 2, rwlen >>= 4)
+                if ((enable & 3) && (rwlen & 15) == type && db[i] == addr)
+                        dr6 |= (1 << i);
+        return dr6;
+}
+static void kvm_vcpu_check_singlestep(struct kvm_vcpu *vcpu, int *r)
+{
+        struct kvm_run *kvm_run = vcpu->run;
+        /*
+         * Use the "raw" value to see if TF was passed to the processor.
+         * Note that the new value of the flags has not been saved yet.
+         *
+         * This is correct even for TF set by the guest, because "the
+         * processor will not generate this exception after the instruction
+         * that sets the TF flag".
+         */
+        unsigned long rflags = kvm_x86_ops->get_rflags(vcpu);
+        if (unlikely(rflags & X86_EFLAGS_TF)) {
+                if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) {
+                        kvm_run->debug.arch.dr6 = DR6_BS | DR6_FIXED_1;
+                        kvm_run->debug.arch.pc = vcpu->arch.singlestep_rip;
+                        kvm_run->debug.arch.exception = DB_VECTOR;
+                        kvm_run->exit_reason = KVM_EXIT_DEBUG;
+                        *r = EMULATE_USER_EXIT;
+                } else {
+                        vcpu->arch.emulate_ctxt.eflags &= ~X86_EFLAGS_TF;
+                        /*
+                         * "Certain debug exceptions may clear bit 0-3.  The
+                         * remaining contents of the DR6 register are never
+                         * cleared by the processor".
+                         */
+                        vcpu->arch.dr6 &= ~15;
+                        vcpu->arch.dr6 |= DR6_BS;
+                        kvm_queue_exception(vcpu, DB_VECTOR);
+                }
+        }
+}
+static bool kvm_vcpu_check_breakpoint(struct kvm_vcpu *vcpu, int *r)
+{
+        struct kvm_run *kvm_run = vcpu->run;
+        unsigned long eip = vcpu->arch.emulate_ctxt.eip;
+        u32 dr6 = 0;
+        if (unlikely(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) &&
+            (vcpu->arch.guest_debug_dr7 & DR7_BP_EN_MASK)) {
+                dr6 = kvm_vcpu_check_hw_bp(eip, 0,
+                                           vcpu->arch.guest_debug_dr7,
+                                           vcpu->arch.eff_db);
+                if (dr6 != 0) {
+                        kvm_run->debug.arch.dr6 = dr6 | DR6_FIXED_1;
+                        kvm_run->debug.arch.pc = kvm_rip_read(vcpu) +
+                                get_segment_base(vcpu, VCPU_SREG_CS);
+                        kvm_run->debug.arch.exception = DB_VECTOR;
+                        kvm_run->exit_reason = KVM_EXIT_DEBUG;
+                        *r = EMULATE_USER_EXIT;
+                        return true;
+                }
+        }
+        if (unlikely(vcpu->arch.dr7 & DR7_BP_EN_MASK)) {
+                dr6 = kvm_vcpu_check_hw_bp(eip, 0,
+                                           vcpu->arch.dr7,
+                                           vcpu->arch.db);
+                if (dr6 != 0) {
+                        vcpu->arch.dr6 &= ~15;
+                        vcpu->arch.dr6 |= dr6;
+                        kvm_queue_exception(vcpu, DB_VECTOR);
+                        *r = EMULATE_DONE;
+                        return true;
+                }
+        }
+        return false;
+}
 int x86_emulate_instruction(struct kvm_vcpu *vcpu,
                            unsigned long cr2,
                            int emulation_type,
@@ -4975,6 +5080,16 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu,
        if (!(emulation_type & EMULTYPE_NO_DECODE)) {
                init_emulate_ctxt(vcpu);
+                /*
+                 * We will reenter on the same instruction since
+                 * we do not set complete_userspace_io.  This does not
+                 * handle watchpoints yet, those would be handled in
+                 * the emulate_ops.
+                 */
+                if (kvm_vcpu_check_breakpoint(vcpu, &r))
+                        return r;
                ctxt->interruptibility = 0;
                ctxt->have_exception = false;
                ctxt->perm_ok = false;
@@ -5031,17 +5146,18 @@ restart:
                inject_emulated_exception(vcpu);
                r = EMULATE_DONE;
        } else if (vcpu->arch.pio.count) {
-                if (!vcpu->arch.pio.in)
+                if (!vcpu->arch.pio.in) {
+                        /* FIXME: return into emulator if single-stepping.  */
                        vcpu->arch.pio.count = 0;
-                else {
+                } else {
                        writeback = false;
                        vcpu->arch.complete_userspace_io = complete_emulated_pio;
                }
-                r = EMULATE_DO_MMIO;
+                r = EMULATE_USER_EXIT;
        } else if (vcpu->mmio_needed) {
                if (!vcpu->mmio_is_write)
                        writeback = false;
-                r = EMULATE_DO_MMIO;
+                r = EMULATE_USER_EXIT;
                vcpu->arch.complete_userspace_io = complete_emulated_mmio;
        } else if (r == EMULATION_RESTART)
                goto restart;
@@ -5050,10 +5166,12 @@ restart:
        if (writeback) {
                toggle_interruptibility(vcpu, ctxt->interruptibility);
-                kvm_set_rflags(vcpu, ctxt->eflags);
                kvm_make_request(KVM_REQ_EVENT, vcpu);
                vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
                kvm_rip_write(vcpu, ctxt->eip);
+                if (r == EMULATE_DONE)
+                        kvm_vcpu_check_singlestep(vcpu, &r);
+                kvm_set_rflags(vcpu, ctxt->eflags);
        } else
                vcpu->arch.emulate_regs_need_sync_to_vcpu = true;
@@ -5347,7 +5465,7 @@ static struct notifier_block pvclock_gtod_notifier = {
 int kvm_arch_init(void *opaque)
 {
        int r;
-        struct kvm_x86_ops *ops = (struct kvm_x86_ops *)opaque;
+        struct kvm_x86_ops *ops = opaque;
        if (kvm_x86_ops) {
                printk(KERN_ERR "kvm: already loaded the other module\n");
@@ -5495,6 +5613,23 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu)
        return 1;
 }
+/*
+ * kvm_pv_kick_cpu_op:  Kick a vcpu.
+ *
+ * @apicid - apicid of vcpu to be kicked.
+ */
+static void kvm_pv_kick_cpu_op(struct kvm *kvm, unsigned long flags, int apicid)
+{
+        struct kvm_lapic_irq lapic_irq;
+        lapic_irq.shorthand = 0;
+        lapic_irq.dest_mode = 0;
+        lapic_irq.dest_id = apicid;
+        lapic_irq.delivery_mode = APIC_DM_REMRD;
+        kvm_irq_delivery_to_apic(kvm, 0, &lapic_irq, NULL);
+}
 int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
 {
        unsigned long nr, a0, a1, a2, a3, ret;
@@ -5528,6 +5663,10 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
        case KVM_HC_VAPIC_POLL_IRQ:
                ret = 0;
                break;
+        case KVM_HC_KICK_CPU:
+                kvm_pv_kick_cpu_op(vcpu->kvm, a0, a1);
+                ret = 0;
+                break;
        default:
                ret = -KVM_ENOSYS;
                break;
@@ -5689,29 +5828,6 @@ static void process_nmi(struct kvm_vcpu *vcpu)
        kvm_make_request(KVM_REQ_EVENT, vcpu);
 }
-static void kvm_gen_update_masterclock(struct kvm *kvm)
-{
-#ifdef CONFIG_X86_64
-        int i;
-        struct kvm_vcpu *vcpu;
-        struct kvm_arch *ka = &kvm->arch;
-        spin_lock(&ka->pvclock_gtod_sync_lock);
-        kvm_make_mclock_inprogress_request(kvm);
-        /* no guest entries from this point */
-        pvclock_update_vm_gtod_copy(kvm);
-        kvm_for_each_vcpu(i, vcpu, kvm)
-                set_bit(KVM_REQ_CLOCK_UPDATE, &vcpu->requests);
-        /* guest entries allowed */
-        kvm_for_each_vcpu(i, vcpu, kvm)
-                clear_bit(KVM_REQ_MCLOCK_INPROGRESS, &vcpu->requests);
-        spin_unlock(&ka->pvclock_gtod_sync_lock);
-#endif
-}
 static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu)
 {
        u64 eoi_exit_bitmap[4];
@@ -5950,6 +6066,7 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
                                kvm_apic_accept_events(vcpu);
                                switch(vcpu->arch.mp_state) {
                                case KVM_MP_STATE_HALTED:
+                                        vcpu->arch.pv.pv_unhalted = false;
                                        vcpu->arch.mp_state =
                                                KVM_MP_STATE_RUNNABLE;
                                case KVM_MP_STATE_RUNNABLE:
@@ -6061,6 +6178,8 @@ static int complete_emulated_mmio(struct kvm_vcpu *vcpu)
        if (vcpu->mmio_cur_fragment == vcpu->mmio_nr_fragments) {
                vcpu->mmio_needed = 0;
+                /* FIXME: return into emulator if single-stepping.  */
                if (vcpu->mmio_is_write)
                        return 1;
                vcpu->mmio_read_completed = 1;
@@ -6249,7 +6368,12 @@ int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
                                    struct kvm_mp_state *mp_state)
 {
        kvm_apic_accept_events(vcpu);
-        mp_state->mp_state = vcpu->arch.mp_state;
+        if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED &&
+                                        vcpu->arch.pv.pv_unhalted)
+                mp_state->mp_state = KVM_MP_STATE_RUNNABLE;
+        else
+                mp_state->mp_state = vcpu->arch.mp_state;
        return 0;
 }
@@ -6770,6 +6894,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
        BUG_ON(vcpu->kvm == NULL);
        kvm = vcpu->kvm;
+        vcpu->arch.pv.pv_unhalted = false;
        vcpu->arch.emulate_ctxt.ops = &emulate_ops;
        if (!irqchip_in_kernel(kvm) || kvm_vcpu_is_bsp(vcpu))
                vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
@@ -7019,6 +7144,15 @@ out_free:
        return -ENOMEM;
 }
+void kvm_arch_memslots_updated(struct kvm *kvm)
+{
+        /*
+         * memslots->generation has been incremented.
+         * mmio generation may have reached its maximum value.
+         */
+        kvm_mmu_invalidate_mmio_sptes(kvm);
+}
 int kvm_arch_prepare_memory_region(struct kvm *kvm,
                                struct kvm_memory_slot *memslot,
                                struct kvm_userspace_memory_region *mem,
@@ -7079,11 +7213,6 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
         */
        if ((change != KVM_MR_DELETE) && (mem->flags & KVM_MEM_LOG_DIRTY_PAGES))
                kvm_mmu_slot_remove_write_access(kvm, mem->slot);
-        /*
-         * If memory slot is created, or moved, we need to clear all
-         * mmio sptes.
-         */
-        kvm_mmu_invalidate_mmio_sptes(kvm);
 }
 void kvm_arch_flush_shadow_all(struct kvm *kvm)
@@ -7103,6 +7232,7 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
                !vcpu->arch.apf.halted)
                || !list_empty_careful(&vcpu->async_pf.done)
                || kvm_apic_has_events(vcpu)
+                || vcpu->arch.pv.pv_unhalted
                || atomic_read(&vcpu->arch.nmi_queued) ||
                (kvm_arch_interrupt_allowed(vcpu) &&
                 kvm_cpu_has_interrupt(vcpu));
diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c
index c74436e687bf..72074d528400 100644
--- a/arch/x86/vdso/vclock_gettime.c
+++ b/arch/x86/vdso/vclock_gettime.c
@@ -85,15 +85,18 @@ static notrace cycle_t vread_pvclock(int *mode)
        cycle_t ret;
        u64 last;
        u32 version;
-        u32 migrate_count;
        u8 flags;
        unsigned cpu, cpu1;
        /*
-         * When looping to get a consistent (time-info, tsc) pair, we
+         * Note: hypervisor must guarantee that:
-         * also need to deal with the possibility we can switch vcpus,
+         * 1. cpu ID number maps 1:1 to per-CPU pvclock time info.
-         * so make sure we always re-fetch time-info for the current vcpu.
+         * 2. that per-CPU pvclock time info is updated if the
+         *    underlying CPU changes.
+         * 3. that version is increased whenever underlying CPU
+         *    changes.
+         *
         */
        do {
                cpu = __getcpu() & VGETCPU_CPU_MASK;
@@ -104,8 +107,6 @@ static notrace cycle_t vread_pvclock(int *mode)
                pvti = get_pvti(cpu);
-                migrate_count = pvti->migrate_count;
                version = __pvclock_read_cycles(&pvti->pvti, &ret, &flags);
                /*
@@ -117,8 +118,7 @@ static notrace cycle_t vread_pvclock(int *mode)
                cpu1 = __getcpu() & VGETCPU_CPU_MASK;
        } while (unlikely(cpu != cpu1 ||
                          (pvti->pvti.version & 1) ||
-                          pvti->pvti.version != version ||
+                          pvti->pvti.version != version));
-                          pvti->migrate_count != migrate_count));
        if (unlikely(!(flags & PVCLOCK_TSC_STABLE_BIT)))
                *mode = VCLOCK_NONE;
diff --git a/drivers/base/Kconfig b/drivers/base/Kconfig
index 5daa2599ed48..e373671652b0 100644
--- a/drivers/base/Kconfig
+++ b/drivers/base/Kconfig
@@ -200,11 +200,9 @@ config DMA_SHARED_BUFFER
          APIs extension; the file's descriptor can then be passed on to other
          driver.
-config CMA
+config DMA_CMA
-        bool "Contiguous Memory Allocator"
+        bool "DMA Contiguous Memory Allocator"
-        depends on HAVE_DMA_CONTIGUOUS && HAVE_MEMBLOCK
+        depends on HAVE_DMA_CONTIGUOUS && CMA
-        select MIGRATION
-        select MEMORY_ISOLATION
        help
          This enables the Contiguous Memory Allocator which allows drivers
          to allocate big physically-contiguous blocks of memory for use with
@@ -213,17 +211,7 @@ config CMA
          For more information see <include/linux/dma-contiguous.h>.
          If unsure, say "n".
-if CMA
+if  DMA_CMA
-config CMA_DEBUG
-        bool "CMA debug messages (DEVELOPMENT)"
-        depends on DEBUG_KERNEL
-        help
-          Turns on debug messages in CMA.  This produces KERN_DEBUG
-          messages for every CMA call as well as various messages while
-          processing calls such as dma_alloc_from_contiguous().
-          This option does not affect warning and error messages.
 comment "Default contiguous memory area size:"
 config CMA_SIZE_MBYTES
diff --git a/drivers/base/Makefile b/drivers/base/Makefile
index 48029aa477d9..94e8a80e87f8 100644
--- a/drivers/base/Makefile
+++ b/drivers/base/Makefile
@@ -6,7 +6,7 @@ obj-y			:= core.o bus.o dd.o syscore.o \
                           attribute_container.o transport_class.o \
                           topology.o
 obj-$(CONFIG_DEVTMPFS)  += devtmpfs.o
-obj-$(CONFIG_CMA) += dma-contiguous.o
+obj-$(CONFIG_DMA_CMA) += dma-contiguous.o
 obj-y                   += power/
 obj-$(CONFIG_HAS_DMA)   += dma-mapping.o
 obj-$(CONFIG_HAVE_GENERIC_DMA_COHERENT) += dma-coherent.o
diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h
index 343744e4809c..7e2d15837b02 100644
--- a/include/kvm/arm_vgic.h
+++ b/include/kvm/arm_vgic.h
@@ -26,7 +26,7 @@
 #include <linux/types.h>
 #include <linux/irqchip/arm-gic.h>
-#define VGIC_NR_IRQS            128
+#define VGIC_NR_IRQS            256
 #define VGIC_NR_SGIS            16
 #define VGIC_NR_PPIS            16
 #define VGIC_NR_PRIVATE_IRQS    (VGIC_NR_SGIS + VGIC_NR_PPIS)
diff --git a/include/linux/dma-contiguous.h b/include/linux/dma-contiguous.h
index 01b5c84be828..00141d3325fe 100644
--- a/include/linux/dma-contiguous.h
+++ b/include/linux/dma-contiguous.h
@@ -57,7 +57,7 @@ struct cma;
 struct page;
 struct device;
-#ifdef CONFIG_CMA
+#ifdef CONFIG_DMA_CMA
 /*
 * There is always at least global CMA area and a few optional device
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index a63d83ebd151..ca645a01d37a 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -85,6 +85,12 @@ static inline bool is_noslot_pfn(pfn_t pfn)
        return pfn == KVM_PFN_NOSLOT;
 }
+/*
+ * architectures with KVM_HVA_ERR_BAD other than PAGE_OFFSET (e.g. s390)
+ * provide own defines and kvm_is_error_hva
+ */
+#ifndef KVM_HVA_ERR_BAD
 #define KVM_HVA_ERR_BAD         (PAGE_OFFSET)
 #define KVM_HVA_ERR_RO_BAD      (PAGE_OFFSET + PAGE_SIZE)
@@ -93,6 +99,8 @@ static inline bool kvm_is_error_hva(unsigned long addr)
        return addr >= PAGE_OFFSET;
 }
+#endif
 #define KVM_ERR_PTR_BAD_PAGE    (ERR_PTR(-ENOENT))
 static inline bool is_error_page(struct page *page)
@@ -160,8 +168,12 @@ enum kvm_bus {
 int kvm_io_bus_write(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
                     int len, const void *val);
+int kvm_io_bus_write_cookie(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
+                            int len, const void *val, long cookie);
 int kvm_io_bus_read(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr, int len,
                    void *val);
+int kvm_io_bus_read_cookie(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
+                           int len, void *val, long cookie);
 int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
                            int len, struct kvm_io_device *dev);
 int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx,
@@ -499,6 +511,7 @@ int __kvm_set_memory_region(struct kvm *kvm,
 void kvm_arch_free_memslot(struct kvm_memory_slot *free,
                           struct kvm_memory_slot *dont);
 int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages);
+void kvm_arch_memslots_updated(struct kvm *kvm);
 int kvm_arch_prepare_memory_region(struct kvm *kvm,
                                struct kvm_memory_slot *memslot,
                                struct kvm_userspace_memory_region *mem,
diff --git a/include/linux/sched.h b/include/linux/sched.h
index f79ced719435..ce1e1c0aaa33 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -107,14 +107,6 @@ extern unsigned long this_cpu_load(void);
 extern void calc_global_load(unsigned long ticks);
 extern void update_cpu_load_nohz(void);
-/* Notifier for when a task gets migrated to a new CPU */
-struct task_migration_notifier {
-        struct task_struct *task;
-        int from_cpu;
-        int to_cpu;
-};
-extern void register_task_migration_notifier(struct notifier_block *n);
 extern unsigned long get_parent_ip(unsigned long addr);
 extern void dump_cpu_task(int cpu);
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index acccd08be6c7..99c25338ede8 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -667,6 +667,7 @@ struct kvm_ppc_smmu_info {
 #define KVM_CAP_PPC_RTAS 91
 #define KVM_CAP_IRQ_XICS 92
 #define KVM_CAP_ARM_EL1_32BIT 93
+#define KVM_CAP_SPAPR_MULTITCE 94
 #ifdef KVM_CAP_IRQ_ROUTING
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 725aa067ad63..5ac63c9a995a 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -978,13 +978,6 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
                rq->skip_clock_update = 1;
 }
-static ATOMIC_NOTIFIER_HEAD(task_migration_notifier);
-void register_task_migration_notifier(struct notifier_block *n)
-{
-        atomic_notifier_chain_register(&task_migration_notifier, n);
-}
 #ifdef CONFIG_SMP
 void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
 {
@@ -1015,18 +1008,10 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
        trace_sched_migrate_task(p, new_cpu);
        if (task_cpu(p) != new_cpu) {
-                struct task_migration_notifier tmn;
                if (p->sched_class->migrate_task_rq)
                        p->sched_class->migrate_task_rq(p, new_cpu);
                p->se.nr_migrations++;
                perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, NULL, 0);
-                tmn.task = p;
-                tmn.from_cpu = task_cpu(p);
-                tmn.to_cpu = new_cpu;
-                atomic_notifier_call_chain(&task_migration_notifier, 0, &tmn);
        }
        __set_task_cpu(p, new_cpu);
diff --git a/mm/Kconfig b/mm/Kconfig
index 8028dcc6615c..6cdd27043303 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -478,6 +478,30 @@ config FRONTSWAP
          If unsure, say Y to enable frontswap.
+config CMA
+        bool "Contiguous Memory Allocator"
+        depends on HAVE_MEMBLOCK
+        select MIGRATION
+        select MEMORY_ISOLATION
+        help
+          This enables the Contiguous Memory Allocator which allows other
+          subsystems to allocate big physically-contiguous blocks of memory.
+          CMA reserves a region of memory and allows only movable pages to
+          be allocated from it. This way, the kernel can use the memory for
+          pagecache and when a subsystem requests for contiguous area, the
+          allocated pages are migrated away to serve the contiguous request.
+          If unsure, say "n".
+config CMA_DEBUG
+        bool "CMA debug messages (DEVELOPMENT)"
+        depends on DEBUG_KERNEL && CMA
+        help
+          Turns on debug messages in CMA.  This produces KERN_DEBUG
+          messages for every CMA call as well as various messages while
+          processing calls such as dma_alloc_from_contiguous().
+          This option does not affect warning and error messages.
 config ZBUD
        tristate
        default n
diff --git a/virt/kvm/arm/vgic.c b/virt/kvm/arm/vgic.c
index 17c5ac7d10ed..685fc72fc751 100644
--- a/virt/kvm/arm/vgic.c
+++ b/virt/kvm/arm/vgic.c
@@ -149,7 +149,7 @@ static u32 *vgic_bytemap_get_reg(struct vgic_bytemap *x, int cpuid, u32 offset)
 {
        offset >>= 2;
        BUG_ON(offset > (VGIC_NR_IRQS / 4));
-        if (offset < 4)
+        if (offset < 8)
                return x->percpu[cpuid] + offset;
        else
                return x->shared + offset - 8;
@@ -432,19 +432,13 @@ static bool handle_mmio_priority_reg(struct kvm_vcpu *vcpu,
 static u32 vgic_get_target_reg(struct kvm *kvm, int irq)
 {
        struct vgic_dist *dist = &kvm->arch.vgic;
-        struct kvm_vcpu *vcpu;
+        int i;
-        int i, c;
-        unsigned long *bmap;
        u32 val = 0;
        irq -= VGIC_NR_PRIVATE_IRQS;
-        kvm_for_each_vcpu(c, vcpu, kvm) {
+        for (i = 0; i < GICD_IRQS_PER_ITARGETSR; i++)
-                bmap = vgic_bitmap_get_shared_map(&dist->irq_spi_target[c]);
+                val |= 1 << (dist->irq_spi_cpu[irq + i] + i * 8);
-                for (i = 0; i < GICD_IRQS_PER_ITARGETSR; i++)
-                        if (test_bit(irq + i, bmap))
-                                val |= 1 << (c + i * 8);
-        }
        return val;
 }
@@ -547,8 +541,12 @@ static bool handle_mmio_cfg_reg(struct kvm_vcpu *vcpu,
                                struct kvm_exit_mmio *mmio, phys_addr_t offset)
 {
        u32 val;
-        u32 *reg = vgic_bitmap_get_reg(&vcpu->kvm->arch.vgic.irq_cfg,
+        u32 *reg;
-                                       vcpu->vcpu_id, offset >> 1);
+        offset >>= 1;
+        reg = vgic_bitmap_get_reg(&vcpu->kvm->arch.vgic.irq_cfg,
+                                  vcpu->vcpu_id, offset);
        if (offset & 2)
                val = *reg >> 16;
        else
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 1580dd4ace4e..bf040c4e02b3 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -102,28 +102,8 @@ static bool largepages_enabled = true;
 bool kvm_is_mmio_pfn(pfn_t pfn)
 {
-        if (pfn_valid(pfn)) {
+        if (pfn_valid(pfn))
-                int reserved;
+                return PageReserved(pfn_to_page(pfn));
-                struct page *tail = pfn_to_page(pfn);
-                struct page *head = compound_trans_head(tail);
-                reserved = PageReserved(head);
-                if (head != tail) {
-                        /*
-                         * "head" is not a dangling pointer
-                         * (compound_trans_head takes care of that)
-                         * but the hugepage may have been splitted
-                         * from under us (and we may not hold a
-                         * reference count on the head page so it can
-                         * be reused before we run PageReferenced), so
-                         * we've to check PageTail before returning
-                         * what we just read.
-                         */
-                        smp_rmb();
-                        if (PageTail(tail))
-                                return reserved;
-                }
-                return PageReserved(tail);
-        }
        return true;
 }
@@ -731,7 +711,10 @@ static struct kvm_memslots *install_new_memslots(struct kvm *kvm,
        update_memslots(slots, new, kvm->memslots->generation);
        rcu_assign_pointer(kvm->memslots, slots);
        synchronize_srcu_expedited(&kvm->srcu);
-        return old_memslots; 
+        kvm_arch_memslots_updated(kvm);
+        return old_memslots;
 }
 /*
@@ -1893,7 +1876,7 @@ static struct file_operations kvm_vcpu_fops = {
 */
 static int create_vcpu_fd(struct kvm_vcpu *vcpu)
 {
-        return anon_inode_getfd("kvm-vcpu", &kvm_vcpu_fops, vcpu, O_RDWR);
+        return anon_inode_getfd("kvm-vcpu", &kvm_vcpu_fops, vcpu, O_RDWR | O_CLOEXEC);
 }
 /*
@@ -2302,7 +2285,7 @@ static int kvm_ioctl_create_device(struct kvm *kvm,
                return ret;
        }
-        ret = anon_inode_getfd(ops->name, &kvm_device_fops, dev, O_RDWR);
+        ret = anon_inode_getfd(ops->name, &kvm_device_fops, dev, O_RDWR | O_CLOEXEC);
        if (ret < 0) {
                ops->destroy(dev);
                return ret;
@@ -2586,7 +2569,7 @@ static int kvm_dev_ioctl_create_vm(unsigned long type)
                return r;
        }
 #endif
-        r = anon_inode_getfd("kvm-vm", &kvm_vm_fops, kvm, O_RDWR);
+        r = anon_inode_getfd("kvm-vm", &kvm_vm_fops, kvm, O_RDWR | O_CLOEXEC);
        if (r < 0)
                kvm_put_kvm(kvm);
@@ -2812,11 +2795,9 @@ static void kvm_io_bus_destroy(struct kvm_io_bus *bus)
        kfree(bus);
 }
-static int kvm_io_bus_sort_cmp(const void *p1, const void *p2)
+static inline int kvm_io_bus_cmp(const struct kvm_io_range *r1,
+                                 const struct kvm_io_range *r2)
 {
-        const struct kvm_io_range *r1 = p1;
-        const struct kvm_io_range *r2 = p2;
        if (r1->addr < r2->addr)
                return -1;
        if (r1->addr + r1->len > r2->addr + r2->len)
@@ -2824,6 +2805,11 @@ static int kvm_io_bus_sort_cmp(const void *p1, const void *p2)
        return 0;
 }
+static int kvm_io_bus_sort_cmp(const void *p1, const void *p2)
+{
+        return kvm_io_bus_cmp(p1, p2);
+}
 static int kvm_io_bus_insert_dev(struct kvm_io_bus *bus, struct kvm_io_device *dev,
                          gpa_t addr, int len)
 {
@@ -2857,17 +2843,54 @@ static int kvm_io_bus_get_first_dev(struct kvm_io_bus *bus,
        off = range - bus->range;
-        while (off > 0 && kvm_io_bus_sort_cmp(&key, &bus->range[off-1]) == 0)
+        while (off > 0 && kvm_io_bus_cmp(&key, &bus->range[off-1]) == 0)
                off--;
        return off;
 }
+static int __kvm_io_bus_write(struct kvm_io_bus *bus,
+                              struct kvm_io_range *range, const void *val)
+{
+        int idx;
+        idx = kvm_io_bus_get_first_dev(bus, range->addr, range->len);
+        if (idx < 0)
+                return -EOPNOTSUPP;
+        while (idx < bus->dev_count &&
+                kvm_io_bus_cmp(range, &bus->range[idx]) == 0) {
+                if (!kvm_iodevice_write(bus->range[idx].dev, range->addr,
+                                        range->len, val))
+                        return idx;
+                idx++;
+        }
+        return -EOPNOTSUPP;
+}
 /* kvm_io_bus_write - called under kvm->slots_lock */
 int kvm_io_bus_write(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
                     int len, const void *val)
 {
-        int idx;
+        struct kvm_io_bus *bus;
+        struct kvm_io_range range;
+        int r;
+        range = (struct kvm_io_range) {
+                .addr = addr,
+                .len = len,
+        };
+        bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu);
+        r = __kvm_io_bus_write(bus, &range, val);
+        return r < 0 ? r : 0;
+}
+/* kvm_io_bus_write_cookie - called under kvm->slots_lock */
+int kvm_io_bus_write_cookie(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
+                            int len, const void *val, long cookie)
+{
        struct kvm_io_bus *bus;
        struct kvm_io_range range;
@@ -2877,14 +2900,35 @@ int kvm_io_bus_write(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
        };
        bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu);
-        idx = kvm_io_bus_get_first_dev(bus, addr, len);
+        /* First try the device referenced by cookie. */
+        if ((cookie >= 0) && (cookie < bus->dev_count) &&
+            (kvm_io_bus_cmp(&range, &bus->range[cookie]) == 0))
+                if (!kvm_iodevice_write(bus->range[cookie].dev, addr, len,
+                                        val))
+                        return cookie;
+        /*
+         * cookie contained garbage; fall back to search and return the
+         * correct cookie value.
+         */
+        return __kvm_io_bus_write(bus, &range, val);
+}
+static int __kvm_io_bus_read(struct kvm_io_bus *bus, struct kvm_io_range *range,
+                             void *val)
+{
+        int idx;
+        idx = kvm_io_bus_get_first_dev(bus, range->addr, range->len);
        if (idx < 0)
                return -EOPNOTSUPP;
        while (idx < bus->dev_count &&
-                kvm_io_bus_sort_cmp(&range, &bus->range[idx]) == 0) {
+                kvm_io_bus_cmp(range, &bus->range[idx]) == 0) {
-                if (!kvm_iodevice_write(bus->range[idx].dev, addr, len, val))
+                if (!kvm_iodevice_read(bus->range[idx].dev, range->addr,
-                        return 0;
+                                       range->len, val))
+                        return idx;
                idx++;
        }
@@ -2895,9 +2939,9 @@ int kvm_io_bus_write(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
 int kvm_io_bus_read(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
                    int len, void *val)
 {
-        int idx;
        struct kvm_io_bus *bus;
        struct kvm_io_range range;
+        int r;
        range = (struct kvm_io_range) {
                .addr = addr,
@@ -2905,18 +2949,36 @@ int kvm_io_bus_read(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
        };
        bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu);
-        idx = kvm_io_bus_get_first_dev(bus, addr, len);
+        r = __kvm_io_bus_read(bus, &range, val);
-        if (idx < 0)
+        return r < 0 ? r : 0;
-                return -EOPNOTSUPP;
+}
-        while (idx < bus->dev_count &&
+/* kvm_io_bus_read_cookie - called under kvm->slots_lock */
-                kvm_io_bus_sort_cmp(&range, &bus->range[idx]) == 0) {
+int kvm_io_bus_read_cookie(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
-                if (!kvm_iodevice_read(bus->range[idx].dev, addr, len, val))
+                           int len, void *val, long cookie)
-                        return 0;
+{
-                idx++;
+        struct kvm_io_bus *bus;
-        }
+        struct kvm_io_range range;
-        return -EOPNOTSUPP;
+        range = (struct kvm_io_range) {
+                .addr = addr,
+                .len = len,
+        };
+        bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu);
+        /* First try the device referenced by cookie. */
+        if ((cookie >= 0) && (cookie < bus->dev_count) &&
+            (kvm_io_bus_cmp(&range, &bus->range[cookie]) == 0))
+                if (!kvm_iodevice_read(bus->range[cookie].dev, addr, len,
+                                       val))
+                        return cookie;
+        /*
+         * cookie contained garbage; fall back to search and return the
+         * correct cookie value.
+         */
+        return __kvm_io_bus_read(bus, &range, val);
 }
 /* Caller must hold slots_lock. */
author	Linus Torvalds <torvalds@linux-foundation.org>	2013-09-04 21:15:06 -0400
committer	Linus Torvalds <torvalds@linux-foundation.org>	2013-09-04 21:15:06 -0400
commit	ae7a835cc546fc67df90edaaa0c48ae2b22a29fe (patch)
tree	b1235437fde066ab0f272f164d75dc1b98a244cf
parent	cf39c8e5352b4fb9efedfe7e9acb566a85ed847c (diff)
parent	6b9e4fa07443f5baf5bbd7ab043abd6976f8d7bc (diff)