aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2013-09-04 21:15:06 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2013-09-04 21:15:06 -0400
commitae7a835cc546fc67df90edaaa0c48ae2b22a29fe (patch)
treeb1235437fde066ab0f272f164d75dc1b98a244cf
parentcf39c8e5352b4fb9efedfe7e9acb566a85ed847c (diff)
parent6b9e4fa07443f5baf5bbd7ab043abd6976f8d7bc (diff)
Merge branch 'next' of git://git.kernel.org/pub/scm/virt/kvm/kvm
Pull KVM updates from Gleb Natapov: "The highlights of the release are nested EPT and pv-ticketlocks support (hypervisor part, guest part, which is most of the code, goes through tip tree). Apart of that there are many fixes for all arches" Fix up semantic conflicts as discussed in the pull request thread.. * 'next' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (88 commits) ARM: KVM: Add newlines to panic strings ARM: KVM: Work around older compiler bug ARM: KVM: Simplify tracepoint text ARM: KVM: Fix kvm_set_pte assignment ARM: KVM: vgic: Bump VGIC_NR_IRQS to 256 ARM: KVM: Bugfix: vgic_bytemap_get_reg per cpu regs ARM: KVM: vgic: fix GICD_ICFGRn access ARM: KVM: vgic: simplify vgic_get_target_reg KVM: MMU: remove unused parameter KVM: PPC: Book3S PR: Rework kvmppc_mmu_book3s_64_xlate() KVM: PPC: Book3S PR: Make instruction fetch fallback work for system calls KVM: PPC: Book3S PR: Don't corrupt guest state when kernel uses VMX KVM: x86: update masterclock when kvmclock_offset is calculated (v2) KVM: PPC: Book3S: Fix compile error in XICS emulation KVM: PPC: Book3S PR: return appropriate error when allocation fails arch: powerpc: kvm: add signed type cast for comparation KVM: x86: add comments where MMIO does not return to the emulator KVM: vmx: count exits to userspace during invalid guest emulation KVM: rename __kvm_io_bus_sort_cmp to kvm_io_bus_cmp kvm: optimize away THP checks in kvm_is_mmio_pfn() ...
-rw-r--r--Documentation/virtual/kvm/cpuid.txt4
-rw-r--r--Documentation/virtual/kvm/hypercalls.txt14
-rw-r--r--arch/arm/configs/keystone_defconfig1
-rw-r--r--arch/arm/configs/omap2plus_defconfig1
-rw-r--r--arch/arm/configs/tegra_defconfig1
-rw-r--r--arch/arm/include/asm/dma-contiguous.h2
-rw-r--r--arch/arm/include/asm/kvm_mmu.h2
-rw-r--r--arch/arm/kvm/arm.c4
-rw-r--r--arch/arm/kvm/interrupts.S8
-rw-r--r--arch/arm/kvm/reset.c2
-rw-r--r--arch/arm/kvm/trace.h7
-rw-r--r--arch/arm/mm/dma-mapping.c6
-rw-r--r--arch/ia64/kvm/kvm-ia64.c4
-rw-r--r--arch/mips/kvm/kvm_locore.S969
-rw-r--r--arch/mips/kvm/kvm_mips.c4
-rw-r--r--arch/powerpc/include/asm/kvm_book3s.h38
-rw-r--r--arch/powerpc/include/asm/kvm_book3s_64.h4
-rw-r--r--arch/powerpc/include/asm/kvm_host.h14
-rw-r--r--arch/powerpc/include/asm/kvm_ppc.h25
-rw-r--r--arch/powerpc/kernel/asm-offsets.c1
-rw-r--r--arch/powerpc/kernel/setup_64.c4
-rw-r--r--arch/powerpc/kvm/Kconfig1
-rw-r--r--arch/powerpc/kvm/Makefile1
-rw-r--r--arch/powerpc/kvm/book3s_64_mmu.c150
-rw-r--r--arch/powerpc/kvm/book3s_64_mmu_hv.c42
-rw-r--r--arch/powerpc/kvm/book3s_64_vio.c2
-rw-r--r--arch/powerpc/kvm/book3s_emulate.c2
-rw-r--r--arch/powerpc/kvm/book3s_hv.c36
-rw-r--r--arch/powerpc/kvm/book3s_hv_builtin.c246
-rw-r--r--arch/powerpc/kvm/book3s_hv_cma.c240
-rw-r--r--arch/powerpc/kvm/book3s_hv_cma.h27
-rw-r--r--arch/powerpc/kvm/book3s_hv_rm_mmu.c139
-rw-r--r--arch/powerpc/kvm/book3s_hv_rmhandlers.S2
-rw-r--r--arch/powerpc/kvm/book3s_interrupts.S14
-rw-r--r--arch/powerpc/kvm/book3s_pr.c35
-rw-r--r--arch/powerpc/kvm/book3s_xics.c1
-rw-r--r--arch/powerpc/kvm/booke.c6
-rw-r--r--arch/powerpc/kvm/powerpc.c6
-rw-r--r--arch/s390/include/asm/kvm_host.h8
-rw-r--r--arch/s390/include/asm/mmu.h2
-rw-r--r--arch/s390/include/asm/mmu_context.h19
-rw-r--r--arch/s390/include/asm/pgtable.h11
-rw-r--r--arch/s390/include/asm/processor.h2
-rw-r--r--arch/s390/kvm/diag.c17
-rw-r--r--arch/s390/kvm/kvm-s390.c27
-rw-r--r--arch/s390/kvm/kvm-s390.h10
-rw-r--r--arch/s390/kvm/priv.c32
-rw-r--r--arch/s390/mm/pgtable.c183
-rw-r--r--arch/x86/include/asm/kvm_host.h14
-rw-r--r--arch/x86/include/asm/pvclock.h1
-rw-r--r--arch/x86/include/asm/vmx.h2
-rw-r--r--arch/x86/include/uapi/asm/vmx.h6
-rw-r--r--arch/x86/kernel/pvclock.c44
-rw-r--r--arch/x86/kvm/cpuid.c3
-rw-r--r--arch/x86/kvm/lapic.c38
-rw-r--r--arch/x86/kvm/mmu.c181
-rw-r--r--arch/x86/kvm/mmu.h2
-rw-r--r--arch/x86/kvm/paging_tmpl.h178
-rw-r--r--arch/x86/kvm/pmu.c25
-rw-r--r--arch/x86/kvm/vmx.c441
-rw-r--r--arch/x86/kvm/x86.c224
-rw-r--r--arch/x86/vdso/vclock_gettime.c16
-rw-r--r--drivers/base/Kconfig20
-rw-r--r--drivers/base/Makefile2
-rw-r--r--include/kvm/arm_vgic.h2
-rw-r--r--include/linux/dma-contiguous.h2
-rw-r--r--include/linux/kvm_host.h13
-rw-r--r--include/linux/sched.h8
-rw-r--r--include/uapi/linux/kvm.h1
-rw-r--r--kernel/sched/core.c15
-rw-r--r--mm/Kconfig24
-rw-r--r--virt/kvm/arm/vgic.c22
-rw-r--r--virt/kvm/kvm_main.c156
73 files changed, 2413 insertions, 1403 deletions
diff --git a/Documentation/virtual/kvm/cpuid.txt b/Documentation/virtual/kvm/cpuid.txt
index 83afe65d4966..22ff659bc0fb 100644
--- a/Documentation/virtual/kvm/cpuid.txt
+++ b/Documentation/virtual/kvm/cpuid.txt
@@ -43,6 +43,10 @@ KVM_FEATURE_CLOCKSOURCE2 || 3 || kvmclock available at msrs
43KVM_FEATURE_ASYNC_PF || 4 || async pf can be enabled by 43KVM_FEATURE_ASYNC_PF || 4 || async pf can be enabled by
44 || || writing to msr 0x4b564d02 44 || || writing to msr 0x4b564d02
45------------------------------------------------------------------------------ 45------------------------------------------------------------------------------
46KVM_FEATURE_PV_UNHALT || 7 || guest checks this feature bit
47 || || before enabling paravirtualized
48 || || spinlock support.
49------------------------------------------------------------------------------
46KVM_FEATURE_CLOCKSOURCE_STABLE_BIT || 24 || host will warn if no guest-side 50KVM_FEATURE_CLOCKSOURCE_STABLE_BIT || 24 || host will warn if no guest-side
47 || || per-cpu warps are expected in 51 || || per-cpu warps are expected in
48 || || kvmclock. 52 || || kvmclock.
diff --git a/Documentation/virtual/kvm/hypercalls.txt b/Documentation/virtual/kvm/hypercalls.txt
index ea113b5d87a4..022198e389d7 100644
--- a/Documentation/virtual/kvm/hypercalls.txt
+++ b/Documentation/virtual/kvm/hypercalls.txt
@@ -64,3 +64,17 @@ Purpose: To enable communication between the hypervisor and guest there is a
64shared page that contains parts of supervisor visible register state. 64shared page that contains parts of supervisor visible register state.
65The guest can map this shared page to access its supervisor register through 65The guest can map this shared page to access its supervisor register through
66memory using this hypercall. 66memory using this hypercall.
67
685. KVM_HC_KICK_CPU
69------------------------
70Architecture: x86
71Status: active
72Purpose: Hypercall used to wakeup a vcpu from HLT state
73Usage example : A vcpu of a paravirtualized guest that is busywaiting in guest
74kernel mode for an event to occur (ex: a spinlock to become available) can
75execute HLT instruction once it has busy-waited for more than a threshold
76time-interval. Execution of HLT instruction would cause the hypervisor to put
77the vcpu to sleep until occurence of an appropriate event. Another vcpu of the
78same guest can wakeup the sleeping vcpu by issuing KVM_HC_KICK_CPU hypercall,
79specifying APIC ID (a1) of the vcpu to be woken up. An additional argument (a0)
80is used in the hypercall for future use.
diff --git a/arch/arm/configs/keystone_defconfig b/arch/arm/configs/keystone_defconfig
index 62e968cac9dc..1f36b823905f 100644
--- a/arch/arm/configs/keystone_defconfig
+++ b/arch/arm/configs/keystone_defconfig
@@ -104,6 +104,7 @@ CONFIG_IP_SCTP=y
104CONFIG_VLAN_8021Q=y 104CONFIG_VLAN_8021Q=y
105CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" 105CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug"
106CONFIG_CMA=y 106CONFIG_CMA=y
107CONFIG_DMA_CMA=y
107CONFIG_MTD=y 108CONFIG_MTD=y
108CONFIG_MTD_CMDLINE_PARTS=y 109CONFIG_MTD_CMDLINE_PARTS=y
109CONFIG_MTD_BLOCK=y 110CONFIG_MTD_BLOCK=y
diff --git a/arch/arm/configs/omap2plus_defconfig b/arch/arm/configs/omap2plus_defconfig
index 5339e6a4d639..5465f564fdf3 100644
--- a/arch/arm/configs/omap2plus_defconfig
+++ b/arch/arm/configs/omap2plus_defconfig
@@ -78,6 +78,7 @@ CONFIG_MAC80211_RC_PID=y
78CONFIG_MAC80211_RC_DEFAULT_PID=y 78CONFIG_MAC80211_RC_DEFAULT_PID=y
79CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" 79CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug"
80CONFIG_CMA=y 80CONFIG_CMA=y
81CONFIG_DMA_CMA=y
81CONFIG_CONNECTOR=y 82CONFIG_CONNECTOR=y
82CONFIG_DEVTMPFS=y 83CONFIG_DEVTMPFS=y
83CONFIG_DEVTMPFS_MOUNT=y 84CONFIG_DEVTMPFS_MOUNT=y
diff --git a/arch/arm/configs/tegra_defconfig b/arch/arm/configs/tegra_defconfig
index 1effb43dab80..92d0a149aeb5 100644
--- a/arch/arm/configs/tegra_defconfig
+++ b/arch/arm/configs/tegra_defconfig
@@ -79,6 +79,7 @@ CONFIG_DEVTMPFS=y
79CONFIG_DEVTMPFS_MOUNT=y 79CONFIG_DEVTMPFS_MOUNT=y
80# CONFIG_FIRMWARE_IN_KERNEL is not set 80# CONFIG_FIRMWARE_IN_KERNEL is not set
81CONFIG_CMA=y 81CONFIG_CMA=y
82CONFIG_DMA_CMA=y
82CONFIG_MTD=y 83CONFIG_MTD=y
83CONFIG_MTD_M25P80=y 84CONFIG_MTD_M25P80=y
84CONFIG_PROC_DEVICETREE=y 85CONFIG_PROC_DEVICETREE=y
diff --git a/arch/arm/include/asm/dma-contiguous.h b/arch/arm/include/asm/dma-contiguous.h
index 3ed37b4d93da..e072bb2ba1b1 100644
--- a/arch/arm/include/asm/dma-contiguous.h
+++ b/arch/arm/include/asm/dma-contiguous.h
@@ -2,7 +2,7 @@
2#define ASMARM_DMA_CONTIGUOUS_H 2#define ASMARM_DMA_CONTIGUOUS_H
3 3
4#ifdef __KERNEL__ 4#ifdef __KERNEL__
5#ifdef CONFIG_CMA 5#ifdef CONFIG_DMA_CMA
6 6
7#include <linux/types.h> 7#include <linux/types.h>
8#include <asm-generic/dma-contiguous.h> 8#include <asm-generic/dma-contiguous.h>
diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h
index 472ac7091003..9b28c41f4ba9 100644
--- a/arch/arm/include/asm/kvm_mmu.h
+++ b/arch/arm/include/asm/kvm_mmu.h
@@ -64,7 +64,7 @@ void kvm_clear_hyp_idmap(void);
64 64
65static inline void kvm_set_pte(pte_t *pte, pte_t new_pte) 65static inline void kvm_set_pte(pte_t *pte, pte_t new_pte)
66{ 66{
67 pte_val(*pte) = new_pte; 67 *pte = new_pte;
68 /* 68 /*
69 * flush_pmd_entry just takes a void pointer and cleans the necessary 69 * flush_pmd_entry just takes a void pointer and cleans the necessary
70 * cache entries, so we can reuse the function for ptes. 70 * cache entries, so we can reuse the function for ptes.
diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
index 741f66a2edbd..9c697db2787e 100644
--- a/arch/arm/kvm/arm.c
+++ b/arch/arm/kvm/arm.c
@@ -219,6 +219,10 @@ long kvm_arch_dev_ioctl(struct file *filp,
219 return -EINVAL; 219 return -EINVAL;
220} 220}
221 221
222void kvm_arch_memslots_updated(struct kvm *kvm)
223{
224}
225
222int kvm_arch_prepare_memory_region(struct kvm *kvm, 226int kvm_arch_prepare_memory_region(struct kvm *kvm,
223 struct kvm_memory_slot *memslot, 227 struct kvm_memory_slot *memslot,
224 struct kvm_userspace_memory_region *mem, 228 struct kvm_userspace_memory_region *mem,
diff --git a/arch/arm/kvm/interrupts.S b/arch/arm/kvm/interrupts.S
index 16cd4ba5d7fd..85dd84b10687 100644
--- a/arch/arm/kvm/interrupts.S
+++ b/arch/arm/kvm/interrupts.S
@@ -492,10 +492,10 @@ __kvm_hyp_code_end:
492 .section ".rodata" 492 .section ".rodata"
493 493
494und_die_str: 494und_die_str:
495 .ascii "unexpected undefined exception in Hyp mode at: %#08x" 495 .ascii "unexpected undefined exception in Hyp mode at: %#08x\n"
496pabt_die_str: 496pabt_die_str:
497 .ascii "unexpected prefetch abort in Hyp mode at: %#08x" 497 .ascii "unexpected prefetch abort in Hyp mode at: %#08x\n"
498dabt_die_str: 498dabt_die_str:
499 .ascii "unexpected data abort in Hyp mode at: %#08x" 499 .ascii "unexpected data abort in Hyp mode at: %#08x\n"
500svc_die_str: 500svc_die_str:
501 .ascii "unexpected HVC/SVC trap in Hyp mode at: %#08x" 501 .ascii "unexpected HVC/SVC trap in Hyp mode at: %#08x\n"
diff --git a/arch/arm/kvm/reset.c b/arch/arm/kvm/reset.c
index b7840e7aa452..71e08baee209 100644
--- a/arch/arm/kvm/reset.c
+++ b/arch/arm/kvm/reset.c
@@ -40,7 +40,7 @@ static struct kvm_regs a15_regs_reset = {
40}; 40};
41 41
42static const struct kvm_irq_level a15_vtimer_irq = { 42static const struct kvm_irq_level a15_vtimer_irq = {
43 .irq = 27, 43 { .irq = 27 },
44 .level = 1, 44 .level = 1,
45}; 45};
46 46
diff --git a/arch/arm/kvm/trace.h b/arch/arm/kvm/trace.h
index a8e73ed5ad5b..b1d640f78623 100644
--- a/arch/arm/kvm/trace.h
+++ b/arch/arm/kvm/trace.h
@@ -59,10 +59,9 @@ TRACE_EVENT(kvm_guest_fault,
59 __entry->ipa = ipa; 59 __entry->ipa = ipa;
60 ), 60 ),
61 61
62 TP_printk("guest fault at PC %#08lx (hxfar %#08lx, " 62 TP_printk("ipa %#llx, hsr %#08lx, hxfar %#08lx, pc %#08lx",
63 "ipa %#16llx, hsr %#08lx", 63 __entry->ipa, __entry->hsr,
64 __entry->vcpu_pc, __entry->hxfar, 64 __entry->hxfar, __entry->vcpu_pc)
65 __entry->ipa, __entry->hsr)
66); 65);
67 66
68TRACE_EVENT(kvm_irq_line, 67TRACE_EVENT(kvm_irq_line,
diff --git a/arch/arm/mm/dma-mapping.c b/arch/arm/mm/dma-mapping.c
index 7f9b1798c6cf..dbddc07a3bbd 100644
--- a/arch/arm/mm/dma-mapping.c
+++ b/arch/arm/mm/dma-mapping.c
@@ -358,7 +358,7 @@ static int __init atomic_pool_init(void)
358 if (!pages) 358 if (!pages)
359 goto no_pages; 359 goto no_pages;
360 360
361 if (IS_ENABLED(CONFIG_CMA)) 361 if (IS_ENABLED(CONFIG_DMA_CMA))
362 ptr = __alloc_from_contiguous(NULL, pool->size, prot, &page, 362 ptr = __alloc_from_contiguous(NULL, pool->size, prot, &page,
363 atomic_pool_init); 363 atomic_pool_init);
364 else 364 else
@@ -670,7 +670,7 @@ static void *__dma_alloc(struct device *dev, size_t size, dma_addr_t *handle,
670 addr = __alloc_simple_buffer(dev, size, gfp, &page); 670 addr = __alloc_simple_buffer(dev, size, gfp, &page);
671 else if (!(gfp & __GFP_WAIT)) 671 else if (!(gfp & __GFP_WAIT))
672 addr = __alloc_from_pool(size, &page); 672 addr = __alloc_from_pool(size, &page);
673 else if (!IS_ENABLED(CONFIG_CMA)) 673 else if (!IS_ENABLED(CONFIG_DMA_CMA))
674 addr = __alloc_remap_buffer(dev, size, gfp, prot, &page, caller); 674 addr = __alloc_remap_buffer(dev, size, gfp, prot, &page, caller);
675 else 675 else
676 addr = __alloc_from_contiguous(dev, size, prot, &page, caller); 676 addr = __alloc_from_contiguous(dev, size, prot, &page, caller);
@@ -759,7 +759,7 @@ static void __arm_dma_free(struct device *dev, size_t size, void *cpu_addr,
759 __dma_free_buffer(page, size); 759 __dma_free_buffer(page, size);
760 } else if (__free_from_pool(cpu_addr, size)) { 760 } else if (__free_from_pool(cpu_addr, size)) {
761 return; 761 return;
762 } else if (!IS_ENABLED(CONFIG_CMA)) { 762 } else if (!IS_ENABLED(CONFIG_DMA_CMA)) {
763 __dma_free_remap(cpu_addr, size); 763 __dma_free_remap(cpu_addr, size);
764 __dma_free_buffer(page, size); 764 __dma_free_buffer(page, size);
765 } else { 765 } else {
diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c
index 5b2dc0d10c8f..bdfd8789b376 100644
--- a/arch/ia64/kvm/kvm-ia64.c
+++ b/arch/ia64/kvm/kvm-ia64.c
@@ -1560,6 +1560,10 @@ int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages)
1560 return 0; 1560 return 0;
1561} 1561}
1562 1562
1563void kvm_arch_memslots_updated(struct kvm *kvm)
1564{
1565}
1566
1563int kvm_arch_prepare_memory_region(struct kvm *kvm, 1567int kvm_arch_prepare_memory_region(struct kvm *kvm,
1564 struct kvm_memory_slot *memslot, 1568 struct kvm_memory_slot *memslot,
1565 struct kvm_userspace_memory_region *mem, 1569 struct kvm_userspace_memory_region *mem,
diff --git a/arch/mips/kvm/kvm_locore.S b/arch/mips/kvm/kvm_locore.S
index dca2aa665993..bbace092ad0a 100644
--- a/arch/mips/kvm/kvm_locore.S
+++ b/arch/mips/kvm/kvm_locore.S
@@ -1,13 +1,13 @@
1/* 1/*
2* This file is subject to the terms and conditions of the GNU General Public 2 * This file is subject to the terms and conditions of the GNU General Public
3* License. See the file "COPYING" in the main directory of this archive 3 * License. See the file "COPYING" in the main directory of this archive
4* for more details. 4 * for more details.
5* 5 *
6* Main entry point for the guest, exception handling. 6 * Main entry point for the guest, exception handling.
7* 7 *
8* Copyright (C) 2012 MIPS Technologies, Inc. All rights reserved. 8 * Copyright (C) 2012 MIPS Technologies, Inc. All rights reserved.
9* Authors: Sanjay Lal <sanjayl@kymasys.com> 9 * Authors: Sanjay Lal <sanjayl@kymasys.com>
10*/ 10 */
11 11
12#include <asm/asm.h> 12#include <asm/asm.h>
13#include <asm/asmmacro.h> 13#include <asm/asmmacro.h>
@@ -55,195 +55,193 @@
55 * a0: run 55 * a0: run
56 * a1: vcpu 56 * a1: vcpu
57 */ 57 */
58 .set noreorder
59 .set noat
58 60
59FEXPORT(__kvm_mips_vcpu_run) 61FEXPORT(__kvm_mips_vcpu_run)
60 .set push 62 /* k0/k1 not being used in host kernel context */
61 .set noreorder 63 INT_ADDIU k1, sp, -PT_SIZE
62 .set noat 64 LONG_S $0, PT_R0(k1)
63 65 LONG_S $1, PT_R1(k1)
64 /* k0/k1 not being used in host kernel context */ 66 LONG_S $2, PT_R2(k1)
65 addiu k1,sp, -PT_SIZE 67 LONG_S $3, PT_R3(k1)
66 LONG_S $0, PT_R0(k1) 68
67 LONG_S $1, PT_R1(k1) 69 LONG_S $4, PT_R4(k1)
68 LONG_S $2, PT_R2(k1) 70 LONG_S $5, PT_R5(k1)
69 LONG_S $3, PT_R3(k1) 71 LONG_S $6, PT_R6(k1)
70 72 LONG_S $7, PT_R7(k1)
71 LONG_S $4, PT_R4(k1) 73
72 LONG_S $5, PT_R5(k1) 74 LONG_S $8, PT_R8(k1)
73 LONG_S $6, PT_R6(k1) 75 LONG_S $9, PT_R9(k1)
74 LONG_S $7, PT_R7(k1) 76 LONG_S $10, PT_R10(k1)
75 77 LONG_S $11, PT_R11(k1)
76 LONG_S $8, PT_R8(k1) 78 LONG_S $12, PT_R12(k1)
77 LONG_S $9, PT_R9(k1) 79 LONG_S $13, PT_R13(k1)
78 LONG_S $10, PT_R10(k1) 80 LONG_S $14, PT_R14(k1)
79 LONG_S $11, PT_R11(k1) 81 LONG_S $15, PT_R15(k1)
80 LONG_S $12, PT_R12(k1) 82 LONG_S $16, PT_R16(k1)
81 LONG_S $13, PT_R13(k1) 83 LONG_S $17, PT_R17(k1)
82 LONG_S $14, PT_R14(k1) 84
83 LONG_S $15, PT_R15(k1) 85 LONG_S $18, PT_R18(k1)
84 LONG_S $16, PT_R16(k1) 86 LONG_S $19, PT_R19(k1)
85 LONG_S $17, PT_R17(k1) 87 LONG_S $20, PT_R20(k1)
86 88 LONG_S $21, PT_R21(k1)
87 LONG_S $18, PT_R18(k1) 89 LONG_S $22, PT_R22(k1)
88 LONG_S $19, PT_R19(k1) 90 LONG_S $23, PT_R23(k1)
89 LONG_S $20, PT_R20(k1) 91 LONG_S $24, PT_R24(k1)
90 LONG_S $21, PT_R21(k1) 92 LONG_S $25, PT_R25(k1)
91 LONG_S $22, PT_R22(k1)
92 LONG_S $23, PT_R23(k1)
93 LONG_S $24, PT_R24(k1)
94 LONG_S $25, PT_R25(k1)
95 93
96 /* XXXKYMA k0/k1 not saved, not being used if we got here through an ioctl() */ 94 /* XXXKYMA k0/k1 not saved, not being used if we got here through an ioctl() */
97 95
98 LONG_S $28, PT_R28(k1) 96 LONG_S $28, PT_R28(k1)
99 LONG_S $29, PT_R29(k1) 97 LONG_S $29, PT_R29(k1)
100 LONG_S $30, PT_R30(k1) 98 LONG_S $30, PT_R30(k1)
101 LONG_S $31, PT_R31(k1) 99 LONG_S $31, PT_R31(k1)
102 100
103 /* Save hi/lo */ 101 /* Save hi/lo */
104 mflo v0 102 mflo v0
105 LONG_S v0, PT_LO(k1) 103 LONG_S v0, PT_LO(k1)
106 mfhi v1 104 mfhi v1
107 LONG_S v1, PT_HI(k1) 105 LONG_S v1, PT_HI(k1)
108 106
109 /* Save host status */ 107 /* Save host status */
110 mfc0 v0, CP0_STATUS 108 mfc0 v0, CP0_STATUS
111 LONG_S v0, PT_STATUS(k1) 109 LONG_S v0, PT_STATUS(k1)
112 110
113 /* Save host ASID, shove it into the BVADDR location */ 111 /* Save host ASID, shove it into the BVADDR location */
114 mfc0 v1,CP0_ENTRYHI 112 mfc0 v1, CP0_ENTRYHI
115 andi v1, 0xff 113 andi v1, 0xff
116 LONG_S v1, PT_HOST_ASID(k1) 114 LONG_S v1, PT_HOST_ASID(k1)
117 115
118 /* Save DDATA_LO, will be used to store pointer to vcpu */ 116 /* Save DDATA_LO, will be used to store pointer to vcpu */
119 mfc0 v1, CP0_DDATA_LO 117 mfc0 v1, CP0_DDATA_LO
120 LONG_S v1, PT_HOST_USERLOCAL(k1) 118 LONG_S v1, PT_HOST_USERLOCAL(k1)
121 119
122 /* DDATA_LO has pointer to vcpu */ 120 /* DDATA_LO has pointer to vcpu */
123 mtc0 a1,CP0_DDATA_LO 121 mtc0 a1, CP0_DDATA_LO
124 122
125 /* Offset into vcpu->arch */ 123 /* Offset into vcpu->arch */
126 addiu k1, a1, VCPU_HOST_ARCH 124 INT_ADDIU k1, a1, VCPU_HOST_ARCH
127 125
128 /* Save the host stack to VCPU, used for exception processing when we exit from the Guest */ 126 /*
129 LONG_S sp, VCPU_HOST_STACK(k1) 127 * Save the host stack to VCPU, used for exception processing
128 * when we exit from the Guest
129 */
130 LONG_S sp, VCPU_HOST_STACK(k1)
130 131
131 /* Save the kernel gp as well */ 132 /* Save the kernel gp as well */
132 LONG_S gp, VCPU_HOST_GP(k1) 133 LONG_S gp, VCPU_HOST_GP(k1)
133 134
134 /* Setup status register for running the guest in UM, interrupts are disabled */ 135 /* Setup status register for running the guest in UM, interrupts are disabled */
135 li k0,(ST0_EXL | KSU_USER| ST0_BEV) 136 li k0, (ST0_EXL | KSU_USER | ST0_BEV)
136 mtc0 k0,CP0_STATUS 137 mtc0 k0, CP0_STATUS
137 ehb 138 ehb
138 139
139 /* load up the new EBASE */ 140 /* load up the new EBASE */
140 LONG_L k0, VCPU_GUEST_EBASE(k1) 141 LONG_L k0, VCPU_GUEST_EBASE(k1)
141 mtc0 k0,CP0_EBASE 142 mtc0 k0, CP0_EBASE
142 143
143 /* Now that the new EBASE has been loaded, unset BEV, set interrupt mask as it was 144 /*
144 * but make sure that timer interrupts are enabled 145 * Now that the new EBASE has been loaded, unset BEV, set
145 */ 146 * interrupt mask as it was but make sure that timer interrupts
146 li k0,(ST0_EXL | KSU_USER | ST0_IE) 147 * are enabled
147 andi v0, v0, ST0_IM 148 */
148 or k0, k0, v0 149 li k0, (ST0_EXL | KSU_USER | ST0_IE)
149 mtc0 k0,CP0_STATUS 150 andi v0, v0, ST0_IM
150 ehb 151 or k0, k0, v0
152 mtc0 k0, CP0_STATUS
153 ehb
151 154
152 155
153 /* Set Guest EPC */ 156 /* Set Guest EPC */
154 LONG_L t0, VCPU_PC(k1) 157 LONG_L t0, VCPU_PC(k1)
155 mtc0 t0, CP0_EPC 158 mtc0 t0, CP0_EPC
156 159
157FEXPORT(__kvm_mips_load_asid) 160FEXPORT(__kvm_mips_load_asid)
158 /* Set the ASID for the Guest Kernel */ 161 /* Set the ASID for the Guest Kernel */
159 sll t0, t0, 1 /* with kseg0 @ 0x40000000, kernel */ 162 INT_SLL t0, t0, 1 /* with kseg0 @ 0x40000000, kernel */
160 /* addresses shift to 0x80000000 */ 163 /* addresses shift to 0x80000000 */
161 bltz t0, 1f /* If kernel */ 164 bltz t0, 1f /* If kernel */
162 addiu t1, k1, VCPU_GUEST_KERNEL_ASID /* (BD) */ 165 INT_ADDIU t1, k1, VCPU_GUEST_KERNEL_ASID /* (BD) */
163 addiu t1, k1, VCPU_GUEST_USER_ASID /* else user */ 166 INT_ADDIU t1, k1, VCPU_GUEST_USER_ASID /* else user */
1641: 1671:
165 /* t1: contains the base of the ASID array, need to get the cpu id */ 168 /* t1: contains the base of the ASID array, need to get the cpu id */
166 LONG_L t2, TI_CPU($28) /* smp_processor_id */ 169 LONG_L t2, TI_CPU($28) /* smp_processor_id */
167 sll t2, t2, 2 /* x4 */ 170 INT_SLL t2, t2, 2 /* x4 */
168 addu t3, t1, t2 171 REG_ADDU t3, t1, t2
169 LONG_L k0, (t3) 172 LONG_L k0, (t3)
170 andi k0, k0, 0xff 173 andi k0, k0, 0xff
171 mtc0 k0,CP0_ENTRYHI 174 mtc0 k0, CP0_ENTRYHI
172 ehb 175 ehb
173 176
174 /* Disable RDHWR access */ 177 /* Disable RDHWR access */
175 mtc0 zero, CP0_HWRENA 178 mtc0 zero, CP0_HWRENA
176 179
177 /* Now load up the Guest Context from VCPU */ 180 /* Now load up the Guest Context from VCPU */
178 LONG_L $1, VCPU_R1(k1) 181 LONG_L $1, VCPU_R1(k1)
179 LONG_L $2, VCPU_R2(k1) 182 LONG_L $2, VCPU_R2(k1)
180 LONG_L $3, VCPU_R3(k1) 183 LONG_L $3, VCPU_R3(k1)
181 184
182 LONG_L $4, VCPU_R4(k1) 185 LONG_L $4, VCPU_R4(k1)
183 LONG_L $5, VCPU_R5(k1) 186 LONG_L $5, VCPU_R5(k1)
184 LONG_L $6, VCPU_R6(k1) 187 LONG_L $6, VCPU_R6(k1)
185 LONG_L $7, VCPU_R7(k1) 188 LONG_L $7, VCPU_R7(k1)
186 189
187 LONG_L $8, VCPU_R8(k1) 190 LONG_L $8, VCPU_R8(k1)
188 LONG_L $9, VCPU_R9(k1) 191 LONG_L $9, VCPU_R9(k1)
189 LONG_L $10, VCPU_R10(k1) 192 LONG_L $10, VCPU_R10(k1)
190 LONG_L $11, VCPU_R11(k1) 193 LONG_L $11, VCPU_R11(k1)
191 LONG_L $12, VCPU_R12(k1) 194 LONG_L $12, VCPU_R12(k1)
192 LONG_L $13, VCPU_R13(k1) 195 LONG_L $13, VCPU_R13(k1)
193 LONG_L $14, VCPU_R14(k1) 196 LONG_L $14, VCPU_R14(k1)
194 LONG_L $15, VCPU_R15(k1) 197 LONG_L $15, VCPU_R15(k1)
195 LONG_L $16, VCPU_R16(k1) 198 LONG_L $16, VCPU_R16(k1)
196 LONG_L $17, VCPU_R17(k1) 199 LONG_L $17, VCPU_R17(k1)
197 LONG_L $18, VCPU_R18(k1) 200 LONG_L $18, VCPU_R18(k1)
198 LONG_L $19, VCPU_R19(k1) 201 LONG_L $19, VCPU_R19(k1)
199 LONG_L $20, VCPU_R20(k1) 202 LONG_L $20, VCPU_R20(k1)
200 LONG_L $21, VCPU_R21(k1) 203 LONG_L $21, VCPU_R21(k1)
201 LONG_L $22, VCPU_R22(k1) 204 LONG_L $22, VCPU_R22(k1)
202 LONG_L $23, VCPU_R23(k1) 205 LONG_L $23, VCPU_R23(k1)
203 LONG_L $24, VCPU_R24(k1) 206 LONG_L $24, VCPU_R24(k1)
204 LONG_L $25, VCPU_R25(k1) 207 LONG_L $25, VCPU_R25(k1)
205 208
206 /* k0/k1 loaded up later */ 209 /* k0/k1 loaded up later */
207 210
208 LONG_L $28, VCPU_R28(k1) 211 LONG_L $28, VCPU_R28(k1)
209 LONG_L $29, VCPU_R29(k1) 212 LONG_L $29, VCPU_R29(k1)
210 LONG_L $30, VCPU_R30(k1) 213 LONG_L $30, VCPU_R30(k1)
211 LONG_L $31, VCPU_R31(k1) 214 LONG_L $31, VCPU_R31(k1)
212 215
213 /* Restore hi/lo */ 216 /* Restore hi/lo */
214 LONG_L k0, VCPU_LO(k1) 217 LONG_L k0, VCPU_LO(k1)
215 mtlo k0 218 mtlo k0
216 219
217 LONG_L k0, VCPU_HI(k1) 220 LONG_L k0, VCPU_HI(k1)
218 mthi k0 221 mthi k0
219 222
220FEXPORT(__kvm_mips_load_k0k1) 223FEXPORT(__kvm_mips_load_k0k1)
221 /* Restore the guest's k0/k1 registers */ 224 /* Restore the guest's k0/k1 registers */
222 LONG_L k0, VCPU_R26(k1) 225 LONG_L k0, VCPU_R26(k1)
223 LONG_L k1, VCPU_R27(k1) 226 LONG_L k1, VCPU_R27(k1)
224 227
225 /* Jump to guest */ 228 /* Jump to guest */
226 eret 229 eret
227 .set pop
228 230
229VECTOR(MIPSX(exception), unknown) 231VECTOR(MIPSX(exception), unknown)
230/* 232/*
231 * Find out what mode we came from and jump to the proper handler. 233 * Find out what mode we came from and jump to the proper handler.
232 */ 234 */
233 .set push 235 mtc0 k0, CP0_ERROREPC #01: Save guest k0
234 .set noat 236 ehb #02:
235 .set noreorder 237
236 mtc0 k0, CP0_ERROREPC #01: Save guest k0 238 mfc0 k0, CP0_EBASE #02: Get EBASE
237 ehb #02: 239 INT_SRL k0, k0, 10 #03: Get rid of CPUNum
238 240 INT_SLL k0, k0, 10 #04
239 mfc0 k0, CP0_EBASE #02: Get EBASE 241 LONG_S k1, 0x3000(k0) #05: Save k1 @ offset 0x3000
240 srl k0, k0, 10 #03: Get rid of CPUNum 242 INT_ADDIU k0, k0, 0x2000 #06: Exception handler is installed @ offset 0x2000
241 sll k0, k0, 10 #04 243 j k0 #07: jump to the function
242 LONG_S k1, 0x3000(k0) #05: Save k1 @ offset 0x3000 244 nop #08: branch delay slot
243 addiu k0, k0, 0x2000 #06: Exception handler is installed @ offset 0x2000
244 j k0 #07: jump to the function
245 nop #08: branch delay slot
246 .set push
247VECTOR_END(MIPSX(exceptionEnd)) 245VECTOR_END(MIPSX(exceptionEnd))
248.end MIPSX(exception) 246.end MIPSX(exception)
249 247
@@ -253,329 +251,327 @@ VECTOR_END(MIPSX(exceptionEnd))
253 * 251 *
254 */ 252 */
255NESTED (MIPSX(GuestException), CALLFRAME_SIZ, ra) 253NESTED (MIPSX(GuestException), CALLFRAME_SIZ, ra)
256 .set push 254 /* Get the VCPU pointer from DDTATA_LO */
257 .set noat 255 mfc0 k1, CP0_DDATA_LO
258 .set noreorder 256 INT_ADDIU k1, k1, VCPU_HOST_ARCH
259 257
260 /* Get the VCPU pointer from DDTATA_LO */ 258 /* Start saving Guest context to VCPU */
261 mfc0 k1, CP0_DDATA_LO 259 LONG_S $0, VCPU_R0(k1)
262 addiu k1, k1, VCPU_HOST_ARCH 260 LONG_S $1, VCPU_R1(k1)
263 261 LONG_S $2, VCPU_R2(k1)
264 /* Start saving Guest context to VCPU */ 262 LONG_S $3, VCPU_R3(k1)
265 LONG_S $0, VCPU_R0(k1) 263 LONG_S $4, VCPU_R4(k1)
266 LONG_S $1, VCPU_R1(k1) 264 LONG_S $5, VCPU_R5(k1)
267 LONG_S $2, VCPU_R2(k1) 265 LONG_S $6, VCPU_R6(k1)
268 LONG_S $3, VCPU_R3(k1) 266 LONG_S $7, VCPU_R7(k1)
269 LONG_S $4, VCPU_R4(k1) 267 LONG_S $8, VCPU_R8(k1)
270 LONG_S $5, VCPU_R5(k1) 268 LONG_S $9, VCPU_R9(k1)
271 LONG_S $6, VCPU_R6(k1) 269 LONG_S $10, VCPU_R10(k1)
272 LONG_S $7, VCPU_R7(k1) 270 LONG_S $11, VCPU_R11(k1)
273 LONG_S $8, VCPU_R8(k1) 271 LONG_S $12, VCPU_R12(k1)
274 LONG_S $9, VCPU_R9(k1) 272 LONG_S $13, VCPU_R13(k1)
275 LONG_S $10, VCPU_R10(k1) 273 LONG_S $14, VCPU_R14(k1)
276 LONG_S $11, VCPU_R11(k1) 274 LONG_S $15, VCPU_R15(k1)
277 LONG_S $12, VCPU_R12(k1) 275 LONG_S $16, VCPU_R16(k1)
278 LONG_S $13, VCPU_R13(k1) 276 LONG_S $17, VCPU_R17(k1)
279 LONG_S $14, VCPU_R14(k1) 277 LONG_S $18, VCPU_R18(k1)
280 LONG_S $15, VCPU_R15(k1) 278 LONG_S $19, VCPU_R19(k1)
281 LONG_S $16, VCPU_R16(k1) 279 LONG_S $20, VCPU_R20(k1)
282 LONG_S $17,VCPU_R17(k1) 280 LONG_S $21, VCPU_R21(k1)
283 LONG_S $18, VCPU_R18(k1) 281 LONG_S $22, VCPU_R22(k1)
284 LONG_S $19, VCPU_R19(k1) 282 LONG_S $23, VCPU_R23(k1)
285 LONG_S $20, VCPU_R20(k1) 283 LONG_S $24, VCPU_R24(k1)
286 LONG_S $21, VCPU_R21(k1) 284 LONG_S $25, VCPU_R25(k1)
287 LONG_S $22, VCPU_R22(k1) 285
288 LONG_S $23, VCPU_R23(k1) 286 /* Guest k0/k1 saved later */
289 LONG_S $24, VCPU_R24(k1) 287
290 LONG_S $25, VCPU_R25(k1) 288 LONG_S $28, VCPU_R28(k1)
291 289 LONG_S $29, VCPU_R29(k1)
292 /* Guest k0/k1 saved later */ 290 LONG_S $30, VCPU_R30(k1)
293 291 LONG_S $31, VCPU_R31(k1)
294 LONG_S $28, VCPU_R28(k1) 292
295 LONG_S $29, VCPU_R29(k1) 293 /* We need to save hi/lo and restore them on
296 LONG_S $30, VCPU_R30(k1) 294 * the way out
297 LONG_S $31, VCPU_R31(k1) 295 */
298 296 mfhi t0
299 /* We need to save hi/lo and restore them on 297 LONG_S t0, VCPU_HI(k1)
300 * the way out 298
301 */ 299 mflo t0
302 mfhi t0 300 LONG_S t0, VCPU_LO(k1)
303 LONG_S t0, VCPU_HI(k1) 301
304 302 /* Finally save guest k0/k1 to VCPU */
305 mflo t0 303 mfc0 t0, CP0_ERROREPC
306 LONG_S t0, VCPU_LO(k1) 304 LONG_S t0, VCPU_R26(k1)
307 305
308 /* Finally save guest k0/k1 to VCPU */ 306 /* Get GUEST k1 and save it in VCPU */
309 mfc0 t0, CP0_ERROREPC 307 PTR_LI t1, ~0x2ff
310 LONG_S t0, VCPU_R26(k1) 308 mfc0 t0, CP0_EBASE
311 309 and t0, t0, t1
312 /* Get GUEST k1 and save it in VCPU */ 310 LONG_L t0, 0x3000(t0)
313 la t1, ~0x2ff 311 LONG_S t0, VCPU_R27(k1)
314 mfc0 t0, CP0_EBASE 312
315 and t0, t0, t1 313 /* Now that context has been saved, we can use other registers */
316 LONG_L t0, 0x3000(t0) 314
317 LONG_S t0, VCPU_R27(k1) 315 /* Restore vcpu */
318 316 mfc0 a1, CP0_DDATA_LO
319 /* Now that context has been saved, we can use other registers */ 317 move s1, a1
320 318
321 /* Restore vcpu */ 319 /* Restore run (vcpu->run) */
322 mfc0 a1, CP0_DDATA_LO 320 LONG_L a0, VCPU_RUN(a1)
323 move s1, a1 321 /* Save pointer to run in s0, will be saved by the compiler */
324 322 move s0, a0
325 /* Restore run (vcpu->run) */ 323
326 LONG_L a0, VCPU_RUN(a1) 324 /* Save Host level EPC, BadVaddr and Cause to VCPU, useful to
327 /* Save pointer to run in s0, will be saved by the compiler */ 325 * process the exception */
328 move s0, a0 326 mfc0 k0,CP0_EPC
329 327 LONG_S k0, VCPU_PC(k1)
330 328
331 /* Save Host level EPC, BadVaddr and Cause to VCPU, useful to process the exception */ 329 mfc0 k0, CP0_BADVADDR
332 mfc0 k0,CP0_EPC 330 LONG_S k0, VCPU_HOST_CP0_BADVADDR(k1)
333 LONG_S k0, VCPU_PC(k1) 331
334 332 mfc0 k0, CP0_CAUSE
335 mfc0 k0, CP0_BADVADDR 333 LONG_S k0, VCPU_HOST_CP0_CAUSE(k1)
336 LONG_S k0, VCPU_HOST_CP0_BADVADDR(k1) 334
337 335 mfc0 k0, CP0_ENTRYHI
338 mfc0 k0, CP0_CAUSE 336 LONG_S k0, VCPU_HOST_ENTRYHI(k1)
339 LONG_S k0, VCPU_HOST_CP0_CAUSE(k1) 337
340 338 /* Now restore the host state just enough to run the handlers */
341 mfc0 k0, CP0_ENTRYHI 339
342 LONG_S k0, VCPU_HOST_ENTRYHI(k1) 340 /* Swtich EBASE to the one used by Linux */
343 341 /* load up the host EBASE */
344 /* Now restore the host state just enough to run the handlers */ 342 mfc0 v0, CP0_STATUS
345 343
346 /* Swtich EBASE to the one used by Linux */ 344 .set at
347 /* load up the host EBASE */ 345 or k0, v0, ST0_BEV
348 mfc0 v0, CP0_STATUS 346 .set noat
349 347
350 .set at 348 mtc0 k0, CP0_STATUS
351 or k0, v0, ST0_BEV 349 ehb
352 .set noat 350
353 351 LONG_L k0, VCPU_HOST_EBASE(k1)
354 mtc0 k0, CP0_STATUS 352 mtc0 k0,CP0_EBASE
355 ehb 353
356
357 LONG_L k0, VCPU_HOST_EBASE(k1)
358 mtc0 k0,CP0_EBASE
359
360
361 /* Now that the new EBASE has been loaded, unset BEV and KSU_USER */
362 .set at
363 and v0, v0, ~(ST0_EXL | KSU_USER | ST0_IE)
364 or v0, v0, ST0_CU0
365 .set noat
366 mtc0 v0, CP0_STATUS
367 ehb
368
369 /* Load up host GP */
370 LONG_L gp, VCPU_HOST_GP(k1)
371
372 /* Need a stack before we can jump to "C" */
373 LONG_L sp, VCPU_HOST_STACK(k1)
374
375 /* Saved host state */
376 addiu sp,sp, -PT_SIZE
377 354
378 /* XXXKYMA do we need to load the host ASID, maybe not because the 355 /* Now that the new EBASE has been loaded, unset BEV and KSU_USER */
379 * kernel entries are marked GLOBAL, need to verify 356 .set at
380 */ 357 and v0, v0, ~(ST0_EXL | KSU_USER | ST0_IE)
358 or v0, v0, ST0_CU0
359 .set noat
360 mtc0 v0, CP0_STATUS
361 ehb
362
363 /* Load up host GP */
364 LONG_L gp, VCPU_HOST_GP(k1)
365
366 /* Need a stack before we can jump to "C" */
367 LONG_L sp, VCPU_HOST_STACK(k1)
368
369 /* Saved host state */
370 INT_ADDIU sp, sp, -PT_SIZE
381 371
382 /* Restore host DDATA_LO */ 372 /* XXXKYMA do we need to load the host ASID, maybe not because the
383 LONG_L k0, PT_HOST_USERLOCAL(sp) 373 * kernel entries are marked GLOBAL, need to verify
384 mtc0 k0, CP0_DDATA_LO 374 */
385 375
386 /* Restore RDHWR access */ 376 /* Restore host DDATA_LO */
387 la k0, 0x2000000F 377 LONG_L k0, PT_HOST_USERLOCAL(sp)
388 mtc0 k0, CP0_HWRENA 378 mtc0 k0, CP0_DDATA_LO
389 379
390 /* Jump to handler */ 380 /* Restore RDHWR access */
381 PTR_LI k0, 0x2000000F
382 mtc0 k0, CP0_HWRENA
383
384 /* Jump to handler */
391FEXPORT(__kvm_mips_jump_to_handler) 385FEXPORT(__kvm_mips_jump_to_handler)
392 /* XXXKYMA: not sure if this is safe, how large is the stack?? */ 386 /* XXXKYMA: not sure if this is safe, how large is the stack??
393 /* Now jump to the kvm_mips_handle_exit() to see if we can deal with this in the kernel */ 387 * Now jump to the kvm_mips_handle_exit() to see if we can deal
394 la t9,kvm_mips_handle_exit 388 * with this in the kernel */
395 jalr.hb t9 389 PTR_LA t9, kvm_mips_handle_exit
396 addiu sp,sp, -CALLFRAME_SIZ /* BD Slot */ 390 jalr.hb t9
397 391 INT_ADDIU sp, sp, -CALLFRAME_SIZ /* BD Slot */
398 /* Return from handler Make sure interrupts are disabled */ 392
399 di 393 /* Return from handler Make sure interrupts are disabled */
400 ehb 394 di
401 395 ehb
402 /* XXXKYMA: k0/k1 could have been blown away if we processed an exception 396
403 * while we were handling the exception from the guest, reload k1 397 /* XXXKYMA: k0/k1 could have been blown away if we processed
404 */ 398 * an exception while we were handling the exception from the
405 move k1, s1 399 * guest, reload k1
406 addiu k1, k1, VCPU_HOST_ARCH 400 */
407 401
408 /* Check return value, should tell us if we are returning to the host (handle I/O etc) 402 move k1, s1
409 * or resuming the guest 403 INT_ADDIU k1, k1, VCPU_HOST_ARCH
410 */ 404
411 andi t0, v0, RESUME_HOST 405 /* Check return value, should tell us if we are returning to the
412 bnez t0, __kvm_mips_return_to_host 406 * host (handle I/O etc)or resuming the guest
413 nop 407 */
408 andi t0, v0, RESUME_HOST
409 bnez t0, __kvm_mips_return_to_host
410 nop
414 411
415__kvm_mips_return_to_guest: 412__kvm_mips_return_to_guest:
416 /* Put the saved pointer to vcpu (s1) back into the DDATA_LO Register */ 413 /* Put the saved pointer to vcpu (s1) back into the DDATA_LO Register */
417 mtc0 s1, CP0_DDATA_LO 414 mtc0 s1, CP0_DDATA_LO
418
419 /* Load up the Guest EBASE to minimize the window where BEV is set */
420 LONG_L t0, VCPU_GUEST_EBASE(k1)
421
422 /* Switch EBASE back to the one used by KVM */
423 mfc0 v1, CP0_STATUS
424 .set at
425 or k0, v1, ST0_BEV
426 .set noat
427 mtc0 k0, CP0_STATUS
428 ehb
429 mtc0 t0,CP0_EBASE
430
431 /* Setup status register for running guest in UM */
432 .set at
433 or v1, v1, (ST0_EXL | KSU_USER | ST0_IE)
434 and v1, v1, ~ST0_CU0
435 .set noat
436 mtc0 v1, CP0_STATUS
437 ehb
438 415
416 /* Load up the Guest EBASE to minimize the window where BEV is set */
417 LONG_L t0, VCPU_GUEST_EBASE(k1)
418
419 /* Switch EBASE back to the one used by KVM */
420 mfc0 v1, CP0_STATUS
421 .set at
422 or k0, v1, ST0_BEV
423 .set noat
424 mtc0 k0, CP0_STATUS
425 ehb
426 mtc0 t0, CP0_EBASE
427
428 /* Setup status register for running guest in UM */
429 .set at
430 or v1, v1, (ST0_EXL | KSU_USER | ST0_IE)
431 and v1, v1, ~ST0_CU0
432 .set noat
433 mtc0 v1, CP0_STATUS
434 ehb
439 435
440 /* Set Guest EPC */ 436 /* Set Guest EPC */
441 LONG_L t0, VCPU_PC(k1) 437 LONG_L t0, VCPU_PC(k1)
442 mtc0 t0, CP0_EPC 438 mtc0 t0, CP0_EPC
443 439
444 /* Set the ASID for the Guest Kernel */ 440 /* Set the ASID for the Guest Kernel */
445 sll t0, t0, 1 /* with kseg0 @ 0x40000000, kernel */ 441 INT_SLL t0, t0, 1 /* with kseg0 @ 0x40000000, kernel */
446 /* addresses shift to 0x80000000 */ 442 /* addresses shift to 0x80000000 */
447 bltz t0, 1f /* If kernel */ 443 bltz t0, 1f /* If kernel */
448 addiu t1, k1, VCPU_GUEST_KERNEL_ASID /* (BD) */ 444 INT_ADDIU t1, k1, VCPU_GUEST_KERNEL_ASID /* (BD) */
449 addiu t1, k1, VCPU_GUEST_USER_ASID /* else user */ 445 INT_ADDIU t1, k1, VCPU_GUEST_USER_ASID /* else user */
4501: 4461:
451 /* t1: contains the base of the ASID array, need to get the cpu id */ 447 /* t1: contains the base of the ASID array, need to get the cpu id */
452 LONG_L t2, TI_CPU($28) /* smp_processor_id */ 448 LONG_L t2, TI_CPU($28) /* smp_processor_id */
453 sll t2, t2, 2 /* x4 */ 449 INT_SLL t2, t2, 2 /* x4 */
454 addu t3, t1, t2 450 REG_ADDU t3, t1, t2
455 LONG_L k0, (t3) 451 LONG_L k0, (t3)
456 andi k0, k0, 0xff 452 andi k0, k0, 0xff
457 mtc0 k0,CP0_ENTRYHI 453 mtc0 k0,CP0_ENTRYHI
458 ehb 454 ehb
459 455
460 /* Disable RDHWR access */ 456 /* Disable RDHWR access */
461 mtc0 zero, CP0_HWRENA 457 mtc0 zero, CP0_HWRENA
462 458
463 /* load the guest context from VCPU and return */ 459 /* load the guest context from VCPU and return */
464 LONG_L $0, VCPU_R0(k1) 460 LONG_L $0, VCPU_R0(k1)
465 LONG_L $1, VCPU_R1(k1) 461 LONG_L $1, VCPU_R1(k1)
466 LONG_L $2, VCPU_R2(k1) 462 LONG_L $2, VCPU_R2(k1)
467 LONG_L $3, VCPU_R3(k1) 463 LONG_L $3, VCPU_R3(k1)
468 LONG_L $4, VCPU_R4(k1) 464 LONG_L $4, VCPU_R4(k1)
469 LONG_L $5, VCPU_R5(k1) 465 LONG_L $5, VCPU_R5(k1)
470 LONG_L $6, VCPU_R6(k1) 466 LONG_L $6, VCPU_R6(k1)
471 LONG_L $7, VCPU_R7(k1) 467 LONG_L $7, VCPU_R7(k1)
472 LONG_L $8, VCPU_R8(k1) 468 LONG_L $8, VCPU_R8(k1)
473 LONG_L $9, VCPU_R9(k1) 469 LONG_L $9, VCPU_R9(k1)
474 LONG_L $10, VCPU_R10(k1) 470 LONG_L $10, VCPU_R10(k1)
475 LONG_L $11, VCPU_R11(k1) 471 LONG_L $11, VCPU_R11(k1)
476 LONG_L $12, VCPU_R12(k1) 472 LONG_L $12, VCPU_R12(k1)
477 LONG_L $13, VCPU_R13(k1) 473 LONG_L $13, VCPU_R13(k1)
478 LONG_L $14, VCPU_R14(k1) 474 LONG_L $14, VCPU_R14(k1)
479 LONG_L $15, VCPU_R15(k1) 475 LONG_L $15, VCPU_R15(k1)
480 LONG_L $16, VCPU_R16(k1) 476 LONG_L $16, VCPU_R16(k1)
481 LONG_L $17, VCPU_R17(k1) 477 LONG_L $17, VCPU_R17(k1)
482 LONG_L $18, VCPU_R18(k1) 478 LONG_L $18, VCPU_R18(k1)
483 LONG_L $19, VCPU_R19(k1) 479 LONG_L $19, VCPU_R19(k1)
484 LONG_L $20, VCPU_R20(k1) 480 LONG_L $20, VCPU_R20(k1)
485 LONG_L $21, VCPU_R21(k1) 481 LONG_L $21, VCPU_R21(k1)
486 LONG_L $22, VCPU_R22(k1) 482 LONG_L $22, VCPU_R22(k1)
487 LONG_L $23, VCPU_R23(k1) 483 LONG_L $23, VCPU_R23(k1)
488 LONG_L $24, VCPU_R24(k1) 484 LONG_L $24, VCPU_R24(k1)
489 LONG_L $25, VCPU_R25(k1) 485 LONG_L $25, VCPU_R25(k1)
490 486
491 /* $/k1 loaded later */ 487 /* $/k1 loaded later */
492 LONG_L $28, VCPU_R28(k1) 488 LONG_L $28, VCPU_R28(k1)
493 LONG_L $29, VCPU_R29(k1) 489 LONG_L $29, VCPU_R29(k1)
494 LONG_L $30, VCPU_R30(k1) 490 LONG_L $30, VCPU_R30(k1)
495 LONG_L $31, VCPU_R31(k1) 491 LONG_L $31, VCPU_R31(k1)
496 492
497FEXPORT(__kvm_mips_skip_guest_restore) 493FEXPORT(__kvm_mips_skip_guest_restore)
498 LONG_L k0, VCPU_HI(k1) 494 LONG_L k0, VCPU_HI(k1)
499 mthi k0 495 mthi k0
500 496
501 LONG_L k0, VCPU_LO(k1) 497 LONG_L k0, VCPU_LO(k1)
502 mtlo k0 498 mtlo k0
503 499
504 LONG_L k0, VCPU_R26(k1) 500 LONG_L k0, VCPU_R26(k1)
505 LONG_L k1, VCPU_R27(k1) 501 LONG_L k1, VCPU_R27(k1)
506 502
507 eret 503 eret
508 504
509__kvm_mips_return_to_host: 505__kvm_mips_return_to_host:
510 /* EBASE is already pointing to Linux */ 506 /* EBASE is already pointing to Linux */
511 LONG_L k1, VCPU_HOST_STACK(k1) 507 LONG_L k1, VCPU_HOST_STACK(k1)
512 addiu k1,k1, -PT_SIZE 508 INT_ADDIU k1,k1, -PT_SIZE
513 509
514 /* Restore host DDATA_LO */ 510 /* Restore host DDATA_LO */
515 LONG_L k0, PT_HOST_USERLOCAL(k1) 511 LONG_L k0, PT_HOST_USERLOCAL(k1)
516 mtc0 k0, CP0_DDATA_LO 512 mtc0 k0, CP0_DDATA_LO
517 513
518 /* Restore host ASID */ 514 /* Restore host ASID */
519 LONG_L k0, PT_HOST_ASID(sp) 515 LONG_L k0, PT_HOST_ASID(sp)
520 andi k0, 0xff 516 andi k0, 0xff
521 mtc0 k0,CP0_ENTRYHI 517 mtc0 k0,CP0_ENTRYHI
522 ehb 518 ehb
523 519
524 /* Load context saved on the host stack */ 520 /* Load context saved on the host stack */
525 LONG_L $0, PT_R0(k1) 521 LONG_L $0, PT_R0(k1)
526 LONG_L $1, PT_R1(k1) 522 LONG_L $1, PT_R1(k1)
527 523
528 /* r2/v0 is the return code, shift it down by 2 (arithmetic) to recover the err code */ 524 /* r2/v0 is the return code, shift it down by 2 (arithmetic)
529 sra k0, v0, 2 525 * to recover the err code */
530 move $2, k0 526 INT_SRA k0, v0, 2
531 527 move $2, k0
532 LONG_L $3, PT_R3(k1) 528
533 LONG_L $4, PT_R4(k1) 529 LONG_L $3, PT_R3(k1)
534 LONG_L $5, PT_R5(k1) 530 LONG_L $4, PT_R4(k1)
535 LONG_L $6, PT_R6(k1) 531 LONG_L $5, PT_R5(k1)
536 LONG_L $7, PT_R7(k1) 532 LONG_L $6, PT_R6(k1)
537 LONG_L $8, PT_R8(k1) 533 LONG_L $7, PT_R7(k1)
538 LONG_L $9, PT_R9(k1) 534 LONG_L $8, PT_R8(k1)
539 LONG_L $10, PT_R10(k1) 535 LONG_L $9, PT_R9(k1)
540 LONG_L $11, PT_R11(k1) 536 LONG_L $10, PT_R10(k1)
541 LONG_L $12, PT_R12(k1) 537 LONG_L $11, PT_R11(k1)
542 LONG_L $13, PT_R13(k1) 538 LONG_L $12, PT_R12(k1)
543 LONG_L $14, PT_R14(k1) 539 LONG_L $13, PT_R13(k1)
544 LONG_L $15, PT_R15(k1) 540 LONG_L $14, PT_R14(k1)
545 LONG_L $16, PT_R16(k1) 541 LONG_L $15, PT_R15(k1)
546 LONG_L $17, PT_R17(k1) 542 LONG_L $16, PT_R16(k1)
547 LONG_L $18, PT_R18(k1) 543 LONG_L $17, PT_R17(k1)
548 LONG_L $19, PT_R19(k1) 544 LONG_L $18, PT_R18(k1)
549 LONG_L $20, PT_R20(k1) 545 LONG_L $19, PT_R19(k1)
550 LONG_L $21, PT_R21(k1) 546 LONG_L $20, PT_R20(k1)
551 LONG_L $22, PT_R22(k1) 547 LONG_L $21, PT_R21(k1)
552 LONG_L $23, PT_R23(k1) 548 LONG_L $22, PT_R22(k1)
553 LONG_L $24, PT_R24(k1) 549 LONG_L $23, PT_R23(k1)
554 LONG_L $25, PT_R25(k1) 550 LONG_L $24, PT_R24(k1)
555 551 LONG_L $25, PT_R25(k1)
556 /* Host k0/k1 were not saved */ 552
557 553 /* Host k0/k1 were not saved */
558 LONG_L $28, PT_R28(k1) 554
559 LONG_L $29, PT_R29(k1) 555 LONG_L $28, PT_R28(k1)
560 LONG_L $30, PT_R30(k1) 556 LONG_L $29, PT_R29(k1)
561 557 LONG_L $30, PT_R30(k1)
562 LONG_L k0, PT_HI(k1) 558
563 mthi k0 559 LONG_L k0, PT_HI(k1)
564 560 mthi k0
565 LONG_L k0, PT_LO(k1) 561
566 mtlo k0 562 LONG_L k0, PT_LO(k1)
567 563 mtlo k0
568 /* Restore RDHWR access */ 564
569 la k0, 0x2000000F 565 /* Restore RDHWR access */
570 mtc0 k0, CP0_HWRENA 566 PTR_LI k0, 0x2000000F
571 567 mtc0 k0, CP0_HWRENA
572 568
573 /* Restore RA, which is the address we will return to */ 569
574 LONG_L ra, PT_R31(k1) 570 /* Restore RA, which is the address we will return to */
575 j ra 571 LONG_L ra, PT_R31(k1)
576 nop 572 j ra
577 573 nop
578 .set pop 574
579VECTOR_END(MIPSX(GuestExceptionEnd)) 575VECTOR_END(MIPSX(GuestExceptionEnd))
580.end MIPSX(GuestException) 576.end MIPSX(GuestException)
581 577
@@ -627,24 +623,23 @@ MIPSX(exceptions):
627 623
628#define HW_SYNCI_Step $1 624#define HW_SYNCI_Step $1
629LEAF(MIPSX(SyncICache)) 625LEAF(MIPSX(SyncICache))
630 .set push 626 .set push
631 .set mips32r2 627 .set mips32r2
632 beq a1, zero, 20f 628 beq a1, zero, 20f
633 nop 629 nop
634 addu a1, a0, a1 630 REG_ADDU a1, a0, a1
635 rdhwr v0, HW_SYNCI_Step 631 rdhwr v0, HW_SYNCI_Step
636 beq v0, zero, 20f 632 beq v0, zero, 20f
637 nop 633 nop
638
63910: 63410:
640 synci 0(a0) 635 synci 0(a0)
641 addu a0, a0, v0 636 REG_ADDU a0, a0, v0
642 sltu v1, a0, a1 637 sltu v1, a0, a1
643 bne v1, zero, 10b 638 bne v1, zero, 10b
644 nop 639 nop
645 sync 640 sync
64620: 64120:
647 jr.hb ra 642 jr.hb ra
648 nop 643 nop
649 .set pop 644 .set pop
650END(MIPSX(SyncICache)) 645END(MIPSX(SyncICache))
diff --git a/arch/mips/kvm/kvm_mips.c b/arch/mips/kvm/kvm_mips.c
index dd203e59e6fd..a7b044536de4 100644
--- a/arch/mips/kvm/kvm_mips.c
+++ b/arch/mips/kvm/kvm_mips.c
@@ -208,6 +208,10 @@ int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages)
208 return 0; 208 return 0;
209} 209}
210 210
211void kvm_arch_memslots_updated(struct kvm *kvm)
212{
213}
214
211int kvm_arch_prepare_memory_region(struct kvm *kvm, 215int kvm_arch_prepare_memory_region(struct kvm *kvm,
212 struct kvm_memory_slot *memslot, 216 struct kvm_memory_slot *memslot,
213 struct kvm_userspace_memory_region *mem, 217 struct kvm_userspace_memory_region *mem,
diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h
index 08891d07aeb6..fa19e2f1a874 100644
--- a/arch/powerpc/include/asm/kvm_book3s.h
+++ b/arch/powerpc/include/asm/kvm_book3s.h
@@ -334,6 +334,27 @@ static inline u32 kvmppc_get_last_inst(struct kvm_vcpu *vcpu)
334 return r; 334 return r;
335} 335}
336 336
337/*
338 * Like kvmppc_get_last_inst(), but for fetching a sc instruction.
339 * Because the sc instruction sets SRR0 to point to the following
340 * instruction, we have to fetch from pc - 4.
341 */
342static inline u32 kvmppc_get_last_sc(struct kvm_vcpu *vcpu)
343{
344 ulong pc = kvmppc_get_pc(vcpu) - 4;
345 struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu);
346 u32 r;
347
348 /* Load the instruction manually if it failed to do so in the
349 * exit path */
350 if (svcpu->last_inst == KVM_INST_FETCH_FAILED)
351 kvmppc_ld(vcpu, &pc, sizeof(u32), &svcpu->last_inst, false);
352
353 r = svcpu->last_inst;
354 svcpu_put(svcpu);
355 return r;
356}
357
337static inline ulong kvmppc_get_fault_dar(struct kvm_vcpu *vcpu) 358static inline ulong kvmppc_get_fault_dar(struct kvm_vcpu *vcpu)
338{ 359{
339 struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu); 360 struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu);
@@ -446,6 +467,23 @@ static inline u32 kvmppc_get_last_inst(struct kvm_vcpu *vcpu)
446 return vcpu->arch.last_inst; 467 return vcpu->arch.last_inst;
447} 468}
448 469
470/*
471 * Like kvmppc_get_last_inst(), but for fetching a sc instruction.
472 * Because the sc instruction sets SRR0 to point to the following
473 * instruction, we have to fetch from pc - 4.
474 */
475static inline u32 kvmppc_get_last_sc(struct kvm_vcpu *vcpu)
476{
477 ulong pc = kvmppc_get_pc(vcpu) - 4;
478
479 /* Load the instruction manually if it failed to do so in the
480 * exit path */
481 if (vcpu->arch.last_inst == KVM_INST_FETCH_FAILED)
482 kvmppc_ld(vcpu, &pc, sizeof(u32), &vcpu->arch.last_inst, false);
483
484 return vcpu->arch.last_inst;
485}
486
449static inline ulong kvmppc_get_fault_dar(struct kvm_vcpu *vcpu) 487static inline ulong kvmppc_get_fault_dar(struct kvm_vcpu *vcpu)
450{ 488{
451 return vcpu->arch.fault_dar; 489 return vcpu->arch.fault_dar;
diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h
index a1ecb14e4442..86d638a3b359 100644
--- a/arch/powerpc/include/asm/kvm_book3s_64.h
+++ b/arch/powerpc/include/asm/kvm_book3s_64.h
@@ -37,7 +37,7 @@ static inline void svcpu_put(struct kvmppc_book3s_shadow_vcpu *svcpu)
37 37
38#ifdef CONFIG_KVM_BOOK3S_64_HV 38#ifdef CONFIG_KVM_BOOK3S_64_HV
39#define KVM_DEFAULT_HPT_ORDER 24 /* 16MB HPT by default */ 39#define KVM_DEFAULT_HPT_ORDER 24 /* 16MB HPT by default */
40extern int kvm_hpt_order; /* order of preallocated HPTs */ 40extern unsigned long kvm_rma_pages;
41#endif 41#endif
42 42
43#define VRMA_VSID 0x1ffffffUL /* 1TB VSID reserved for VRMA */ 43#define VRMA_VSID 0x1ffffffUL /* 1TB VSID reserved for VRMA */
@@ -100,7 +100,7 @@ static inline unsigned long compute_tlbie_rb(unsigned long v, unsigned long r,
100 /* (masks depend on page size) */ 100 /* (masks depend on page size) */
101 rb |= 0x1000; /* page encoding in LP field */ 101 rb |= 0x1000; /* page encoding in LP field */
102 rb |= (va_low & 0x7f) << 16; /* 7b of VA in AVA/LP field */ 102 rb |= (va_low & 0x7f) << 16; /* 7b of VA in AVA/LP field */
103 rb |= (va_low & 0xfe); /* AVAL field (P7 doesn't seem to care) */ 103 rb |= ((va_low << 4) & 0xf0); /* AVAL field (P7 doesn't seem to care) */
104 } 104 }
105 } else { 105 } else {
106 /* 4kB page */ 106 /* 4kB page */
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index af326cde7cb6..33283532e9d8 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -183,13 +183,9 @@ struct kvmppc_spapr_tce_table {
183 struct page *pages[0]; 183 struct page *pages[0];
184}; 184};
185 185
186struct kvmppc_linear_info { 186struct kvm_rma_info {
187 void *base_virt; 187 atomic_t use_count;
188 unsigned long base_pfn; 188 unsigned long base_pfn;
189 unsigned long npages;
190 struct list_head list;
191 atomic_t use_count;
192 int type;
193}; 189};
194 190
195/* XICS components, defined in book3s_xics.c */ 191/* XICS components, defined in book3s_xics.c */
@@ -246,7 +242,7 @@ struct kvm_arch {
246 int tlbie_lock; 242 int tlbie_lock;
247 unsigned long lpcr; 243 unsigned long lpcr;
248 unsigned long rmor; 244 unsigned long rmor;
249 struct kvmppc_linear_info *rma; 245 struct kvm_rma_info *rma;
250 unsigned long vrma_slb_v; 246 unsigned long vrma_slb_v;
251 int rma_setup_done; 247 int rma_setup_done;
252 int using_mmu_notifiers; 248 int using_mmu_notifiers;
@@ -259,7 +255,7 @@ struct kvm_arch {
259 spinlock_t slot_phys_lock; 255 spinlock_t slot_phys_lock;
260 cpumask_t need_tlb_flush; 256 cpumask_t need_tlb_flush;
261 struct kvmppc_vcore *vcores[KVM_MAX_VCORES]; 257 struct kvmppc_vcore *vcores[KVM_MAX_VCORES];
262 struct kvmppc_linear_info *hpt_li; 258 int hpt_cma_alloc;
263#endif /* CONFIG_KVM_BOOK3S_64_HV */ 259#endif /* CONFIG_KVM_BOOK3S_64_HV */
264#ifdef CONFIG_PPC_BOOK3S_64 260#ifdef CONFIG_PPC_BOOK3S_64
265 struct list_head spapr_tce_tables; 261 struct list_head spapr_tce_tables;
diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
index a5287fe03d77..b15554a26c20 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -137,10 +137,10 @@ extern long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
137 unsigned long ioba, unsigned long tce); 137 unsigned long ioba, unsigned long tce);
138extern long kvm_vm_ioctl_allocate_rma(struct kvm *kvm, 138extern long kvm_vm_ioctl_allocate_rma(struct kvm *kvm,
139 struct kvm_allocate_rma *rma); 139 struct kvm_allocate_rma *rma);
140extern struct kvmppc_linear_info *kvm_alloc_rma(void); 140extern struct kvm_rma_info *kvm_alloc_rma(void);
141extern void kvm_release_rma(struct kvmppc_linear_info *ri); 141extern void kvm_release_rma(struct kvm_rma_info *ri);
142extern struct kvmppc_linear_info *kvm_alloc_hpt(void); 142extern struct page *kvm_alloc_hpt(unsigned long nr_pages);
143extern void kvm_release_hpt(struct kvmppc_linear_info *li); 143extern void kvm_release_hpt(struct page *page, unsigned long nr_pages);
144extern int kvmppc_core_init_vm(struct kvm *kvm); 144extern int kvmppc_core_init_vm(struct kvm *kvm);
145extern void kvmppc_core_destroy_vm(struct kvm *kvm); 145extern void kvmppc_core_destroy_vm(struct kvm *kvm);
146extern void kvmppc_core_free_memslot(struct kvm_memory_slot *free, 146extern void kvmppc_core_free_memslot(struct kvm_memory_slot *free,
@@ -261,6 +261,7 @@ void kvmppc_set_pid(struct kvm_vcpu *vcpu, u32 pid);
261struct openpic; 261struct openpic;
262 262
263#ifdef CONFIG_KVM_BOOK3S_64_HV 263#ifdef CONFIG_KVM_BOOK3S_64_HV
264extern void kvm_cma_reserve(void) __init;
264static inline void kvmppc_set_xics_phys(int cpu, unsigned long addr) 265static inline void kvmppc_set_xics_phys(int cpu, unsigned long addr)
265{ 266{
266 paca[cpu].kvm_hstate.xics_phys = addr; 267 paca[cpu].kvm_hstate.xics_phys = addr;
@@ -281,13 +282,12 @@ static inline void kvmppc_set_host_ipi(int cpu, u8 host_ipi)
281} 282}
282 283
283extern void kvmppc_fast_vcpu_kick(struct kvm_vcpu *vcpu); 284extern void kvmppc_fast_vcpu_kick(struct kvm_vcpu *vcpu);
284extern void kvm_linear_init(void);
285 285
286#else 286#else
287static inline void kvmppc_set_xics_phys(int cpu, unsigned long addr) 287static inline void __init kvm_cma_reserve(void)
288{} 288{}
289 289
290static inline void kvm_linear_init(void) 290static inline void kvmppc_set_xics_phys(int cpu, unsigned long addr)
291{} 291{}
292 292
293static inline u32 kvmppc_get_xics_latch(void) 293static inline u32 kvmppc_get_xics_latch(void)
@@ -394,10 +394,15 @@ static inline void kvmppc_mmu_flush_icache(pfn_t pfn)
394 } 394 }
395} 395}
396 396
397/* Please call after prepare_to_enter. This function puts the lazy ee state 397/*
398 back to normal mode, without actually enabling interrupts. */ 398 * Please call after prepare_to_enter. This function puts the lazy ee and irq
399static inline void kvmppc_lazy_ee_enable(void) 399 * disabled tracking state back to normal mode, without actually enabling
400 * interrupts.
401 */
402static inline void kvmppc_fix_ee_before_entry(void)
400{ 403{
404 trace_hardirqs_on();
405
401#ifdef CONFIG_PPC64 406#ifdef CONFIG_PPC64
402 /* Only need to enable IRQs by hard enabling them after this */ 407 /* Only need to enable IRQs by hard enabling them after this */
403 local_paca->irq_happened = 0; 408 local_paca->irq_happened = 0;
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
index 8207459efe56..d8958be5f31a 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -454,6 +454,7 @@ int main(void)
454 DEFINE(VCPU_SPRG2, offsetof(struct kvm_vcpu, arch.shregs.sprg2)); 454 DEFINE(VCPU_SPRG2, offsetof(struct kvm_vcpu, arch.shregs.sprg2));
455 DEFINE(VCPU_SPRG3, offsetof(struct kvm_vcpu, arch.shregs.sprg3)); 455 DEFINE(VCPU_SPRG3, offsetof(struct kvm_vcpu, arch.shregs.sprg3));
456#endif 456#endif
457 DEFINE(VCPU_SHARED_SPRG3, offsetof(struct kvm_vcpu_arch_shared, sprg3));
457 DEFINE(VCPU_SHARED_SPRG4, offsetof(struct kvm_vcpu_arch_shared, sprg4)); 458 DEFINE(VCPU_SHARED_SPRG4, offsetof(struct kvm_vcpu_arch_shared, sprg4));
458 DEFINE(VCPU_SHARED_SPRG5, offsetof(struct kvm_vcpu_arch_shared, sprg5)); 459 DEFINE(VCPU_SHARED_SPRG5, offsetof(struct kvm_vcpu_arch_shared, sprg5));
459 DEFINE(VCPU_SHARED_SPRG6, offsetof(struct kvm_vcpu_arch_shared, sprg6)); 460 DEFINE(VCPU_SHARED_SPRG6, offsetof(struct kvm_vcpu_arch_shared, sprg6));
diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c
index 389fb8077cc9..fe6a58c9f0b7 100644
--- a/arch/powerpc/kernel/setup_64.c
+++ b/arch/powerpc/kernel/setup_64.c
@@ -229,6 +229,8 @@ void __init early_setup(unsigned long dt_ptr)
229 /* Initialize the hash table or TLB handling */ 229 /* Initialize the hash table or TLB handling */
230 early_init_mmu(); 230 early_init_mmu();
231 231
232 kvm_cma_reserve();
233
232 /* 234 /*
233 * Reserve any gigantic pages requested on the command line. 235 * Reserve any gigantic pages requested on the command line.
234 * memblock needs to have been initialized by the time this is 236 * memblock needs to have been initialized by the time this is
@@ -609,8 +611,6 @@ void __init setup_arch(char **cmdline_p)
609 /* Initialize the MMU context management stuff */ 611 /* Initialize the MMU context management stuff */
610 mmu_context_init(); 612 mmu_context_init();
611 613
612 kvm_linear_init();
613
614 /* Interrupt code needs to be 64K-aligned */ 614 /* Interrupt code needs to be 64K-aligned */
615 if ((unsigned long)_stext & 0xffff) 615 if ((unsigned long)_stext & 0xffff)
616 panic("Kernelbase not 64K-aligned (0x%lx)!\n", 616 panic("Kernelbase not 64K-aligned (0x%lx)!\n",
diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig
index eb643f862579..ffaef2cb101a 100644
--- a/arch/powerpc/kvm/Kconfig
+++ b/arch/powerpc/kvm/Kconfig
@@ -72,6 +72,7 @@ config KVM_BOOK3S_64_HV
72 bool "KVM support for POWER7 and PPC970 using hypervisor mode in host" 72 bool "KVM support for POWER7 and PPC970 using hypervisor mode in host"
73 depends on KVM_BOOK3S_64 73 depends on KVM_BOOK3S_64
74 select MMU_NOTIFIER 74 select MMU_NOTIFIER
75 select CMA
75 ---help--- 76 ---help---
76 Support running unmodified book3s_64 guest kernels in 77 Support running unmodified book3s_64 guest kernels in
77 virtual machines on POWER7 and PPC970 processors that have 78 virtual machines on POWER7 and PPC970 processors that have
diff --git a/arch/powerpc/kvm/Makefile b/arch/powerpc/kvm/Makefile
index 008cd856c5b5..6646c952c5e3 100644
--- a/arch/powerpc/kvm/Makefile
+++ b/arch/powerpc/kvm/Makefile
@@ -81,6 +81,7 @@ kvm-book3s_64-builtin-objs-$(CONFIG_KVM_BOOK3S_64_HV) := \
81 book3s_64_vio_hv.o \ 81 book3s_64_vio_hv.o \
82 book3s_hv_ras.o \ 82 book3s_hv_ras.o \
83 book3s_hv_builtin.o \ 83 book3s_hv_builtin.o \
84 book3s_hv_cma.o \
84 $(kvm-book3s_64-builtin-xics-objs-y) 85 $(kvm-book3s_64-builtin-xics-objs-y)
85 86
86kvm-book3s_64-objs-$(CONFIG_KVM_XICS) += \ 87kvm-book3s_64-objs-$(CONFIG_KVM_XICS) += \
diff --git a/arch/powerpc/kvm/book3s_64_mmu.c b/arch/powerpc/kvm/book3s_64_mmu.c
index 739bfbadb85e..7e345e00661a 100644
--- a/arch/powerpc/kvm/book3s_64_mmu.c
+++ b/arch/powerpc/kvm/book3s_64_mmu.c
@@ -182,10 +182,13 @@ static int kvmppc_mmu_book3s_64_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,
182 hva_t ptegp; 182 hva_t ptegp;
183 u64 pteg[16]; 183 u64 pteg[16];
184 u64 avpn = 0; 184 u64 avpn = 0;
185 u64 v, r;
186 u64 v_val, v_mask;
187 u64 eaddr_mask;
185 int i; 188 int i;
186 u8 key = 0; 189 u8 pp, key = 0;
187 bool found = false; 190 bool found = false;
188 int second = 0; 191 bool second = false;
189 ulong mp_ea = vcpu->arch.magic_page_ea; 192 ulong mp_ea = vcpu->arch.magic_page_ea;
190 193
191 /* Magic page override */ 194 /* Magic page override */
@@ -208,8 +211,16 @@ static int kvmppc_mmu_book3s_64_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,
208 goto no_seg_found; 211 goto no_seg_found;
209 212
210 avpn = kvmppc_mmu_book3s_64_get_avpn(slbe, eaddr); 213 avpn = kvmppc_mmu_book3s_64_get_avpn(slbe, eaddr);
214 v_val = avpn & HPTE_V_AVPN;
215
211 if (slbe->tb) 216 if (slbe->tb)
212 avpn |= SLB_VSID_B_1T; 217 v_val |= SLB_VSID_B_1T;
218 if (slbe->large)
219 v_val |= HPTE_V_LARGE;
220 v_val |= HPTE_V_VALID;
221
222 v_mask = SLB_VSID_B | HPTE_V_AVPN | HPTE_V_LARGE | HPTE_V_VALID |
223 HPTE_V_SECONDARY;
213 224
214do_second: 225do_second:
215 ptegp = kvmppc_mmu_book3s_64_get_pteg(vcpu_book3s, slbe, eaddr, second); 226 ptegp = kvmppc_mmu_book3s_64_get_pteg(vcpu_book3s, slbe, eaddr, second);
@@ -227,91 +238,74 @@ do_second:
227 key = 4; 238 key = 4;
228 239
229 for (i=0; i<16; i+=2) { 240 for (i=0; i<16; i+=2) {
230 u64 v = pteg[i]; 241 /* Check all relevant fields of 1st dword */
231 u64 r = pteg[i+1]; 242 if ((pteg[i] & v_mask) == v_val) {
232
233 /* Valid check */
234 if (!(v & HPTE_V_VALID))
235 continue;
236 /* Hash check */
237 if ((v & HPTE_V_SECONDARY) != second)
238 continue;
239
240 /* AVPN compare */
241 if (HPTE_V_COMPARE(avpn, v)) {
242 u8 pp = (r & HPTE_R_PP) | key;
243 int eaddr_mask = 0xFFF;
244
245 gpte->eaddr = eaddr;
246 gpte->vpage = kvmppc_mmu_book3s_64_ea_to_vp(vcpu,
247 eaddr,
248 data);
249 if (slbe->large)
250 eaddr_mask = 0xFFFFFF;
251 gpte->raddr = (r & HPTE_R_RPN) | (eaddr & eaddr_mask);
252 gpte->may_execute = ((r & HPTE_R_N) ? false : true);
253 gpte->may_read = false;
254 gpte->may_write = false;
255
256 switch (pp) {
257 case 0:
258 case 1:
259 case 2:
260 case 6:
261 gpte->may_write = true;
262 /* fall through */
263 case 3:
264 case 5:
265 case 7:
266 gpte->may_read = true;
267 break;
268 }
269
270 dprintk("KVM MMU: Translated 0x%lx [0x%llx] -> 0x%llx "
271 "-> 0x%lx\n",
272 eaddr, avpn, gpte->vpage, gpte->raddr);
273 found = true; 243 found = true;
274 break; 244 break;
275 } 245 }
276 } 246 }
277 247
278 /* Update PTE R and C bits, so the guest's swapper knows we used the 248 if (!found) {
279 * page */ 249 if (second)
280 if (found) { 250 goto no_page_found;
281 u32 oldr = pteg[i+1]; 251 v_val |= HPTE_V_SECONDARY;
252 second = true;
253 goto do_second;
254 }
282 255
283 if (gpte->may_read) { 256 v = pteg[i];
284 /* Set the accessed flag */ 257 r = pteg[i+1];
285 pteg[i+1] |= HPTE_R_R; 258 pp = (r & HPTE_R_PP) | key;
286 } 259 eaddr_mask = 0xFFF;
287 if (gpte->may_write) { 260
288 /* Set the dirty flag */ 261 gpte->eaddr = eaddr;
289 pteg[i+1] |= HPTE_R_C; 262 gpte->vpage = kvmppc_mmu_book3s_64_ea_to_vp(vcpu, eaddr, data);
290 } else { 263 if (slbe->large)
291 dprintk("KVM: Mapping read-only page!\n"); 264 eaddr_mask = 0xFFFFFF;
292 } 265 gpte->raddr = (r & HPTE_R_RPN & ~eaddr_mask) | (eaddr & eaddr_mask);
266 gpte->may_execute = ((r & HPTE_R_N) ? false : true);
267 gpte->may_read = false;
268 gpte->may_write = false;
269
270 switch (pp) {
271 case 0:
272 case 1:
273 case 2:
274 case 6:
275 gpte->may_write = true;
276 /* fall through */
277 case 3:
278 case 5:
279 case 7:
280 gpte->may_read = true;
281 break;
282 }
293 283
294 /* Write back into the PTEG */ 284 dprintk("KVM MMU: Translated 0x%lx [0x%llx] -> 0x%llx "
295 if (pteg[i+1] != oldr) 285 "-> 0x%lx\n",
296 copy_to_user((void __user *)ptegp, pteg, sizeof(pteg)); 286 eaddr, avpn, gpte->vpage, gpte->raddr);
297 287
298 if (!gpte->may_read) 288 /* Update PTE R and C bits, so the guest's swapper knows we used the
299 return -EPERM; 289 * page */
300 return 0; 290 if (gpte->may_read) {
301 } else { 291 /* Set the accessed flag */
302 dprintk("KVM MMU: No PTE found (ea=0x%lx sdr1=0x%llx " 292 r |= HPTE_R_R;
303 "ptegp=0x%lx)\n", 293 }
304 eaddr, to_book3s(vcpu)->sdr1, ptegp); 294 if (data && gpte->may_write) {
305 for (i = 0; i < 16; i += 2) 295 /* Set the dirty flag -- XXX even if not writing */
306 dprintk(" %02d: 0x%llx - 0x%llx (0x%llx)\n", 296 r |= HPTE_R_C;
307 i, pteg[i], pteg[i+1], avpn); 297 }
308 298
309 if (!second) { 299 /* Write back into the PTEG */
310 second = HPTE_V_SECONDARY; 300 if (pteg[i+1] != r) {
311 goto do_second; 301 pteg[i+1] = r;
312 } 302 copy_to_user((void __user *)ptegp, pteg, sizeof(pteg));
313 } 303 }
314 304
305 if (!gpte->may_read)
306 return -EPERM;
307 return 0;
308
315no_page_found: 309no_page_found:
316 return -ENOENT; 310 return -ENOENT;
317 311
diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c
index 710d31317d81..043eec8461e7 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
@@ -37,6 +37,8 @@
37#include <asm/ppc-opcode.h> 37#include <asm/ppc-opcode.h>
38#include <asm/cputable.h> 38#include <asm/cputable.h>
39 39
40#include "book3s_hv_cma.h"
41
40/* POWER7 has 10-bit LPIDs, PPC970 has 6-bit LPIDs */ 42/* POWER7 has 10-bit LPIDs, PPC970 has 6-bit LPIDs */
41#define MAX_LPID_970 63 43#define MAX_LPID_970 63
42 44
@@ -52,8 +54,8 @@ long kvmppc_alloc_hpt(struct kvm *kvm, u32 *htab_orderp)
52{ 54{
53 unsigned long hpt; 55 unsigned long hpt;
54 struct revmap_entry *rev; 56 struct revmap_entry *rev;
55 struct kvmppc_linear_info *li; 57 struct page *page = NULL;
56 long order = kvm_hpt_order; 58 long order = KVM_DEFAULT_HPT_ORDER;
57 59
58 if (htab_orderp) { 60 if (htab_orderp) {
59 order = *htab_orderp; 61 order = *htab_orderp;
@@ -61,26 +63,23 @@ long kvmppc_alloc_hpt(struct kvm *kvm, u32 *htab_orderp)
61 order = PPC_MIN_HPT_ORDER; 63 order = PPC_MIN_HPT_ORDER;
62 } 64 }
63 65
66 kvm->arch.hpt_cma_alloc = 0;
64 /* 67 /*
65 * If the user wants a different size from default,
66 * try first to allocate it from the kernel page allocator. 68 * try first to allocate it from the kernel page allocator.
69 * We keep the CMA reserved for failed allocation.
67 */ 70 */
68 hpt = 0; 71 hpt = __get_free_pages(GFP_KERNEL | __GFP_ZERO | __GFP_REPEAT |
69 if (order != kvm_hpt_order) { 72 __GFP_NOWARN, order - PAGE_SHIFT);
70 hpt = __get_free_pages(GFP_KERNEL|__GFP_ZERO|__GFP_REPEAT|
71 __GFP_NOWARN, order - PAGE_SHIFT);
72 if (!hpt)
73 --order;
74 }
75 73
76 /* Next try to allocate from the preallocated pool */ 74 /* Next try to allocate from the preallocated pool */
77 if (!hpt) { 75 if (!hpt) {
78 li = kvm_alloc_hpt(); 76 VM_BUG_ON(order < KVM_CMA_CHUNK_ORDER);
79 if (li) { 77 page = kvm_alloc_hpt(1 << (order - PAGE_SHIFT));
80 hpt = (ulong)li->base_virt; 78 if (page) {
81 kvm->arch.hpt_li = li; 79 hpt = (unsigned long)pfn_to_kaddr(page_to_pfn(page));
82 order = kvm_hpt_order; 80 kvm->arch.hpt_cma_alloc = 1;
83 } 81 } else
82 --order;
84 } 83 }
85 84
86 /* Lastly try successively smaller sizes from the page allocator */ 85 /* Lastly try successively smaller sizes from the page allocator */
@@ -118,8 +117,8 @@ long kvmppc_alloc_hpt(struct kvm *kvm, u32 *htab_orderp)
118 return 0; 117 return 0;
119 118
120 out_freehpt: 119 out_freehpt:
121 if (kvm->arch.hpt_li) 120 if (kvm->arch.hpt_cma_alloc)
122 kvm_release_hpt(kvm->arch.hpt_li); 121 kvm_release_hpt(page, 1 << (order - PAGE_SHIFT));
123 else 122 else
124 free_pages(hpt, order - PAGE_SHIFT); 123 free_pages(hpt, order - PAGE_SHIFT);
125 return -ENOMEM; 124 return -ENOMEM;
@@ -165,8 +164,9 @@ void kvmppc_free_hpt(struct kvm *kvm)
165{ 164{
166 kvmppc_free_lpid(kvm->arch.lpid); 165 kvmppc_free_lpid(kvm->arch.lpid);
167 vfree(kvm->arch.revmap); 166 vfree(kvm->arch.revmap);
168 if (kvm->arch.hpt_li) 167 if (kvm->arch.hpt_cma_alloc)
169 kvm_release_hpt(kvm->arch.hpt_li); 168 kvm_release_hpt(virt_to_page(kvm->arch.hpt_virt),
169 1 << (kvm->arch.hpt_order - PAGE_SHIFT));
170 else 170 else
171 free_pages(kvm->arch.hpt_virt, 171 free_pages(kvm->arch.hpt_virt,
172 kvm->arch.hpt_order - PAGE_SHIFT); 172 kvm->arch.hpt_order - PAGE_SHIFT);
@@ -1579,7 +1579,7 @@ int kvm_vm_ioctl_get_htab_fd(struct kvm *kvm, struct kvm_get_htab_fd *ghf)
1579 ctx->first_pass = 1; 1579 ctx->first_pass = 1;
1580 1580
1581 rwflag = (ghf->flags & KVM_GET_HTAB_WRITE) ? O_WRONLY : O_RDONLY; 1581 rwflag = (ghf->flags & KVM_GET_HTAB_WRITE) ? O_WRONLY : O_RDONLY;
1582 ret = anon_inode_getfd("kvm-htab", &kvm_htab_fops, ctx, rwflag); 1582 ret = anon_inode_getfd("kvm-htab", &kvm_htab_fops, ctx, rwflag | O_CLOEXEC);
1583 if (ret < 0) { 1583 if (ret < 0) {
1584 kvm_put_kvm(kvm); 1584 kvm_put_kvm(kvm);
1585 return ret; 1585 return ret;
diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c
index b2d3f3b2de72..54cf9bc94dad 100644
--- a/arch/powerpc/kvm/book3s_64_vio.c
+++ b/arch/powerpc/kvm/book3s_64_vio.c
@@ -136,7 +136,7 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
136 mutex_unlock(&kvm->lock); 136 mutex_unlock(&kvm->lock);
137 137
138 return anon_inode_getfd("kvm-spapr-tce", &kvm_spapr_tce_fops, 138 return anon_inode_getfd("kvm-spapr-tce", &kvm_spapr_tce_fops,
139 stt, O_RDWR); 139 stt, O_RDWR | O_CLOEXEC);
140 140
141fail: 141fail:
142 if (stt) { 142 if (stt) {
diff --git a/arch/powerpc/kvm/book3s_emulate.c b/arch/powerpc/kvm/book3s_emulate.c
index 1f6344c4408d..360ce68c9809 100644
--- a/arch/powerpc/kvm/book3s_emulate.c
+++ b/arch/powerpc/kvm/book3s_emulate.c
@@ -458,6 +458,7 @@ int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, ulong spr_val)
458 case SPRN_PMC4_GEKKO: 458 case SPRN_PMC4_GEKKO:
459 case SPRN_WPAR_GEKKO: 459 case SPRN_WPAR_GEKKO:
460 case SPRN_MSSSR0: 460 case SPRN_MSSSR0:
461 case SPRN_DABR:
461 break; 462 break;
462unprivileged: 463unprivileged:
463 default: 464 default:
@@ -555,6 +556,7 @@ int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, ulong *spr_val)
555 case SPRN_PMC4_GEKKO: 556 case SPRN_PMC4_GEKKO:
556 case SPRN_WPAR_GEKKO: 557 case SPRN_WPAR_GEKKO:
557 case SPRN_MSSSR0: 558 case SPRN_MSSSR0:
559 case SPRN_DABR:
558 *spr_val = 0; 560 *spr_val = 0;
559 break; 561 break;
560 default: 562 default:
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 7629cd3eb91a..b0ee3bc9ca76 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -680,13 +680,12 @@ static int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
680} 680}
681 681
682int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, 682int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
683 struct kvm_sregs *sregs) 683 struct kvm_sregs *sregs)
684{ 684{
685 int i; 685 int i;
686 686
687 sregs->pvr = vcpu->arch.pvr;
688
689 memset(sregs, 0, sizeof(struct kvm_sregs)); 687 memset(sregs, 0, sizeof(struct kvm_sregs));
688 sregs->pvr = vcpu->arch.pvr;
690 for (i = 0; i < vcpu->arch.slb_max; i++) { 689 for (i = 0; i < vcpu->arch.slb_max; i++) {
691 sregs->u.s.ppc64.slb[i].slbe = vcpu->arch.slb[i].orige; 690 sregs->u.s.ppc64.slb[i].slbe = vcpu->arch.slb[i].orige;
692 sregs->u.s.ppc64.slb[i].slbv = vcpu->arch.slb[i].origv; 691 sregs->u.s.ppc64.slb[i].slbv = vcpu->arch.slb[i].origv;
@@ -696,7 +695,7 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
696} 695}
697 696
698int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, 697int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
699 struct kvm_sregs *sregs) 698 struct kvm_sregs *sregs)
700{ 699{
701 int i, j; 700 int i, j;
702 701
@@ -1511,10 +1510,10 @@ static inline int lpcr_rmls(unsigned long rma_size)
1511 1510
1512static int kvm_rma_fault(struct vm_area_struct *vma, struct vm_fault *vmf) 1511static int kvm_rma_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
1513{ 1512{
1514 struct kvmppc_linear_info *ri = vma->vm_file->private_data;
1515 struct page *page; 1513 struct page *page;
1514 struct kvm_rma_info *ri = vma->vm_file->private_data;
1516 1515
1517 if (vmf->pgoff >= ri->npages) 1516 if (vmf->pgoff >= kvm_rma_pages)
1518 return VM_FAULT_SIGBUS; 1517 return VM_FAULT_SIGBUS;
1519 1518
1520 page = pfn_to_page(ri->base_pfn + vmf->pgoff); 1519 page = pfn_to_page(ri->base_pfn + vmf->pgoff);
@@ -1536,7 +1535,7 @@ static int kvm_rma_mmap(struct file *file, struct vm_area_struct *vma)
1536 1535
1537static int kvm_rma_release(struct inode *inode, struct file *filp) 1536static int kvm_rma_release(struct inode *inode, struct file *filp)
1538{ 1537{
1539 struct kvmppc_linear_info *ri = filp->private_data; 1538 struct kvm_rma_info *ri = filp->private_data;
1540 1539
1541 kvm_release_rma(ri); 1540 kvm_release_rma(ri);
1542 return 0; 1541 return 0;
@@ -1549,18 +1548,27 @@ static const struct file_operations kvm_rma_fops = {
1549 1548
1550long kvm_vm_ioctl_allocate_rma(struct kvm *kvm, struct kvm_allocate_rma *ret) 1549long kvm_vm_ioctl_allocate_rma(struct kvm *kvm, struct kvm_allocate_rma *ret)
1551{ 1550{
1552 struct kvmppc_linear_info *ri;
1553 long fd; 1551 long fd;
1552 struct kvm_rma_info *ri;
1553 /*
1554 * Only do this on PPC970 in HV mode
1555 */
1556 if (!cpu_has_feature(CPU_FTR_HVMODE) ||
1557 !cpu_has_feature(CPU_FTR_ARCH_201))
1558 return -EINVAL;
1559
1560 if (!kvm_rma_pages)
1561 return -EINVAL;
1554 1562
1555 ri = kvm_alloc_rma(); 1563 ri = kvm_alloc_rma();
1556 if (!ri) 1564 if (!ri)
1557 return -ENOMEM; 1565 return -ENOMEM;
1558 1566
1559 fd = anon_inode_getfd("kvm-rma", &kvm_rma_fops, ri, O_RDWR); 1567 fd = anon_inode_getfd("kvm-rma", &kvm_rma_fops, ri, O_RDWR | O_CLOEXEC);
1560 if (fd < 0) 1568 if (fd < 0)
1561 kvm_release_rma(ri); 1569 kvm_release_rma(ri);
1562 1570
1563 ret->rma_size = ri->npages << PAGE_SHIFT; 1571 ret->rma_size = kvm_rma_pages << PAGE_SHIFT;
1564 return fd; 1572 return fd;
1565} 1573}
1566 1574
@@ -1725,7 +1733,7 @@ static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu)
1725{ 1733{
1726 int err = 0; 1734 int err = 0;
1727 struct kvm *kvm = vcpu->kvm; 1735 struct kvm *kvm = vcpu->kvm;
1728 struct kvmppc_linear_info *ri = NULL; 1736 struct kvm_rma_info *ri = NULL;
1729 unsigned long hva; 1737 unsigned long hva;
1730 struct kvm_memory_slot *memslot; 1738 struct kvm_memory_slot *memslot;
1731 struct vm_area_struct *vma; 1739 struct vm_area_struct *vma;
@@ -1803,7 +1811,7 @@ static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu)
1803 1811
1804 } else { 1812 } else {
1805 /* Set up to use an RMO region */ 1813 /* Set up to use an RMO region */
1806 rma_size = ri->npages; 1814 rma_size = kvm_rma_pages;
1807 if (rma_size > memslot->npages) 1815 if (rma_size > memslot->npages)
1808 rma_size = memslot->npages; 1816 rma_size = memslot->npages;
1809 rma_size <<= PAGE_SHIFT; 1817 rma_size <<= PAGE_SHIFT;
@@ -1831,14 +1839,14 @@ static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu)
1831 /* POWER7 */ 1839 /* POWER7 */
1832 lpcr &= ~(LPCR_VPM0 | LPCR_VRMA_L); 1840 lpcr &= ~(LPCR_VPM0 | LPCR_VRMA_L);
1833 lpcr |= rmls << LPCR_RMLS_SH; 1841 lpcr |= rmls << LPCR_RMLS_SH;
1834 kvm->arch.rmor = kvm->arch.rma->base_pfn << PAGE_SHIFT; 1842 kvm->arch.rmor = ri->base_pfn << PAGE_SHIFT;
1835 } 1843 }
1836 kvm->arch.lpcr = lpcr; 1844 kvm->arch.lpcr = lpcr;
1837 pr_info("KVM: Using RMO at %lx size %lx (LPCR = %lx)\n", 1845 pr_info("KVM: Using RMO at %lx size %lx (LPCR = %lx)\n",
1838 ri->base_pfn << PAGE_SHIFT, rma_size, lpcr); 1846 ri->base_pfn << PAGE_SHIFT, rma_size, lpcr);
1839 1847
1840 /* Initialize phys addrs of pages in RMO */ 1848 /* Initialize phys addrs of pages in RMO */
1841 npages = ri->npages; 1849 npages = kvm_rma_pages;
1842 porder = __ilog2(npages); 1850 porder = __ilog2(npages);
1843 physp = memslot->arch.slot_phys; 1851 physp = memslot->arch.slot_phys;
1844 if (physp) { 1852 if (physp) {
diff --git a/arch/powerpc/kvm/book3s_hv_builtin.c b/arch/powerpc/kvm/book3s_hv_builtin.c
index ec0a9e5de100..8cd0daebb82d 100644
--- a/arch/powerpc/kvm/book3s_hv_builtin.c
+++ b/arch/powerpc/kvm/book3s_hv_builtin.c
@@ -13,33 +13,34 @@
13#include <linux/spinlock.h> 13#include <linux/spinlock.h>
14#include <linux/bootmem.h> 14#include <linux/bootmem.h>
15#include <linux/init.h> 15#include <linux/init.h>
16#include <linux/memblock.h>
17#include <linux/sizes.h>
16 18
17#include <asm/cputable.h> 19#include <asm/cputable.h>
18#include <asm/kvm_ppc.h> 20#include <asm/kvm_ppc.h>
19#include <asm/kvm_book3s.h> 21#include <asm/kvm_book3s.h>
20 22
21#define KVM_LINEAR_RMA 0 23#include "book3s_hv_cma.h"
22#define KVM_LINEAR_HPT 1 24/*
23 25 * Hash page table alignment on newer cpus(CPU_FTR_ARCH_206)
24static void __init kvm_linear_init_one(ulong size, int count, int type); 26 * should be power of 2.
25static struct kvmppc_linear_info *kvm_alloc_linear(int type); 27 */
26static void kvm_release_linear(struct kvmppc_linear_info *ri); 28#define HPT_ALIGN_PAGES ((1 << 18) >> PAGE_SHIFT) /* 256k */
27 29/*
28int kvm_hpt_order = KVM_DEFAULT_HPT_ORDER; 30 * By default we reserve 5% of memory for hash pagetable allocation.
29EXPORT_SYMBOL_GPL(kvm_hpt_order); 31 */
30 32static unsigned long kvm_cma_resv_ratio = 5;
31/*************** RMA *************/
32
33/* 33/*
34 * This maintains a list of RMAs (real mode areas) for KVM guests to use. 34 * We allocate RMAs (real mode areas) for KVM guests from the KVM CMA area.
35 * Each RMA has to be physically contiguous and of a size that the 35 * Each RMA has to be physically contiguous and of a size that the
36 * hardware supports. PPC970 and POWER7 support 64MB, 128MB and 256MB, 36 * hardware supports. PPC970 and POWER7 support 64MB, 128MB and 256MB,
37 * and other larger sizes. Since we are unlikely to be allocate that 37 * and other larger sizes. Since we are unlikely to be allocate that
38 * much physically contiguous memory after the system is up and running, 38 * much physically contiguous memory after the system is up and running,
39 * we preallocate a set of RMAs in early boot for KVM to use. 39 * we preallocate a set of RMAs in early boot using CMA.
40 * should be power of 2.
40 */ 41 */
41static unsigned long kvm_rma_size = 64 << 20; /* 64MB */ 42unsigned long kvm_rma_pages = (1 << 27) >> PAGE_SHIFT; /* 128MB */
42static unsigned long kvm_rma_count; 43EXPORT_SYMBOL_GPL(kvm_rma_pages);
43 44
44/* Work out RMLS (real mode limit selector) field value for a given RMA size. 45/* Work out RMLS (real mode limit selector) field value for a given RMA size.
45 Assumes POWER7 or PPC970. */ 46 Assumes POWER7 or PPC970. */
@@ -69,165 +70,114 @@ static inline int lpcr_rmls(unsigned long rma_size)
69 70
70static int __init early_parse_rma_size(char *p) 71static int __init early_parse_rma_size(char *p)
71{ 72{
72 if (!p) 73 unsigned long kvm_rma_size;
73 return 1;
74 74
75 pr_debug("%s(%s)\n", __func__, p);
76 if (!p)
77 return -EINVAL;
75 kvm_rma_size = memparse(p, &p); 78 kvm_rma_size = memparse(p, &p);
76 79 /*
80 * Check that the requested size is one supported in hardware
81 */
82 if (lpcr_rmls(kvm_rma_size) < 0) {
83 pr_err("RMA size of 0x%lx not supported\n", kvm_rma_size);
84 return -EINVAL;
85 }
86 kvm_rma_pages = kvm_rma_size >> PAGE_SHIFT;
77 return 0; 87 return 0;
78} 88}
79early_param("kvm_rma_size", early_parse_rma_size); 89early_param("kvm_rma_size", early_parse_rma_size);
80 90
81static int __init early_parse_rma_count(char *p) 91struct kvm_rma_info *kvm_alloc_rma()
82{ 92{
83 if (!p) 93 struct page *page;
84 return 1; 94 struct kvm_rma_info *ri;
85 95
86 kvm_rma_count = simple_strtoul(p, NULL, 0); 96 ri = kmalloc(sizeof(struct kvm_rma_info), GFP_KERNEL);
87 97 if (!ri)
88 return 0; 98 return NULL;
89} 99 page = kvm_alloc_cma(kvm_rma_pages, kvm_rma_pages);
90early_param("kvm_rma_count", early_parse_rma_count); 100 if (!page)
91 101 goto err_out;
92struct kvmppc_linear_info *kvm_alloc_rma(void) 102 atomic_set(&ri->use_count, 1);
93{ 103 ri->base_pfn = page_to_pfn(page);
94 return kvm_alloc_linear(KVM_LINEAR_RMA); 104 return ri;
105err_out:
106 kfree(ri);
107 return NULL;
95} 108}
96EXPORT_SYMBOL_GPL(kvm_alloc_rma); 109EXPORT_SYMBOL_GPL(kvm_alloc_rma);
97 110
98void kvm_release_rma(struct kvmppc_linear_info *ri) 111void kvm_release_rma(struct kvm_rma_info *ri)
99{ 112{
100 kvm_release_linear(ri); 113 if (atomic_dec_and_test(&ri->use_count)) {
114 kvm_release_cma(pfn_to_page(ri->base_pfn), kvm_rma_pages);
115 kfree(ri);
116 }
101} 117}
102EXPORT_SYMBOL_GPL(kvm_release_rma); 118EXPORT_SYMBOL_GPL(kvm_release_rma);
103 119
104/*************** HPT *************/ 120static int __init early_parse_kvm_cma_resv(char *p)
105
106/*
107 * This maintains a list of big linear HPT tables that contain the GVA->HPA
108 * memory mappings. If we don't reserve those early on, we might not be able
109 * to get a big (usually 16MB) linear memory region from the kernel anymore.
110 */
111
112static unsigned long kvm_hpt_count;
113
114static int __init early_parse_hpt_count(char *p)
115{ 121{
122 pr_debug("%s(%s)\n", __func__, p);
116 if (!p) 123 if (!p)
117 return 1; 124 return -EINVAL;
118 125 return kstrtoul(p, 0, &kvm_cma_resv_ratio);
119 kvm_hpt_count = simple_strtoul(p, NULL, 0);
120
121 return 0;
122} 126}
123early_param("kvm_hpt_count", early_parse_hpt_count); 127early_param("kvm_cma_resv_ratio", early_parse_kvm_cma_resv);
124 128
125struct kvmppc_linear_info *kvm_alloc_hpt(void) 129struct page *kvm_alloc_hpt(unsigned long nr_pages)
126{ 130{
127 return kvm_alloc_linear(KVM_LINEAR_HPT); 131 unsigned long align_pages = HPT_ALIGN_PAGES;
132
133 /* Old CPUs require HPT aligned on a multiple of its size */
134 if (!cpu_has_feature(CPU_FTR_ARCH_206))
135 align_pages = nr_pages;
136 return kvm_alloc_cma(nr_pages, align_pages);
128} 137}
129EXPORT_SYMBOL_GPL(kvm_alloc_hpt); 138EXPORT_SYMBOL_GPL(kvm_alloc_hpt);
130 139
131void kvm_release_hpt(struct kvmppc_linear_info *li) 140void kvm_release_hpt(struct page *page, unsigned long nr_pages)
132{ 141{
133 kvm_release_linear(li); 142 kvm_release_cma(page, nr_pages);
134} 143}
135EXPORT_SYMBOL_GPL(kvm_release_hpt); 144EXPORT_SYMBOL_GPL(kvm_release_hpt);
136 145
137/*************** generic *************/ 146/**
138 147 * kvm_cma_reserve() - reserve area for kvm hash pagetable
139static LIST_HEAD(free_linears); 148 *
140static DEFINE_SPINLOCK(linear_lock); 149 * This function reserves memory from early allocator. It should be
141 150 * called by arch specific code once the early allocator (memblock or bootmem)
142static void __init kvm_linear_init_one(ulong size, int count, int type) 151 * has been activated and all other subsystems have already allocated/reserved
143{ 152 * memory.
144 unsigned long i;
145 unsigned long j, npages;
146 void *linear;
147 struct page *pg;
148 const char *typestr;
149 struct kvmppc_linear_info *linear_info;
150
151 if (!count)
152 return;
153
154 typestr = (type == KVM_LINEAR_RMA) ? "RMA" : "HPT";
155
156 npages = size >> PAGE_SHIFT;
157 linear_info = alloc_bootmem(count * sizeof(struct kvmppc_linear_info));
158 for (i = 0; i < count; ++i) {
159 linear = alloc_bootmem_align(size, size);
160 pr_debug("Allocated KVM %s at %p (%ld MB)\n", typestr, linear,
161 size >> 20);
162 linear_info[i].base_virt = linear;
163 linear_info[i].base_pfn = __pa(linear) >> PAGE_SHIFT;
164 linear_info[i].npages = npages;
165 linear_info[i].type = type;
166 list_add_tail(&linear_info[i].list, &free_linears);
167 atomic_set(&linear_info[i].use_count, 0);
168
169 pg = pfn_to_page(linear_info[i].base_pfn);
170 for (j = 0; j < npages; ++j) {
171 atomic_inc(&pg->_count);
172 ++pg;
173 }
174 }
175}
176
177static struct kvmppc_linear_info *kvm_alloc_linear(int type)
178{
179 struct kvmppc_linear_info *ri, *ret;
180
181 ret = NULL;
182 spin_lock(&linear_lock);
183 list_for_each_entry(ri, &free_linears, list) {
184 if (ri->type != type)
185 continue;
186
187 list_del(&ri->list);
188 atomic_inc(&ri->use_count);
189 memset(ri->base_virt, 0, ri->npages << PAGE_SHIFT);
190 ret = ri;
191 break;
192 }
193 spin_unlock(&linear_lock);
194 return ret;
195}
196
197static void kvm_release_linear(struct kvmppc_linear_info *ri)
198{
199 if (atomic_dec_and_test(&ri->use_count)) {
200 spin_lock(&linear_lock);
201 list_add_tail(&ri->list, &free_linears);
202 spin_unlock(&linear_lock);
203
204 }
205}
206
207/*
208 * Called at boot time while the bootmem allocator is active,
209 * to allocate contiguous physical memory for the hash page
210 * tables for guests.
211 */ 153 */
212void __init kvm_linear_init(void) 154void __init kvm_cma_reserve(void)
213{ 155{
214 /* HPT */ 156 unsigned long align_size;
215 kvm_linear_init_one(1 << kvm_hpt_order, kvm_hpt_count, KVM_LINEAR_HPT); 157 struct memblock_region *reg;
216 158 phys_addr_t selected_size = 0;
217 /* RMA */ 159 /*
218 /* Only do this on PPC970 in HV mode */ 160 * We cannot use memblock_phys_mem_size() here, because
219 if (!cpu_has_feature(CPU_FTR_HVMODE) || 161 * memblock_analyze() has not been called yet.
220 !cpu_has_feature(CPU_FTR_ARCH_201)) 162 */
221 return; 163 for_each_memblock(memory, reg)
222 164 selected_size += memblock_region_memory_end_pfn(reg) -
223 if (!kvm_rma_size || !kvm_rma_count) 165 memblock_region_memory_base_pfn(reg);
224 return; 166
225 167 selected_size = (selected_size * kvm_cma_resv_ratio / 100) << PAGE_SHIFT;
226 /* Check that the requested size is one supported in hardware */ 168 if (selected_size) {
227 if (lpcr_rmls(kvm_rma_size) < 0) { 169 pr_debug("%s: reserving %ld MiB for global area\n", __func__,
228 pr_err("RMA size of 0x%lx not supported\n", kvm_rma_size); 170 (unsigned long)selected_size / SZ_1M);
229 return; 171 /*
172 * Old CPUs require HPT aligned on a multiple of its size. So for them
173 * make the alignment as max size we could request.
174 */
175 if (!cpu_has_feature(CPU_FTR_ARCH_206))
176 align_size = __rounddown_pow_of_two(selected_size);
177 else
178 align_size = HPT_ALIGN_PAGES << PAGE_SHIFT;
179
180 align_size = max(kvm_rma_pages << PAGE_SHIFT, align_size);
181 kvm_cma_declare_contiguous(selected_size, align_size);
230 } 182 }
231
232 kvm_linear_init_one(kvm_rma_size, kvm_rma_count, KVM_LINEAR_RMA);
233} 183}
diff --git a/arch/powerpc/kvm/book3s_hv_cma.c b/arch/powerpc/kvm/book3s_hv_cma.c
new file mode 100644
index 000000000000..d9d3d8553d51
--- /dev/null
+++ b/arch/powerpc/kvm/book3s_hv_cma.c
@@ -0,0 +1,240 @@
1/*
2 * Contiguous Memory Allocator for ppc KVM hash pagetable based on CMA
3 * for DMA mapping framework
4 *
5 * Copyright IBM Corporation, 2013
6 * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
7 *
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License as
10 * published by the Free Software Foundation; either version 2 of the
11 * License or (at your optional) any later version of the license.
12 *
13 */
14#define pr_fmt(fmt) "kvm_cma: " fmt
15
16#ifdef CONFIG_CMA_DEBUG
17#ifndef DEBUG
18# define DEBUG
19#endif
20#endif
21
22#include <linux/memblock.h>
23#include <linux/mutex.h>
24#include <linux/sizes.h>
25#include <linux/slab.h>
26
27#include "book3s_hv_cma.h"
28
29struct kvm_cma {
30 unsigned long base_pfn;
31 unsigned long count;
32 unsigned long *bitmap;
33};
34
35static DEFINE_MUTEX(kvm_cma_mutex);
36static struct kvm_cma kvm_cma_area;
37
38/**
39 * kvm_cma_declare_contiguous() - reserve area for contiguous memory handling
40 * for kvm hash pagetable
41 * @size: Size of the reserved memory.
42 * @alignment: Alignment for the contiguous memory area
43 *
44 * This function reserves memory for kvm cma area. It should be
45 * called by arch code when early allocator (memblock or bootmem)
46 * is still activate.
47 */
48long __init kvm_cma_declare_contiguous(phys_addr_t size, phys_addr_t alignment)
49{
50 long base_pfn;
51 phys_addr_t addr;
52 struct kvm_cma *cma = &kvm_cma_area;
53
54 pr_debug("%s(size %lx)\n", __func__, (unsigned long)size);
55
56 if (!size)
57 return -EINVAL;
58 /*
59 * Sanitise input arguments.
60 * We should be pageblock aligned for CMA.
61 */
62 alignment = max(alignment, (phys_addr_t)(PAGE_SIZE << pageblock_order));
63 size = ALIGN(size, alignment);
64 /*
65 * Reserve memory
66 * Use __memblock_alloc_base() since
67 * memblock_alloc_base() panic()s.
68 */
69 addr = __memblock_alloc_base(size, alignment, 0);
70 if (!addr) {
71 base_pfn = -ENOMEM;
72 goto err;
73 } else
74 base_pfn = PFN_DOWN(addr);
75
76 /*
77 * Each reserved area must be initialised later, when more kernel
78 * subsystems (like slab allocator) are available.
79 */
80 cma->base_pfn = base_pfn;
81 cma->count = size >> PAGE_SHIFT;
82 pr_info("CMA: reserved %ld MiB\n", (unsigned long)size / SZ_1M);
83 return 0;
84err:
85 pr_err("CMA: failed to reserve %ld MiB\n", (unsigned long)size / SZ_1M);
86 return base_pfn;
87}
88
89/**
90 * kvm_alloc_cma() - allocate pages from contiguous area
91 * @nr_pages: Requested number of pages.
92 * @align_pages: Requested alignment in number of pages
93 *
94 * This function allocates memory buffer for hash pagetable.
95 */
96struct page *kvm_alloc_cma(unsigned long nr_pages, unsigned long align_pages)
97{
98 int ret;
99 struct page *page = NULL;
100 struct kvm_cma *cma = &kvm_cma_area;
101 unsigned long chunk_count, nr_chunk;
102 unsigned long mask, pfn, pageno, start = 0;
103
104
105 if (!cma || !cma->count)
106 return NULL;
107
108 pr_debug("%s(cma %p, count %lu, align pages %lu)\n", __func__,
109 (void *)cma, nr_pages, align_pages);
110
111 if (!nr_pages)
112 return NULL;
113 /*
114 * align mask with chunk size. The bit tracks pages in chunk size
115 */
116 VM_BUG_ON(!is_power_of_2(align_pages));
117 mask = (align_pages >> (KVM_CMA_CHUNK_ORDER - PAGE_SHIFT)) - 1;
118 BUILD_BUG_ON(PAGE_SHIFT > KVM_CMA_CHUNK_ORDER);
119
120 chunk_count = cma->count >> (KVM_CMA_CHUNK_ORDER - PAGE_SHIFT);
121 nr_chunk = nr_pages >> (KVM_CMA_CHUNK_ORDER - PAGE_SHIFT);
122
123 mutex_lock(&kvm_cma_mutex);
124 for (;;) {
125 pageno = bitmap_find_next_zero_area(cma->bitmap, chunk_count,
126 start, nr_chunk, mask);
127 if (pageno >= chunk_count)
128 break;
129
130 pfn = cma->base_pfn + (pageno << (KVM_CMA_CHUNK_ORDER - PAGE_SHIFT));
131 ret = alloc_contig_range(pfn, pfn + nr_pages, MIGRATE_CMA);
132 if (ret == 0) {
133 bitmap_set(cma->bitmap, pageno, nr_chunk);
134 page = pfn_to_page(pfn);
135 memset(pfn_to_kaddr(pfn), 0, nr_pages << PAGE_SHIFT);
136 break;
137 } else if (ret != -EBUSY) {
138 break;
139 }
140 pr_debug("%s(): memory range at %p is busy, retrying\n",
141 __func__, pfn_to_page(pfn));
142 /* try again with a bit different memory target */
143 start = pageno + mask + 1;
144 }
145 mutex_unlock(&kvm_cma_mutex);
146 pr_debug("%s(): returned %p\n", __func__, page);
147 return page;
148}
149
150/**
151 * kvm_release_cma() - release allocated pages for hash pagetable
152 * @pages: Allocated pages.
153 * @nr_pages: Number of allocated pages.
154 *
155 * This function releases memory allocated by kvm_alloc_cma().
156 * It returns false when provided pages do not belong to contiguous area and
157 * true otherwise.
158 */
159bool kvm_release_cma(struct page *pages, unsigned long nr_pages)
160{
161 unsigned long pfn;
162 unsigned long nr_chunk;
163 struct kvm_cma *cma = &kvm_cma_area;
164
165 if (!cma || !pages)
166 return false;
167
168 pr_debug("%s(page %p count %lu)\n", __func__, (void *)pages, nr_pages);
169
170 pfn = page_to_pfn(pages);
171
172 if (pfn < cma->base_pfn || pfn >= cma->base_pfn + cma->count)
173 return false;
174
175 VM_BUG_ON(pfn + nr_pages > cma->base_pfn + cma->count);
176 nr_chunk = nr_pages >> (KVM_CMA_CHUNK_ORDER - PAGE_SHIFT);
177
178 mutex_lock(&kvm_cma_mutex);
179 bitmap_clear(cma->bitmap,
180 (pfn - cma->base_pfn) >> (KVM_CMA_CHUNK_ORDER - PAGE_SHIFT),
181 nr_chunk);
182 free_contig_range(pfn, nr_pages);
183 mutex_unlock(&kvm_cma_mutex);
184
185 return true;
186}
187
188static int __init kvm_cma_activate_area(unsigned long base_pfn,
189 unsigned long count)
190{
191 unsigned long pfn = base_pfn;
192 unsigned i = count >> pageblock_order;
193 struct zone *zone;
194
195 WARN_ON_ONCE(!pfn_valid(pfn));
196 zone = page_zone(pfn_to_page(pfn));
197 do {
198 unsigned j;
199 base_pfn = pfn;
200 for (j = pageblock_nr_pages; j; --j, pfn++) {
201 WARN_ON_ONCE(!pfn_valid(pfn));
202 /*
203 * alloc_contig_range requires the pfn range
204 * specified to be in the same zone. Make this
205 * simple by forcing the entire CMA resv range
206 * to be in the same zone.
207 */
208 if (page_zone(pfn_to_page(pfn)) != zone)
209 return -EINVAL;
210 }
211 init_cma_reserved_pageblock(pfn_to_page(base_pfn));
212 } while (--i);
213 return 0;
214}
215
216static int __init kvm_cma_init_reserved_areas(void)
217{
218 int bitmap_size, ret;
219 unsigned long chunk_count;
220 struct kvm_cma *cma = &kvm_cma_area;
221
222 pr_debug("%s()\n", __func__);
223 if (!cma->count)
224 return 0;
225 chunk_count = cma->count >> (KVM_CMA_CHUNK_ORDER - PAGE_SHIFT);
226 bitmap_size = BITS_TO_LONGS(chunk_count) * sizeof(long);
227 cma->bitmap = kzalloc(bitmap_size, GFP_KERNEL);
228 if (!cma->bitmap)
229 return -ENOMEM;
230
231 ret = kvm_cma_activate_area(cma->base_pfn, cma->count);
232 if (ret)
233 goto error;
234 return 0;
235
236error:
237 kfree(cma->bitmap);
238 return ret;
239}
240core_initcall(kvm_cma_init_reserved_areas);
diff --git a/arch/powerpc/kvm/book3s_hv_cma.h b/arch/powerpc/kvm/book3s_hv_cma.h
new file mode 100644
index 000000000000..655144f75fa5
--- /dev/null
+++ b/arch/powerpc/kvm/book3s_hv_cma.h
@@ -0,0 +1,27 @@
1/*
2 * Contiguous Memory Allocator for ppc KVM hash pagetable based on CMA
3 * for DMA mapping framework
4 *
5 * Copyright IBM Corporation, 2013
6 * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
7 *
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License as
10 * published by the Free Software Foundation; either version 2 of the
11 * License or (at your optional) any later version of the license.
12 *
13 */
14
15#ifndef __POWERPC_KVM_CMA_ALLOC_H__
16#define __POWERPC_KVM_CMA_ALLOC_H__
17/*
18 * Both RMA and Hash page allocation will be multiple of 256K.
19 */
20#define KVM_CMA_CHUNK_ORDER 18
21
22extern struct page *kvm_alloc_cma(unsigned long nr_pages,
23 unsigned long align_pages);
24extern bool kvm_release_cma(struct page *pages, unsigned long nr_pages);
25extern long kvm_cma_declare_contiguous(phys_addr_t size,
26 phys_addr_t alignment) __init;
27#endif
diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
index fc25689a9f35..45e30d6e462b 100644
--- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
@@ -383,6 +383,80 @@ static inline int try_lock_tlbie(unsigned int *lock)
383 return old == 0; 383 return old == 0;
384} 384}
385 385
386/*
387 * tlbie/tlbiel is a bit different on the PPC970 compared to later
388 * processors such as POWER7; the large page bit is in the instruction
389 * not RB, and the top 16 bits and the bottom 12 bits of the VA
390 * in RB must be 0.
391 */
392static void do_tlbies_970(struct kvm *kvm, unsigned long *rbvalues,
393 long npages, int global, bool need_sync)
394{
395 long i;
396
397 if (global) {
398 while (!try_lock_tlbie(&kvm->arch.tlbie_lock))
399 cpu_relax();
400 if (need_sync)
401 asm volatile("ptesync" : : : "memory");
402 for (i = 0; i < npages; ++i) {
403 unsigned long rb = rbvalues[i];
404
405 if (rb & 1) /* large page */
406 asm volatile("tlbie %0,1" : :
407 "r" (rb & 0x0000fffffffff000ul));
408 else
409 asm volatile("tlbie %0,0" : :
410 "r" (rb & 0x0000fffffffff000ul));
411 }
412 asm volatile("eieio; tlbsync; ptesync" : : : "memory");
413 kvm->arch.tlbie_lock = 0;
414 } else {
415 if (need_sync)
416 asm volatile("ptesync" : : : "memory");
417 for (i = 0; i < npages; ++i) {
418 unsigned long rb = rbvalues[i];
419
420 if (rb & 1) /* large page */
421 asm volatile("tlbiel %0,1" : :
422 "r" (rb & 0x0000fffffffff000ul));
423 else
424 asm volatile("tlbiel %0,0" : :
425 "r" (rb & 0x0000fffffffff000ul));
426 }
427 asm volatile("ptesync" : : : "memory");
428 }
429}
430
431static void do_tlbies(struct kvm *kvm, unsigned long *rbvalues,
432 long npages, int global, bool need_sync)
433{
434 long i;
435
436 if (cpu_has_feature(CPU_FTR_ARCH_201)) {
437 /* PPC970 tlbie instruction is a bit different */
438 do_tlbies_970(kvm, rbvalues, npages, global, need_sync);
439 return;
440 }
441 if (global) {
442 while (!try_lock_tlbie(&kvm->arch.tlbie_lock))
443 cpu_relax();
444 if (need_sync)
445 asm volatile("ptesync" : : : "memory");
446 for (i = 0; i < npages; ++i)
447 asm volatile(PPC_TLBIE(%1,%0) : :
448 "r" (rbvalues[i]), "r" (kvm->arch.lpid));
449 asm volatile("eieio; tlbsync; ptesync" : : : "memory");
450 kvm->arch.tlbie_lock = 0;
451 } else {
452 if (need_sync)
453 asm volatile("ptesync" : : : "memory");
454 for (i = 0; i < npages; ++i)
455 asm volatile("tlbiel %0" : : "r" (rbvalues[i]));
456 asm volatile("ptesync" : : : "memory");
457 }
458}
459
386long kvmppc_do_h_remove(struct kvm *kvm, unsigned long flags, 460long kvmppc_do_h_remove(struct kvm *kvm, unsigned long flags,
387 unsigned long pte_index, unsigned long avpn, 461 unsigned long pte_index, unsigned long avpn,
388 unsigned long *hpret) 462 unsigned long *hpret)
@@ -408,19 +482,7 @@ long kvmppc_do_h_remove(struct kvm *kvm, unsigned long flags,
408 if (v & HPTE_V_VALID) { 482 if (v & HPTE_V_VALID) {
409 hpte[0] &= ~HPTE_V_VALID; 483 hpte[0] &= ~HPTE_V_VALID;
410 rb = compute_tlbie_rb(v, hpte[1], pte_index); 484 rb = compute_tlbie_rb(v, hpte[1], pte_index);
411 if (global_invalidates(kvm, flags)) { 485 do_tlbies(kvm, &rb, 1, global_invalidates(kvm, flags), true);
412 while (!try_lock_tlbie(&kvm->arch.tlbie_lock))
413 cpu_relax();
414 asm volatile("ptesync" : : : "memory");
415 asm volatile(PPC_TLBIE(%1,%0)"; eieio; tlbsync"
416 : : "r" (rb), "r" (kvm->arch.lpid));
417 asm volatile("ptesync" : : : "memory");
418 kvm->arch.tlbie_lock = 0;
419 } else {
420 asm volatile("ptesync" : : : "memory");
421 asm volatile("tlbiel %0" : : "r" (rb));
422 asm volatile("ptesync" : : : "memory");
423 }
424 /* Read PTE low word after tlbie to get final R/C values */ 486 /* Read PTE low word after tlbie to get final R/C values */
425 remove_revmap_chain(kvm, pte_index, rev, v, hpte[1]); 487 remove_revmap_chain(kvm, pte_index, rev, v, hpte[1]);
426 } 488 }
@@ -448,12 +510,11 @@ long kvmppc_h_bulk_remove(struct kvm_vcpu *vcpu)
448 unsigned long *hp, *hptes[4], tlbrb[4]; 510 unsigned long *hp, *hptes[4], tlbrb[4];
449 long int i, j, k, n, found, indexes[4]; 511 long int i, j, k, n, found, indexes[4];
450 unsigned long flags, req, pte_index, rcbits; 512 unsigned long flags, req, pte_index, rcbits;
451 long int local = 0; 513 int global;
452 long int ret = H_SUCCESS; 514 long int ret = H_SUCCESS;
453 struct revmap_entry *rev, *revs[4]; 515 struct revmap_entry *rev, *revs[4];
454 516
455 if (atomic_read(&kvm->online_vcpus) == 1) 517 global = global_invalidates(kvm, 0);
456 local = 1;
457 for (i = 0; i < 4 && ret == H_SUCCESS; ) { 518 for (i = 0; i < 4 && ret == H_SUCCESS; ) {
458 n = 0; 519 n = 0;
459 for (; i < 4; ++i) { 520 for (; i < 4; ++i) {
@@ -529,22 +590,7 @@ long kvmppc_h_bulk_remove(struct kvm_vcpu *vcpu)
529 break; 590 break;
530 591
531 /* Now that we've collected a batch, do the tlbies */ 592 /* Now that we've collected a batch, do the tlbies */
532 if (!local) { 593 do_tlbies(kvm, tlbrb, n, global, true);
533 while(!try_lock_tlbie(&kvm->arch.tlbie_lock))
534 cpu_relax();
535 asm volatile("ptesync" : : : "memory");
536 for (k = 0; k < n; ++k)
537 asm volatile(PPC_TLBIE(%1,%0) : :
538 "r" (tlbrb[k]),
539 "r" (kvm->arch.lpid));
540 asm volatile("eieio; tlbsync; ptesync" : : : "memory");
541 kvm->arch.tlbie_lock = 0;
542 } else {
543 asm volatile("ptesync" : : : "memory");
544 for (k = 0; k < n; ++k)
545 asm volatile("tlbiel %0" : : "r" (tlbrb[k]));
546 asm volatile("ptesync" : : : "memory");
547 }
548 594
549 /* Read PTE low words after tlbie to get final R/C values */ 595 /* Read PTE low words after tlbie to get final R/C values */
550 for (k = 0; k < n; ++k) { 596 for (k = 0; k < n; ++k) {
@@ -603,19 +649,7 @@ long kvmppc_h_protect(struct kvm_vcpu *vcpu, unsigned long flags,
603 if (v & HPTE_V_VALID) { 649 if (v & HPTE_V_VALID) {
604 rb = compute_tlbie_rb(v, r, pte_index); 650 rb = compute_tlbie_rb(v, r, pte_index);
605 hpte[0] = v & ~HPTE_V_VALID; 651 hpte[0] = v & ~HPTE_V_VALID;
606 if (global_invalidates(kvm, flags)) { 652 do_tlbies(kvm, &rb, 1, global_invalidates(kvm, flags), true);
607 while(!try_lock_tlbie(&kvm->arch.tlbie_lock))
608 cpu_relax();
609 asm volatile("ptesync" : : : "memory");
610 asm volatile(PPC_TLBIE(%1,%0)"; eieio; tlbsync"
611 : : "r" (rb), "r" (kvm->arch.lpid));
612 asm volatile("ptesync" : : : "memory");
613 kvm->arch.tlbie_lock = 0;
614 } else {
615 asm volatile("ptesync" : : : "memory");
616 asm volatile("tlbiel %0" : : "r" (rb));
617 asm volatile("ptesync" : : : "memory");
618 }
619 /* 653 /*
620 * If the host has this page as readonly but the guest 654 * If the host has this page as readonly but the guest
621 * wants to make it read/write, reduce the permissions. 655 * wants to make it read/write, reduce the permissions.
@@ -686,13 +720,7 @@ void kvmppc_invalidate_hpte(struct kvm *kvm, unsigned long *hptep,
686 720
687 hptep[0] &= ~HPTE_V_VALID; 721 hptep[0] &= ~HPTE_V_VALID;
688 rb = compute_tlbie_rb(hptep[0], hptep[1], pte_index); 722 rb = compute_tlbie_rb(hptep[0], hptep[1], pte_index);
689 while (!try_lock_tlbie(&kvm->arch.tlbie_lock)) 723 do_tlbies(kvm, &rb, 1, 1, true);
690 cpu_relax();
691 asm volatile("ptesync" : : : "memory");
692 asm volatile(PPC_TLBIE(%1,%0)"; eieio; tlbsync"
693 : : "r" (rb), "r" (kvm->arch.lpid));
694 asm volatile("ptesync" : : : "memory");
695 kvm->arch.tlbie_lock = 0;
696} 724}
697EXPORT_SYMBOL_GPL(kvmppc_invalidate_hpte); 725EXPORT_SYMBOL_GPL(kvmppc_invalidate_hpte);
698 726
@@ -706,12 +734,7 @@ void kvmppc_clear_ref_hpte(struct kvm *kvm, unsigned long *hptep,
706 rbyte = (hptep[1] & ~HPTE_R_R) >> 8; 734 rbyte = (hptep[1] & ~HPTE_R_R) >> 8;
707 /* modify only the second-last byte, which contains the ref bit */ 735 /* modify only the second-last byte, which contains the ref bit */
708 *((char *)hptep + 14) = rbyte; 736 *((char *)hptep + 14) = rbyte;
709 while (!try_lock_tlbie(&kvm->arch.tlbie_lock)) 737 do_tlbies(kvm, &rb, 1, 1, false);
710 cpu_relax();
711 asm volatile(PPC_TLBIE(%1,%0)"; eieio; tlbsync"
712 : : "r" (rb), "r" (kvm->arch.lpid));
713 asm volatile("ptesync" : : : "memory");
714 kvm->arch.tlbie_lock = 0;
715} 738}
716EXPORT_SYMBOL_GPL(kvmppc_clear_ref_hpte); 739EXPORT_SYMBOL_GPL(kvmppc_clear_ref_hpte);
717 740
diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index b02f91e4c70d..60dce5bfab3f 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -1381,7 +1381,7 @@ hcall_try_real_mode:
1381 cmpldi r3,hcall_real_table_end - hcall_real_table 1381 cmpldi r3,hcall_real_table_end - hcall_real_table
1382 bge guest_exit_cont 1382 bge guest_exit_cont
1383 LOAD_REG_ADDR(r4, hcall_real_table) 1383 LOAD_REG_ADDR(r4, hcall_real_table)
1384 lwzx r3,r3,r4 1384 lwax r3,r3,r4
1385 cmpwi r3,0 1385 cmpwi r3,0
1386 beq guest_exit_cont 1386 beq guest_exit_cont
1387 add r3,r3,r4 1387 add r3,r3,r4
diff --git a/arch/powerpc/kvm/book3s_interrupts.S b/arch/powerpc/kvm/book3s_interrupts.S
index 48cbbf862958..17cfae5497a3 100644
--- a/arch/powerpc/kvm/book3s_interrupts.S
+++ b/arch/powerpc/kvm/book3s_interrupts.S
@@ -92,6 +92,11 @@ kvm_start_lightweight:
92 PPC_LL r3, VCPU_HFLAGS(r4) 92 PPC_LL r3, VCPU_HFLAGS(r4)
93 rldicl r3, r3, 0, 63 /* r3 &= 1 */ 93 rldicl r3, r3, 0, 63 /* r3 &= 1 */
94 stb r3, HSTATE_RESTORE_HID5(r13) 94 stb r3, HSTATE_RESTORE_HID5(r13)
95
96 /* Load up guest SPRG3 value, since it's user readable */
97 ld r3, VCPU_SHARED(r4)
98 ld r3, VCPU_SHARED_SPRG3(r3)
99 mtspr SPRN_SPRG3, r3
95#endif /* CONFIG_PPC_BOOK3S_64 */ 100#endif /* CONFIG_PPC_BOOK3S_64 */
96 101
97 PPC_LL r4, VCPU_SHADOW_MSR(r4) /* get shadow_msr */ 102 PPC_LL r4, VCPU_SHADOW_MSR(r4) /* get shadow_msr */
@@ -123,6 +128,15 @@ kvmppc_handler_highmem:
123 /* R7 = vcpu */ 128 /* R7 = vcpu */
124 PPC_LL r7, GPR4(r1) 129 PPC_LL r7, GPR4(r1)
125 130
131#ifdef CONFIG_PPC_BOOK3S_64
132 /*
133 * Reload kernel SPRG3 value.
134 * No need to save guest value as usermode can't modify SPRG3.
135 */
136 ld r3, PACA_SPRG3(r13)
137 mtspr SPRN_SPRG3, r3
138#endif /* CONFIG_PPC_BOOK3S_64 */
139
126 PPC_STL r14, VCPU_GPR(R14)(r7) 140 PPC_STL r14, VCPU_GPR(R14)(r7)
127 PPC_STL r15, VCPU_GPR(R15)(r7) 141 PPC_STL r15, VCPU_GPR(R15)(r7)
128 PPC_STL r16, VCPU_GPR(R16)(r7) 142 PPC_STL r16, VCPU_GPR(R16)(r7)
diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c
index c6e13d9a9e15..27db1e665959 100644
--- a/arch/powerpc/kvm/book3s_pr.c
+++ b/arch/powerpc/kvm/book3s_pr.c
@@ -468,7 +468,8 @@ void kvmppc_giveup_ext(struct kvm_vcpu *vcpu, ulong msr)
468 * both the traditional FP registers and the added VSX 468 * both the traditional FP registers and the added VSX
469 * registers into thread.fpr[]. 469 * registers into thread.fpr[].
470 */ 470 */
471 giveup_fpu(current); 471 if (current->thread.regs->msr & MSR_FP)
472 giveup_fpu(current);
472 for (i = 0; i < ARRAY_SIZE(vcpu->arch.fpr); i++) 473 for (i = 0; i < ARRAY_SIZE(vcpu->arch.fpr); i++)
473 vcpu_fpr[i] = thread_fpr[get_fpr_index(i)]; 474 vcpu_fpr[i] = thread_fpr[get_fpr_index(i)];
474 475
@@ -483,7 +484,8 @@ void kvmppc_giveup_ext(struct kvm_vcpu *vcpu, ulong msr)
483 484
484#ifdef CONFIG_ALTIVEC 485#ifdef CONFIG_ALTIVEC
485 if (msr & MSR_VEC) { 486 if (msr & MSR_VEC) {
486 giveup_altivec(current); 487 if (current->thread.regs->msr & MSR_VEC)
488 giveup_altivec(current);
487 memcpy(vcpu->arch.vr, t->vr, sizeof(vcpu->arch.vr)); 489 memcpy(vcpu->arch.vr, t->vr, sizeof(vcpu->arch.vr));
488 vcpu->arch.vscr = t->vscr; 490 vcpu->arch.vscr = t->vscr;
489 } 491 }
@@ -575,8 +577,6 @@ static int kvmppc_handle_ext(struct kvm_vcpu *vcpu, unsigned int exit_nr,
575 printk(KERN_INFO "Loading up ext 0x%lx\n", msr); 577 printk(KERN_INFO "Loading up ext 0x%lx\n", msr);
576#endif 578#endif
577 579
578 current->thread.regs->msr |= msr;
579
580 if (msr & MSR_FP) { 580 if (msr & MSR_FP) {
581 for (i = 0; i < ARRAY_SIZE(vcpu->arch.fpr); i++) 581 for (i = 0; i < ARRAY_SIZE(vcpu->arch.fpr); i++)
582 thread_fpr[get_fpr_index(i)] = vcpu_fpr[i]; 582 thread_fpr[get_fpr_index(i)] = vcpu_fpr[i];
@@ -598,12 +598,32 @@ static int kvmppc_handle_ext(struct kvm_vcpu *vcpu, unsigned int exit_nr,
598#endif 598#endif
599 } 599 }
600 600
601 current->thread.regs->msr |= msr;
601 vcpu->arch.guest_owned_ext |= msr; 602 vcpu->arch.guest_owned_ext |= msr;
602 kvmppc_recalc_shadow_msr(vcpu); 603 kvmppc_recalc_shadow_msr(vcpu);
603 604
604 return RESUME_GUEST; 605 return RESUME_GUEST;
605} 606}
606 607
608/*
609 * Kernel code using FP or VMX could have flushed guest state to
610 * the thread_struct; if so, get it back now.
611 */
612static void kvmppc_handle_lost_ext(struct kvm_vcpu *vcpu)
613{
614 unsigned long lost_ext;
615
616 lost_ext = vcpu->arch.guest_owned_ext & ~current->thread.regs->msr;
617 if (!lost_ext)
618 return;
619
620 if (lost_ext & MSR_FP)
621 kvmppc_load_up_fpu();
622 if (lost_ext & MSR_VEC)
623 kvmppc_load_up_altivec();
624 current->thread.regs->msr |= lost_ext;
625}
626
607int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, 627int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
608 unsigned int exit_nr) 628 unsigned int exit_nr)
609{ 629{
@@ -772,7 +792,7 @@ program_interrupt:
772 } 792 }
773 case BOOK3S_INTERRUPT_SYSCALL: 793 case BOOK3S_INTERRUPT_SYSCALL:
774 if (vcpu->arch.papr_enabled && 794 if (vcpu->arch.papr_enabled &&
775 (kvmppc_get_last_inst(vcpu) == 0x44000022) && 795 (kvmppc_get_last_sc(vcpu) == 0x44000022) &&
776 !(vcpu->arch.shared->msr & MSR_PR)) { 796 !(vcpu->arch.shared->msr & MSR_PR)) {
777 /* SC 1 papr hypercalls */ 797 /* SC 1 papr hypercalls */
778 ulong cmd = kvmppc_get_gpr(vcpu, 3); 798 ulong cmd = kvmppc_get_gpr(vcpu, 3);
@@ -890,8 +910,9 @@ program_interrupt:
890 local_irq_enable(); 910 local_irq_enable();
891 r = s; 911 r = s;
892 } else { 912 } else {
893 kvmppc_lazy_ee_enable(); 913 kvmppc_fix_ee_before_entry();
894 } 914 }
915 kvmppc_handle_lost_ext(vcpu);
895 } 916 }
896 917
897 trace_kvm_book3s_reenter(r, vcpu); 918 trace_kvm_book3s_reenter(r, vcpu);
@@ -1162,7 +1183,7 @@ int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
1162 if (vcpu->arch.shared->msr & MSR_FP) 1183 if (vcpu->arch.shared->msr & MSR_FP)
1163 kvmppc_handle_ext(vcpu, BOOK3S_INTERRUPT_FP_UNAVAIL, MSR_FP); 1184 kvmppc_handle_ext(vcpu, BOOK3S_INTERRUPT_FP_UNAVAIL, MSR_FP);
1164 1185
1165 kvmppc_lazy_ee_enable(); 1186 kvmppc_fix_ee_before_entry();
1166 1187
1167 ret = __kvmppc_vcpu_run(kvm_run, vcpu); 1188 ret = __kvmppc_vcpu_run(kvm_run, vcpu);
1168 1189
diff --git a/arch/powerpc/kvm/book3s_xics.c b/arch/powerpc/kvm/book3s_xics.c
index 94c1dd46b83d..a3a5cb8ee7ea 100644
--- a/arch/powerpc/kvm/book3s_xics.c
+++ b/arch/powerpc/kvm/book3s_xics.c
@@ -19,6 +19,7 @@
19#include <asm/hvcall.h> 19#include <asm/hvcall.h>
20#include <asm/xics.h> 20#include <asm/xics.h>
21#include <asm/debug.h> 21#include <asm/debug.h>
22#include <asm/time.h>
22 23
23#include <linux/debugfs.h> 24#include <linux/debugfs.h>
24#include <linux/seq_file.h> 25#include <linux/seq_file.h>
diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c
index dcc94f016007..17722d82f1d1 100644
--- a/arch/powerpc/kvm/booke.c
+++ b/arch/powerpc/kvm/booke.c
@@ -674,8 +674,6 @@ int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
674 goto out; 674 goto out;
675 } 675 }
676 676
677 kvm_guest_enter();
678
679#ifdef CONFIG_PPC_FPU 677#ifdef CONFIG_PPC_FPU
680 /* Save userspace FPU state in stack */ 678 /* Save userspace FPU state in stack */
681 enable_kernel_fp(); 679 enable_kernel_fp();
@@ -698,7 +696,7 @@ int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
698 kvmppc_load_guest_fp(vcpu); 696 kvmppc_load_guest_fp(vcpu);
699#endif 697#endif
700 698
701 kvmppc_lazy_ee_enable(); 699 kvmppc_fix_ee_before_entry();
702 700
703 ret = __kvmppc_vcpu_run(kvm_run, vcpu); 701 ret = __kvmppc_vcpu_run(kvm_run, vcpu);
704 702
@@ -1168,7 +1166,7 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
1168 local_irq_enable(); 1166 local_irq_enable();
1169 r = (s << 2) | RESUME_HOST | (r & RESUME_FLAG_NV); 1167 r = (s << 2) | RESUME_HOST | (r & RESUME_FLAG_NV);
1170 } else { 1168 } else {
1171 kvmppc_lazy_ee_enable(); 1169 kvmppc_fix_ee_before_entry();
1172 } 1170 }
1173 } 1171 }
1174 1172
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 6316ee336e88..f55e14cd1762 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -117,8 +117,6 @@ int kvmppc_prepare_to_enter(struct kvm_vcpu *vcpu)
117 kvm_guest_exit(); 117 kvm_guest_exit();
118 continue; 118 continue;
119 } 119 }
120
121 trace_hardirqs_on();
122#endif 120#endif
123 121
124 kvm_guest_enter(); 122 kvm_guest_enter();
@@ -420,6 +418,10 @@ int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages)
420 return kvmppc_core_create_memslot(slot, npages); 418 return kvmppc_core_create_memslot(slot, npages);
421} 419}
422 420
421void kvm_arch_memslots_updated(struct kvm *kvm)
422{
423}
424
423int kvm_arch_prepare_memory_region(struct kvm *kvm, 425int kvm_arch_prepare_memory_region(struct kvm *kvm,
424 struct kvm_memory_slot *memslot, 426 struct kvm_memory_slot *memslot,
425 struct kvm_userspace_memory_region *mem, 427 struct kvm_userspace_memory_region *mem,
diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h
index 3238d4004e84..e87ecaa2c569 100644
--- a/arch/s390/include/asm/kvm_host.h
+++ b/arch/s390/include/asm/kvm_host.h
@@ -274,6 +274,14 @@ struct kvm_arch{
274 int css_support; 274 int css_support;
275}; 275};
276 276
277#define KVM_HVA_ERR_BAD (-1UL)
278#define KVM_HVA_ERR_RO_BAD (-2UL)
279
280static inline bool kvm_is_error_hva(unsigned long addr)
281{
282 return IS_ERR_VALUE(addr);
283}
284
277extern int sie64a(struct kvm_s390_sie_block *, u64 *); 285extern int sie64a(struct kvm_s390_sie_block *, u64 *);
278extern char sie_exit; 286extern char sie_exit;
279#endif 287#endif
diff --git a/arch/s390/include/asm/mmu.h b/arch/s390/include/asm/mmu.h
index 6340178748bf..ff132ac64ddd 100644
--- a/arch/s390/include/asm/mmu.h
+++ b/arch/s390/include/asm/mmu.h
@@ -12,8 +12,6 @@ typedef struct {
12 unsigned long asce_bits; 12 unsigned long asce_bits;
13 unsigned long asce_limit; 13 unsigned long asce_limit;
14 unsigned long vdso_base; 14 unsigned long vdso_base;
15 /* Cloned contexts will be created with extended page tables. */
16 unsigned int alloc_pgste:1;
17 /* The mmu context has extended page tables. */ 15 /* The mmu context has extended page tables. */
18 unsigned int has_pgste:1; 16 unsigned int has_pgste:1;
19} mm_context_t; 17} mm_context_t;
diff --git a/arch/s390/include/asm/mmu_context.h b/arch/s390/include/asm/mmu_context.h
index 7b7fce4e8469..9f973d8de90e 100644
--- a/arch/s390/include/asm/mmu_context.h
+++ b/arch/s390/include/asm/mmu_context.h
@@ -21,24 +21,7 @@ static inline int init_new_context(struct task_struct *tsk,
21#ifdef CONFIG_64BIT 21#ifdef CONFIG_64BIT
22 mm->context.asce_bits |= _ASCE_TYPE_REGION3; 22 mm->context.asce_bits |= _ASCE_TYPE_REGION3;
23#endif 23#endif
24 if (current->mm && current->mm->context.alloc_pgste) { 24 mm->context.has_pgste = 0;
25 /*
26 * alloc_pgste indicates, that any NEW context will be created
27 * with extended page tables. The old context is unchanged. The
28 * page table allocation and the page table operations will
29 * look at has_pgste to distinguish normal and extended page
30 * tables. The only way to create extended page tables is to
31 * set alloc_pgste and then create a new context (e.g. dup_mm).
32 * The page table allocation is called after init_new_context
33 * and if has_pgste is set, it will create extended page
34 * tables.
35 */
36 mm->context.has_pgste = 1;
37 mm->context.alloc_pgste = 1;
38 } else {
39 mm->context.has_pgste = 0;
40 mm->context.alloc_pgste = 0;
41 }
42 mm->context.asce_limit = STACK_TOP_MAX; 25 mm->context.asce_limit = STACK_TOP_MAX;
43 crst_table_init((unsigned long *) mm->pgd, pgd_entry_type(mm)); 26 crst_table_init((unsigned long *) mm->pgd, pgd_entry_type(mm));
44 return 0; 27 return 0;
diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h
index 9f215b40109e..9b60a36c348d 100644
--- a/arch/s390/include/asm/pgtable.h
+++ b/arch/s390/include/asm/pgtable.h
@@ -1442,6 +1442,17 @@ static inline pmd_t pmd_mkwrite(pmd_t pmd)
1442} 1442}
1443#endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLB_PAGE */ 1443#endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLB_PAGE */
1444 1444
1445static inline void pmdp_flush_lazy(struct mm_struct *mm,
1446 unsigned long address, pmd_t *pmdp)
1447{
1448 int active = (mm == current->active_mm) ? 1 : 0;
1449
1450 if ((atomic_read(&mm->context.attach_count) & 0xffff) > active)
1451 __pmd_idte(address, pmdp);
1452 else
1453 mm->context.flush_mm = 1;
1454}
1455
1445#ifdef CONFIG_TRANSPARENT_HUGEPAGE 1456#ifdef CONFIG_TRANSPARENT_HUGEPAGE
1446 1457
1447#define __HAVE_ARCH_PGTABLE_DEPOSIT 1458#define __HAVE_ARCH_PGTABLE_DEPOSIT
diff --git a/arch/s390/include/asm/processor.h b/arch/s390/include/asm/processor.h
index b0e6435b2f02..0eb37505cab1 100644
--- a/arch/s390/include/asm/processor.h
+++ b/arch/s390/include/asm/processor.h
@@ -43,6 +43,7 @@ extern void execve_tail(void);
43#ifndef CONFIG_64BIT 43#ifndef CONFIG_64BIT
44 44
45#define TASK_SIZE (1UL << 31) 45#define TASK_SIZE (1UL << 31)
46#define TASK_MAX_SIZE (1UL << 31)
46#define TASK_UNMAPPED_BASE (1UL << 30) 47#define TASK_UNMAPPED_BASE (1UL << 30)
47 48
48#else /* CONFIG_64BIT */ 49#else /* CONFIG_64BIT */
@@ -51,6 +52,7 @@ extern void execve_tail(void);
51#define TASK_UNMAPPED_BASE (test_thread_flag(TIF_31BIT) ? \ 52#define TASK_UNMAPPED_BASE (test_thread_flag(TIF_31BIT) ? \
52 (1UL << 30) : (1UL << 41)) 53 (1UL << 30) : (1UL << 41))
53#define TASK_SIZE TASK_SIZE_OF(current) 54#define TASK_SIZE TASK_SIZE_OF(current)
55#define TASK_MAX_SIZE (1UL << 53)
54 56
55#endif /* CONFIG_64BIT */ 57#endif /* CONFIG_64BIT */
56 58
diff --git a/arch/s390/kvm/diag.c b/arch/s390/kvm/diag.c
index 3074475c8ae0..3a74d8af0d69 100644
--- a/arch/s390/kvm/diag.c
+++ b/arch/s390/kvm/diag.c
@@ -119,12 +119,21 @@ static int __diag_virtio_hypercall(struct kvm_vcpu *vcpu)
119 * The layout is as follows: 119 * The layout is as follows:
120 * - gpr 2 contains the subchannel id (passed as addr) 120 * - gpr 2 contains the subchannel id (passed as addr)
121 * - gpr 3 contains the virtqueue index (passed as datamatch) 121 * - gpr 3 contains the virtqueue index (passed as datamatch)
122 * - gpr 4 contains the index on the bus (optionally)
122 */ 123 */
123 ret = kvm_io_bus_write(vcpu->kvm, KVM_VIRTIO_CCW_NOTIFY_BUS, 124 ret = kvm_io_bus_write_cookie(vcpu->kvm, KVM_VIRTIO_CCW_NOTIFY_BUS,
124 vcpu->run->s.regs.gprs[2], 125 vcpu->run->s.regs.gprs[2],
125 8, &vcpu->run->s.regs.gprs[3]); 126 8, &vcpu->run->s.regs.gprs[3],
127 vcpu->run->s.regs.gprs[4]);
126 srcu_read_unlock(&vcpu->kvm->srcu, idx); 128 srcu_read_unlock(&vcpu->kvm->srcu, idx);
127 /* kvm_io_bus_write returns -EOPNOTSUPP if it found no match. */ 129
130 /*
131 * Return cookie in gpr 2, but don't overwrite the register if the
132 * diagnose will be handled by userspace.
133 */
134 if (ret != -EOPNOTSUPP)
135 vcpu->run->s.regs.gprs[2] = ret;
136 /* kvm_io_bus_write_cookie returns -EOPNOTSUPP if it found no match. */
128 return ret < 0 ? ret : 0; 137 return ret < 0 ? ret : 0;
129} 138}
130 139
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 34c1c9a90be2..776dafe918db 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -28,6 +28,7 @@
28#include <asm/pgtable.h> 28#include <asm/pgtable.h>
29#include <asm/nmi.h> 29#include <asm/nmi.h>
30#include <asm/switch_to.h> 30#include <asm/switch_to.h>
31#include <asm/facility.h>
31#include <asm/sclp.h> 32#include <asm/sclp.h>
32#include "kvm-s390.h" 33#include "kvm-s390.h"
33#include "gaccess.h" 34#include "gaccess.h"
@@ -84,9 +85,15 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
84 { NULL } 85 { NULL }
85}; 86};
86 87
87static unsigned long long *facilities; 88unsigned long *vfacilities;
88static struct gmap_notifier gmap_notifier; 89static struct gmap_notifier gmap_notifier;
89 90
91/* test availability of vfacility */
92static inline int test_vfacility(unsigned long nr)
93{
94 return __test_facility(nr, (void *) vfacilities);
95}
96
90/* Section: not file related */ 97/* Section: not file related */
91int kvm_arch_hardware_enable(void *garbage) 98int kvm_arch_hardware_enable(void *garbage)
92{ 99{
@@ -387,7 +394,7 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
387 vcpu->arch.sie_block->ecb = 6; 394 vcpu->arch.sie_block->ecb = 6;
388 vcpu->arch.sie_block->ecb2 = 8; 395 vcpu->arch.sie_block->ecb2 = 8;
389 vcpu->arch.sie_block->eca = 0xC1002001U; 396 vcpu->arch.sie_block->eca = 0xC1002001U;
390 vcpu->arch.sie_block->fac = (int) (long) facilities; 397 vcpu->arch.sie_block->fac = (int) (long) vfacilities;
391 hrtimer_init(&vcpu->arch.ckc_timer, CLOCK_REALTIME, HRTIMER_MODE_ABS); 398 hrtimer_init(&vcpu->arch.ckc_timer, CLOCK_REALTIME, HRTIMER_MODE_ABS);
392 tasklet_init(&vcpu->arch.tasklet, kvm_s390_tasklet, 399 tasklet_init(&vcpu->arch.tasklet, kvm_s390_tasklet,
393 (unsigned long) vcpu); 400 (unsigned long) vcpu);
@@ -1063,6 +1070,10 @@ int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages)
1063 return 0; 1070 return 0;
1064} 1071}
1065 1072
1073void kvm_arch_memslots_updated(struct kvm *kvm)
1074{
1075}
1076
1066/* Section: memory related */ 1077/* Section: memory related */
1067int kvm_arch_prepare_memory_region(struct kvm *kvm, 1078int kvm_arch_prepare_memory_region(struct kvm *kvm,
1068 struct kvm_memory_slot *memslot, 1079 struct kvm_memory_slot *memslot,
@@ -1129,20 +1140,20 @@ static int __init kvm_s390_init(void)
1129 * to hold the maximum amount of facilities. On the other hand, we 1140 * to hold the maximum amount of facilities. On the other hand, we
1130 * only set facilities that are known to work in KVM. 1141 * only set facilities that are known to work in KVM.
1131 */ 1142 */
1132 facilities = (unsigned long long *) get_zeroed_page(GFP_KERNEL|GFP_DMA); 1143 vfacilities = (unsigned long *) get_zeroed_page(GFP_KERNEL|GFP_DMA);
1133 if (!facilities) { 1144 if (!vfacilities) {
1134 kvm_exit(); 1145 kvm_exit();
1135 return -ENOMEM; 1146 return -ENOMEM;
1136 } 1147 }
1137 memcpy(facilities, S390_lowcore.stfle_fac_list, 16); 1148 memcpy(vfacilities, S390_lowcore.stfle_fac_list, 16);
1138 facilities[0] &= 0xff82fff3f47c0000ULL; 1149 vfacilities[0] &= 0xff82fff3f47c0000UL;
1139 facilities[1] &= 0x001c000000000000ULL; 1150 vfacilities[1] &= 0x001c000000000000UL;
1140 return 0; 1151 return 0;
1141} 1152}
1142 1153
1143static void __exit kvm_s390_exit(void) 1154static void __exit kvm_s390_exit(void)
1144{ 1155{
1145 free_page((unsigned long) facilities); 1156 free_page((unsigned long) vfacilities);
1146 kvm_exit(); 1157 kvm_exit();
1147} 1158}
1148 1159
diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h
index 028ca9fd2158..dc99f1ca4267 100644
--- a/arch/s390/kvm/kvm-s390.h
+++ b/arch/s390/kvm/kvm-s390.h
@@ -24,6 +24,9 @@
24 24
25typedef int (*intercept_handler_t)(struct kvm_vcpu *vcpu); 25typedef int (*intercept_handler_t)(struct kvm_vcpu *vcpu);
26 26
27/* declare vfacilities extern */
28extern unsigned long *vfacilities;
29
27/* negativ values are error codes, positive values for internal conditions */ 30/* negativ values are error codes, positive values for internal conditions */
28#define SIE_INTERCEPT_RERUNVCPU (1<<0) 31#define SIE_INTERCEPT_RERUNVCPU (1<<0)
29#define SIE_INTERCEPT_UCONTROL (1<<1) 32#define SIE_INTERCEPT_UCONTROL (1<<1)
@@ -112,6 +115,13 @@ static inline u64 kvm_s390_get_base_disp_rs(struct kvm_vcpu *vcpu)
112 return (base2 ? vcpu->run->s.regs.gprs[base2] : 0) + disp2; 115 return (base2 ? vcpu->run->s.regs.gprs[base2] : 0) + disp2;
113} 116}
114 117
118/* Set the condition code in the guest program status word */
119static inline void kvm_s390_set_psw_cc(struct kvm_vcpu *vcpu, unsigned long cc)
120{
121 vcpu->arch.sie_block->gpsw.mask &= ~(3UL << 44);
122 vcpu->arch.sie_block->gpsw.mask |= cc << 44;
123}
124
115int kvm_s390_handle_wait(struct kvm_vcpu *vcpu); 125int kvm_s390_handle_wait(struct kvm_vcpu *vcpu);
116enum hrtimer_restart kvm_s390_idle_wakeup(struct hrtimer *timer); 126enum hrtimer_restart kvm_s390_idle_wakeup(struct hrtimer *timer);
117void kvm_s390_tasklet(unsigned long parm); 127void kvm_s390_tasklet(unsigned long parm);
diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c
index 4cdc54e63ebc..59200ee275e5 100644
--- a/arch/s390/kvm/priv.c
+++ b/arch/s390/kvm/priv.c
@@ -164,8 +164,7 @@ static int handle_tpi(struct kvm_vcpu *vcpu)
164 kfree(inti); 164 kfree(inti);
165no_interrupt: 165no_interrupt:
166 /* Set condition code and we're done. */ 166 /* Set condition code and we're done. */
167 vcpu->arch.sie_block->gpsw.mask &= ~(3ul << 44); 167 kvm_s390_set_psw_cc(vcpu, cc);
168 vcpu->arch.sie_block->gpsw.mask |= (cc & 3ul) << 44;
169 return 0; 168 return 0;
170} 169}
171 170
@@ -220,15 +219,13 @@ static int handle_io_inst(struct kvm_vcpu *vcpu)
220 * Set condition code 3 to stop the guest from issueing channel 219 * Set condition code 3 to stop the guest from issueing channel
221 * I/O instructions. 220 * I/O instructions.
222 */ 221 */
223 vcpu->arch.sie_block->gpsw.mask &= ~(3ul << 44); 222 kvm_s390_set_psw_cc(vcpu, 3);
224 vcpu->arch.sie_block->gpsw.mask |= (3 & 3ul) << 44;
225 return 0; 223 return 0;
226 } 224 }
227} 225}
228 226
229static int handle_stfl(struct kvm_vcpu *vcpu) 227static int handle_stfl(struct kvm_vcpu *vcpu)
230{ 228{
231 unsigned int facility_list;
232 int rc; 229 int rc;
233 230
234 vcpu->stat.instruction_stfl++; 231 vcpu->stat.instruction_stfl++;
@@ -236,15 +233,13 @@ static int handle_stfl(struct kvm_vcpu *vcpu)
236 if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE) 233 if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE)
237 return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP); 234 return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP);
238 235
239 /* only pass the facility bits, which we can handle */
240 facility_list = S390_lowcore.stfl_fac_list & 0xff82fff3;
241
242 rc = copy_to_guest(vcpu, offsetof(struct _lowcore, stfl_fac_list), 236 rc = copy_to_guest(vcpu, offsetof(struct _lowcore, stfl_fac_list),
243 &facility_list, sizeof(facility_list)); 237 vfacilities, 4);
244 if (rc) 238 if (rc)
245 return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); 239 return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
246 VCPU_EVENT(vcpu, 5, "store facility list value %x", facility_list); 240 VCPU_EVENT(vcpu, 5, "store facility list value %x",
247 trace_kvm_s390_handle_stfl(vcpu, facility_list); 241 *(unsigned int *) vfacilities);
242 trace_kvm_s390_handle_stfl(vcpu, *(unsigned int *) vfacilities);
248 return 0; 243 return 0;
249} 244}
250 245
@@ -387,7 +382,7 @@ static int handle_stsi(struct kvm_vcpu *vcpu)
387 return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP); 382 return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP);
388 383
389 if (fc > 3) { 384 if (fc > 3) {
390 vcpu->arch.sie_block->gpsw.mask |= 3ul << 44; /* cc 3 */ 385 kvm_s390_set_psw_cc(vcpu, 3);
391 return 0; 386 return 0;
392 } 387 }
393 388
@@ -397,7 +392,7 @@ static int handle_stsi(struct kvm_vcpu *vcpu)
397 392
398 if (fc == 0) { 393 if (fc == 0) {
399 vcpu->run->s.regs.gprs[0] = 3 << 28; 394 vcpu->run->s.regs.gprs[0] = 3 << 28;
400 vcpu->arch.sie_block->gpsw.mask &= ~(3ul << 44); /* cc 0 */ 395 kvm_s390_set_psw_cc(vcpu, 0);
401 return 0; 396 return 0;
402 } 397 }
403 398
@@ -431,12 +426,11 @@ static int handle_stsi(struct kvm_vcpu *vcpu)
431 } 426 }
432 trace_kvm_s390_handle_stsi(vcpu, fc, sel1, sel2, operand2); 427 trace_kvm_s390_handle_stsi(vcpu, fc, sel1, sel2, operand2);
433 free_page(mem); 428 free_page(mem);
434 vcpu->arch.sie_block->gpsw.mask &= ~(3ul << 44); 429 kvm_s390_set_psw_cc(vcpu, 0);
435 vcpu->run->s.regs.gprs[0] = 0; 430 vcpu->run->s.regs.gprs[0] = 0;
436 return 0; 431 return 0;
437out_no_data: 432out_no_data:
438 /* condition code 3 */ 433 kvm_s390_set_psw_cc(vcpu, 3);
439 vcpu->arch.sie_block->gpsw.mask |= 3ul << 44;
440out_exception: 434out_exception:
441 free_page(mem); 435 free_page(mem);
442 return rc; 436 return rc;
@@ -494,12 +488,12 @@ static int handle_epsw(struct kvm_vcpu *vcpu)
494 kvm_s390_get_regs_rre(vcpu, &reg1, &reg2); 488 kvm_s390_get_regs_rre(vcpu, &reg1, &reg2);
495 489
496 /* This basically extracts the mask half of the psw. */ 490 /* This basically extracts the mask half of the psw. */
497 vcpu->run->s.regs.gprs[reg1] &= 0xffffffff00000000; 491 vcpu->run->s.regs.gprs[reg1] &= 0xffffffff00000000UL;
498 vcpu->run->s.regs.gprs[reg1] |= vcpu->arch.sie_block->gpsw.mask >> 32; 492 vcpu->run->s.regs.gprs[reg1] |= vcpu->arch.sie_block->gpsw.mask >> 32;
499 if (reg2) { 493 if (reg2) {
500 vcpu->run->s.regs.gprs[reg2] &= 0xffffffff00000000; 494 vcpu->run->s.regs.gprs[reg2] &= 0xffffffff00000000UL;
501 vcpu->run->s.regs.gprs[reg2] |= 495 vcpu->run->s.regs.gprs[reg2] |=
502 vcpu->arch.sie_block->gpsw.mask & 0x00000000ffffffff; 496 vcpu->arch.sie_block->gpsw.mask & 0x00000000ffffffffUL;
503 } 497 }
504 return 0; 498 return 0;
505} 499}
diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c
index 6d16132d0850..bf7c0dc64a76 100644
--- a/arch/s390/mm/pgtable.c
+++ b/arch/s390/mm/pgtable.c
@@ -335,7 +335,7 @@ int gmap_map_segment(struct gmap *gmap, unsigned long from,
335 335
336 if ((from | to | len) & (PMD_SIZE - 1)) 336 if ((from | to | len) & (PMD_SIZE - 1))
337 return -EINVAL; 337 return -EINVAL;
338 if (len == 0 || from + len > PGDIR_SIZE || 338 if (len == 0 || from + len > TASK_MAX_SIZE ||
339 from + len < from || to + len < to) 339 from + len < from || to + len < to)
340 return -EINVAL; 340 return -EINVAL;
341 341
@@ -732,6 +732,11 @@ void gmap_do_ipte_notify(struct mm_struct *mm, unsigned long addr, pte_t *pte)
732 spin_unlock(&gmap_notifier_lock); 732 spin_unlock(&gmap_notifier_lock);
733} 733}
734 734
735static inline int page_table_with_pgste(struct page *page)
736{
737 return atomic_read(&page->_mapcount) == 0;
738}
739
735static inline unsigned long *page_table_alloc_pgste(struct mm_struct *mm, 740static inline unsigned long *page_table_alloc_pgste(struct mm_struct *mm,
736 unsigned long vmaddr) 741 unsigned long vmaddr)
737{ 742{
@@ -751,7 +756,7 @@ static inline unsigned long *page_table_alloc_pgste(struct mm_struct *mm,
751 mp->vmaddr = vmaddr & PMD_MASK; 756 mp->vmaddr = vmaddr & PMD_MASK;
752 INIT_LIST_HEAD(&mp->mapper); 757 INIT_LIST_HEAD(&mp->mapper);
753 page->index = (unsigned long) mp; 758 page->index = (unsigned long) mp;
754 atomic_set(&page->_mapcount, 3); 759 atomic_set(&page->_mapcount, 0);
755 table = (unsigned long *) page_to_phys(page); 760 table = (unsigned long *) page_to_phys(page);
756 clear_table(table, _PAGE_INVALID, PAGE_SIZE/2); 761 clear_table(table, _PAGE_INVALID, PAGE_SIZE/2);
757 clear_table(table + PTRS_PER_PTE, PGSTE_HR_BIT | PGSTE_HC_BIT, 762 clear_table(table + PTRS_PER_PTE, PGSTE_HR_BIT | PGSTE_HC_BIT,
@@ -818,6 +823,11 @@ EXPORT_SYMBOL(set_guest_storage_key);
818 823
819#else /* CONFIG_PGSTE */ 824#else /* CONFIG_PGSTE */
820 825
826static inline int page_table_with_pgste(struct page *page)
827{
828 return 0;
829}
830
821static inline unsigned long *page_table_alloc_pgste(struct mm_struct *mm, 831static inline unsigned long *page_table_alloc_pgste(struct mm_struct *mm,
822 unsigned long vmaddr) 832 unsigned long vmaddr)
823{ 833{
@@ -894,12 +904,12 @@ void page_table_free(struct mm_struct *mm, unsigned long *table)
894 struct page *page; 904 struct page *page;
895 unsigned int bit, mask; 905 unsigned int bit, mask;
896 906
897 if (mm_has_pgste(mm)) { 907 page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
908 if (page_table_with_pgste(page)) {
898 gmap_disconnect_pgtable(mm, table); 909 gmap_disconnect_pgtable(mm, table);
899 return page_table_free_pgste(table); 910 return page_table_free_pgste(table);
900 } 911 }
901 /* Free 1K/2K page table fragment of a 4K page */ 912 /* Free 1K/2K page table fragment of a 4K page */
902 page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
903 bit = 1 << ((__pa(table) & ~PAGE_MASK)/(PTRS_PER_PTE*sizeof(pte_t))); 913 bit = 1 << ((__pa(table) & ~PAGE_MASK)/(PTRS_PER_PTE*sizeof(pte_t)));
904 spin_lock_bh(&mm->context.list_lock); 914 spin_lock_bh(&mm->context.list_lock);
905 if ((atomic_read(&page->_mapcount) & FRAG_MASK) != FRAG_MASK) 915 if ((atomic_read(&page->_mapcount) & FRAG_MASK) != FRAG_MASK)
@@ -937,14 +947,14 @@ void page_table_free_rcu(struct mmu_gather *tlb, unsigned long *table)
937 unsigned int bit, mask; 947 unsigned int bit, mask;
938 948
939 mm = tlb->mm; 949 mm = tlb->mm;
940 if (mm_has_pgste(mm)) { 950 page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
951 if (page_table_with_pgste(page)) {
941 gmap_disconnect_pgtable(mm, table); 952 gmap_disconnect_pgtable(mm, table);
942 table = (unsigned long *) (__pa(table) | FRAG_MASK); 953 table = (unsigned long *) (__pa(table) | FRAG_MASK);
943 tlb_remove_table(tlb, table); 954 tlb_remove_table(tlb, table);
944 return; 955 return;
945 } 956 }
946 bit = 1 << ((__pa(table) & ~PAGE_MASK) / (PTRS_PER_PTE*sizeof(pte_t))); 957 bit = 1 << ((__pa(table) & ~PAGE_MASK) / (PTRS_PER_PTE*sizeof(pte_t)));
947 page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
948 spin_lock_bh(&mm->context.list_lock); 958 spin_lock_bh(&mm->context.list_lock);
949 if ((atomic_read(&page->_mapcount) & FRAG_MASK) != FRAG_MASK) 959 if ((atomic_read(&page->_mapcount) & FRAG_MASK) != FRAG_MASK)
950 list_del(&page->lru); 960 list_del(&page->lru);
@@ -1030,36 +1040,120 @@ void tlb_remove_table(struct mmu_gather *tlb, void *table)
1030} 1040}
1031 1041
1032#ifdef CONFIG_TRANSPARENT_HUGEPAGE 1042#ifdef CONFIG_TRANSPARENT_HUGEPAGE
1033void thp_split_vma(struct vm_area_struct *vma) 1043static inline void thp_split_vma(struct vm_area_struct *vma)
1034{ 1044{
1035 unsigned long addr; 1045 unsigned long addr;
1036 struct page *page;
1037 1046
1038 for (addr = vma->vm_start; addr < vma->vm_end; addr += PAGE_SIZE) { 1047 for (addr = vma->vm_start; addr < vma->vm_end; addr += PAGE_SIZE)
1039 page = follow_page(vma, addr, FOLL_SPLIT); 1048 follow_page(vma, addr, FOLL_SPLIT);
1040 }
1041} 1049}
1042 1050
1043void thp_split_mm(struct mm_struct *mm) 1051static inline void thp_split_mm(struct mm_struct *mm)
1044{ 1052{
1045 struct vm_area_struct *vma = mm->mmap; 1053 struct vm_area_struct *vma;
1046 1054
1047 while (vma != NULL) { 1055 for (vma = mm->mmap; vma != NULL; vma = vma->vm_next) {
1048 thp_split_vma(vma); 1056 thp_split_vma(vma);
1049 vma->vm_flags &= ~VM_HUGEPAGE; 1057 vma->vm_flags &= ~VM_HUGEPAGE;
1050 vma->vm_flags |= VM_NOHUGEPAGE; 1058 vma->vm_flags |= VM_NOHUGEPAGE;
1051 vma = vma->vm_next;
1052 } 1059 }
1060 mm->def_flags |= VM_NOHUGEPAGE;
1061}
1062#else
1063static inline void thp_split_mm(struct mm_struct *mm)
1064{
1053} 1065}
1054#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 1066#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
1055 1067
1068static unsigned long page_table_realloc_pmd(struct mmu_gather *tlb,
1069 struct mm_struct *mm, pud_t *pud,
1070 unsigned long addr, unsigned long end)
1071{
1072 unsigned long next, *table, *new;
1073 struct page *page;
1074 pmd_t *pmd;
1075
1076 pmd = pmd_offset(pud, addr);
1077 do {
1078 next = pmd_addr_end(addr, end);
1079again:
1080 if (pmd_none_or_clear_bad(pmd))
1081 continue;
1082 table = (unsigned long *) pmd_deref(*pmd);
1083 page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
1084 if (page_table_with_pgste(page))
1085 continue;
1086 /* Allocate new page table with pgstes */
1087 new = page_table_alloc_pgste(mm, addr);
1088 if (!new) {
1089 mm->context.has_pgste = 0;
1090 continue;
1091 }
1092 spin_lock(&mm->page_table_lock);
1093 if (likely((unsigned long *) pmd_deref(*pmd) == table)) {
1094 /* Nuke pmd entry pointing to the "short" page table */
1095 pmdp_flush_lazy(mm, addr, pmd);
1096 pmd_clear(pmd);
1097 /* Copy ptes from old table to new table */
1098 memcpy(new, table, PAGE_SIZE/2);
1099 clear_table(table, _PAGE_INVALID, PAGE_SIZE/2);
1100 /* Establish new table */
1101 pmd_populate(mm, pmd, (pte_t *) new);
1102 /* Free old table with rcu, there might be a walker! */
1103 page_table_free_rcu(tlb, table);
1104 new = NULL;
1105 }
1106 spin_unlock(&mm->page_table_lock);
1107 if (new) {
1108 page_table_free_pgste(new);
1109 goto again;
1110 }
1111 } while (pmd++, addr = next, addr != end);
1112
1113 return addr;
1114}
1115
1116static unsigned long page_table_realloc_pud(struct mmu_gather *tlb,
1117 struct mm_struct *mm, pgd_t *pgd,
1118 unsigned long addr, unsigned long end)
1119{
1120 unsigned long next;
1121 pud_t *pud;
1122
1123 pud = pud_offset(pgd, addr);
1124 do {
1125 next = pud_addr_end(addr, end);
1126 if (pud_none_or_clear_bad(pud))
1127 continue;
1128 next = page_table_realloc_pmd(tlb, mm, pud, addr, next);
1129 } while (pud++, addr = next, addr != end);
1130
1131 return addr;
1132}
1133
1134static void page_table_realloc(struct mmu_gather *tlb, struct mm_struct *mm,
1135 unsigned long addr, unsigned long end)
1136{
1137 unsigned long next;
1138 pgd_t *pgd;
1139
1140 pgd = pgd_offset(mm, addr);
1141 do {
1142 next = pgd_addr_end(addr, end);
1143 if (pgd_none_or_clear_bad(pgd))
1144 continue;
1145 next = page_table_realloc_pud(tlb, mm, pgd, addr, next);
1146 } while (pgd++, addr = next, addr != end);
1147}
1148
1056/* 1149/*
1057 * switch on pgstes for its userspace process (for kvm) 1150 * switch on pgstes for its userspace process (for kvm)
1058 */ 1151 */
1059int s390_enable_sie(void) 1152int s390_enable_sie(void)
1060{ 1153{
1061 struct task_struct *tsk = current; 1154 struct task_struct *tsk = current;
1062 struct mm_struct *mm, *old_mm; 1155 struct mm_struct *mm = tsk->mm;
1156 struct mmu_gather tlb;
1063 1157
1064 /* Do we have switched amode? If no, we cannot do sie */ 1158 /* Do we have switched amode? If no, we cannot do sie */
1065 if (s390_user_mode == HOME_SPACE_MODE) 1159 if (s390_user_mode == HOME_SPACE_MODE)
@@ -1069,57 +1163,16 @@ int s390_enable_sie(void)
1069 if (mm_has_pgste(tsk->mm)) 1163 if (mm_has_pgste(tsk->mm))
1070 return 0; 1164 return 0;
1071 1165
1072 /* lets check if we are allowed to replace the mm */ 1166 down_write(&mm->mmap_sem);
1073 task_lock(tsk);
1074 if (!tsk->mm || atomic_read(&tsk->mm->mm_users) > 1 ||
1075#ifdef CONFIG_AIO
1076 !hlist_empty(&tsk->mm->ioctx_list) ||
1077#endif
1078 tsk->mm != tsk->active_mm) {
1079 task_unlock(tsk);
1080 return -EINVAL;
1081 }
1082 task_unlock(tsk);
1083
1084 /* we copy the mm and let dup_mm create the page tables with_pgstes */
1085 tsk->mm->context.alloc_pgste = 1;
1086 /* make sure that both mms have a correct rss state */
1087 sync_mm_rss(tsk->mm);
1088 mm = dup_mm(tsk);
1089 tsk->mm->context.alloc_pgste = 0;
1090 if (!mm)
1091 return -ENOMEM;
1092
1093#ifdef CONFIG_TRANSPARENT_HUGEPAGE
1094 /* split thp mappings and disable thp for future mappings */ 1167 /* split thp mappings and disable thp for future mappings */
1095 thp_split_mm(mm); 1168 thp_split_mm(mm);
1096 mm->def_flags |= VM_NOHUGEPAGE; 1169 /* Reallocate the page tables with pgstes */
1097#endif 1170 mm->context.has_pgste = 1;
1098 1171 tlb_gather_mmu(&tlb, mm, 0, TASK_SIZE);
1099 /* Now lets check again if something happened */ 1172 page_table_realloc(&tlb, mm, 0, TASK_SIZE);
1100 task_lock(tsk); 1173 tlb_finish_mmu(&tlb, 0, TASK_SIZE);
1101 if (!tsk->mm || atomic_read(&tsk->mm->mm_users) > 1 || 1174 up_write(&mm->mmap_sem);
1102#ifdef CONFIG_AIO 1175 return mm->context.has_pgste ? 0 : -ENOMEM;
1103 !hlist_empty(&tsk->mm->ioctx_list) ||
1104#endif
1105 tsk->mm != tsk->active_mm) {
1106 mmput(mm);
1107 task_unlock(tsk);
1108 return -EINVAL;
1109 }
1110
1111 /* ok, we are alone. No ptrace, no threads, etc. */
1112 old_mm = tsk->mm;
1113 tsk->mm = tsk->active_mm = mm;
1114 preempt_disable();
1115 update_mm(mm, tsk);
1116 atomic_inc(&mm->context.attach_count);
1117 atomic_dec(&old_mm->context.attach_count);
1118 cpumask_set_cpu(smp_processor_id(), mm_cpumask(mm));
1119 preempt_enable();
1120 task_unlock(tsk);
1121 mmput(old_mm);
1122 return 0;
1123} 1176}
1124EXPORT_SYMBOL_GPL(s390_enable_sie); 1177EXPORT_SYMBOL_GPL(s390_enable_sie);
1125 1178
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index f87f7fcefa0a..c76ff74a98f2 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -286,6 +286,7 @@ struct kvm_mmu {
286 u64 *pae_root; 286 u64 *pae_root;
287 u64 *lm_root; 287 u64 *lm_root;
288 u64 rsvd_bits_mask[2][4]; 288 u64 rsvd_bits_mask[2][4];
289 u64 bad_mt_xwr;
289 290
290 /* 291 /*
291 * Bitmap: bit set = last pte in walk 292 * Bitmap: bit set = last pte in walk
@@ -323,6 +324,7 @@ struct kvm_pmu {
323 u64 global_ovf_ctrl; 324 u64 global_ovf_ctrl;
324 u64 counter_bitmask[2]; 325 u64 counter_bitmask[2];
325 u64 global_ctrl_mask; 326 u64 global_ctrl_mask;
327 u64 reserved_bits;
326 u8 version; 328 u8 version;
327 struct kvm_pmc gp_counters[INTEL_PMC_MAX_GENERIC]; 329 struct kvm_pmc gp_counters[INTEL_PMC_MAX_GENERIC];
328 struct kvm_pmc fixed_counters[INTEL_PMC_MAX_FIXED]; 330 struct kvm_pmc fixed_counters[INTEL_PMC_MAX_FIXED];
@@ -511,6 +513,14 @@ struct kvm_vcpu_arch {
511 * instruction. 513 * instruction.
512 */ 514 */
513 bool write_fault_to_shadow_pgtable; 515 bool write_fault_to_shadow_pgtable;
516
517 /* set at EPT violation at this point */
518 unsigned long exit_qualification;
519
520 /* pv related host specific info */
521 struct {
522 bool pv_unhalted;
523 } pv;
514}; 524};
515 525
516struct kvm_lpage_info { 526struct kvm_lpage_info {
@@ -802,8 +812,8 @@ extern u32 kvm_min_guest_tsc_khz;
802extern u32 kvm_max_guest_tsc_khz; 812extern u32 kvm_max_guest_tsc_khz;
803 813
804enum emulation_result { 814enum emulation_result {
805 EMULATE_DONE, /* no further processing */ 815 EMULATE_DONE, /* no further processing */
806 EMULATE_DO_MMIO, /* kvm_run filled with mmio request */ 816 EMULATE_USER_EXIT, /* kvm_run ready for userspace exit */
807 EMULATE_FAIL, /* can't emulate this instruction */ 817 EMULATE_FAIL, /* can't emulate this instruction */
808}; 818};
809 819
diff --git a/arch/x86/include/asm/pvclock.h b/arch/x86/include/asm/pvclock.h
index 109a9dd5d454..be8269b00e2a 100644
--- a/arch/x86/include/asm/pvclock.h
+++ b/arch/x86/include/asm/pvclock.h
@@ -93,7 +93,6 @@ unsigned __pvclock_read_cycles(const struct pvclock_vcpu_time_info *src,
93 93
94struct pvclock_vsyscall_time_info { 94struct pvclock_vsyscall_time_info {
95 struct pvclock_vcpu_time_info pvti; 95 struct pvclock_vcpu_time_info pvti;
96 u32 migrate_count;
97} __attribute__((__aligned__(SMP_CACHE_BYTES))); 96} __attribute__((__aligned__(SMP_CACHE_BYTES)));
98 97
99#define PVTI_SIZE sizeof(struct pvclock_vsyscall_time_info) 98#define PVTI_SIZE sizeof(struct pvclock_vsyscall_time_info)
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index f3e01a2cbaa1..966502d4682e 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -387,6 +387,7 @@ enum vmcs_field {
387#define VMX_EPT_EXTENT_INDIVIDUAL_ADDR 0 387#define VMX_EPT_EXTENT_INDIVIDUAL_ADDR 0
388#define VMX_EPT_EXTENT_CONTEXT 1 388#define VMX_EPT_EXTENT_CONTEXT 1
389#define VMX_EPT_EXTENT_GLOBAL 2 389#define VMX_EPT_EXTENT_GLOBAL 2
390#define VMX_EPT_EXTENT_SHIFT 24
390 391
391#define VMX_EPT_EXECUTE_ONLY_BIT (1ull) 392#define VMX_EPT_EXECUTE_ONLY_BIT (1ull)
392#define VMX_EPT_PAGE_WALK_4_BIT (1ull << 6) 393#define VMX_EPT_PAGE_WALK_4_BIT (1ull << 6)
@@ -394,6 +395,7 @@ enum vmcs_field {
394#define VMX_EPTP_WB_BIT (1ull << 14) 395#define VMX_EPTP_WB_BIT (1ull << 14)
395#define VMX_EPT_2MB_PAGE_BIT (1ull << 16) 396#define VMX_EPT_2MB_PAGE_BIT (1ull << 16)
396#define VMX_EPT_1GB_PAGE_BIT (1ull << 17) 397#define VMX_EPT_1GB_PAGE_BIT (1ull << 17)
398#define VMX_EPT_INVEPT_BIT (1ull << 20)
397#define VMX_EPT_AD_BIT (1ull << 21) 399#define VMX_EPT_AD_BIT (1ull << 21)
398#define VMX_EPT_EXTENT_CONTEXT_BIT (1ull << 25) 400#define VMX_EPT_EXTENT_CONTEXT_BIT (1ull << 25)
399#define VMX_EPT_EXTENT_GLOBAL_BIT (1ull << 26) 401#define VMX_EPT_EXTENT_GLOBAL_BIT (1ull << 26)
diff --git a/arch/x86/include/uapi/asm/vmx.h b/arch/x86/include/uapi/asm/vmx.h
index d651082c7cf7..0e79420376eb 100644
--- a/arch/x86/include/uapi/asm/vmx.h
+++ b/arch/x86/include/uapi/asm/vmx.h
@@ -65,6 +65,7 @@
65#define EXIT_REASON_EOI_INDUCED 45 65#define EXIT_REASON_EOI_INDUCED 45
66#define EXIT_REASON_EPT_VIOLATION 48 66#define EXIT_REASON_EPT_VIOLATION 48
67#define EXIT_REASON_EPT_MISCONFIG 49 67#define EXIT_REASON_EPT_MISCONFIG 49
68#define EXIT_REASON_INVEPT 50
68#define EXIT_REASON_PREEMPTION_TIMER 52 69#define EXIT_REASON_PREEMPTION_TIMER 52
69#define EXIT_REASON_WBINVD 54 70#define EXIT_REASON_WBINVD 54
70#define EXIT_REASON_XSETBV 55 71#define EXIT_REASON_XSETBV 55
@@ -106,12 +107,13 @@
106 { EXIT_REASON_APIC_ACCESS, "APIC_ACCESS" }, \ 107 { EXIT_REASON_APIC_ACCESS, "APIC_ACCESS" }, \
107 { EXIT_REASON_EPT_VIOLATION, "EPT_VIOLATION" }, \ 108 { EXIT_REASON_EPT_VIOLATION, "EPT_VIOLATION" }, \
108 { EXIT_REASON_EPT_MISCONFIG, "EPT_MISCONFIG" }, \ 109 { EXIT_REASON_EPT_MISCONFIG, "EPT_MISCONFIG" }, \
110 { EXIT_REASON_INVEPT, "INVEPT" }, \
111 { EXIT_REASON_PREEMPTION_TIMER, "PREEMPTION_TIMER" }, \
109 { EXIT_REASON_WBINVD, "WBINVD" }, \ 112 { EXIT_REASON_WBINVD, "WBINVD" }, \
110 { EXIT_REASON_APIC_WRITE, "APIC_WRITE" }, \ 113 { EXIT_REASON_APIC_WRITE, "APIC_WRITE" }, \
111 { EXIT_REASON_EOI_INDUCED, "EOI_INDUCED" }, \ 114 { EXIT_REASON_EOI_INDUCED, "EOI_INDUCED" }, \
112 { EXIT_REASON_INVALID_STATE, "INVALID_STATE" }, \ 115 { EXIT_REASON_INVALID_STATE, "INVALID_STATE" }, \
113 { EXIT_REASON_INVD, "INVD" }, \ 116 { EXIT_REASON_INVD, "INVD" }, \
114 { EXIT_REASON_INVPCID, "INVPCID" }, \ 117 { EXIT_REASON_INVPCID, "INVPCID" }
115 { EXIT_REASON_PREEMPTION_TIMER, "PREEMPTION_TIMER" }
116 118
117#endif /* _UAPIVMX_H */ 119#endif /* _UAPIVMX_H */
diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c
index 2cb9470ea85b..a16bae3f83b3 100644
--- a/arch/x86/kernel/pvclock.c
+++ b/arch/x86/kernel/pvclock.c
@@ -128,46 +128,7 @@ void pvclock_read_wallclock(struct pvclock_wall_clock *wall_clock,
128 set_normalized_timespec(ts, now.tv_sec, now.tv_nsec); 128 set_normalized_timespec(ts, now.tv_sec, now.tv_nsec);
129} 129}
130 130
131static struct pvclock_vsyscall_time_info *pvclock_vdso_info;
132
133static struct pvclock_vsyscall_time_info *
134pvclock_get_vsyscall_user_time_info(int cpu)
135{
136 if (!pvclock_vdso_info) {
137 BUG();
138 return NULL;
139 }
140
141 return &pvclock_vdso_info[cpu];
142}
143
144struct pvclock_vcpu_time_info *pvclock_get_vsyscall_time_info(int cpu)
145{
146 return &pvclock_get_vsyscall_user_time_info(cpu)->pvti;
147}
148
149#ifdef CONFIG_X86_64 131#ifdef CONFIG_X86_64
150static int pvclock_task_migrate(struct notifier_block *nb, unsigned long l,
151 void *v)
152{
153 struct task_migration_notifier *mn = v;
154 struct pvclock_vsyscall_time_info *pvti;
155
156 pvti = pvclock_get_vsyscall_user_time_info(mn->from_cpu);
157
158 /* this is NULL when pvclock vsyscall is not initialized */
159 if (unlikely(pvti == NULL))
160 return NOTIFY_DONE;
161
162 pvti->migrate_count++;
163
164 return NOTIFY_DONE;
165}
166
167static struct notifier_block pvclock_migrate = {
168 .notifier_call = pvclock_task_migrate,
169};
170
171/* 132/*
172 * Initialize the generic pvclock vsyscall state. This will allocate 133 * Initialize the generic pvclock vsyscall state. This will allocate
173 * a/some page(s) for the per-vcpu pvclock information, set up a 134 * a/some page(s) for the per-vcpu pvclock information, set up a
@@ -181,17 +142,12 @@ int __init pvclock_init_vsyscall(struct pvclock_vsyscall_time_info *i,
181 142
182 WARN_ON (size != PVCLOCK_VSYSCALL_NR_PAGES*PAGE_SIZE); 143 WARN_ON (size != PVCLOCK_VSYSCALL_NR_PAGES*PAGE_SIZE);
183 144
184 pvclock_vdso_info = i;
185
186 for (idx = 0; idx <= (PVCLOCK_FIXMAP_END-PVCLOCK_FIXMAP_BEGIN); idx++) { 145 for (idx = 0; idx <= (PVCLOCK_FIXMAP_END-PVCLOCK_FIXMAP_BEGIN); idx++) {
187 __set_fixmap(PVCLOCK_FIXMAP_BEGIN + idx, 146 __set_fixmap(PVCLOCK_FIXMAP_BEGIN + idx,
188 __pa(i) + (idx*PAGE_SIZE), 147 __pa(i) + (idx*PAGE_SIZE),
189 PAGE_KERNEL_VVAR); 148 PAGE_KERNEL_VVAR);
190 } 149 }
191 150
192
193 register_task_migration_notifier(&pvclock_migrate);
194
195 return 0; 151 return 0;
196} 152}
197#endif 153#endif
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index a20ecb5b6cbf..b110fe6c03d4 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -413,7 +413,8 @@ static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
413 (1 << KVM_FEATURE_CLOCKSOURCE2) | 413 (1 << KVM_FEATURE_CLOCKSOURCE2) |
414 (1 << KVM_FEATURE_ASYNC_PF) | 414 (1 << KVM_FEATURE_ASYNC_PF) |
415 (1 << KVM_FEATURE_PV_EOI) | 415 (1 << KVM_FEATURE_PV_EOI) |
416 (1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT); 416 (1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT) |
417 (1 << KVM_FEATURE_PV_UNHALT);
417 418
418 if (sched_info_on()) 419 if (sched_info_on())
419 entry->eax |= (1 << KVM_FEATURE_STEAL_TIME); 420 entry->eax |= (1 << KVM_FEATURE_STEAL_TIME);
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index afc11245827c..5439117d5c4c 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -79,16 +79,6 @@ static inline void apic_set_reg(struct kvm_lapic *apic, int reg_off, u32 val)
79 *((u32 *) (apic->regs + reg_off)) = val; 79 *((u32 *) (apic->regs + reg_off)) = val;
80} 80}
81 81
82static inline int apic_test_and_set_vector(int vec, void *bitmap)
83{
84 return test_and_set_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
85}
86
87static inline int apic_test_and_clear_vector(int vec, void *bitmap)
88{
89 return test_and_clear_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
90}
91
92static inline int apic_test_vector(int vec, void *bitmap) 82static inline int apic_test_vector(int vec, void *bitmap)
93{ 83{
94 return test_bit(VEC_POS(vec), (bitmap) + REG_POS(vec)); 84 return test_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
@@ -331,10 +321,10 @@ void kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir)
331} 321}
332EXPORT_SYMBOL_GPL(kvm_apic_update_irr); 322EXPORT_SYMBOL_GPL(kvm_apic_update_irr);
333 323
334static inline int apic_test_and_set_irr(int vec, struct kvm_lapic *apic) 324static inline void apic_set_irr(int vec, struct kvm_lapic *apic)
335{ 325{
336 apic->irr_pending = true; 326 apic->irr_pending = true;
337 return apic_test_and_set_vector(vec, apic->regs + APIC_IRR); 327 apic_set_vector(vec, apic->regs + APIC_IRR);
338} 328}
339 329
340static inline int apic_search_irr(struct kvm_lapic *apic) 330static inline int apic_search_irr(struct kvm_lapic *apic)
@@ -681,32 +671,28 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
681 if (unlikely(!apic_enabled(apic))) 671 if (unlikely(!apic_enabled(apic)))
682 break; 672 break;
683 673
674 result = 1;
675
684 if (dest_map) 676 if (dest_map)
685 __set_bit(vcpu->vcpu_id, dest_map); 677 __set_bit(vcpu->vcpu_id, dest_map);
686 678
687 if (kvm_x86_ops->deliver_posted_interrupt) { 679 if (kvm_x86_ops->deliver_posted_interrupt)
688 result = 1;
689 kvm_x86_ops->deliver_posted_interrupt(vcpu, vector); 680 kvm_x86_ops->deliver_posted_interrupt(vcpu, vector);
690 } else { 681 else {
691 result = !apic_test_and_set_irr(vector, apic); 682 apic_set_irr(vector, apic);
692
693 if (!result) {
694 if (trig_mode)
695 apic_debug("level trig mode repeatedly "
696 "for vector %d", vector);
697 goto out;
698 }
699 683
700 kvm_make_request(KVM_REQ_EVENT, vcpu); 684 kvm_make_request(KVM_REQ_EVENT, vcpu);
701 kvm_vcpu_kick(vcpu); 685 kvm_vcpu_kick(vcpu);
702 } 686 }
703out:
704 trace_kvm_apic_accept_irq(vcpu->vcpu_id, delivery_mode, 687 trace_kvm_apic_accept_irq(vcpu->vcpu_id, delivery_mode,
705 trig_mode, vector, !result); 688 trig_mode, vector, false);
706 break; 689 break;
707 690
708 case APIC_DM_REMRD: 691 case APIC_DM_REMRD:
709 apic_debug("Ignoring delivery mode 3\n"); 692 result = 1;
693 vcpu->arch.pv.pv_unhalted = 1;
694 kvm_make_request(KVM_REQ_EVENT, vcpu);
695 kvm_vcpu_kick(vcpu);
710 break; 696 break;
711 697
712 case APIC_DM_SMI: 698 case APIC_DM_SMI:
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 9e9285ae9b94..6e2d2c8f230b 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -132,8 +132,8 @@ module_param(dbg, bool, 0644);
132 (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \ 132 (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \
133 * PT32_LEVEL_BITS))) - 1)) 133 * PT32_LEVEL_BITS))) - 1))
134 134
135#define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | PT_USER_MASK \ 135#define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | shadow_user_mask \
136 | PT64_NX_MASK) 136 | shadow_x_mask | shadow_nx_mask)
137 137
138#define ACC_EXEC_MASK 1 138#define ACC_EXEC_MASK 1
139#define ACC_WRITE_MASK PT_WRITABLE_MASK 139#define ACC_WRITE_MASK PT_WRITABLE_MASK
@@ -331,11 +331,6 @@ static int is_large_pte(u64 pte)
331 return pte & PT_PAGE_SIZE_MASK; 331 return pte & PT_PAGE_SIZE_MASK;
332} 332}
333 333
334static int is_dirty_gpte(unsigned long pte)
335{
336 return pte & PT_DIRTY_MASK;
337}
338
339static int is_rmap_spte(u64 pte) 334static int is_rmap_spte(u64 pte)
340{ 335{
341 return is_shadow_present_pte(pte); 336 return is_shadow_present_pte(pte);
@@ -2052,12 +2047,18 @@ static void shadow_walk_next(struct kvm_shadow_walk_iterator *iterator)
2052 return __shadow_walk_next(iterator, *iterator->sptep); 2047 return __shadow_walk_next(iterator, *iterator->sptep);
2053} 2048}
2054 2049
2055static void link_shadow_page(u64 *sptep, struct kvm_mmu_page *sp) 2050static void link_shadow_page(u64 *sptep, struct kvm_mmu_page *sp, bool accessed)
2056{ 2051{
2057 u64 spte; 2052 u64 spte;
2058 2053
2054 BUILD_BUG_ON(VMX_EPT_READABLE_MASK != PT_PRESENT_MASK ||
2055 VMX_EPT_WRITABLE_MASK != PT_WRITABLE_MASK);
2056
2059 spte = __pa(sp->spt) | PT_PRESENT_MASK | PT_WRITABLE_MASK | 2057 spte = __pa(sp->spt) | PT_PRESENT_MASK | PT_WRITABLE_MASK |
2060 shadow_user_mask | shadow_x_mask | shadow_accessed_mask; 2058 shadow_user_mask | shadow_x_mask;
2059
2060 if (accessed)
2061 spte |= shadow_accessed_mask;
2061 2062
2062 mmu_spte_set(sptep, spte); 2063 mmu_spte_set(sptep, spte);
2063} 2064}
@@ -2574,14 +2575,6 @@ static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
2574 mmu_free_roots(vcpu); 2575 mmu_free_roots(vcpu);
2575} 2576}
2576 2577
2577static bool is_rsvd_bits_set(struct kvm_mmu *mmu, u64 gpte, int level)
2578{
2579 int bit7;
2580
2581 bit7 = (gpte >> 7) & 1;
2582 return (gpte & mmu->rsvd_bits_mask[bit7][level-1]) != 0;
2583}
2584
2585static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, 2578static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn,
2586 bool no_dirty_log) 2579 bool no_dirty_log)
2587{ 2580{
@@ -2594,26 +2587,6 @@ static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn,
2594 return gfn_to_pfn_memslot_atomic(slot, gfn); 2587 return gfn_to_pfn_memslot_atomic(slot, gfn);
2595} 2588}
2596 2589
2597static bool prefetch_invalid_gpte(struct kvm_vcpu *vcpu,
2598 struct kvm_mmu_page *sp, u64 *spte,
2599 u64 gpte)
2600{
2601 if (is_rsvd_bits_set(&vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL))
2602 goto no_present;
2603
2604 if (!is_present_gpte(gpte))
2605 goto no_present;
2606
2607 if (!(gpte & PT_ACCESSED_MASK))
2608 goto no_present;
2609
2610 return false;
2611
2612no_present:
2613 drop_spte(vcpu->kvm, spte);
2614 return true;
2615}
2616
2617static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu, 2590static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu,
2618 struct kvm_mmu_page *sp, 2591 struct kvm_mmu_page *sp,
2619 u64 *start, u64 *end) 2592 u64 *start, u64 *end)
@@ -2710,7 +2683,7 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
2710 iterator.level - 1, 2683 iterator.level - 1,
2711 1, ACC_ALL, iterator.sptep); 2684 1, ACC_ALL, iterator.sptep);
2712 2685
2713 link_shadow_page(iterator.sptep, sp); 2686 link_shadow_page(iterator.sptep, sp, true);
2714 } 2687 }
2715 } 2688 }
2716 return emulate; 2689 return emulate;
@@ -2808,7 +2781,7 @@ exit:
2808 return ret; 2781 return ret;
2809} 2782}
2810 2783
2811static bool page_fault_can_be_fast(struct kvm_vcpu *vcpu, u32 error_code) 2784static bool page_fault_can_be_fast(u32 error_code)
2812{ 2785{
2813 /* 2786 /*
2814 * Do not fix the mmio spte with invalid generation number which 2787 * Do not fix the mmio spte with invalid generation number which
@@ -2861,7 +2834,7 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level,
2861 bool ret = false; 2834 bool ret = false;
2862 u64 spte = 0ull; 2835 u64 spte = 0ull;
2863 2836
2864 if (!page_fault_can_be_fast(vcpu, error_code)) 2837 if (!page_fault_can_be_fast(error_code))
2865 return false; 2838 return false;
2866 2839
2867 walk_shadow_page_lockless_begin(vcpu); 2840 walk_shadow_page_lockless_begin(vcpu);
@@ -3209,6 +3182,7 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
3209 mmu_sync_roots(vcpu); 3182 mmu_sync_roots(vcpu);
3210 spin_unlock(&vcpu->kvm->mmu_lock); 3183 spin_unlock(&vcpu->kvm->mmu_lock);
3211} 3184}
3185EXPORT_SYMBOL_GPL(kvm_mmu_sync_roots);
3212 3186
3213static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr, 3187static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr,
3214 u32 access, struct x86_exception *exception) 3188 u32 access, struct x86_exception *exception)
@@ -3478,6 +3452,7 @@ void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu)
3478 ++vcpu->stat.tlb_flush; 3452 ++vcpu->stat.tlb_flush;
3479 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); 3453 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
3480} 3454}
3455EXPORT_SYMBOL_GPL(kvm_mmu_flush_tlb);
3481 3456
3482static void paging_new_cr3(struct kvm_vcpu *vcpu) 3457static void paging_new_cr3(struct kvm_vcpu *vcpu)
3483{ 3458{
@@ -3501,18 +3476,6 @@ static void paging_free(struct kvm_vcpu *vcpu)
3501 nonpaging_free(vcpu); 3476 nonpaging_free(vcpu);
3502} 3477}
3503 3478
3504static inline void protect_clean_gpte(unsigned *access, unsigned gpte)
3505{
3506 unsigned mask;
3507
3508 BUILD_BUG_ON(PT_WRITABLE_MASK != ACC_WRITE_MASK);
3509
3510 mask = (unsigned)~ACC_WRITE_MASK;
3511 /* Allow write access to dirty gptes */
3512 mask |= (gpte >> (PT_DIRTY_SHIFT - PT_WRITABLE_SHIFT)) & PT_WRITABLE_MASK;
3513 *access &= mask;
3514}
3515
3516static bool sync_mmio_spte(struct kvm *kvm, u64 *sptep, gfn_t gfn, 3479static bool sync_mmio_spte(struct kvm *kvm, u64 *sptep, gfn_t gfn,
3517 unsigned access, int *nr_present) 3480 unsigned access, int *nr_present)
3518{ 3481{
@@ -3530,16 +3493,6 @@ static bool sync_mmio_spte(struct kvm *kvm, u64 *sptep, gfn_t gfn,
3530 return false; 3493 return false;
3531} 3494}
3532 3495
3533static inline unsigned gpte_access(struct kvm_vcpu *vcpu, u64 gpte)
3534{
3535 unsigned access;
3536
3537 access = (gpte & (PT_WRITABLE_MASK | PT_USER_MASK)) | ACC_EXEC_MASK;
3538 access &= ~(gpte >> PT64_NX_SHIFT);
3539
3540 return access;
3541}
3542
3543static inline bool is_last_gpte(struct kvm_mmu *mmu, unsigned level, unsigned gpte) 3496static inline bool is_last_gpte(struct kvm_mmu *mmu, unsigned level, unsigned gpte)
3544{ 3497{
3545 unsigned index; 3498 unsigned index;
@@ -3549,6 +3502,11 @@ static inline bool is_last_gpte(struct kvm_mmu *mmu, unsigned level, unsigned gp
3549 return mmu->last_pte_bitmap & (1 << index); 3502 return mmu->last_pte_bitmap & (1 << index);
3550} 3503}
3551 3504
3505#define PTTYPE_EPT 18 /* arbitrary */
3506#define PTTYPE PTTYPE_EPT
3507#include "paging_tmpl.h"
3508#undef PTTYPE
3509
3552#define PTTYPE 64 3510#define PTTYPE 64
3553#include "paging_tmpl.h" 3511#include "paging_tmpl.h"
3554#undef PTTYPE 3512#undef PTTYPE
@@ -3563,6 +3521,8 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
3563 int maxphyaddr = cpuid_maxphyaddr(vcpu); 3521 int maxphyaddr = cpuid_maxphyaddr(vcpu);
3564 u64 exb_bit_rsvd = 0; 3522 u64 exb_bit_rsvd = 0;
3565 3523
3524 context->bad_mt_xwr = 0;
3525
3566 if (!context->nx) 3526 if (!context->nx)
3567 exb_bit_rsvd = rsvd_bits(63, 63); 3527 exb_bit_rsvd = rsvd_bits(63, 63);
3568 switch (context->root_level) { 3528 switch (context->root_level) {
@@ -3618,7 +3578,40 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
3618 } 3578 }
3619} 3579}
3620 3580
3621static void update_permission_bitmask(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu) 3581static void reset_rsvds_bits_mask_ept(struct kvm_vcpu *vcpu,
3582 struct kvm_mmu *context, bool execonly)
3583{
3584 int maxphyaddr = cpuid_maxphyaddr(vcpu);
3585 int pte;
3586
3587 context->rsvd_bits_mask[0][3] =
3588 rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 7);
3589 context->rsvd_bits_mask[0][2] =
3590 rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 6);
3591 context->rsvd_bits_mask[0][1] =
3592 rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 6);
3593 context->rsvd_bits_mask[0][0] = rsvd_bits(maxphyaddr, 51);
3594
3595 /* large page */
3596 context->rsvd_bits_mask[1][3] = context->rsvd_bits_mask[0][3];
3597 context->rsvd_bits_mask[1][2] =
3598 rsvd_bits(maxphyaddr, 51) | rsvd_bits(12, 29);
3599 context->rsvd_bits_mask[1][1] =
3600 rsvd_bits(maxphyaddr, 51) | rsvd_bits(12, 20);
3601 context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[0][0];
3602
3603 for (pte = 0; pte < 64; pte++) {
3604 int rwx_bits = pte & 7;
3605 int mt = pte >> 3;
3606 if (mt == 0x2 || mt == 0x3 || mt == 0x7 ||
3607 rwx_bits == 0x2 || rwx_bits == 0x6 ||
3608 (rwx_bits == 0x4 && !execonly))
3609 context->bad_mt_xwr |= (1ull << pte);
3610 }
3611}
3612
3613static void update_permission_bitmask(struct kvm_vcpu *vcpu,
3614 struct kvm_mmu *mmu, bool ept)
3622{ 3615{
3623 unsigned bit, byte, pfec; 3616 unsigned bit, byte, pfec;
3624 u8 map; 3617 u8 map;
@@ -3636,12 +3629,16 @@ static void update_permission_bitmask(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu
3636 w = bit & ACC_WRITE_MASK; 3629 w = bit & ACC_WRITE_MASK;
3637 u = bit & ACC_USER_MASK; 3630 u = bit & ACC_USER_MASK;
3638 3631
3639 /* Not really needed: !nx will cause pte.nx to fault */ 3632 if (!ept) {
3640 x |= !mmu->nx; 3633 /* Not really needed: !nx will cause pte.nx to fault */
3641 /* Allow supervisor writes if !cr0.wp */ 3634 x |= !mmu->nx;
3642 w |= !is_write_protection(vcpu) && !uf; 3635 /* Allow supervisor writes if !cr0.wp */
3643 /* Disallow supervisor fetches of user code if cr4.smep */ 3636 w |= !is_write_protection(vcpu) && !uf;
3644 x &= !(smep && u && !uf); 3637 /* Disallow supervisor fetches of user code if cr4.smep */
3638 x &= !(smep && u && !uf);
3639 } else
3640 /* Not really needed: no U/S accesses on ept */
3641 u = 1;
3645 3642
3646 fault = (ff && !x) || (uf && !u) || (wf && !w); 3643 fault = (ff && !x) || (uf && !u) || (wf && !w);
3647 map |= fault << bit; 3644 map |= fault << bit;
@@ -3676,7 +3673,7 @@ static int paging64_init_context_common(struct kvm_vcpu *vcpu,
3676 context->root_level = level; 3673 context->root_level = level;
3677 3674
3678 reset_rsvds_bits_mask(vcpu, context); 3675 reset_rsvds_bits_mask(vcpu, context);
3679 update_permission_bitmask(vcpu, context); 3676 update_permission_bitmask(vcpu, context, false);
3680 update_last_pte_bitmap(vcpu, context); 3677 update_last_pte_bitmap(vcpu, context);
3681 3678
3682 ASSERT(is_pae(vcpu)); 3679 ASSERT(is_pae(vcpu));
@@ -3706,7 +3703,7 @@ static int paging32_init_context(struct kvm_vcpu *vcpu,
3706 context->root_level = PT32_ROOT_LEVEL; 3703 context->root_level = PT32_ROOT_LEVEL;
3707 3704
3708 reset_rsvds_bits_mask(vcpu, context); 3705 reset_rsvds_bits_mask(vcpu, context);
3709 update_permission_bitmask(vcpu, context); 3706 update_permission_bitmask(vcpu, context, false);
3710 update_last_pte_bitmap(vcpu, context); 3707 update_last_pte_bitmap(vcpu, context);
3711 3708
3712 context->new_cr3 = paging_new_cr3; 3709 context->new_cr3 = paging_new_cr3;
@@ -3768,7 +3765,7 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
3768 context->gva_to_gpa = paging32_gva_to_gpa; 3765 context->gva_to_gpa = paging32_gva_to_gpa;
3769 } 3766 }
3770 3767
3771 update_permission_bitmask(vcpu, context); 3768 update_permission_bitmask(vcpu, context, false);
3772 update_last_pte_bitmap(vcpu, context); 3769 update_last_pte_bitmap(vcpu, context);
3773 3770
3774 return 0; 3771 return 0;
@@ -3800,6 +3797,33 @@ int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context)
3800} 3797}
3801EXPORT_SYMBOL_GPL(kvm_init_shadow_mmu); 3798EXPORT_SYMBOL_GPL(kvm_init_shadow_mmu);
3802 3799
3800int kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context,
3801 bool execonly)
3802{
3803 ASSERT(vcpu);
3804 ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
3805
3806 context->shadow_root_level = kvm_x86_ops->get_tdp_level();
3807
3808 context->nx = true;
3809 context->new_cr3 = paging_new_cr3;
3810 context->page_fault = ept_page_fault;
3811 context->gva_to_gpa = ept_gva_to_gpa;
3812 context->sync_page = ept_sync_page;
3813 context->invlpg = ept_invlpg;
3814 context->update_pte = ept_update_pte;
3815 context->free = paging_free;
3816 context->root_level = context->shadow_root_level;
3817 context->root_hpa = INVALID_PAGE;
3818 context->direct_map = false;
3819
3820 update_permission_bitmask(vcpu, context, true);
3821 reset_rsvds_bits_mask_ept(vcpu, context, execonly);
3822
3823 return 0;
3824}
3825EXPORT_SYMBOL_GPL(kvm_init_shadow_ept_mmu);
3826
3803static int init_kvm_softmmu(struct kvm_vcpu *vcpu) 3827static int init_kvm_softmmu(struct kvm_vcpu *vcpu)
3804{ 3828{
3805 int r = kvm_init_shadow_mmu(vcpu, vcpu->arch.walk_mmu); 3829 int r = kvm_init_shadow_mmu(vcpu, vcpu->arch.walk_mmu);
@@ -3847,7 +3871,7 @@ static int init_kvm_nested_mmu(struct kvm_vcpu *vcpu)
3847 g_context->gva_to_gpa = paging32_gva_to_gpa_nested; 3871 g_context->gva_to_gpa = paging32_gva_to_gpa_nested;
3848 } 3872 }
3849 3873
3850 update_permission_bitmask(vcpu, g_context); 3874 update_permission_bitmask(vcpu, g_context, false);
3851 update_last_pte_bitmap(vcpu, g_context); 3875 update_last_pte_bitmap(vcpu, g_context);
3852 3876
3853 return 0; 3877 return 0;
@@ -3923,8 +3947,8 @@ static bool need_remote_flush(u64 old, u64 new)
3923 return true; 3947 return true;
3924 if ((old ^ new) & PT64_BASE_ADDR_MASK) 3948 if ((old ^ new) & PT64_BASE_ADDR_MASK)
3925 return true; 3949 return true;
3926 old ^= PT64_NX_MASK; 3950 old ^= shadow_nx_mask;
3927 new ^= PT64_NX_MASK; 3951 new ^= shadow_nx_mask;
3928 return (old & ~new & PT64_PERM_MASK) != 0; 3952 return (old & ~new & PT64_PERM_MASK) != 0;
3929} 3953}
3930 3954
@@ -4182,7 +4206,7 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code,
4182 switch (er) { 4206 switch (er) {
4183 case EMULATE_DONE: 4207 case EMULATE_DONE:
4184 return 1; 4208 return 1;
4185 case EMULATE_DO_MMIO: 4209 case EMULATE_USER_EXIT:
4186 ++vcpu->stat.mmio_exits; 4210 ++vcpu->stat.mmio_exits;
4187 /* fall through */ 4211 /* fall through */
4188 case EMULATE_FAIL: 4212 case EMULATE_FAIL:
@@ -4390,11 +4414,8 @@ void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm)
4390 /* 4414 /*
4391 * The very rare case: if the generation-number is round, 4415 * The very rare case: if the generation-number is round,
4392 * zap all shadow pages. 4416 * zap all shadow pages.
4393 *
4394 * The max value is MMIO_MAX_GEN - 1 since it is not called
4395 * when mark memslot invalid.
4396 */ 4417 */
4397 if (unlikely(kvm_current_mmio_generation(kvm) >= (MMIO_MAX_GEN - 1))) { 4418 if (unlikely(kvm_current_mmio_generation(kvm) >= MMIO_MAX_GEN)) {
4398 printk_ratelimited(KERN_INFO "kvm: zapping shadow pages for mmio generation wraparound\n"); 4419 printk_ratelimited(KERN_INFO "kvm: zapping shadow pages for mmio generation wraparound\n");
4399 kvm_mmu_invalidate_zap_all_pages(kvm); 4420 kvm_mmu_invalidate_zap_all_pages(kvm);
4400 } 4421 }
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 5b59c573aba7..77e044a0f5f7 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -71,6 +71,8 @@ enum {
71 71
72int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct); 72int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct);
73int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context); 73int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context);
74int kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context,
75 bool execonly);
74 76
75static inline unsigned int kvm_mmu_available_pages(struct kvm *kvm) 77static inline unsigned int kvm_mmu_available_pages(struct kvm *kvm)
76{ 78{
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 7769699d48a8..043330159179 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -23,6 +23,13 @@
23 * so the code in this file is compiled twice, once per pte size. 23 * so the code in this file is compiled twice, once per pte size.
24 */ 24 */
25 25
26/*
27 * This is used to catch non optimized PT_GUEST_(DIRTY|ACCESS)_SHIFT macro
28 * uses for EPT without A/D paging type.
29 */
30extern u64 __pure __using_nonexistent_pte_bit(void)
31 __compiletime_error("wrong use of PT_GUEST_(DIRTY|ACCESS)_SHIFT");
32
26#if PTTYPE == 64 33#if PTTYPE == 64
27 #define pt_element_t u64 34 #define pt_element_t u64
28 #define guest_walker guest_walker64 35 #define guest_walker guest_walker64
@@ -32,6 +39,10 @@
32 #define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl) 39 #define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl)
33 #define PT_INDEX(addr, level) PT64_INDEX(addr, level) 40 #define PT_INDEX(addr, level) PT64_INDEX(addr, level)
34 #define PT_LEVEL_BITS PT64_LEVEL_BITS 41 #define PT_LEVEL_BITS PT64_LEVEL_BITS
42 #define PT_GUEST_ACCESSED_MASK PT_ACCESSED_MASK
43 #define PT_GUEST_DIRTY_MASK PT_DIRTY_MASK
44 #define PT_GUEST_DIRTY_SHIFT PT_DIRTY_SHIFT
45 #define PT_GUEST_ACCESSED_SHIFT PT_ACCESSED_SHIFT
35 #ifdef CONFIG_X86_64 46 #ifdef CONFIG_X86_64
36 #define PT_MAX_FULL_LEVELS 4 47 #define PT_MAX_FULL_LEVELS 4
37 #define CMPXCHG cmpxchg 48 #define CMPXCHG cmpxchg
@@ -49,7 +60,26 @@
49 #define PT_INDEX(addr, level) PT32_INDEX(addr, level) 60 #define PT_INDEX(addr, level) PT32_INDEX(addr, level)
50 #define PT_LEVEL_BITS PT32_LEVEL_BITS 61 #define PT_LEVEL_BITS PT32_LEVEL_BITS
51 #define PT_MAX_FULL_LEVELS 2 62 #define PT_MAX_FULL_LEVELS 2
63 #define PT_GUEST_ACCESSED_MASK PT_ACCESSED_MASK
64 #define PT_GUEST_DIRTY_MASK PT_DIRTY_MASK
65 #define PT_GUEST_DIRTY_SHIFT PT_DIRTY_SHIFT
66 #define PT_GUEST_ACCESSED_SHIFT PT_ACCESSED_SHIFT
52 #define CMPXCHG cmpxchg 67 #define CMPXCHG cmpxchg
68#elif PTTYPE == PTTYPE_EPT
69 #define pt_element_t u64
70 #define guest_walker guest_walkerEPT
71 #define FNAME(name) ept_##name
72 #define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK
73 #define PT_LVL_ADDR_MASK(lvl) PT64_LVL_ADDR_MASK(lvl)
74 #define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl)
75 #define PT_INDEX(addr, level) PT64_INDEX(addr, level)
76 #define PT_LEVEL_BITS PT64_LEVEL_BITS
77 #define PT_GUEST_ACCESSED_MASK 0
78 #define PT_GUEST_DIRTY_MASK 0
79 #define PT_GUEST_DIRTY_SHIFT __using_nonexistent_pte_bit()
80 #define PT_GUEST_ACCESSED_SHIFT __using_nonexistent_pte_bit()
81 #define CMPXCHG cmpxchg64
82 #define PT_MAX_FULL_LEVELS 4
53#else 83#else
54 #error Invalid PTTYPE value 84 #error Invalid PTTYPE value
55#endif 85#endif
@@ -80,6 +110,40 @@ static gfn_t gpte_to_gfn_lvl(pt_element_t gpte, int lvl)
80 return (gpte & PT_LVL_ADDR_MASK(lvl)) >> PAGE_SHIFT; 110 return (gpte & PT_LVL_ADDR_MASK(lvl)) >> PAGE_SHIFT;
81} 111}
82 112
113static inline void FNAME(protect_clean_gpte)(unsigned *access, unsigned gpte)
114{
115 unsigned mask;
116
117 /* dirty bit is not supported, so no need to track it */
118 if (!PT_GUEST_DIRTY_MASK)
119 return;
120
121 BUILD_BUG_ON(PT_WRITABLE_MASK != ACC_WRITE_MASK);
122
123 mask = (unsigned)~ACC_WRITE_MASK;
124 /* Allow write access to dirty gptes */
125 mask |= (gpte >> (PT_GUEST_DIRTY_SHIFT - PT_WRITABLE_SHIFT)) &
126 PT_WRITABLE_MASK;
127 *access &= mask;
128}
129
130static bool FNAME(is_rsvd_bits_set)(struct kvm_mmu *mmu, u64 gpte, int level)
131{
132 int bit7 = (gpte >> 7) & 1, low6 = gpte & 0x3f;
133
134 return (gpte & mmu->rsvd_bits_mask[bit7][level-1]) |
135 ((mmu->bad_mt_xwr & (1ull << low6)) != 0);
136}
137
138static inline int FNAME(is_present_gpte)(unsigned long pte)
139{
140#if PTTYPE != PTTYPE_EPT
141 return is_present_gpte(pte);
142#else
143 return pte & 7;
144#endif
145}
146
83static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, 147static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
84 pt_element_t __user *ptep_user, unsigned index, 148 pt_element_t __user *ptep_user, unsigned index,
85 pt_element_t orig_pte, pt_element_t new_pte) 149 pt_element_t orig_pte, pt_element_t new_pte)
@@ -103,6 +167,42 @@ static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
103 return (ret != orig_pte); 167 return (ret != orig_pte);
104} 168}
105 169
170static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu,
171 struct kvm_mmu_page *sp, u64 *spte,
172 u64 gpte)
173{
174 if (FNAME(is_rsvd_bits_set)(&vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL))
175 goto no_present;
176
177 if (!FNAME(is_present_gpte)(gpte))
178 goto no_present;
179
180 /* if accessed bit is not supported prefetch non accessed gpte */
181 if (PT_GUEST_ACCESSED_MASK && !(gpte & PT_GUEST_ACCESSED_MASK))
182 goto no_present;
183
184 return false;
185
186no_present:
187 drop_spte(vcpu->kvm, spte);
188 return true;
189}
190
191static inline unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, u64 gpte)
192{
193 unsigned access;
194#if PTTYPE == PTTYPE_EPT
195 access = ((gpte & VMX_EPT_WRITABLE_MASK) ? ACC_WRITE_MASK : 0) |
196 ((gpte & VMX_EPT_EXECUTABLE_MASK) ? ACC_EXEC_MASK : 0) |
197 ACC_USER_MASK;
198#else
199 access = (gpte & (PT_WRITABLE_MASK | PT_USER_MASK)) | ACC_EXEC_MASK;
200 access &= ~(gpte >> PT64_NX_SHIFT);
201#endif
202
203 return access;
204}
205
106static int FNAME(update_accessed_dirty_bits)(struct kvm_vcpu *vcpu, 206static int FNAME(update_accessed_dirty_bits)(struct kvm_vcpu *vcpu,
107 struct kvm_mmu *mmu, 207 struct kvm_mmu *mmu,
108 struct guest_walker *walker, 208 struct guest_walker *walker,
@@ -114,18 +214,23 @@ static int FNAME(update_accessed_dirty_bits)(struct kvm_vcpu *vcpu,
114 gfn_t table_gfn; 214 gfn_t table_gfn;
115 int ret; 215 int ret;
116 216
217 /* dirty/accessed bits are not supported, so no need to update them */
218 if (!PT_GUEST_DIRTY_MASK)
219 return 0;
220
117 for (level = walker->max_level; level >= walker->level; --level) { 221 for (level = walker->max_level; level >= walker->level; --level) {
118 pte = orig_pte = walker->ptes[level - 1]; 222 pte = orig_pte = walker->ptes[level - 1];
119 table_gfn = walker->table_gfn[level - 1]; 223 table_gfn = walker->table_gfn[level - 1];
120 ptep_user = walker->ptep_user[level - 1]; 224 ptep_user = walker->ptep_user[level - 1];
121 index = offset_in_page(ptep_user) / sizeof(pt_element_t); 225 index = offset_in_page(ptep_user) / sizeof(pt_element_t);
122 if (!(pte & PT_ACCESSED_MASK)) { 226 if (!(pte & PT_GUEST_ACCESSED_MASK)) {
123 trace_kvm_mmu_set_accessed_bit(table_gfn, index, sizeof(pte)); 227 trace_kvm_mmu_set_accessed_bit(table_gfn, index, sizeof(pte));
124 pte |= PT_ACCESSED_MASK; 228 pte |= PT_GUEST_ACCESSED_MASK;
125 } 229 }
126 if (level == walker->level && write_fault && !is_dirty_gpte(pte)) { 230 if (level == walker->level && write_fault &&
231 !(pte & PT_GUEST_DIRTY_MASK)) {
127 trace_kvm_mmu_set_dirty_bit(table_gfn, index, sizeof(pte)); 232 trace_kvm_mmu_set_dirty_bit(table_gfn, index, sizeof(pte));
128 pte |= PT_DIRTY_MASK; 233 pte |= PT_GUEST_DIRTY_MASK;
129 } 234 }
130 if (pte == orig_pte) 235 if (pte == orig_pte)
131 continue; 236 continue;
@@ -170,7 +275,7 @@ retry_walk:
170 if (walker->level == PT32E_ROOT_LEVEL) { 275 if (walker->level == PT32E_ROOT_LEVEL) {
171 pte = mmu->get_pdptr(vcpu, (addr >> 30) & 3); 276 pte = mmu->get_pdptr(vcpu, (addr >> 30) & 3);
172 trace_kvm_mmu_paging_element(pte, walker->level); 277 trace_kvm_mmu_paging_element(pte, walker->level);
173 if (!is_present_gpte(pte)) 278 if (!FNAME(is_present_gpte)(pte))
174 goto error; 279 goto error;
175 --walker->level; 280 --walker->level;
176 } 281 }
@@ -179,7 +284,7 @@ retry_walk:
179 ASSERT((!is_long_mode(vcpu) && is_pae(vcpu)) || 284 ASSERT((!is_long_mode(vcpu) && is_pae(vcpu)) ||
180 (mmu->get_cr3(vcpu) & CR3_NONPAE_RESERVED_BITS) == 0); 285 (mmu->get_cr3(vcpu) & CR3_NONPAE_RESERVED_BITS) == 0);
181 286
182 accessed_dirty = PT_ACCESSED_MASK; 287 accessed_dirty = PT_GUEST_ACCESSED_MASK;
183 pt_access = pte_access = ACC_ALL; 288 pt_access = pte_access = ACC_ALL;
184 ++walker->level; 289 ++walker->level;
185 290
@@ -215,17 +320,17 @@ retry_walk:
215 320
216 trace_kvm_mmu_paging_element(pte, walker->level); 321 trace_kvm_mmu_paging_element(pte, walker->level);
217 322
218 if (unlikely(!is_present_gpte(pte))) 323 if (unlikely(!FNAME(is_present_gpte)(pte)))
219 goto error; 324 goto error;
220 325
221 if (unlikely(is_rsvd_bits_set(&vcpu->arch.mmu, pte, 326 if (unlikely(FNAME(is_rsvd_bits_set)(mmu, pte,
222 walker->level))) { 327 walker->level))) {
223 errcode |= PFERR_RSVD_MASK | PFERR_PRESENT_MASK; 328 errcode |= PFERR_RSVD_MASK | PFERR_PRESENT_MASK;
224 goto error; 329 goto error;
225 } 330 }
226 331
227 accessed_dirty &= pte; 332 accessed_dirty &= pte;
228 pte_access = pt_access & gpte_access(vcpu, pte); 333 pte_access = pt_access & FNAME(gpte_access)(vcpu, pte);
229 334
230 walker->ptes[walker->level - 1] = pte; 335 walker->ptes[walker->level - 1] = pte;
231 } while (!is_last_gpte(mmu, walker->level, pte)); 336 } while (!is_last_gpte(mmu, walker->level, pte));
@@ -248,13 +353,15 @@ retry_walk:
248 walker->gfn = real_gpa >> PAGE_SHIFT; 353 walker->gfn = real_gpa >> PAGE_SHIFT;
249 354
250 if (!write_fault) 355 if (!write_fault)
251 protect_clean_gpte(&pte_access, pte); 356 FNAME(protect_clean_gpte)(&pte_access, pte);
252 else 357 else
253 /* 358 /*
254 * On a write fault, fold the dirty bit into accessed_dirty by 359 * On a write fault, fold the dirty bit into accessed_dirty.
255 * shifting it one place right. 360 * For modes without A/D bits support accessed_dirty will be
361 * always clear.
256 */ 362 */
257 accessed_dirty &= pte >> (PT_DIRTY_SHIFT - PT_ACCESSED_SHIFT); 363 accessed_dirty &= pte >>
364 (PT_GUEST_DIRTY_SHIFT - PT_GUEST_ACCESSED_SHIFT);
258 365
259 if (unlikely(!accessed_dirty)) { 366 if (unlikely(!accessed_dirty)) {
260 ret = FNAME(update_accessed_dirty_bits)(vcpu, mmu, walker, write_fault); 367 ret = FNAME(update_accessed_dirty_bits)(vcpu, mmu, walker, write_fault);
@@ -279,6 +386,25 @@ error:
279 walker->fault.vector = PF_VECTOR; 386 walker->fault.vector = PF_VECTOR;
280 walker->fault.error_code_valid = true; 387 walker->fault.error_code_valid = true;
281 walker->fault.error_code = errcode; 388 walker->fault.error_code = errcode;
389
390#if PTTYPE == PTTYPE_EPT
391 /*
392 * Use PFERR_RSVD_MASK in error_code to to tell if EPT
393 * misconfiguration requires to be injected. The detection is
394 * done by is_rsvd_bits_set() above.
395 *
396 * We set up the value of exit_qualification to inject:
397 * [2:0] - Derive from [2:0] of real exit_qualification at EPT violation
398 * [5:3] - Calculated by the page walk of the guest EPT page tables
399 * [7:8] - Derived from [7:8] of real exit_qualification
400 *
401 * The other bits are set to 0.
402 */
403 if (!(errcode & PFERR_RSVD_MASK)) {
404 vcpu->arch.exit_qualification &= 0x187;
405 vcpu->arch.exit_qualification |= ((pt_access & pte) & 0x7) << 3;
406 }
407#endif
282 walker->fault.address = addr; 408 walker->fault.address = addr;
283 walker->fault.nested_page_fault = mmu != vcpu->arch.walk_mmu; 409 walker->fault.nested_page_fault = mmu != vcpu->arch.walk_mmu;
284 410
@@ -293,6 +419,7 @@ static int FNAME(walk_addr)(struct guest_walker *walker,
293 access); 419 access);
294} 420}
295 421
422#if PTTYPE != PTTYPE_EPT
296static int FNAME(walk_addr_nested)(struct guest_walker *walker, 423static int FNAME(walk_addr_nested)(struct guest_walker *walker,
297 struct kvm_vcpu *vcpu, gva_t addr, 424 struct kvm_vcpu *vcpu, gva_t addr,
298 u32 access) 425 u32 access)
@@ -300,6 +427,7 @@ static int FNAME(walk_addr_nested)(struct guest_walker *walker,
300 return FNAME(walk_addr_generic)(walker, vcpu, &vcpu->arch.nested_mmu, 427 return FNAME(walk_addr_generic)(walker, vcpu, &vcpu->arch.nested_mmu,
301 addr, access); 428 addr, access);
302} 429}
430#endif
303 431
304static bool 432static bool
305FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, 433FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
@@ -309,14 +437,14 @@ FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
309 gfn_t gfn; 437 gfn_t gfn;
310 pfn_t pfn; 438 pfn_t pfn;
311 439
312 if (prefetch_invalid_gpte(vcpu, sp, spte, gpte)) 440 if (FNAME(prefetch_invalid_gpte)(vcpu, sp, spte, gpte))
313 return false; 441 return false;
314 442
315 pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte); 443 pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte);
316 444
317 gfn = gpte_to_gfn(gpte); 445 gfn = gpte_to_gfn(gpte);
318 pte_access = sp->role.access & gpte_access(vcpu, gpte); 446 pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
319 protect_clean_gpte(&pte_access, gpte); 447 FNAME(protect_clean_gpte)(&pte_access, gpte);
320 pfn = pte_prefetch_gfn_to_pfn(vcpu, gfn, 448 pfn = pte_prefetch_gfn_to_pfn(vcpu, gfn,
321 no_dirty_log && (pte_access & ACC_WRITE_MASK)); 449 no_dirty_log && (pte_access & ACC_WRITE_MASK));
322 if (is_error_pfn(pfn)) 450 if (is_error_pfn(pfn))
@@ -446,7 +574,7 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
446 goto out_gpte_changed; 574 goto out_gpte_changed;
447 575
448 if (sp) 576 if (sp)
449 link_shadow_page(it.sptep, sp); 577 link_shadow_page(it.sptep, sp, PT_GUEST_ACCESSED_MASK);
450 } 578 }
451 579
452 for (; 580 for (;
@@ -466,7 +594,7 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
466 594
467 sp = kvm_mmu_get_page(vcpu, direct_gfn, addr, it.level-1, 595 sp = kvm_mmu_get_page(vcpu, direct_gfn, addr, it.level-1,
468 true, direct_access, it.sptep); 596 true, direct_access, it.sptep);
469 link_shadow_page(it.sptep, sp); 597 link_shadow_page(it.sptep, sp, PT_GUEST_ACCESSED_MASK);
470 } 598 }
471 599
472 clear_sp_write_flooding_count(it.sptep); 600 clear_sp_write_flooding_count(it.sptep);
@@ -727,6 +855,7 @@ static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr, u32 access,
727 return gpa; 855 return gpa;
728} 856}
729 857
858#if PTTYPE != PTTYPE_EPT
730static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gva_t vaddr, 859static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gva_t vaddr,
731 u32 access, 860 u32 access,
732 struct x86_exception *exception) 861 struct x86_exception *exception)
@@ -745,6 +874,7 @@ static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gva_t vaddr,
745 874
746 return gpa; 875 return gpa;
747} 876}
877#endif
748 878
749/* 879/*
750 * Using the cached information from sp->gfns is safe because: 880 * Using the cached information from sp->gfns is safe because:
@@ -785,15 +915,15 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
785 sizeof(pt_element_t))) 915 sizeof(pt_element_t)))
786 return -EINVAL; 916 return -EINVAL;
787 917
788 if (prefetch_invalid_gpte(vcpu, sp, &sp->spt[i], gpte)) { 918 if (FNAME(prefetch_invalid_gpte)(vcpu, sp, &sp->spt[i], gpte)) {
789 vcpu->kvm->tlbs_dirty++; 919 vcpu->kvm->tlbs_dirty++;
790 continue; 920 continue;
791 } 921 }
792 922
793 gfn = gpte_to_gfn(gpte); 923 gfn = gpte_to_gfn(gpte);
794 pte_access = sp->role.access; 924 pte_access = sp->role.access;
795 pte_access &= gpte_access(vcpu, gpte); 925 pte_access &= FNAME(gpte_access)(vcpu, gpte);
796 protect_clean_gpte(&pte_access, gpte); 926 FNAME(protect_clean_gpte)(&pte_access, gpte);
797 927
798 if (sync_mmio_spte(vcpu->kvm, &sp->spt[i], gfn, pte_access, 928 if (sync_mmio_spte(vcpu->kvm, &sp->spt[i], gfn, pte_access,
799 &nr_present)) 929 &nr_present))
@@ -830,3 +960,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
830#undef gpte_to_gfn 960#undef gpte_to_gfn
831#undef gpte_to_gfn_lvl 961#undef gpte_to_gfn_lvl
832#undef CMPXCHG 962#undef CMPXCHG
963#undef PT_GUEST_ACCESSED_MASK
964#undef PT_GUEST_DIRTY_MASK
965#undef PT_GUEST_DIRTY_SHIFT
966#undef PT_GUEST_ACCESSED_SHIFT
diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c
index c53e797e7369..5c4f63151b4d 100644
--- a/arch/x86/kvm/pmu.c
+++ b/arch/x86/kvm/pmu.c
@@ -160,7 +160,7 @@ static void stop_counter(struct kvm_pmc *pmc)
160 160
161static void reprogram_counter(struct kvm_pmc *pmc, u32 type, 161static void reprogram_counter(struct kvm_pmc *pmc, u32 type,
162 unsigned config, bool exclude_user, bool exclude_kernel, 162 unsigned config, bool exclude_user, bool exclude_kernel,
163 bool intr) 163 bool intr, bool in_tx, bool in_tx_cp)
164{ 164{
165 struct perf_event *event; 165 struct perf_event *event;
166 struct perf_event_attr attr = { 166 struct perf_event_attr attr = {
@@ -173,6 +173,10 @@ static void reprogram_counter(struct kvm_pmc *pmc, u32 type,
173 .exclude_kernel = exclude_kernel, 173 .exclude_kernel = exclude_kernel,
174 .config = config, 174 .config = config,
175 }; 175 };
176 if (in_tx)
177 attr.config |= HSW_IN_TX;
178 if (in_tx_cp)
179 attr.config |= HSW_IN_TX_CHECKPOINTED;
176 180
177 attr.sample_period = (-pmc->counter) & pmc_bitmask(pmc); 181 attr.sample_period = (-pmc->counter) & pmc_bitmask(pmc);
178 182
@@ -226,7 +230,9 @@ static void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel)
226 230
227 if (!(eventsel & (ARCH_PERFMON_EVENTSEL_EDGE | 231 if (!(eventsel & (ARCH_PERFMON_EVENTSEL_EDGE |
228 ARCH_PERFMON_EVENTSEL_INV | 232 ARCH_PERFMON_EVENTSEL_INV |
229 ARCH_PERFMON_EVENTSEL_CMASK))) { 233 ARCH_PERFMON_EVENTSEL_CMASK |
234 HSW_IN_TX |
235 HSW_IN_TX_CHECKPOINTED))) {
230 config = find_arch_event(&pmc->vcpu->arch.pmu, event_select, 236 config = find_arch_event(&pmc->vcpu->arch.pmu, event_select,
231 unit_mask); 237 unit_mask);
232 if (config != PERF_COUNT_HW_MAX) 238 if (config != PERF_COUNT_HW_MAX)
@@ -239,7 +245,9 @@ static void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel)
239 reprogram_counter(pmc, type, config, 245 reprogram_counter(pmc, type, config,
240 !(eventsel & ARCH_PERFMON_EVENTSEL_USR), 246 !(eventsel & ARCH_PERFMON_EVENTSEL_USR),
241 !(eventsel & ARCH_PERFMON_EVENTSEL_OS), 247 !(eventsel & ARCH_PERFMON_EVENTSEL_OS),
242 eventsel & ARCH_PERFMON_EVENTSEL_INT); 248 eventsel & ARCH_PERFMON_EVENTSEL_INT,
249 (eventsel & HSW_IN_TX),
250 (eventsel & HSW_IN_TX_CHECKPOINTED));
243} 251}
244 252
245static void reprogram_fixed_counter(struct kvm_pmc *pmc, u8 en_pmi, int idx) 253static void reprogram_fixed_counter(struct kvm_pmc *pmc, u8 en_pmi, int idx)
@@ -256,7 +264,7 @@ static void reprogram_fixed_counter(struct kvm_pmc *pmc, u8 en_pmi, int idx)
256 arch_events[fixed_pmc_events[idx]].event_type, 264 arch_events[fixed_pmc_events[idx]].event_type,
257 !(en & 0x2), /* exclude user */ 265 !(en & 0x2), /* exclude user */
258 !(en & 0x1), /* exclude kernel */ 266 !(en & 0x1), /* exclude kernel */
259 pmi); 267 pmi, false, false);
260} 268}
261 269
262static inline u8 fixed_en_pmi(u64 ctrl, int idx) 270static inline u8 fixed_en_pmi(u64 ctrl, int idx)
@@ -408,7 +416,7 @@ int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
408 } else if ((pmc = get_gp_pmc(pmu, index, MSR_P6_EVNTSEL0))) { 416 } else if ((pmc = get_gp_pmc(pmu, index, MSR_P6_EVNTSEL0))) {
409 if (data == pmc->eventsel) 417 if (data == pmc->eventsel)
410 return 0; 418 return 0;
411 if (!(data & 0xffffffff00200000ull)) { 419 if (!(data & pmu->reserved_bits)) {
412 reprogram_gp_counter(pmc, data); 420 reprogram_gp_counter(pmc, data);
413 return 0; 421 return 0;
414 } 422 }
@@ -450,6 +458,7 @@ void kvm_pmu_cpuid_update(struct kvm_vcpu *vcpu)
450 pmu->counter_bitmask[KVM_PMC_GP] = 0; 458 pmu->counter_bitmask[KVM_PMC_GP] = 0;
451 pmu->counter_bitmask[KVM_PMC_FIXED] = 0; 459 pmu->counter_bitmask[KVM_PMC_FIXED] = 0;
452 pmu->version = 0; 460 pmu->version = 0;
461 pmu->reserved_bits = 0xffffffff00200000ull;
453 462
454 entry = kvm_find_cpuid_entry(vcpu, 0xa, 0); 463 entry = kvm_find_cpuid_entry(vcpu, 0xa, 0);
455 if (!entry) 464 if (!entry)
@@ -478,6 +487,12 @@ void kvm_pmu_cpuid_update(struct kvm_vcpu *vcpu)
478 pmu->global_ctrl = ((1 << pmu->nr_arch_gp_counters) - 1) | 487 pmu->global_ctrl = ((1 << pmu->nr_arch_gp_counters) - 1) |
479 (((1ull << pmu->nr_arch_fixed_counters) - 1) << INTEL_PMC_IDX_FIXED); 488 (((1ull << pmu->nr_arch_fixed_counters) - 1) << INTEL_PMC_IDX_FIXED);
480 pmu->global_ctrl_mask = ~pmu->global_ctrl; 489 pmu->global_ctrl_mask = ~pmu->global_ctrl;
490
491 entry = kvm_find_cpuid_entry(vcpu, 7, 0);
492 if (entry &&
493 (boot_cpu_has(X86_FEATURE_HLE) || boot_cpu_has(X86_FEATURE_RTM)) &&
494 (entry->ebx & (X86_FEATURE_HLE|X86_FEATURE_RTM)))
495 pmu->reserved_bits ^= HSW_IN_TX|HSW_IN_TX_CHECKPOINTED;
481} 496}
482 497
483void kvm_pmu_init(struct kvm_vcpu *vcpu) 498void kvm_pmu_init(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 064d0be67ecc..1f1da43ff2a2 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -373,6 +373,7 @@ struct nested_vmx {
373 * we must keep them pinned while L2 runs. 373 * we must keep them pinned while L2 runs.
374 */ 374 */
375 struct page *apic_access_page; 375 struct page *apic_access_page;
376 u64 msr_ia32_feature_control;
376}; 377};
377 378
378#define POSTED_INTR_ON 0 379#define POSTED_INTR_ON 0
@@ -711,10 +712,10 @@ static void nested_release_page_clean(struct page *page)
711 kvm_release_page_clean(page); 712 kvm_release_page_clean(page);
712} 713}
713 714
715static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu);
714static u64 construct_eptp(unsigned long root_hpa); 716static u64 construct_eptp(unsigned long root_hpa);
715static void kvm_cpu_vmxon(u64 addr); 717static void kvm_cpu_vmxon(u64 addr);
716static void kvm_cpu_vmxoff(void); 718static void kvm_cpu_vmxoff(void);
717static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3);
718static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr); 719static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr);
719static void vmx_set_segment(struct kvm_vcpu *vcpu, 720static void vmx_set_segment(struct kvm_vcpu *vcpu,
720 struct kvm_segment *var, int seg); 721 struct kvm_segment *var, int seg);
@@ -1039,12 +1040,16 @@ static inline bool nested_cpu_has2(struct vmcs12 *vmcs12, u32 bit)
1039 (vmcs12->secondary_vm_exec_control & bit); 1040 (vmcs12->secondary_vm_exec_control & bit);
1040} 1041}
1041 1042
1042static inline bool nested_cpu_has_virtual_nmis(struct vmcs12 *vmcs12, 1043static inline bool nested_cpu_has_virtual_nmis(struct vmcs12 *vmcs12)
1043 struct kvm_vcpu *vcpu)
1044{ 1044{
1045 return vmcs12->pin_based_vm_exec_control & PIN_BASED_VIRTUAL_NMIS; 1045 return vmcs12->pin_based_vm_exec_control & PIN_BASED_VIRTUAL_NMIS;
1046} 1046}
1047 1047
1048static inline int nested_cpu_has_ept(struct vmcs12 *vmcs12)
1049{
1050 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_EPT);
1051}
1052
1048static inline bool is_exception(u32 intr_info) 1053static inline bool is_exception(u32 intr_info)
1049{ 1054{
1050 return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK)) 1055 return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
@@ -2155,6 +2160,7 @@ static u32 nested_vmx_pinbased_ctls_low, nested_vmx_pinbased_ctls_high;
2155static u32 nested_vmx_exit_ctls_low, nested_vmx_exit_ctls_high; 2160static u32 nested_vmx_exit_ctls_low, nested_vmx_exit_ctls_high;
2156static u32 nested_vmx_entry_ctls_low, nested_vmx_entry_ctls_high; 2161static u32 nested_vmx_entry_ctls_low, nested_vmx_entry_ctls_high;
2157static u32 nested_vmx_misc_low, nested_vmx_misc_high; 2162static u32 nested_vmx_misc_low, nested_vmx_misc_high;
2163static u32 nested_vmx_ept_caps;
2158static __init void nested_vmx_setup_ctls_msrs(void) 2164static __init void nested_vmx_setup_ctls_msrs(void)
2159{ 2165{
2160 /* 2166 /*
@@ -2190,14 +2196,17 @@ static __init void nested_vmx_setup_ctls_msrs(void)
2190 * If bit 55 of VMX_BASIC is off, bits 0-8 and 10, 11, 13, 14, 16 and 2196 * If bit 55 of VMX_BASIC is off, bits 0-8 and 10, 11, 13, 14, 16 and
2191 * 17 must be 1. 2197 * 17 must be 1.
2192 */ 2198 */
2199 rdmsr(MSR_IA32_VMX_EXIT_CTLS,
2200 nested_vmx_exit_ctls_low, nested_vmx_exit_ctls_high);
2193 nested_vmx_exit_ctls_low = VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR; 2201 nested_vmx_exit_ctls_low = VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR;
2194 /* Note that guest use of VM_EXIT_ACK_INTR_ON_EXIT is not supported. */ 2202 /* Note that guest use of VM_EXIT_ACK_INTR_ON_EXIT is not supported. */
2203 nested_vmx_exit_ctls_high &=
2195#ifdef CONFIG_X86_64 2204#ifdef CONFIG_X86_64
2196 nested_vmx_exit_ctls_high = VM_EXIT_HOST_ADDR_SPACE_SIZE; 2205 VM_EXIT_HOST_ADDR_SPACE_SIZE |
2197#else
2198 nested_vmx_exit_ctls_high = 0;
2199#endif 2206#endif
2200 nested_vmx_exit_ctls_high |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR; 2207 VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT;
2208 nested_vmx_exit_ctls_high |= (VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR |
2209 VM_EXIT_LOAD_IA32_EFER);
2201 2210
2202 /* entry controls */ 2211 /* entry controls */
2203 rdmsr(MSR_IA32_VMX_ENTRY_CTLS, 2212 rdmsr(MSR_IA32_VMX_ENTRY_CTLS,
@@ -2205,8 +2214,12 @@ static __init void nested_vmx_setup_ctls_msrs(void)
2205 /* If bit 55 of VMX_BASIC is off, bits 0-8 and 12 must be 1. */ 2214 /* If bit 55 of VMX_BASIC is off, bits 0-8 and 12 must be 1. */
2206 nested_vmx_entry_ctls_low = VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR; 2215 nested_vmx_entry_ctls_low = VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR;
2207 nested_vmx_entry_ctls_high &= 2216 nested_vmx_entry_ctls_high &=
2208 VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_IA32E_MODE; 2217#ifdef CONFIG_X86_64
2209 nested_vmx_entry_ctls_high |= VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR; 2218 VM_ENTRY_IA32E_MODE |
2219#endif
2220 VM_ENTRY_LOAD_IA32_PAT;
2221 nested_vmx_entry_ctls_high |= (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR |
2222 VM_ENTRY_LOAD_IA32_EFER);
2210 2223
2211 /* cpu-based controls */ 2224 /* cpu-based controls */
2212 rdmsr(MSR_IA32_VMX_PROCBASED_CTLS, 2225 rdmsr(MSR_IA32_VMX_PROCBASED_CTLS,
@@ -2241,6 +2254,22 @@ static __init void nested_vmx_setup_ctls_msrs(void)
2241 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | 2254 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
2242 SECONDARY_EXEC_WBINVD_EXITING; 2255 SECONDARY_EXEC_WBINVD_EXITING;
2243 2256
2257 if (enable_ept) {
2258 /* nested EPT: emulate EPT also to L1 */
2259 nested_vmx_secondary_ctls_high |= SECONDARY_EXEC_ENABLE_EPT;
2260 nested_vmx_ept_caps = VMX_EPT_PAGE_WALK_4_BIT |
2261 VMX_EPTP_WB_BIT | VMX_EPT_INVEPT_BIT;
2262 nested_vmx_ept_caps &= vmx_capability.ept;
2263 /*
2264 * Since invept is completely emulated we support both global
2265 * and context invalidation independent of what host cpu
2266 * supports
2267 */
2268 nested_vmx_ept_caps |= VMX_EPT_EXTENT_GLOBAL_BIT |
2269 VMX_EPT_EXTENT_CONTEXT_BIT;
2270 } else
2271 nested_vmx_ept_caps = 0;
2272
2244 /* miscellaneous data */ 2273 /* miscellaneous data */
2245 rdmsr(MSR_IA32_VMX_MISC, nested_vmx_misc_low, nested_vmx_misc_high); 2274 rdmsr(MSR_IA32_VMX_MISC, nested_vmx_misc_low, nested_vmx_misc_high);
2246 nested_vmx_misc_low &= VMX_MISC_PREEMPTION_TIMER_RATE_MASK | 2275 nested_vmx_misc_low &= VMX_MISC_PREEMPTION_TIMER_RATE_MASK |
@@ -2282,8 +2311,11 @@ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
2282 2311
2283 switch (msr_index) { 2312 switch (msr_index) {
2284 case MSR_IA32_FEATURE_CONTROL: 2313 case MSR_IA32_FEATURE_CONTROL:
2285 *pdata = 0; 2314 if (nested_vmx_allowed(vcpu)) {
2286 break; 2315 *pdata = to_vmx(vcpu)->nested.msr_ia32_feature_control;
2316 break;
2317 }
2318 return 0;
2287 case MSR_IA32_VMX_BASIC: 2319 case MSR_IA32_VMX_BASIC:
2288 /* 2320 /*
2289 * This MSR reports some information about VMX support. We 2321 * This MSR reports some information about VMX support. We
@@ -2346,8 +2378,8 @@ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
2346 nested_vmx_secondary_ctls_high); 2378 nested_vmx_secondary_ctls_high);
2347 break; 2379 break;
2348 case MSR_IA32_VMX_EPT_VPID_CAP: 2380 case MSR_IA32_VMX_EPT_VPID_CAP:
2349 /* Currently, no nested ept or nested vpid */ 2381 /* Currently, no nested vpid support */
2350 *pdata = 0; 2382 *pdata = nested_vmx_ept_caps;
2351 break; 2383 break;
2352 default: 2384 default:
2353 return 0; 2385 return 0;
@@ -2356,14 +2388,24 @@ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
2356 return 1; 2388 return 1;
2357} 2389}
2358 2390
2359static int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) 2391static int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
2360{ 2392{
2393 u32 msr_index = msr_info->index;
2394 u64 data = msr_info->data;
2395 bool host_initialized = msr_info->host_initiated;
2396
2361 if (!nested_vmx_allowed(vcpu)) 2397 if (!nested_vmx_allowed(vcpu))
2362 return 0; 2398 return 0;
2363 2399
2364 if (msr_index == MSR_IA32_FEATURE_CONTROL) 2400 if (msr_index == MSR_IA32_FEATURE_CONTROL) {
2365 /* TODO: the right thing. */ 2401 if (!host_initialized &&
2402 to_vmx(vcpu)->nested.msr_ia32_feature_control
2403 & FEATURE_CONTROL_LOCKED)
2404 return 0;
2405 to_vmx(vcpu)->nested.msr_ia32_feature_control = data;
2366 return 1; 2406 return 1;
2407 }
2408
2367 /* 2409 /*
2368 * No need to treat VMX capability MSRs specially: If we don't handle 2410 * No need to treat VMX capability MSRs specially: If we don't handle
2369 * them, handle_wrmsr will #GP(0), which is correct (they are readonly) 2411 * them, handle_wrmsr will #GP(0), which is correct (they are readonly)
@@ -2494,7 +2536,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
2494 return 1; 2536 return 1;
2495 /* Otherwise falls through */ 2537 /* Otherwise falls through */
2496 default: 2538 default:
2497 if (vmx_set_vmx_msr(vcpu, msr_index, data)) 2539 if (vmx_set_vmx_msr(vcpu, msr_info))
2498 break; 2540 break;
2499 msr = find_msr_entry(vmx, msr_index); 2541 msr = find_msr_entry(vmx, msr_index);
2500 if (msr) { 2542 if (msr) {
@@ -5302,9 +5344,13 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu)
5302 5344
5303 /* It is a write fault? */ 5345 /* It is a write fault? */
5304 error_code = exit_qualification & (1U << 1); 5346 error_code = exit_qualification & (1U << 1);
5347 /* It is a fetch fault? */
5348 error_code |= (exit_qualification & (1U << 2)) << 2;
5305 /* ept page table is present? */ 5349 /* ept page table is present? */
5306 error_code |= (exit_qualification >> 3) & 0x1; 5350 error_code |= (exit_qualification >> 3) & 0x1;
5307 5351
5352 vcpu->arch.exit_qualification = exit_qualification;
5353
5308 return kvm_mmu_page_fault(vcpu, gpa, error_code, NULL, 0); 5354 return kvm_mmu_page_fault(vcpu, gpa, error_code, NULL, 0);
5309} 5355}
5310 5356
@@ -5438,7 +5484,8 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
5438 5484
5439 err = emulate_instruction(vcpu, EMULTYPE_NO_REEXECUTE); 5485 err = emulate_instruction(vcpu, EMULTYPE_NO_REEXECUTE);
5440 5486
5441 if (err == EMULATE_DO_MMIO) { 5487 if (err == EMULATE_USER_EXIT) {
5488 ++vcpu->stat.mmio_exits;
5442 ret = 0; 5489 ret = 0;
5443 goto out; 5490 goto out;
5444 } 5491 }
@@ -5567,8 +5614,47 @@ static void nested_free_all_saved_vmcss(struct vcpu_vmx *vmx)
5567 free_loaded_vmcs(&vmx->vmcs01); 5614 free_loaded_vmcs(&vmx->vmcs01);
5568} 5615}
5569 5616
5617/*
5618 * The following 3 functions, nested_vmx_succeed()/failValid()/failInvalid(),
5619 * set the success or error code of an emulated VMX instruction, as specified
5620 * by Vol 2B, VMX Instruction Reference, "Conventions".
5621 */
5622static void nested_vmx_succeed(struct kvm_vcpu *vcpu)
5623{
5624 vmx_set_rflags(vcpu, vmx_get_rflags(vcpu)
5625 & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF |
5626 X86_EFLAGS_ZF | X86_EFLAGS_SF | X86_EFLAGS_OF));
5627}
5628
5629static void nested_vmx_failInvalid(struct kvm_vcpu *vcpu)
5630{
5631 vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu)
5632 & ~(X86_EFLAGS_PF | X86_EFLAGS_AF | X86_EFLAGS_ZF |
5633 X86_EFLAGS_SF | X86_EFLAGS_OF))
5634 | X86_EFLAGS_CF);
5635}
5636
5570static void nested_vmx_failValid(struct kvm_vcpu *vcpu, 5637static void nested_vmx_failValid(struct kvm_vcpu *vcpu,
5571 u32 vm_instruction_error); 5638 u32 vm_instruction_error)
5639{
5640 if (to_vmx(vcpu)->nested.current_vmptr == -1ull) {
5641 /*
5642 * failValid writes the error number to the current VMCS, which
5643 * can't be done there isn't a current VMCS.
5644 */
5645 nested_vmx_failInvalid(vcpu);
5646 return;
5647 }
5648 vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu)
5649 & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF |
5650 X86_EFLAGS_SF | X86_EFLAGS_OF))
5651 | X86_EFLAGS_ZF);
5652 get_vmcs12(vcpu)->vm_instruction_error = vm_instruction_error;
5653 /*
5654 * We don't need to force a shadow sync because
5655 * VM_INSTRUCTION_ERROR is not shadowed
5656 */
5657}
5572 5658
5573/* 5659/*
5574 * Emulate the VMXON instruction. 5660 * Emulate the VMXON instruction.
@@ -5583,6 +5669,8 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
5583 struct kvm_segment cs; 5669 struct kvm_segment cs;
5584 struct vcpu_vmx *vmx = to_vmx(vcpu); 5670 struct vcpu_vmx *vmx = to_vmx(vcpu);
5585 struct vmcs *shadow_vmcs; 5671 struct vmcs *shadow_vmcs;
5672 const u64 VMXON_NEEDED_FEATURES = FEATURE_CONTROL_LOCKED
5673 | FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;
5586 5674
5587 /* The Intel VMX Instruction Reference lists a bunch of bits that 5675 /* The Intel VMX Instruction Reference lists a bunch of bits that
5588 * are prerequisite to running VMXON, most notably cr4.VMXE must be 5676 * are prerequisite to running VMXON, most notably cr4.VMXE must be
@@ -5611,6 +5699,13 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
5611 skip_emulated_instruction(vcpu); 5699 skip_emulated_instruction(vcpu);
5612 return 1; 5700 return 1;
5613 } 5701 }
5702
5703 if ((vmx->nested.msr_ia32_feature_control & VMXON_NEEDED_FEATURES)
5704 != VMXON_NEEDED_FEATURES) {
5705 kvm_inject_gp(vcpu, 0);
5706 return 1;
5707 }
5708
5614 if (enable_shadow_vmcs) { 5709 if (enable_shadow_vmcs) {
5615 shadow_vmcs = alloc_vmcs(); 5710 shadow_vmcs = alloc_vmcs();
5616 if (!shadow_vmcs) 5711 if (!shadow_vmcs)
@@ -5628,6 +5723,7 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
5628 vmx->nested.vmxon = true; 5723 vmx->nested.vmxon = true;
5629 5724
5630 skip_emulated_instruction(vcpu); 5725 skip_emulated_instruction(vcpu);
5726 nested_vmx_succeed(vcpu);
5631 return 1; 5727 return 1;
5632} 5728}
5633 5729
@@ -5712,6 +5808,7 @@ static int handle_vmoff(struct kvm_vcpu *vcpu)
5712 return 1; 5808 return 1;
5713 free_nested(to_vmx(vcpu)); 5809 free_nested(to_vmx(vcpu));
5714 skip_emulated_instruction(vcpu); 5810 skip_emulated_instruction(vcpu);
5811 nested_vmx_succeed(vcpu);
5715 return 1; 5812 return 1;
5716} 5813}
5717 5814
@@ -5768,48 +5865,6 @@ static int get_vmx_mem_address(struct kvm_vcpu *vcpu,
5768 return 0; 5865 return 0;
5769} 5866}
5770 5867
5771/*
5772 * The following 3 functions, nested_vmx_succeed()/failValid()/failInvalid(),
5773 * set the success or error code of an emulated VMX instruction, as specified
5774 * by Vol 2B, VMX Instruction Reference, "Conventions".
5775 */
5776static void nested_vmx_succeed(struct kvm_vcpu *vcpu)
5777{
5778 vmx_set_rflags(vcpu, vmx_get_rflags(vcpu)
5779 & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF |
5780 X86_EFLAGS_ZF | X86_EFLAGS_SF | X86_EFLAGS_OF));
5781}
5782
5783static void nested_vmx_failInvalid(struct kvm_vcpu *vcpu)
5784{
5785 vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu)
5786 & ~(X86_EFLAGS_PF | X86_EFLAGS_AF | X86_EFLAGS_ZF |
5787 X86_EFLAGS_SF | X86_EFLAGS_OF))
5788 | X86_EFLAGS_CF);
5789}
5790
5791static void nested_vmx_failValid(struct kvm_vcpu *vcpu,
5792 u32 vm_instruction_error)
5793{
5794 if (to_vmx(vcpu)->nested.current_vmptr == -1ull) {
5795 /*
5796 * failValid writes the error number to the current VMCS, which
5797 * can't be done there isn't a current VMCS.
5798 */
5799 nested_vmx_failInvalid(vcpu);
5800 return;
5801 }
5802 vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu)
5803 & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF |
5804 X86_EFLAGS_SF | X86_EFLAGS_OF))
5805 | X86_EFLAGS_ZF);
5806 get_vmcs12(vcpu)->vm_instruction_error = vm_instruction_error;
5807 /*
5808 * We don't need to force a shadow sync because
5809 * VM_INSTRUCTION_ERROR is not shadowed
5810 */
5811}
5812
5813/* Emulate the VMCLEAR instruction */ 5868/* Emulate the VMCLEAR instruction */
5814static int handle_vmclear(struct kvm_vcpu *vcpu) 5869static int handle_vmclear(struct kvm_vcpu *vcpu)
5815{ 5870{
@@ -5972,8 +6027,8 @@ static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx)
5972 unsigned long field; 6027 unsigned long field;
5973 u64 field_value; 6028 u64 field_value;
5974 struct vmcs *shadow_vmcs = vmx->nested.current_shadow_vmcs; 6029 struct vmcs *shadow_vmcs = vmx->nested.current_shadow_vmcs;
5975 unsigned long *fields = (unsigned long *)shadow_read_write_fields; 6030 const unsigned long *fields = shadow_read_write_fields;
5976 int num_fields = max_shadow_read_write_fields; 6031 const int num_fields = max_shadow_read_write_fields;
5977 6032
5978 vmcs_load(shadow_vmcs); 6033 vmcs_load(shadow_vmcs);
5979 6034
@@ -6002,12 +6057,11 @@ static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx)
6002 6057
6003static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx) 6058static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx)
6004{ 6059{
6005 unsigned long *fields[] = { 6060 const unsigned long *fields[] = {
6006 (unsigned long *)shadow_read_write_fields, 6061 shadow_read_write_fields,
6007 (unsigned long *)shadow_read_only_fields 6062 shadow_read_only_fields
6008 }; 6063 };
6009 int num_lists = ARRAY_SIZE(fields); 6064 const int max_fields[] = {
6010 int max_fields[] = {
6011 max_shadow_read_write_fields, 6065 max_shadow_read_write_fields,
6012 max_shadow_read_only_fields 6066 max_shadow_read_only_fields
6013 }; 6067 };
@@ -6018,7 +6072,7 @@ static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx)
6018 6072
6019 vmcs_load(shadow_vmcs); 6073 vmcs_load(shadow_vmcs);
6020 6074
6021 for (q = 0; q < num_lists; q++) { 6075 for (q = 0; q < ARRAY_SIZE(fields); q++) {
6022 for (i = 0; i < max_fields[q]; i++) { 6076 for (i = 0; i < max_fields[q]; i++) {
6023 field = fields[q][i]; 6077 field = fields[q][i];
6024 vmcs12_read_any(&vmx->vcpu, field, &field_value); 6078 vmcs12_read_any(&vmx->vcpu, field, &field_value);
@@ -6248,6 +6302,74 @@ static int handle_vmptrst(struct kvm_vcpu *vcpu)
6248 return 1; 6302 return 1;
6249} 6303}
6250 6304
6305/* Emulate the INVEPT instruction */
6306static int handle_invept(struct kvm_vcpu *vcpu)
6307{
6308 u32 vmx_instruction_info, types;
6309 unsigned long type;
6310 gva_t gva;
6311 struct x86_exception e;
6312 struct {
6313 u64 eptp, gpa;
6314 } operand;
6315 u64 eptp_mask = ((1ull << 51) - 1) & PAGE_MASK;
6316
6317 if (!(nested_vmx_secondary_ctls_high & SECONDARY_EXEC_ENABLE_EPT) ||
6318 !(nested_vmx_ept_caps & VMX_EPT_INVEPT_BIT)) {
6319 kvm_queue_exception(vcpu, UD_VECTOR);
6320 return 1;
6321 }
6322
6323 if (!nested_vmx_check_permission(vcpu))
6324 return 1;
6325
6326 if (!kvm_read_cr0_bits(vcpu, X86_CR0_PE)) {
6327 kvm_queue_exception(vcpu, UD_VECTOR);
6328 return 1;
6329 }
6330
6331 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
6332 type = kvm_register_read(vcpu, (vmx_instruction_info >> 28) & 0xf);
6333
6334 types = (nested_vmx_ept_caps >> VMX_EPT_EXTENT_SHIFT) & 6;
6335
6336 if (!(types & (1UL << type))) {
6337 nested_vmx_failValid(vcpu,
6338 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
6339 return 1;
6340 }
6341
6342 /* According to the Intel VMX instruction reference, the memory
6343 * operand is read even if it isn't needed (e.g., for type==global)
6344 */
6345 if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
6346 vmx_instruction_info, &gva))
6347 return 1;
6348 if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, &operand,
6349 sizeof(operand), &e)) {
6350 kvm_inject_page_fault(vcpu, &e);
6351 return 1;
6352 }
6353
6354 switch (type) {
6355 case VMX_EPT_EXTENT_CONTEXT:
6356 if ((operand.eptp & eptp_mask) !=
6357 (nested_ept_get_cr3(vcpu) & eptp_mask))
6358 break;
6359 case VMX_EPT_EXTENT_GLOBAL:
6360 kvm_mmu_sync_roots(vcpu);
6361 kvm_mmu_flush_tlb(vcpu);
6362 nested_vmx_succeed(vcpu);
6363 break;
6364 default:
6365 BUG_ON(1);
6366 break;
6367 }
6368
6369 skip_emulated_instruction(vcpu);
6370 return 1;
6371}
6372
6251/* 6373/*
6252 * The exit handlers return 1 if the exit was handled fully and guest execution 6374 * The exit handlers return 1 if the exit was handled fully and guest execution
6253 * may resume. Otherwise they set the kvm_run parameter to indicate what needs 6375 * may resume. Otherwise they set the kvm_run parameter to indicate what needs
@@ -6292,6 +6414,7 @@ static int (*const kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
6292 [EXIT_REASON_PAUSE_INSTRUCTION] = handle_pause, 6414 [EXIT_REASON_PAUSE_INSTRUCTION] = handle_pause,
6293 [EXIT_REASON_MWAIT_INSTRUCTION] = handle_invalid_op, 6415 [EXIT_REASON_MWAIT_INSTRUCTION] = handle_invalid_op,
6294 [EXIT_REASON_MONITOR_INSTRUCTION] = handle_invalid_op, 6416 [EXIT_REASON_MONITOR_INSTRUCTION] = handle_invalid_op,
6417 [EXIT_REASON_INVEPT] = handle_invept,
6295}; 6418};
6296 6419
6297static const int kvm_vmx_max_exit_handlers = 6420static const int kvm_vmx_max_exit_handlers =
@@ -6518,6 +6641,7 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
6518 case EXIT_REASON_VMPTRST: case EXIT_REASON_VMREAD: 6641 case EXIT_REASON_VMPTRST: case EXIT_REASON_VMREAD:
6519 case EXIT_REASON_VMRESUME: case EXIT_REASON_VMWRITE: 6642 case EXIT_REASON_VMRESUME: case EXIT_REASON_VMWRITE:
6520 case EXIT_REASON_VMOFF: case EXIT_REASON_VMON: 6643 case EXIT_REASON_VMOFF: case EXIT_REASON_VMON:
6644 case EXIT_REASON_INVEPT:
6521 /* 6645 /*
6522 * VMX instructions trap unconditionally. This allows L1 to 6646 * VMX instructions trap unconditionally. This allows L1 to
6523 * emulate them for its L2 guest, i.e., allows 3-level nesting! 6647 * emulate them for its L2 guest, i.e., allows 3-level nesting!
@@ -6550,7 +6674,20 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
6550 return nested_cpu_has2(vmcs12, 6674 return nested_cpu_has2(vmcs12,
6551 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES); 6675 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES);
6552 case EXIT_REASON_EPT_VIOLATION: 6676 case EXIT_REASON_EPT_VIOLATION:
6677 /*
6678 * L0 always deals with the EPT violation. If nested EPT is
6679 * used, and the nested mmu code discovers that the address is
6680 * missing in the guest EPT table (EPT12), the EPT violation
6681 * will be injected with nested_ept_inject_page_fault()
6682 */
6683 return 0;
6553 case EXIT_REASON_EPT_MISCONFIG: 6684 case EXIT_REASON_EPT_MISCONFIG:
6685 /*
6686 * L2 never uses directly L1's EPT, but rather L0's own EPT
6687 * table (shadow on EPT) or a merged EPT table that L0 built
6688 * (EPT on EPT). So any problems with the structure of the
6689 * table is L0's fault.
6690 */
6554 return 0; 6691 return 0;
6555 case EXIT_REASON_PREEMPTION_TIMER: 6692 case EXIT_REASON_PREEMPTION_TIMER:
6556 return vmcs12->pin_based_vm_exec_control & 6693 return vmcs12->pin_based_vm_exec_control &
@@ -6638,7 +6775,7 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
6638 6775
6639 if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked && 6776 if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked &&
6640 !(is_guest_mode(vcpu) && nested_cpu_has_virtual_nmis( 6777 !(is_guest_mode(vcpu) && nested_cpu_has_virtual_nmis(
6641 get_vmcs12(vcpu), vcpu)))) { 6778 get_vmcs12(vcpu))))) {
6642 if (vmx_interrupt_allowed(vcpu)) { 6779 if (vmx_interrupt_allowed(vcpu)) {
6643 vmx->soft_vnmi_blocked = 0; 6780 vmx->soft_vnmi_blocked = 0;
6644 } else if (vmx->vnmi_blocked_time > 1000000000LL && 6781 } else if (vmx->vnmi_blocked_time > 1000000000LL &&
@@ -7326,6 +7463,48 @@ static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
7326 entry->ecx |= bit(X86_FEATURE_VMX); 7463 entry->ecx |= bit(X86_FEATURE_VMX);
7327} 7464}
7328 7465
7466static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu,
7467 struct x86_exception *fault)
7468{
7469 struct vmcs12 *vmcs12;
7470 nested_vmx_vmexit(vcpu);
7471 vmcs12 = get_vmcs12(vcpu);
7472
7473 if (fault->error_code & PFERR_RSVD_MASK)
7474 vmcs12->vm_exit_reason = EXIT_REASON_EPT_MISCONFIG;
7475 else
7476 vmcs12->vm_exit_reason = EXIT_REASON_EPT_VIOLATION;
7477 vmcs12->exit_qualification = vcpu->arch.exit_qualification;
7478 vmcs12->guest_physical_address = fault->address;
7479}
7480
7481/* Callbacks for nested_ept_init_mmu_context: */
7482
7483static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu)
7484{
7485 /* return the page table to be shadowed - in our case, EPT12 */
7486 return get_vmcs12(vcpu)->ept_pointer;
7487}
7488
7489static int nested_ept_init_mmu_context(struct kvm_vcpu *vcpu)
7490{
7491 int r = kvm_init_shadow_ept_mmu(vcpu, &vcpu->arch.mmu,
7492 nested_vmx_ept_caps & VMX_EPT_EXECUTE_ONLY_BIT);
7493
7494 vcpu->arch.mmu.set_cr3 = vmx_set_cr3;
7495 vcpu->arch.mmu.get_cr3 = nested_ept_get_cr3;
7496 vcpu->arch.mmu.inject_page_fault = nested_ept_inject_page_fault;
7497
7498 vcpu->arch.walk_mmu = &vcpu->arch.nested_mmu;
7499
7500 return r;
7501}
7502
7503static void nested_ept_uninit_mmu_context(struct kvm_vcpu *vcpu)
7504{
7505 vcpu->arch.walk_mmu = &vcpu->arch.mmu;
7506}
7507
7329/* 7508/*
7330 * prepare_vmcs02 is called when the L1 guest hypervisor runs its nested 7509 * prepare_vmcs02 is called when the L1 guest hypervisor runs its nested
7331 * L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it 7510 * L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it
@@ -7388,7 +7567,7 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
7388 vmcs12->guest_interruptibility_info); 7567 vmcs12->guest_interruptibility_info);
7389 vmcs_write32(GUEST_SYSENTER_CS, vmcs12->guest_sysenter_cs); 7568 vmcs_write32(GUEST_SYSENTER_CS, vmcs12->guest_sysenter_cs);
7390 kvm_set_dr(vcpu, 7, vmcs12->guest_dr7); 7569 kvm_set_dr(vcpu, 7, vmcs12->guest_dr7);
7391 vmcs_writel(GUEST_RFLAGS, vmcs12->guest_rflags); 7570 vmx_set_rflags(vcpu, vmcs12->guest_rflags);
7392 vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, 7571 vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS,
7393 vmcs12->guest_pending_dbg_exceptions); 7572 vmcs12->guest_pending_dbg_exceptions);
7394 vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->guest_sysenter_esp); 7573 vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->guest_sysenter_esp);
@@ -7508,15 +7687,24 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
7508 vcpu->arch.cr0_guest_owned_bits &= ~vmcs12->cr0_guest_host_mask; 7687 vcpu->arch.cr0_guest_owned_bits &= ~vmcs12->cr0_guest_host_mask;
7509 vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits); 7688 vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits);
7510 7689
7511 /* Note: IA32_MODE, LOAD_IA32_EFER are modified by vmx_set_efer below */ 7690 /* L2->L1 exit controls are emulated - the hardware exit is to L0 so
7512 vmcs_write32(VM_EXIT_CONTROLS, 7691 * we should use its exit controls. Note that VM_EXIT_LOAD_IA32_EFER
7513 vmcs12->vm_exit_controls | vmcs_config.vmexit_ctrl); 7692 * bits are further modified by vmx_set_efer() below.
7514 vmcs_write32(VM_ENTRY_CONTROLS, vmcs12->vm_entry_controls | 7693 */
7694 vmcs_write32(VM_EXIT_CONTROLS, vmcs_config.vmexit_ctrl);
7695
7696 /* vmcs12's VM_ENTRY_LOAD_IA32_EFER and VM_ENTRY_IA32E_MODE are
7697 * emulated by vmx_set_efer(), below.
7698 */
7699 vmcs_write32(VM_ENTRY_CONTROLS,
7700 (vmcs12->vm_entry_controls & ~VM_ENTRY_LOAD_IA32_EFER &
7701 ~VM_ENTRY_IA32E_MODE) |
7515 (vmcs_config.vmentry_ctrl & ~VM_ENTRY_IA32E_MODE)); 7702 (vmcs_config.vmentry_ctrl & ~VM_ENTRY_IA32E_MODE));
7516 7703
7517 if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT) 7704 if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT) {
7518 vmcs_write64(GUEST_IA32_PAT, vmcs12->guest_ia32_pat); 7705 vmcs_write64(GUEST_IA32_PAT, vmcs12->guest_ia32_pat);
7519 else if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) 7706 vcpu->arch.pat = vmcs12->guest_ia32_pat;
7707 } else if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT)
7520 vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat); 7708 vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat);
7521 7709
7522 7710
@@ -7538,6 +7726,11 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
7538 vmx_flush_tlb(vcpu); 7726 vmx_flush_tlb(vcpu);
7539 } 7727 }
7540 7728
7729 if (nested_cpu_has_ept(vmcs12)) {
7730 kvm_mmu_unload(vcpu);
7731 nested_ept_init_mmu_context(vcpu);
7732 }
7733
7541 if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER) 7734 if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)
7542 vcpu->arch.efer = vmcs12->guest_ia32_efer; 7735 vcpu->arch.efer = vmcs12->guest_ia32_efer;
7543 else if (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) 7736 else if (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE)
@@ -7565,6 +7758,16 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
7565 kvm_set_cr3(vcpu, vmcs12->guest_cr3); 7758 kvm_set_cr3(vcpu, vmcs12->guest_cr3);
7566 kvm_mmu_reset_context(vcpu); 7759 kvm_mmu_reset_context(vcpu);
7567 7760
7761 /*
7762 * L1 may access the L2's PDPTR, so save them to construct vmcs12
7763 */
7764 if (enable_ept) {
7765 vmcs_write64(GUEST_PDPTR0, vmcs12->guest_pdptr0);
7766 vmcs_write64(GUEST_PDPTR1, vmcs12->guest_pdptr1);
7767 vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2);
7768 vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3);
7769 }
7770
7568 kvm_register_write(vcpu, VCPU_REGS_RSP, vmcs12->guest_rsp); 7771 kvm_register_write(vcpu, VCPU_REGS_RSP, vmcs12->guest_rsp);
7569 kvm_register_write(vcpu, VCPU_REGS_RIP, vmcs12->guest_rip); 7772 kvm_register_write(vcpu, VCPU_REGS_RIP, vmcs12->guest_rip);
7570} 7773}
@@ -7887,6 +8090,22 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
7887 vmcs12->guest_pending_dbg_exceptions = 8090 vmcs12->guest_pending_dbg_exceptions =
7888 vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS); 8091 vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS);
7889 8092
8093 /*
8094 * In some cases (usually, nested EPT), L2 is allowed to change its
8095 * own CR3 without exiting. If it has changed it, we must keep it.
8096 * Of course, if L0 is using shadow page tables, GUEST_CR3 was defined
8097 * by L0, not L1 or L2, so we mustn't unconditionally copy it to vmcs12.
8098 *
8099 * Additionally, restore L2's PDPTR to vmcs12.
8100 */
8101 if (enable_ept) {
8102 vmcs12->guest_cr3 = vmcs_read64(GUEST_CR3);
8103 vmcs12->guest_pdptr0 = vmcs_read64(GUEST_PDPTR0);
8104 vmcs12->guest_pdptr1 = vmcs_read64(GUEST_PDPTR1);
8105 vmcs12->guest_pdptr2 = vmcs_read64(GUEST_PDPTR2);
8106 vmcs12->guest_pdptr3 = vmcs_read64(GUEST_PDPTR3);
8107 }
8108
7890 vmcs12->vm_entry_controls = 8109 vmcs12->vm_entry_controls =
7891 (vmcs12->vm_entry_controls & ~VM_ENTRY_IA32E_MODE) | 8110 (vmcs12->vm_entry_controls & ~VM_ENTRY_IA32E_MODE) |
7892 (vmcs_read32(VM_ENTRY_CONTROLS) & VM_ENTRY_IA32E_MODE); 8111 (vmcs_read32(VM_ENTRY_CONTROLS) & VM_ENTRY_IA32E_MODE);
@@ -7948,6 +8167,8 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
7948static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, 8167static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
7949 struct vmcs12 *vmcs12) 8168 struct vmcs12 *vmcs12)
7950{ 8169{
8170 struct kvm_segment seg;
8171
7951 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER) 8172 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER)
7952 vcpu->arch.efer = vmcs12->host_ia32_efer; 8173 vcpu->arch.efer = vmcs12->host_ia32_efer;
7953 else if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) 8174 else if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE)
@@ -7982,7 +8203,9 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
7982 vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK); 8203 vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK);
7983 kvm_set_cr4(vcpu, vmcs12->host_cr4); 8204 kvm_set_cr4(vcpu, vmcs12->host_cr4);
7984 8205
7985 /* shadow page tables on either EPT or shadow page tables */ 8206 if (nested_cpu_has_ept(vmcs12))
8207 nested_ept_uninit_mmu_context(vcpu);
8208
7986 kvm_set_cr3(vcpu, vmcs12->host_cr3); 8209 kvm_set_cr3(vcpu, vmcs12->host_cr3);
7987 kvm_mmu_reset_context(vcpu); 8210 kvm_mmu_reset_context(vcpu);
7988 8211
@@ -8001,23 +8224,61 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
8001 vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->host_ia32_sysenter_eip); 8224 vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->host_ia32_sysenter_eip);
8002 vmcs_writel(GUEST_IDTR_BASE, vmcs12->host_idtr_base); 8225 vmcs_writel(GUEST_IDTR_BASE, vmcs12->host_idtr_base);
8003 vmcs_writel(GUEST_GDTR_BASE, vmcs12->host_gdtr_base); 8226 vmcs_writel(GUEST_GDTR_BASE, vmcs12->host_gdtr_base);
8004 vmcs_writel(GUEST_TR_BASE, vmcs12->host_tr_base); 8227
8005 vmcs_writel(GUEST_GS_BASE, vmcs12->host_gs_base); 8228 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) {
8006 vmcs_writel(GUEST_FS_BASE, vmcs12->host_fs_base);
8007 vmcs_write16(GUEST_ES_SELECTOR, vmcs12->host_es_selector);
8008 vmcs_write16(GUEST_CS_SELECTOR, vmcs12->host_cs_selector);
8009 vmcs_write16(GUEST_SS_SELECTOR, vmcs12->host_ss_selector);
8010 vmcs_write16(GUEST_DS_SELECTOR, vmcs12->host_ds_selector);
8011 vmcs_write16(GUEST_FS_SELECTOR, vmcs12->host_fs_selector);
8012 vmcs_write16(GUEST_GS_SELECTOR, vmcs12->host_gs_selector);
8013 vmcs_write16(GUEST_TR_SELECTOR, vmcs12->host_tr_selector);
8014
8015 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT)
8016 vmcs_write64(GUEST_IA32_PAT, vmcs12->host_ia32_pat); 8229 vmcs_write64(GUEST_IA32_PAT, vmcs12->host_ia32_pat);
8230 vcpu->arch.pat = vmcs12->host_ia32_pat;
8231 }
8017 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) 8232 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL)
8018 vmcs_write64(GUEST_IA32_PERF_GLOBAL_CTRL, 8233 vmcs_write64(GUEST_IA32_PERF_GLOBAL_CTRL,
8019 vmcs12->host_ia32_perf_global_ctrl); 8234 vmcs12->host_ia32_perf_global_ctrl);
8020 8235
8236 /* Set L1 segment info according to Intel SDM
8237 27.5.2 Loading Host Segment and Descriptor-Table Registers */
8238 seg = (struct kvm_segment) {
8239 .base = 0,
8240 .limit = 0xFFFFFFFF,
8241 .selector = vmcs12->host_cs_selector,
8242 .type = 11,
8243 .present = 1,
8244 .s = 1,
8245 .g = 1
8246 };
8247 if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE)
8248 seg.l = 1;
8249 else
8250 seg.db = 1;
8251 vmx_set_segment(vcpu, &seg, VCPU_SREG_CS);
8252 seg = (struct kvm_segment) {
8253 .base = 0,
8254 .limit = 0xFFFFFFFF,
8255 .type = 3,
8256 .present = 1,
8257 .s = 1,
8258 .db = 1,
8259 .g = 1
8260 };
8261 seg.selector = vmcs12->host_ds_selector;
8262 vmx_set_segment(vcpu, &seg, VCPU_SREG_DS);
8263 seg.selector = vmcs12->host_es_selector;
8264 vmx_set_segment(vcpu, &seg, VCPU_SREG_ES);
8265 seg.selector = vmcs12->host_ss_selector;
8266 vmx_set_segment(vcpu, &seg, VCPU_SREG_SS);
8267 seg.selector = vmcs12->host_fs_selector;
8268 seg.base = vmcs12->host_fs_base;
8269 vmx_set_segment(vcpu, &seg, VCPU_SREG_FS);
8270 seg.selector = vmcs12->host_gs_selector;
8271 seg.base = vmcs12->host_gs_base;
8272 vmx_set_segment(vcpu, &seg, VCPU_SREG_GS);
8273 seg = (struct kvm_segment) {
8274 .base = vmcs12->host_tr_base,
8275 .limit = 0x67,
8276 .selector = vmcs12->host_tr_selector,
8277 .type = 11,
8278 .present = 1
8279 };
8280 vmx_set_segment(vcpu, &seg, VCPU_SREG_TR);
8281
8021 kvm_set_dr(vcpu, 7, 0x400); 8282 kvm_set_dr(vcpu, 7, 0x400);
8022 vmcs_write64(GUEST_IA32_DEBUGCTL, 0); 8283 vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
8023} 8284}
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index d21bce505315..e5ca72a5cdb6 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -682,17 +682,6 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
682 */ 682 */
683 } 683 }
684 684
685 /*
686 * Does the new cr3 value map to physical memory? (Note, we
687 * catch an invalid cr3 even in real-mode, because it would
688 * cause trouble later on when we turn on paging anyway.)
689 *
690 * A real CPU would silently accept an invalid cr3 and would
691 * attempt to use it - with largely undefined (and often hard
692 * to debug) behavior on the guest side.
693 */
694 if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGE_SHIFT)))
695 return 1;
696 vcpu->arch.cr3 = cr3; 685 vcpu->arch.cr3 = cr3;
697 __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail); 686 __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
698 vcpu->arch.mmu.new_cr3(vcpu); 687 vcpu->arch.mmu.new_cr3(vcpu);
@@ -850,7 +839,8 @@ static u32 msrs_to_save[] = {
850#ifdef CONFIG_X86_64 839#ifdef CONFIG_X86_64
851 MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR, 840 MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
852#endif 841#endif
853 MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA 842 MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA,
843 MSR_IA32_FEATURE_CONTROL
854}; 844};
855 845
856static unsigned num_msrs_to_save; 846static unsigned num_msrs_to_save;
@@ -1457,6 +1447,29 @@ static void pvclock_update_vm_gtod_copy(struct kvm *kvm)
1457#endif 1447#endif
1458} 1448}
1459 1449
1450static void kvm_gen_update_masterclock(struct kvm *kvm)
1451{
1452#ifdef CONFIG_X86_64
1453 int i;
1454 struct kvm_vcpu *vcpu;
1455 struct kvm_arch *ka = &kvm->arch;
1456
1457 spin_lock(&ka->pvclock_gtod_sync_lock);
1458 kvm_make_mclock_inprogress_request(kvm);
1459 /* no guest entries from this point */
1460 pvclock_update_vm_gtod_copy(kvm);
1461
1462 kvm_for_each_vcpu(i, vcpu, kvm)
1463 set_bit(KVM_REQ_CLOCK_UPDATE, &vcpu->requests);
1464
1465 /* guest entries allowed */
1466 kvm_for_each_vcpu(i, vcpu, kvm)
1467 clear_bit(KVM_REQ_MCLOCK_INPROGRESS, &vcpu->requests);
1468
1469 spin_unlock(&ka->pvclock_gtod_sync_lock);
1470#endif
1471}
1472
1460static int kvm_guest_time_update(struct kvm_vcpu *v) 1473static int kvm_guest_time_update(struct kvm_vcpu *v)
1461{ 1474{
1462 unsigned long flags, this_tsc_khz; 1475 unsigned long flags, this_tsc_khz;
@@ -3806,6 +3819,7 @@ long kvm_arch_vm_ioctl(struct file *filp,
3806 delta = user_ns.clock - now_ns; 3819 delta = user_ns.clock - now_ns;
3807 local_irq_enable(); 3820 local_irq_enable();
3808 kvm->arch.kvmclock_offset = delta; 3821 kvm->arch.kvmclock_offset = delta;
3822 kvm_gen_update_masterclock(kvm);
3809 break; 3823 break;
3810 } 3824 }
3811 case KVM_GET_CLOCK: { 3825 case KVM_GET_CLOCK: {
@@ -4955,6 +4969,97 @@ static bool retry_instruction(struct x86_emulate_ctxt *ctxt,
4955static int complete_emulated_mmio(struct kvm_vcpu *vcpu); 4969static int complete_emulated_mmio(struct kvm_vcpu *vcpu);
4956static int complete_emulated_pio(struct kvm_vcpu *vcpu); 4970static int complete_emulated_pio(struct kvm_vcpu *vcpu);
4957 4971
4972static int kvm_vcpu_check_hw_bp(unsigned long addr, u32 type, u32 dr7,
4973 unsigned long *db)
4974{
4975 u32 dr6 = 0;
4976 int i;
4977 u32 enable, rwlen;
4978
4979 enable = dr7;
4980 rwlen = dr7 >> 16;
4981 for (i = 0; i < 4; i++, enable >>= 2, rwlen >>= 4)
4982 if ((enable & 3) && (rwlen & 15) == type && db[i] == addr)
4983 dr6 |= (1 << i);
4984 return dr6;
4985}
4986
4987static void kvm_vcpu_check_singlestep(struct kvm_vcpu *vcpu, int *r)
4988{
4989 struct kvm_run *kvm_run = vcpu->run;
4990
4991 /*
4992 * Use the "raw" value to see if TF was passed to the processor.
4993 * Note that the new value of the flags has not been saved yet.
4994 *
4995 * This is correct even for TF set by the guest, because "the
4996 * processor will not generate this exception after the instruction
4997 * that sets the TF flag".
4998 */
4999 unsigned long rflags = kvm_x86_ops->get_rflags(vcpu);
5000
5001 if (unlikely(rflags & X86_EFLAGS_TF)) {
5002 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) {
5003 kvm_run->debug.arch.dr6 = DR6_BS | DR6_FIXED_1;
5004 kvm_run->debug.arch.pc = vcpu->arch.singlestep_rip;
5005 kvm_run->debug.arch.exception = DB_VECTOR;
5006 kvm_run->exit_reason = KVM_EXIT_DEBUG;
5007 *r = EMULATE_USER_EXIT;
5008 } else {
5009 vcpu->arch.emulate_ctxt.eflags &= ~X86_EFLAGS_TF;
5010 /*
5011 * "Certain debug exceptions may clear bit 0-3. The
5012 * remaining contents of the DR6 register are never
5013 * cleared by the processor".
5014 */
5015 vcpu->arch.dr6 &= ~15;
5016 vcpu->arch.dr6 |= DR6_BS;
5017 kvm_queue_exception(vcpu, DB_VECTOR);
5018 }
5019 }
5020}
5021
5022static bool kvm_vcpu_check_breakpoint(struct kvm_vcpu *vcpu, int *r)
5023{
5024 struct kvm_run *kvm_run = vcpu->run;
5025 unsigned long eip = vcpu->arch.emulate_ctxt.eip;
5026 u32 dr6 = 0;
5027
5028 if (unlikely(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) &&
5029 (vcpu->arch.guest_debug_dr7 & DR7_BP_EN_MASK)) {
5030 dr6 = kvm_vcpu_check_hw_bp(eip, 0,
5031 vcpu->arch.guest_debug_dr7,
5032 vcpu->arch.eff_db);
5033
5034 if (dr6 != 0) {
5035 kvm_run->debug.arch.dr6 = dr6 | DR6_FIXED_1;
5036 kvm_run->debug.arch.pc = kvm_rip_read(vcpu) +
5037 get_segment_base(vcpu, VCPU_SREG_CS);
5038
5039 kvm_run->debug.arch.exception = DB_VECTOR;
5040 kvm_run->exit_reason = KVM_EXIT_DEBUG;
5041 *r = EMULATE_USER_EXIT;
5042 return true;
5043 }
5044 }
5045
5046 if (unlikely(vcpu->arch.dr7 & DR7_BP_EN_MASK)) {
5047 dr6 = kvm_vcpu_check_hw_bp(eip, 0,
5048 vcpu->arch.dr7,
5049 vcpu->arch.db);
5050
5051 if (dr6 != 0) {
5052 vcpu->arch.dr6 &= ~15;
5053 vcpu->arch.dr6 |= dr6;
5054 kvm_queue_exception(vcpu, DB_VECTOR);
5055 *r = EMULATE_DONE;
5056 return true;
5057 }
5058 }
5059
5060 return false;
5061}
5062
4958int x86_emulate_instruction(struct kvm_vcpu *vcpu, 5063int x86_emulate_instruction(struct kvm_vcpu *vcpu,
4959 unsigned long cr2, 5064 unsigned long cr2,
4960 int emulation_type, 5065 int emulation_type,
@@ -4975,6 +5080,16 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu,
4975 5080
4976 if (!(emulation_type & EMULTYPE_NO_DECODE)) { 5081 if (!(emulation_type & EMULTYPE_NO_DECODE)) {
4977 init_emulate_ctxt(vcpu); 5082 init_emulate_ctxt(vcpu);
5083
5084 /*
5085 * We will reenter on the same instruction since
5086 * we do not set complete_userspace_io. This does not
5087 * handle watchpoints yet, those would be handled in
5088 * the emulate_ops.
5089 */
5090 if (kvm_vcpu_check_breakpoint(vcpu, &r))
5091 return r;
5092
4978 ctxt->interruptibility = 0; 5093 ctxt->interruptibility = 0;
4979 ctxt->have_exception = false; 5094 ctxt->have_exception = false;
4980 ctxt->perm_ok = false; 5095 ctxt->perm_ok = false;
@@ -5031,17 +5146,18 @@ restart:
5031 inject_emulated_exception(vcpu); 5146 inject_emulated_exception(vcpu);
5032 r = EMULATE_DONE; 5147 r = EMULATE_DONE;
5033 } else if (vcpu->arch.pio.count) { 5148 } else if (vcpu->arch.pio.count) {
5034 if (!vcpu->arch.pio.in) 5149 if (!vcpu->arch.pio.in) {
5150 /* FIXME: return into emulator if single-stepping. */
5035 vcpu->arch.pio.count = 0; 5151 vcpu->arch.pio.count = 0;
5036 else { 5152 } else {
5037 writeback = false; 5153 writeback = false;
5038 vcpu->arch.complete_userspace_io = complete_emulated_pio; 5154 vcpu->arch.complete_userspace_io = complete_emulated_pio;
5039 } 5155 }
5040 r = EMULATE_DO_MMIO; 5156 r = EMULATE_USER_EXIT;
5041 } else if (vcpu->mmio_needed) { 5157 } else if (vcpu->mmio_needed) {
5042 if (!vcpu->mmio_is_write) 5158 if (!vcpu->mmio_is_write)
5043 writeback = false; 5159 writeback = false;
5044 r = EMULATE_DO_MMIO; 5160 r = EMULATE_USER_EXIT;
5045 vcpu->arch.complete_userspace_io = complete_emulated_mmio; 5161 vcpu->arch.complete_userspace_io = complete_emulated_mmio;
5046 } else if (r == EMULATION_RESTART) 5162 } else if (r == EMULATION_RESTART)
5047 goto restart; 5163 goto restart;
@@ -5050,10 +5166,12 @@ restart:
5050 5166
5051 if (writeback) { 5167 if (writeback) {
5052 toggle_interruptibility(vcpu, ctxt->interruptibility); 5168 toggle_interruptibility(vcpu, ctxt->interruptibility);
5053 kvm_set_rflags(vcpu, ctxt->eflags);
5054 kvm_make_request(KVM_REQ_EVENT, vcpu); 5169 kvm_make_request(KVM_REQ_EVENT, vcpu);
5055 vcpu->arch.emulate_regs_need_sync_to_vcpu = false; 5170 vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
5056 kvm_rip_write(vcpu, ctxt->eip); 5171 kvm_rip_write(vcpu, ctxt->eip);
5172 if (r == EMULATE_DONE)
5173 kvm_vcpu_check_singlestep(vcpu, &r);
5174 kvm_set_rflags(vcpu, ctxt->eflags);
5057 } else 5175 } else
5058 vcpu->arch.emulate_regs_need_sync_to_vcpu = true; 5176 vcpu->arch.emulate_regs_need_sync_to_vcpu = true;
5059 5177
@@ -5347,7 +5465,7 @@ static struct notifier_block pvclock_gtod_notifier = {
5347int kvm_arch_init(void *opaque) 5465int kvm_arch_init(void *opaque)
5348{ 5466{
5349 int r; 5467 int r;
5350 struct kvm_x86_ops *ops = (struct kvm_x86_ops *)opaque; 5468 struct kvm_x86_ops *ops = opaque;
5351 5469
5352 if (kvm_x86_ops) { 5470 if (kvm_x86_ops) {
5353 printk(KERN_ERR "kvm: already loaded the other module\n"); 5471 printk(KERN_ERR "kvm: already loaded the other module\n");
@@ -5495,6 +5613,23 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu)
5495 return 1; 5613 return 1;
5496} 5614}
5497 5615
5616/*
5617 * kvm_pv_kick_cpu_op: Kick a vcpu.
5618 *
5619 * @apicid - apicid of vcpu to be kicked.
5620 */
5621static void kvm_pv_kick_cpu_op(struct kvm *kvm, unsigned long flags, int apicid)
5622{
5623 struct kvm_lapic_irq lapic_irq;
5624
5625 lapic_irq.shorthand = 0;
5626 lapic_irq.dest_mode = 0;
5627 lapic_irq.dest_id = apicid;
5628
5629 lapic_irq.delivery_mode = APIC_DM_REMRD;
5630 kvm_irq_delivery_to_apic(kvm, 0, &lapic_irq, NULL);
5631}
5632
5498int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) 5633int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
5499{ 5634{
5500 unsigned long nr, a0, a1, a2, a3, ret; 5635 unsigned long nr, a0, a1, a2, a3, ret;
@@ -5528,6 +5663,10 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
5528 case KVM_HC_VAPIC_POLL_IRQ: 5663 case KVM_HC_VAPIC_POLL_IRQ:
5529 ret = 0; 5664 ret = 0;
5530 break; 5665 break;
5666 case KVM_HC_KICK_CPU:
5667 kvm_pv_kick_cpu_op(vcpu->kvm, a0, a1);
5668 ret = 0;
5669 break;
5531 default: 5670 default:
5532 ret = -KVM_ENOSYS; 5671 ret = -KVM_ENOSYS;
5533 break; 5672 break;
@@ -5689,29 +5828,6 @@ static void process_nmi(struct kvm_vcpu *vcpu)
5689 kvm_make_request(KVM_REQ_EVENT, vcpu); 5828 kvm_make_request(KVM_REQ_EVENT, vcpu);
5690} 5829}
5691 5830
5692static void kvm_gen_update_masterclock(struct kvm *kvm)
5693{
5694#ifdef CONFIG_X86_64
5695 int i;
5696 struct kvm_vcpu *vcpu;
5697 struct kvm_arch *ka = &kvm->arch;
5698
5699 spin_lock(&ka->pvclock_gtod_sync_lock);
5700 kvm_make_mclock_inprogress_request(kvm);
5701 /* no guest entries from this point */
5702 pvclock_update_vm_gtod_copy(kvm);
5703
5704 kvm_for_each_vcpu(i, vcpu, kvm)
5705 set_bit(KVM_REQ_CLOCK_UPDATE, &vcpu->requests);
5706
5707 /* guest entries allowed */
5708 kvm_for_each_vcpu(i, vcpu, kvm)
5709 clear_bit(KVM_REQ_MCLOCK_INPROGRESS, &vcpu->requests);
5710
5711 spin_unlock(&ka->pvclock_gtod_sync_lock);
5712#endif
5713}
5714
5715static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu) 5831static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu)
5716{ 5832{
5717 u64 eoi_exit_bitmap[4]; 5833 u64 eoi_exit_bitmap[4];
@@ -5950,6 +6066,7 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
5950 kvm_apic_accept_events(vcpu); 6066 kvm_apic_accept_events(vcpu);
5951 switch(vcpu->arch.mp_state) { 6067 switch(vcpu->arch.mp_state) {
5952 case KVM_MP_STATE_HALTED: 6068 case KVM_MP_STATE_HALTED:
6069 vcpu->arch.pv.pv_unhalted = false;
5953 vcpu->arch.mp_state = 6070 vcpu->arch.mp_state =
5954 KVM_MP_STATE_RUNNABLE; 6071 KVM_MP_STATE_RUNNABLE;
5955 case KVM_MP_STATE_RUNNABLE: 6072 case KVM_MP_STATE_RUNNABLE:
@@ -6061,6 +6178,8 @@ static int complete_emulated_mmio(struct kvm_vcpu *vcpu)
6061 6178
6062 if (vcpu->mmio_cur_fragment == vcpu->mmio_nr_fragments) { 6179 if (vcpu->mmio_cur_fragment == vcpu->mmio_nr_fragments) {
6063 vcpu->mmio_needed = 0; 6180 vcpu->mmio_needed = 0;
6181
6182 /* FIXME: return into emulator if single-stepping. */
6064 if (vcpu->mmio_is_write) 6183 if (vcpu->mmio_is_write)
6065 return 1; 6184 return 1;
6066 vcpu->mmio_read_completed = 1; 6185 vcpu->mmio_read_completed = 1;
@@ -6249,7 +6368,12 @@ int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
6249 struct kvm_mp_state *mp_state) 6368 struct kvm_mp_state *mp_state)
6250{ 6369{
6251 kvm_apic_accept_events(vcpu); 6370 kvm_apic_accept_events(vcpu);
6252 mp_state->mp_state = vcpu->arch.mp_state; 6371 if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED &&
6372 vcpu->arch.pv.pv_unhalted)
6373 mp_state->mp_state = KVM_MP_STATE_RUNNABLE;
6374 else
6375 mp_state->mp_state = vcpu->arch.mp_state;
6376
6253 return 0; 6377 return 0;
6254} 6378}
6255 6379
@@ -6770,6 +6894,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
6770 BUG_ON(vcpu->kvm == NULL); 6894 BUG_ON(vcpu->kvm == NULL);
6771 kvm = vcpu->kvm; 6895 kvm = vcpu->kvm;
6772 6896
6897 vcpu->arch.pv.pv_unhalted = false;
6773 vcpu->arch.emulate_ctxt.ops = &emulate_ops; 6898 vcpu->arch.emulate_ctxt.ops = &emulate_ops;
6774 if (!irqchip_in_kernel(kvm) || kvm_vcpu_is_bsp(vcpu)) 6899 if (!irqchip_in_kernel(kvm) || kvm_vcpu_is_bsp(vcpu))
6775 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; 6900 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
@@ -7019,6 +7144,15 @@ out_free:
7019 return -ENOMEM; 7144 return -ENOMEM;
7020} 7145}
7021 7146
7147void kvm_arch_memslots_updated(struct kvm *kvm)
7148{
7149 /*
7150 * memslots->generation has been incremented.
7151 * mmio generation may have reached its maximum value.
7152 */
7153 kvm_mmu_invalidate_mmio_sptes(kvm);
7154}
7155
7022int kvm_arch_prepare_memory_region(struct kvm *kvm, 7156int kvm_arch_prepare_memory_region(struct kvm *kvm,
7023 struct kvm_memory_slot *memslot, 7157 struct kvm_memory_slot *memslot,
7024 struct kvm_userspace_memory_region *mem, 7158 struct kvm_userspace_memory_region *mem,
@@ -7079,11 +7213,6 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
7079 */ 7213 */
7080 if ((change != KVM_MR_DELETE) && (mem->flags & KVM_MEM_LOG_DIRTY_PAGES)) 7214 if ((change != KVM_MR_DELETE) && (mem->flags & KVM_MEM_LOG_DIRTY_PAGES))
7081 kvm_mmu_slot_remove_write_access(kvm, mem->slot); 7215 kvm_mmu_slot_remove_write_access(kvm, mem->slot);
7082 /*
7083 * If memory slot is created, or moved, we need to clear all
7084 * mmio sptes.
7085 */
7086 kvm_mmu_invalidate_mmio_sptes(kvm);
7087} 7216}
7088 7217
7089void kvm_arch_flush_shadow_all(struct kvm *kvm) 7218void kvm_arch_flush_shadow_all(struct kvm *kvm)
@@ -7103,6 +7232,7 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
7103 !vcpu->arch.apf.halted) 7232 !vcpu->arch.apf.halted)
7104 || !list_empty_careful(&vcpu->async_pf.done) 7233 || !list_empty_careful(&vcpu->async_pf.done)
7105 || kvm_apic_has_events(vcpu) 7234 || kvm_apic_has_events(vcpu)
7235 || vcpu->arch.pv.pv_unhalted
7106 || atomic_read(&vcpu->arch.nmi_queued) || 7236 || atomic_read(&vcpu->arch.nmi_queued) ||
7107 (kvm_arch_interrupt_allowed(vcpu) && 7237 (kvm_arch_interrupt_allowed(vcpu) &&
7108 kvm_cpu_has_interrupt(vcpu)); 7238 kvm_cpu_has_interrupt(vcpu));
diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c
index c74436e687bf..72074d528400 100644
--- a/arch/x86/vdso/vclock_gettime.c
+++ b/arch/x86/vdso/vclock_gettime.c
@@ -85,15 +85,18 @@ static notrace cycle_t vread_pvclock(int *mode)
85 cycle_t ret; 85 cycle_t ret;
86 u64 last; 86 u64 last;
87 u32 version; 87 u32 version;
88 u32 migrate_count;
89 u8 flags; 88 u8 flags;
90 unsigned cpu, cpu1; 89 unsigned cpu, cpu1;
91 90
92 91
93 /* 92 /*
94 * When looping to get a consistent (time-info, tsc) pair, we 93 * Note: hypervisor must guarantee that:
95 * also need to deal with the possibility we can switch vcpus, 94 * 1. cpu ID number maps 1:1 to per-CPU pvclock time info.
96 * so make sure we always re-fetch time-info for the current vcpu. 95 * 2. that per-CPU pvclock time info is updated if the
96 * underlying CPU changes.
97 * 3. that version is increased whenever underlying CPU
98 * changes.
99 *
97 */ 100 */
98 do { 101 do {
99 cpu = __getcpu() & VGETCPU_CPU_MASK; 102 cpu = __getcpu() & VGETCPU_CPU_MASK;
@@ -104,8 +107,6 @@ static notrace cycle_t vread_pvclock(int *mode)
104 107
105 pvti = get_pvti(cpu); 108 pvti = get_pvti(cpu);
106 109
107 migrate_count = pvti->migrate_count;
108
109 version = __pvclock_read_cycles(&pvti->pvti, &ret, &flags); 110 version = __pvclock_read_cycles(&pvti->pvti, &ret, &flags);
110 111
111 /* 112 /*
@@ -117,8 +118,7 @@ static notrace cycle_t vread_pvclock(int *mode)
117 cpu1 = __getcpu() & VGETCPU_CPU_MASK; 118 cpu1 = __getcpu() & VGETCPU_CPU_MASK;
118 } while (unlikely(cpu != cpu1 || 119 } while (unlikely(cpu != cpu1 ||
119 (pvti->pvti.version & 1) || 120 (pvti->pvti.version & 1) ||
120 pvti->pvti.version != version || 121 pvti->pvti.version != version));
121 pvti->migrate_count != migrate_count));
122 122
123 if (unlikely(!(flags & PVCLOCK_TSC_STABLE_BIT))) 123 if (unlikely(!(flags & PVCLOCK_TSC_STABLE_BIT)))
124 *mode = VCLOCK_NONE; 124 *mode = VCLOCK_NONE;
diff --git a/drivers/base/Kconfig b/drivers/base/Kconfig
index 5daa2599ed48..e373671652b0 100644
--- a/drivers/base/Kconfig
+++ b/drivers/base/Kconfig
@@ -200,11 +200,9 @@ config DMA_SHARED_BUFFER
200 APIs extension; the file's descriptor can then be passed on to other 200 APIs extension; the file's descriptor can then be passed on to other
201 driver. 201 driver.
202 202
203config CMA 203config DMA_CMA
204 bool "Contiguous Memory Allocator" 204 bool "DMA Contiguous Memory Allocator"
205 depends on HAVE_DMA_CONTIGUOUS && HAVE_MEMBLOCK 205 depends on HAVE_DMA_CONTIGUOUS && CMA
206 select MIGRATION
207 select MEMORY_ISOLATION
208 help 206 help
209 This enables the Contiguous Memory Allocator which allows drivers 207 This enables the Contiguous Memory Allocator which allows drivers
210 to allocate big physically-contiguous blocks of memory for use with 208 to allocate big physically-contiguous blocks of memory for use with
@@ -213,17 +211,7 @@ config CMA
213 For more information see <include/linux/dma-contiguous.h>. 211 For more information see <include/linux/dma-contiguous.h>.
214 If unsure, say "n". 212 If unsure, say "n".
215 213
216if CMA 214if DMA_CMA
217
218config CMA_DEBUG
219 bool "CMA debug messages (DEVELOPMENT)"
220 depends on DEBUG_KERNEL
221 help
222 Turns on debug messages in CMA. This produces KERN_DEBUG
223 messages for every CMA call as well as various messages while
224 processing calls such as dma_alloc_from_contiguous().
225 This option does not affect warning and error messages.
226
227comment "Default contiguous memory area size:" 215comment "Default contiguous memory area size:"
228 216
229config CMA_SIZE_MBYTES 217config CMA_SIZE_MBYTES
diff --git a/drivers/base/Makefile b/drivers/base/Makefile
index 48029aa477d9..94e8a80e87f8 100644
--- a/drivers/base/Makefile
+++ b/drivers/base/Makefile
@@ -6,7 +6,7 @@ obj-y := core.o bus.o dd.o syscore.o \
6 attribute_container.o transport_class.o \ 6 attribute_container.o transport_class.o \
7 topology.o 7 topology.o
8obj-$(CONFIG_DEVTMPFS) += devtmpfs.o 8obj-$(CONFIG_DEVTMPFS) += devtmpfs.o
9obj-$(CONFIG_CMA) += dma-contiguous.o 9obj-$(CONFIG_DMA_CMA) += dma-contiguous.o
10obj-y += power/ 10obj-y += power/
11obj-$(CONFIG_HAS_DMA) += dma-mapping.o 11obj-$(CONFIG_HAS_DMA) += dma-mapping.o
12obj-$(CONFIG_HAVE_GENERIC_DMA_COHERENT) += dma-coherent.o 12obj-$(CONFIG_HAVE_GENERIC_DMA_COHERENT) += dma-coherent.o
diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h
index 343744e4809c..7e2d15837b02 100644
--- a/include/kvm/arm_vgic.h
+++ b/include/kvm/arm_vgic.h
@@ -26,7 +26,7 @@
26#include <linux/types.h> 26#include <linux/types.h>
27#include <linux/irqchip/arm-gic.h> 27#include <linux/irqchip/arm-gic.h>
28 28
29#define VGIC_NR_IRQS 128 29#define VGIC_NR_IRQS 256
30#define VGIC_NR_SGIS 16 30#define VGIC_NR_SGIS 16
31#define VGIC_NR_PPIS 16 31#define VGIC_NR_PPIS 16
32#define VGIC_NR_PRIVATE_IRQS (VGIC_NR_SGIS + VGIC_NR_PPIS) 32#define VGIC_NR_PRIVATE_IRQS (VGIC_NR_SGIS + VGIC_NR_PPIS)
diff --git a/include/linux/dma-contiguous.h b/include/linux/dma-contiguous.h
index 01b5c84be828..00141d3325fe 100644
--- a/include/linux/dma-contiguous.h
+++ b/include/linux/dma-contiguous.h
@@ -57,7 +57,7 @@ struct cma;
57struct page; 57struct page;
58struct device; 58struct device;
59 59
60#ifdef CONFIG_CMA 60#ifdef CONFIG_DMA_CMA
61 61
62/* 62/*
63 * There is always at least global CMA area and a few optional device 63 * There is always at least global CMA area and a few optional device
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index a63d83ebd151..ca645a01d37a 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -85,6 +85,12 @@ static inline bool is_noslot_pfn(pfn_t pfn)
85 return pfn == KVM_PFN_NOSLOT; 85 return pfn == KVM_PFN_NOSLOT;
86} 86}
87 87
88/*
89 * architectures with KVM_HVA_ERR_BAD other than PAGE_OFFSET (e.g. s390)
90 * provide own defines and kvm_is_error_hva
91 */
92#ifndef KVM_HVA_ERR_BAD
93
88#define KVM_HVA_ERR_BAD (PAGE_OFFSET) 94#define KVM_HVA_ERR_BAD (PAGE_OFFSET)
89#define KVM_HVA_ERR_RO_BAD (PAGE_OFFSET + PAGE_SIZE) 95#define KVM_HVA_ERR_RO_BAD (PAGE_OFFSET + PAGE_SIZE)
90 96
@@ -93,6 +99,8 @@ static inline bool kvm_is_error_hva(unsigned long addr)
93 return addr >= PAGE_OFFSET; 99 return addr >= PAGE_OFFSET;
94} 100}
95 101
102#endif
103
96#define KVM_ERR_PTR_BAD_PAGE (ERR_PTR(-ENOENT)) 104#define KVM_ERR_PTR_BAD_PAGE (ERR_PTR(-ENOENT))
97 105
98static inline bool is_error_page(struct page *page) 106static inline bool is_error_page(struct page *page)
@@ -160,8 +168,12 @@ enum kvm_bus {
160 168
161int kvm_io_bus_write(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr, 169int kvm_io_bus_write(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
162 int len, const void *val); 170 int len, const void *val);
171int kvm_io_bus_write_cookie(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
172 int len, const void *val, long cookie);
163int kvm_io_bus_read(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr, int len, 173int kvm_io_bus_read(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr, int len,
164 void *val); 174 void *val);
175int kvm_io_bus_read_cookie(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
176 int len, void *val, long cookie);
165int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr, 177int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
166 int len, struct kvm_io_device *dev); 178 int len, struct kvm_io_device *dev);
167int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx, 179int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx,
@@ -499,6 +511,7 @@ int __kvm_set_memory_region(struct kvm *kvm,
499void kvm_arch_free_memslot(struct kvm_memory_slot *free, 511void kvm_arch_free_memslot(struct kvm_memory_slot *free,
500 struct kvm_memory_slot *dont); 512 struct kvm_memory_slot *dont);
501int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages); 513int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages);
514void kvm_arch_memslots_updated(struct kvm *kvm);
502int kvm_arch_prepare_memory_region(struct kvm *kvm, 515int kvm_arch_prepare_memory_region(struct kvm *kvm,
503 struct kvm_memory_slot *memslot, 516 struct kvm_memory_slot *memslot,
504 struct kvm_userspace_memory_region *mem, 517 struct kvm_userspace_memory_region *mem,
diff --git a/include/linux/sched.h b/include/linux/sched.h
index f79ced719435..ce1e1c0aaa33 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -107,14 +107,6 @@ extern unsigned long this_cpu_load(void);
107extern void calc_global_load(unsigned long ticks); 107extern void calc_global_load(unsigned long ticks);
108extern void update_cpu_load_nohz(void); 108extern void update_cpu_load_nohz(void);
109 109
110/* Notifier for when a task gets migrated to a new CPU */
111struct task_migration_notifier {
112 struct task_struct *task;
113 int from_cpu;
114 int to_cpu;
115};
116extern void register_task_migration_notifier(struct notifier_block *n);
117
118extern unsigned long get_parent_ip(unsigned long addr); 110extern unsigned long get_parent_ip(unsigned long addr);
119 111
120extern void dump_cpu_task(int cpu); 112extern void dump_cpu_task(int cpu);
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index acccd08be6c7..99c25338ede8 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -667,6 +667,7 @@ struct kvm_ppc_smmu_info {
667#define KVM_CAP_PPC_RTAS 91 667#define KVM_CAP_PPC_RTAS 91
668#define KVM_CAP_IRQ_XICS 92 668#define KVM_CAP_IRQ_XICS 92
669#define KVM_CAP_ARM_EL1_32BIT 93 669#define KVM_CAP_ARM_EL1_32BIT 93
670#define KVM_CAP_SPAPR_MULTITCE 94
670 671
671#ifdef KVM_CAP_IRQ_ROUTING 672#ifdef KVM_CAP_IRQ_ROUTING
672 673
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 725aa067ad63..5ac63c9a995a 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -978,13 +978,6 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
978 rq->skip_clock_update = 1; 978 rq->skip_clock_update = 1;
979} 979}
980 980
981static ATOMIC_NOTIFIER_HEAD(task_migration_notifier);
982
983void register_task_migration_notifier(struct notifier_block *n)
984{
985 atomic_notifier_chain_register(&task_migration_notifier, n);
986}
987
988#ifdef CONFIG_SMP 981#ifdef CONFIG_SMP
989void set_task_cpu(struct task_struct *p, unsigned int new_cpu) 982void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
990{ 983{
@@ -1015,18 +1008,10 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
1015 trace_sched_migrate_task(p, new_cpu); 1008 trace_sched_migrate_task(p, new_cpu);
1016 1009
1017 if (task_cpu(p) != new_cpu) { 1010 if (task_cpu(p) != new_cpu) {
1018 struct task_migration_notifier tmn;
1019
1020 if (p->sched_class->migrate_task_rq) 1011 if (p->sched_class->migrate_task_rq)
1021 p->sched_class->migrate_task_rq(p, new_cpu); 1012 p->sched_class->migrate_task_rq(p, new_cpu);
1022 p->se.nr_migrations++; 1013 p->se.nr_migrations++;
1023 perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, NULL, 0); 1014 perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, NULL, 0);
1024
1025 tmn.task = p;
1026 tmn.from_cpu = task_cpu(p);
1027 tmn.to_cpu = new_cpu;
1028
1029 atomic_notifier_call_chain(&task_migration_notifier, 0, &tmn);
1030 } 1015 }
1031 1016
1032 __set_task_cpu(p, new_cpu); 1017 __set_task_cpu(p, new_cpu);
diff --git a/mm/Kconfig b/mm/Kconfig
index 8028dcc6615c..6cdd27043303 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -478,6 +478,30 @@ config FRONTSWAP
478 478
479 If unsure, say Y to enable frontswap. 479 If unsure, say Y to enable frontswap.
480 480
481config CMA
482 bool "Contiguous Memory Allocator"
483 depends on HAVE_MEMBLOCK
484 select MIGRATION
485 select MEMORY_ISOLATION
486 help
487 This enables the Contiguous Memory Allocator which allows other
488 subsystems to allocate big physically-contiguous blocks of memory.
489 CMA reserves a region of memory and allows only movable pages to
490 be allocated from it. This way, the kernel can use the memory for
491 pagecache and when a subsystem requests for contiguous area, the
492 allocated pages are migrated away to serve the contiguous request.
493
494 If unsure, say "n".
495
496config CMA_DEBUG
497 bool "CMA debug messages (DEVELOPMENT)"
498 depends on DEBUG_KERNEL && CMA
499 help
500 Turns on debug messages in CMA. This produces KERN_DEBUG
501 messages for every CMA call as well as various messages while
502 processing calls such as dma_alloc_from_contiguous().
503 This option does not affect warning and error messages.
504
481config ZBUD 505config ZBUD
482 tristate 506 tristate
483 default n 507 default n
diff --git a/virt/kvm/arm/vgic.c b/virt/kvm/arm/vgic.c
index 17c5ac7d10ed..685fc72fc751 100644
--- a/virt/kvm/arm/vgic.c
+++ b/virt/kvm/arm/vgic.c
@@ -149,7 +149,7 @@ static u32 *vgic_bytemap_get_reg(struct vgic_bytemap *x, int cpuid, u32 offset)
149{ 149{
150 offset >>= 2; 150 offset >>= 2;
151 BUG_ON(offset > (VGIC_NR_IRQS / 4)); 151 BUG_ON(offset > (VGIC_NR_IRQS / 4));
152 if (offset < 4) 152 if (offset < 8)
153 return x->percpu[cpuid] + offset; 153 return x->percpu[cpuid] + offset;
154 else 154 else
155 return x->shared + offset - 8; 155 return x->shared + offset - 8;
@@ -432,19 +432,13 @@ static bool handle_mmio_priority_reg(struct kvm_vcpu *vcpu,
432static u32 vgic_get_target_reg(struct kvm *kvm, int irq) 432static u32 vgic_get_target_reg(struct kvm *kvm, int irq)
433{ 433{
434 struct vgic_dist *dist = &kvm->arch.vgic; 434 struct vgic_dist *dist = &kvm->arch.vgic;
435 struct kvm_vcpu *vcpu; 435 int i;
436 int i, c;
437 unsigned long *bmap;
438 u32 val = 0; 436 u32 val = 0;
439 437
440 irq -= VGIC_NR_PRIVATE_IRQS; 438 irq -= VGIC_NR_PRIVATE_IRQS;
441 439
442 kvm_for_each_vcpu(c, vcpu, kvm) { 440 for (i = 0; i < GICD_IRQS_PER_ITARGETSR; i++)
443 bmap = vgic_bitmap_get_shared_map(&dist->irq_spi_target[c]); 441 val |= 1 << (dist->irq_spi_cpu[irq + i] + i * 8);
444 for (i = 0; i < GICD_IRQS_PER_ITARGETSR; i++)
445 if (test_bit(irq + i, bmap))
446 val |= 1 << (c + i * 8);
447 }
448 442
449 return val; 443 return val;
450} 444}
@@ -547,8 +541,12 @@ static bool handle_mmio_cfg_reg(struct kvm_vcpu *vcpu,
547 struct kvm_exit_mmio *mmio, phys_addr_t offset) 541 struct kvm_exit_mmio *mmio, phys_addr_t offset)
548{ 542{
549 u32 val; 543 u32 val;
550 u32 *reg = vgic_bitmap_get_reg(&vcpu->kvm->arch.vgic.irq_cfg, 544 u32 *reg;
551 vcpu->vcpu_id, offset >> 1); 545
546 offset >>= 1;
547 reg = vgic_bitmap_get_reg(&vcpu->kvm->arch.vgic.irq_cfg,
548 vcpu->vcpu_id, offset);
549
552 if (offset & 2) 550 if (offset & 2)
553 val = *reg >> 16; 551 val = *reg >> 16;
554 else 552 else
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 1580dd4ace4e..bf040c4e02b3 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -102,28 +102,8 @@ static bool largepages_enabled = true;
102 102
103bool kvm_is_mmio_pfn(pfn_t pfn) 103bool kvm_is_mmio_pfn(pfn_t pfn)
104{ 104{
105 if (pfn_valid(pfn)) { 105 if (pfn_valid(pfn))
106 int reserved; 106 return PageReserved(pfn_to_page(pfn));
107 struct page *tail = pfn_to_page(pfn);
108 struct page *head = compound_trans_head(tail);
109 reserved = PageReserved(head);
110 if (head != tail) {
111 /*
112 * "head" is not a dangling pointer
113 * (compound_trans_head takes care of that)
114 * but the hugepage may have been splitted
115 * from under us (and we may not hold a
116 * reference count on the head page so it can
117 * be reused before we run PageReferenced), so
118 * we've to check PageTail before returning
119 * what we just read.
120 */
121 smp_rmb();
122 if (PageTail(tail))
123 return reserved;
124 }
125 return PageReserved(tail);
126 }
127 107
128 return true; 108 return true;
129} 109}
@@ -731,7 +711,10 @@ static struct kvm_memslots *install_new_memslots(struct kvm *kvm,
731 update_memslots(slots, new, kvm->memslots->generation); 711 update_memslots(slots, new, kvm->memslots->generation);
732 rcu_assign_pointer(kvm->memslots, slots); 712 rcu_assign_pointer(kvm->memslots, slots);
733 synchronize_srcu_expedited(&kvm->srcu); 713 synchronize_srcu_expedited(&kvm->srcu);
734 return old_memslots; 714
715 kvm_arch_memslots_updated(kvm);
716
717 return old_memslots;
735} 718}
736 719
737/* 720/*
@@ -1893,7 +1876,7 @@ static struct file_operations kvm_vcpu_fops = {
1893 */ 1876 */
1894static int create_vcpu_fd(struct kvm_vcpu *vcpu) 1877static int create_vcpu_fd(struct kvm_vcpu *vcpu)
1895{ 1878{
1896 return anon_inode_getfd("kvm-vcpu", &kvm_vcpu_fops, vcpu, O_RDWR); 1879 return anon_inode_getfd("kvm-vcpu", &kvm_vcpu_fops, vcpu, O_RDWR | O_CLOEXEC);
1897} 1880}
1898 1881
1899/* 1882/*
@@ -2302,7 +2285,7 @@ static int kvm_ioctl_create_device(struct kvm *kvm,
2302 return ret; 2285 return ret;
2303 } 2286 }
2304 2287
2305 ret = anon_inode_getfd(ops->name, &kvm_device_fops, dev, O_RDWR); 2288 ret = anon_inode_getfd(ops->name, &kvm_device_fops, dev, O_RDWR | O_CLOEXEC);
2306 if (ret < 0) { 2289 if (ret < 0) {
2307 ops->destroy(dev); 2290 ops->destroy(dev);
2308 return ret; 2291 return ret;
@@ -2586,7 +2569,7 @@ static int kvm_dev_ioctl_create_vm(unsigned long type)
2586 return r; 2569 return r;
2587 } 2570 }
2588#endif 2571#endif
2589 r = anon_inode_getfd("kvm-vm", &kvm_vm_fops, kvm, O_RDWR); 2572 r = anon_inode_getfd("kvm-vm", &kvm_vm_fops, kvm, O_RDWR | O_CLOEXEC);
2590 if (r < 0) 2573 if (r < 0)
2591 kvm_put_kvm(kvm); 2574 kvm_put_kvm(kvm);
2592 2575
@@ -2812,11 +2795,9 @@ static void kvm_io_bus_destroy(struct kvm_io_bus *bus)
2812 kfree(bus); 2795 kfree(bus);
2813} 2796}
2814 2797
2815static int kvm_io_bus_sort_cmp(const void *p1, const void *p2) 2798static inline int kvm_io_bus_cmp(const struct kvm_io_range *r1,
2799 const struct kvm_io_range *r2)
2816{ 2800{
2817 const struct kvm_io_range *r1 = p1;
2818 const struct kvm_io_range *r2 = p2;
2819
2820 if (r1->addr < r2->addr) 2801 if (r1->addr < r2->addr)
2821 return -1; 2802 return -1;
2822 if (r1->addr + r1->len > r2->addr + r2->len) 2803 if (r1->addr + r1->len > r2->addr + r2->len)
@@ -2824,6 +2805,11 @@ static int kvm_io_bus_sort_cmp(const void *p1, const void *p2)
2824 return 0; 2805 return 0;
2825} 2806}
2826 2807
2808static int kvm_io_bus_sort_cmp(const void *p1, const void *p2)
2809{
2810 return kvm_io_bus_cmp(p1, p2);
2811}
2812
2827static int kvm_io_bus_insert_dev(struct kvm_io_bus *bus, struct kvm_io_device *dev, 2813static int kvm_io_bus_insert_dev(struct kvm_io_bus *bus, struct kvm_io_device *dev,
2828 gpa_t addr, int len) 2814 gpa_t addr, int len)
2829{ 2815{
@@ -2857,17 +2843,54 @@ static int kvm_io_bus_get_first_dev(struct kvm_io_bus *bus,
2857 2843
2858 off = range - bus->range; 2844 off = range - bus->range;
2859 2845
2860 while (off > 0 && kvm_io_bus_sort_cmp(&key, &bus->range[off-1]) == 0) 2846 while (off > 0 && kvm_io_bus_cmp(&key, &bus->range[off-1]) == 0)
2861 off--; 2847 off--;
2862 2848
2863 return off; 2849 return off;
2864} 2850}
2865 2851
2852static int __kvm_io_bus_write(struct kvm_io_bus *bus,
2853 struct kvm_io_range *range, const void *val)
2854{
2855 int idx;
2856
2857 idx = kvm_io_bus_get_first_dev(bus, range->addr, range->len);
2858 if (idx < 0)
2859 return -EOPNOTSUPP;
2860
2861 while (idx < bus->dev_count &&
2862 kvm_io_bus_cmp(range, &bus->range[idx]) == 0) {
2863 if (!kvm_iodevice_write(bus->range[idx].dev, range->addr,
2864 range->len, val))
2865 return idx;
2866 idx++;
2867 }
2868
2869 return -EOPNOTSUPP;
2870}
2871
2866/* kvm_io_bus_write - called under kvm->slots_lock */ 2872/* kvm_io_bus_write - called under kvm->slots_lock */
2867int kvm_io_bus_write(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr, 2873int kvm_io_bus_write(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
2868 int len, const void *val) 2874 int len, const void *val)
2869{ 2875{
2870 int idx; 2876 struct kvm_io_bus *bus;
2877 struct kvm_io_range range;
2878 int r;
2879
2880 range = (struct kvm_io_range) {
2881 .addr = addr,
2882 .len = len,
2883 };
2884
2885 bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu);
2886 r = __kvm_io_bus_write(bus, &range, val);
2887 return r < 0 ? r : 0;
2888}
2889
2890/* kvm_io_bus_write_cookie - called under kvm->slots_lock */
2891int kvm_io_bus_write_cookie(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
2892 int len, const void *val, long cookie)
2893{
2871 struct kvm_io_bus *bus; 2894 struct kvm_io_bus *bus;
2872 struct kvm_io_range range; 2895 struct kvm_io_range range;
2873 2896
@@ -2877,14 +2900,35 @@ int kvm_io_bus_write(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
2877 }; 2900 };
2878 2901
2879 bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu); 2902 bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu);
2880 idx = kvm_io_bus_get_first_dev(bus, addr, len); 2903
2904 /* First try the device referenced by cookie. */
2905 if ((cookie >= 0) && (cookie < bus->dev_count) &&
2906 (kvm_io_bus_cmp(&range, &bus->range[cookie]) == 0))
2907 if (!kvm_iodevice_write(bus->range[cookie].dev, addr, len,
2908 val))
2909 return cookie;
2910
2911 /*
2912 * cookie contained garbage; fall back to search and return the
2913 * correct cookie value.
2914 */
2915 return __kvm_io_bus_write(bus, &range, val);
2916}
2917
2918static int __kvm_io_bus_read(struct kvm_io_bus *bus, struct kvm_io_range *range,
2919 void *val)
2920{
2921 int idx;
2922
2923 idx = kvm_io_bus_get_first_dev(bus, range->addr, range->len);
2881 if (idx < 0) 2924 if (idx < 0)
2882 return -EOPNOTSUPP; 2925 return -EOPNOTSUPP;
2883 2926
2884 while (idx < bus->dev_count && 2927 while (idx < bus->dev_count &&
2885 kvm_io_bus_sort_cmp(&range, &bus->range[idx]) == 0) { 2928 kvm_io_bus_cmp(range, &bus->range[idx]) == 0) {
2886 if (!kvm_iodevice_write(bus->range[idx].dev, addr, len, val)) 2929 if (!kvm_iodevice_read(bus->range[idx].dev, range->addr,
2887 return 0; 2930 range->len, val))
2931 return idx;
2888 idx++; 2932 idx++;
2889 } 2933 }
2890 2934
@@ -2895,9 +2939,9 @@ int kvm_io_bus_write(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
2895int kvm_io_bus_read(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr, 2939int kvm_io_bus_read(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
2896 int len, void *val) 2940 int len, void *val)
2897{ 2941{
2898 int idx;
2899 struct kvm_io_bus *bus; 2942 struct kvm_io_bus *bus;
2900 struct kvm_io_range range; 2943 struct kvm_io_range range;
2944 int r;
2901 2945
2902 range = (struct kvm_io_range) { 2946 range = (struct kvm_io_range) {
2903 .addr = addr, 2947 .addr = addr,
@@ -2905,18 +2949,36 @@ int kvm_io_bus_read(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
2905 }; 2949 };
2906 2950
2907 bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu); 2951 bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu);
2908 idx = kvm_io_bus_get_first_dev(bus, addr, len); 2952 r = __kvm_io_bus_read(bus, &range, val);
2909 if (idx < 0) 2953 return r < 0 ? r : 0;
2910 return -EOPNOTSUPP; 2954}
2911 2955
2912 while (idx < bus->dev_count && 2956/* kvm_io_bus_read_cookie - called under kvm->slots_lock */
2913 kvm_io_bus_sort_cmp(&range, &bus->range[idx]) == 0) { 2957int kvm_io_bus_read_cookie(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
2914 if (!kvm_iodevice_read(bus->range[idx].dev, addr, len, val)) 2958 int len, void *val, long cookie)
2915 return 0; 2959{
2916 idx++; 2960 struct kvm_io_bus *bus;
2917 } 2961 struct kvm_io_range range;
2918 2962
2919 return -EOPNOTSUPP; 2963 range = (struct kvm_io_range) {
2964 .addr = addr,
2965 .len = len,
2966 };
2967
2968 bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu);
2969
2970 /* First try the device referenced by cookie. */
2971 if ((cookie >= 0) && (cookie < bus->dev_count) &&
2972 (kvm_io_bus_cmp(&range, &bus->range[cookie]) == 0))
2973 if (!kvm_iodevice_read(bus->range[cookie].dev, addr, len,
2974 val))
2975 return cookie;
2976
2977 /*
2978 * cookie contained garbage; fall back to search and return the
2979 * correct cookie value.
2980 */
2981 return __kvm_io_bus_read(bus, &range, val);
2920} 2982}
2921 2983
2922/* Caller must hold slots_lock. */ 2984/* Caller must hold slots_lock. */