aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/virtual/kvm/api.txt140
-rw-r--r--MAINTAINERS5
-rw-r--r--arch/ia64/kvm/kvm-ia64.c7
-rw-r--r--arch/powerpc/include/asm/Kbuild1
-rw-r--r--arch/powerpc/include/asm/epapr_hcalls.h83
-rw-r--r--arch/powerpc/include/asm/fsl_hcalls.h36
-rw-r--r--arch/powerpc/include/asm/kvm_asm.h1
-rw-r--r--arch/powerpc/include/asm/kvm_book3s.h12
-rw-r--r--arch/powerpc/include/asm/kvm_book3s_64.h33
-rw-r--r--arch/powerpc/include/asm/kvm_booke_hv_asm.h29
-rw-r--r--arch/powerpc/include/asm/kvm_host.h68
-rw-r--r--arch/powerpc/include/asm/kvm_para.h15
-rw-r--r--arch/powerpc/include/asm/kvm_ppc.h87
-rw-r--r--arch/powerpc/include/asm/mmu-book3e.h2
-rw-r--r--arch/powerpc/include/asm/mmu-hash64.h10
-rw-r--r--arch/powerpc/include/asm/reg.h1
-rw-r--r--arch/powerpc/include/asm/reg_booke.h7
-rw-r--r--arch/powerpc/include/asm/smp.h8
-rw-r--r--arch/powerpc/include/uapi/asm/Kbuild1
-rw-r--r--arch/powerpc/include/uapi/asm/epapr_hcalls.h98
-rw-r--r--arch/powerpc/include/uapi/asm/kvm.h86
-rw-r--r--arch/powerpc/include/uapi/asm/kvm_para.h7
-rw-r--r--arch/powerpc/kernel/asm-offsets.c4
-rw-r--r--arch/powerpc/kernel/epapr_hcalls.S28
-rw-r--r--arch/powerpc/kernel/epapr_paravirt.c11
-rw-r--r--arch/powerpc/kernel/kvm.c2
-rw-r--r--arch/powerpc/kernel/ppc_ksyms.c5
-rw-r--r--arch/powerpc/kernel/smp.c46
-rw-r--r--arch/powerpc/kvm/44x.c1
-rw-r--r--arch/powerpc/kvm/44x_emulate.c112
-rw-r--r--arch/powerpc/kvm/Kconfig4
-rw-r--r--arch/powerpc/kvm/Makefile5
-rw-r--r--arch/powerpc/kvm/book3s.c125
-rw-r--r--arch/powerpc/kvm/book3s_32_mmu_host.c3
-rw-r--r--arch/powerpc/kvm/book3s_64_mmu_host.c3
-rw-r--r--arch/powerpc/kvm/book3s_64_mmu_hv.c474
-rw-r--r--arch/powerpc/kvm/book3s_emulate.c16
-rw-r--r--arch/powerpc/kvm/book3s_exports.c3
-rw-r--r--arch/powerpc/kvm/book3s_hv.c655
-rw-r--r--arch/powerpc/kvm/book3s_hv_builtin.c4
-rw-r--r--arch/powerpc/kvm/book3s_hv_ras.c144
-rw-r--r--arch/powerpc/kvm/book3s_hv_rm_mmu.c143
-rw-r--r--arch/powerpc/kvm/book3s_hv_rmhandlers.S142
-rw-r--r--arch/powerpc/kvm/book3s_mmu_hpte.c5
-rw-r--r--arch/powerpc/kvm/book3s_pr.c294
-rw-r--r--arch/powerpc/kvm/book3s_rmhandlers.S18
-rw-r--r--arch/powerpc/kvm/booke.c346
-rw-r--r--arch/powerpc/kvm/booke.h1
-rw-r--r--arch/powerpc/kvm/booke_emulate.c36
-rw-r--r--arch/powerpc/kvm/bookehv_interrupts.S145
-rw-r--r--arch/powerpc/kvm/e500.h11
-rw-r--r--arch/powerpc/kvm/e500_emulate.c14
-rw-r--r--arch/powerpc/kvm/e500_tlb.c132
-rw-r--r--arch/powerpc/kvm/emulate.c221
-rw-r--r--arch/powerpc/kvm/powerpc.c187
-rw-r--r--arch/powerpc/kvm/trace.h200
-rw-r--r--arch/powerpc/platforms/Kconfig1
-rw-r--r--arch/powerpc/sysdev/fsl_msi.c9
-rw-r--r--arch/powerpc/sysdev/fsl_soc.c2
-rw-r--r--arch/s390/kvm/interrupt.c19
-rw-r--r--arch/s390/kvm/kvm-s390.c7
-rw-r--r--arch/x86/include/asm/clocksource.h1
-rw-r--r--arch/x86/include/asm/cpufeature.h1
-rw-r--r--arch/x86/include/asm/fixmap.h5
-rw-r--r--arch/x86/include/asm/kexec.h3
-rw-r--r--arch/x86/include/asm/kvm_guest.h6
-rw-r--r--arch/x86/include/asm/kvm_host.h24
-rw-r--r--arch/x86/include/asm/msr-index.h1
-rw-r--r--arch/x86/include/asm/pvclock.h47
-rw-r--r--arch/x86/include/asm/vmx.h3
-rw-r--r--arch/x86/include/asm/vsyscall.h20
-rw-r--r--arch/x86/kernel/crash.c32
-rw-r--r--arch/x86/kernel/kvm.c20
-rw-r--r--arch/x86/kernel/kvmclock.c88
-rw-r--r--arch/x86/kernel/pvclock.c143
-rw-r--r--arch/x86/kvm/cpuid.c3
-rw-r--r--arch/x86/kvm/cpuid.h8
-rw-r--r--arch/x86/kvm/emulate.c5
-rw-r--r--arch/x86/kvm/lapic.c2
-rw-r--r--arch/x86/kvm/mmu.c65
-rw-r--r--arch/x86/kvm/paging_tmpl.h115
-rw-r--r--arch/x86/kvm/svm.c48
-rw-r--r--arch/x86/kvm/trace.h63
-rw-r--r--arch/x86/kvm/vmx.c203
-rw-r--r--arch/x86/kvm/x86.c548
-rw-r--r--arch/x86/kvm/x86.h2
-rw-r--r--arch/x86/vdso/vclock_gettime.c81
-rw-r--r--arch/x86/vdso/vgetcpu.c11
-rw-r--r--drivers/tty/Kconfig1
-rw-r--r--drivers/virt/Kconfig1
-rw-r--r--include/linux/kvm_host.h53
-rw-r--r--include/linux/pvclock_gtod.h9
-rw-r--r--include/linux/sched.h8
-rw-r--r--include/uapi/linux/kvm.h21
-rw-r--r--kernel/sched/core.c15
-rw-r--r--kernel/time/timekeeping.c50
-rw-r--r--virt/kvm/assigned-dev.c36
-rw-r--r--virt/kvm/eventfd.c8
-rw-r--r--virt/kvm/iommu.c10
-rw-r--r--virt/kvm/irq_comm.c83
-rw-r--r--virt/kvm/kvm_main.c57
101 files changed, 4926 insertions, 1289 deletions
diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index f6ec3a92e621..a4df5535996b 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -1194,12 +1194,15 @@ struct kvm_ppc_pvinfo {
1194This ioctl fetches PV specific information that need to be passed to the guest 1194This ioctl fetches PV specific information that need to be passed to the guest
1195using the device tree or other means from vm context. 1195using the device tree or other means from vm context.
1196 1196
1197For now the only implemented piece of information distributed here is an array 1197The hcall array defines 4 instructions that make up a hypercall.
1198of 4 instructions that make up a hypercall.
1199 1198
1200If any additional field gets added to this structure later on, a bit for that 1199If any additional field gets added to this structure later on, a bit for that
1201additional piece of information will be set in the flags bitmap. 1200additional piece of information will be set in the flags bitmap.
1202 1201
1202The flags bitmap is defined as:
1203
1204 /* the host supports the ePAPR idle hcall
1205 #define KVM_PPC_PVINFO_FLAGS_EV_IDLE (1<<0)
1203 1206
12044.48 KVM_ASSIGN_PCI_DEVICE 12074.48 KVM_ASSIGN_PCI_DEVICE
1205 1208
@@ -1731,7 +1734,46 @@ registers, find a list below:
1731 Arch | Register | Width (bits) 1734 Arch | Register | Width (bits)
1732 | | 1735 | |
1733 PPC | KVM_REG_PPC_HIOR | 64 1736 PPC | KVM_REG_PPC_HIOR | 64
1734 1737 PPC | KVM_REG_PPC_IAC1 | 64
1738 PPC | KVM_REG_PPC_IAC2 | 64
1739 PPC | KVM_REG_PPC_IAC3 | 64
1740 PPC | KVM_REG_PPC_IAC4 | 64
1741 PPC | KVM_REG_PPC_DAC1 | 64
1742 PPC | KVM_REG_PPC_DAC2 | 64
1743 PPC | KVM_REG_PPC_DABR | 64
1744 PPC | KVM_REG_PPC_DSCR | 64
1745 PPC | KVM_REG_PPC_PURR | 64
1746 PPC | KVM_REG_PPC_SPURR | 64
1747 PPC | KVM_REG_PPC_DAR | 64
1748 PPC | KVM_REG_PPC_DSISR | 32
1749 PPC | KVM_REG_PPC_AMR | 64
1750 PPC | KVM_REG_PPC_UAMOR | 64
1751 PPC | KVM_REG_PPC_MMCR0 | 64
1752 PPC | KVM_REG_PPC_MMCR1 | 64
1753 PPC | KVM_REG_PPC_MMCRA | 64
1754 PPC | KVM_REG_PPC_PMC1 | 32
1755 PPC | KVM_REG_PPC_PMC2 | 32
1756 PPC | KVM_REG_PPC_PMC3 | 32
1757 PPC | KVM_REG_PPC_PMC4 | 32
1758 PPC | KVM_REG_PPC_PMC5 | 32
1759 PPC | KVM_REG_PPC_PMC6 | 32
1760 PPC | KVM_REG_PPC_PMC7 | 32
1761 PPC | KVM_REG_PPC_PMC8 | 32
1762 PPC | KVM_REG_PPC_FPR0 | 64
1763 ...
1764 PPC | KVM_REG_PPC_FPR31 | 64
1765 PPC | KVM_REG_PPC_VR0 | 128
1766 ...
1767 PPC | KVM_REG_PPC_VR31 | 128
1768 PPC | KVM_REG_PPC_VSR0 | 128
1769 ...
1770 PPC | KVM_REG_PPC_VSR31 | 128
1771 PPC | KVM_REG_PPC_FPSCR | 64
1772 PPC | KVM_REG_PPC_VSCR | 32
1773 PPC | KVM_REG_PPC_VPA_ADDR | 64
1774 PPC | KVM_REG_PPC_VPA_SLB | 128
1775 PPC | KVM_REG_PPC_VPA_DTL | 128
1776 PPC | KVM_REG_PPC_EPCR | 32
1735 1777
17364.69 KVM_GET_ONE_REG 17784.69 KVM_GET_ONE_REG
1737 1779
@@ -1747,7 +1789,7 @@ kvm_one_reg struct passed in. On success, the register value can be found
1747at the memory location pointed to by "addr". 1789at the memory location pointed to by "addr".
1748 1790
1749The list of registers accessible using this interface is identical to the 1791The list of registers accessible using this interface is identical to the
1750list in 4.64. 1792list in 4.68.
1751 1793
1752 1794
17534.70 KVM_KVMCLOCK_CTRL 17954.70 KVM_KVMCLOCK_CTRL
@@ -1997,6 +2039,93 @@ return the hash table order in the parameter. (If the guest is using
1997the virtualized real-mode area (VRMA) facility, the kernel will 2039the virtualized real-mode area (VRMA) facility, the kernel will
1998re-create the VMRA HPTEs on the next KVM_RUN of any vcpu.) 2040re-create the VMRA HPTEs on the next KVM_RUN of any vcpu.)
1999 2041
20424.77 KVM_S390_INTERRUPT
2043
2044Capability: basic
2045Architectures: s390
2046Type: vm ioctl, vcpu ioctl
2047Parameters: struct kvm_s390_interrupt (in)
2048Returns: 0 on success, -1 on error
2049
2050Allows to inject an interrupt to the guest. Interrupts can be floating
2051(vm ioctl) or per cpu (vcpu ioctl), depending on the interrupt type.
2052
2053Interrupt parameters are passed via kvm_s390_interrupt:
2054
2055struct kvm_s390_interrupt {
2056 __u32 type;
2057 __u32 parm;
2058 __u64 parm64;
2059};
2060
2061type can be one of the following:
2062
2063KVM_S390_SIGP_STOP (vcpu) - sigp restart
2064KVM_S390_PROGRAM_INT (vcpu) - program check; code in parm
2065KVM_S390_SIGP_SET_PREFIX (vcpu) - sigp set prefix; prefix address in parm
2066KVM_S390_RESTART (vcpu) - restart
2067KVM_S390_INT_VIRTIO (vm) - virtio external interrupt; external interrupt
2068 parameters in parm and parm64
2069KVM_S390_INT_SERVICE (vm) - sclp external interrupt; sclp parameter in parm
2070KVM_S390_INT_EMERGENCY (vcpu) - sigp emergency; source cpu in parm
2071KVM_S390_INT_EXTERNAL_CALL (vcpu) - sigp external call; source cpu in parm
2072
2073Note that the vcpu ioctl is asynchronous to vcpu execution.
2074
20754.78 KVM_PPC_GET_HTAB_FD
2076
2077Capability: KVM_CAP_PPC_HTAB_FD
2078Architectures: powerpc
2079Type: vm ioctl
2080Parameters: Pointer to struct kvm_get_htab_fd (in)
2081Returns: file descriptor number (>= 0) on success, -1 on error
2082
2083This returns a file descriptor that can be used either to read out the
2084entries in the guest's hashed page table (HPT), or to write entries to
2085initialize the HPT. The returned fd can only be written to if the
2086KVM_GET_HTAB_WRITE bit is set in the flags field of the argument, and
2087can only be read if that bit is clear. The argument struct looks like
2088this:
2089
2090/* For KVM_PPC_GET_HTAB_FD */
2091struct kvm_get_htab_fd {
2092 __u64 flags;
2093 __u64 start_index;
2094 __u64 reserved[2];
2095};
2096
2097/* Values for kvm_get_htab_fd.flags */
2098#define KVM_GET_HTAB_BOLTED_ONLY ((__u64)0x1)
2099#define KVM_GET_HTAB_WRITE ((__u64)0x2)
2100
2101The `start_index' field gives the index in the HPT of the entry at
2102which to start reading. It is ignored when writing.
2103
2104Reads on the fd will initially supply information about all
2105"interesting" HPT entries. Interesting entries are those with the
2106bolted bit set, if the KVM_GET_HTAB_BOLTED_ONLY bit is set, otherwise
2107all entries. When the end of the HPT is reached, the read() will
2108return. If read() is called again on the fd, it will start again from
2109the beginning of the HPT, but will only return HPT entries that have
2110changed since they were last read.
2111
2112Data read or written is structured as a header (8 bytes) followed by a
2113series of valid HPT entries (16 bytes) each. The header indicates how
2114many valid HPT entries there are and how many invalid entries follow
2115the valid entries. The invalid entries are not represented explicitly
2116in the stream. The header format is:
2117
2118struct kvm_get_htab_header {
2119 __u32 index;
2120 __u16 n_valid;
2121 __u16 n_invalid;
2122};
2123
2124Writes to the fd create HPT entries starting at the index given in the
2125header; first `n_valid' valid entries with contents from the data
2126written, then `n_invalid' invalid entries, invalidating any previously
2127valid entries found.
2128
2000 2129
20015. The kvm_run structure 21305. The kvm_run structure
2002------------------------ 2131------------------------
@@ -2109,7 +2238,8 @@ executed a memory-mapped I/O instruction which could not be satisfied
2109by kvm. The 'data' member contains the written data if 'is_write' is 2238by kvm. The 'data' member contains the written data if 'is_write' is
2110true, and should be filled by application code otherwise. 2239true, and should be filled by application code otherwise.
2111 2240
2112NOTE: For KVM_EXIT_IO, KVM_EXIT_MMIO and KVM_EXIT_OSI, the corresponding 2241NOTE: For KVM_EXIT_IO, KVM_EXIT_MMIO, KVM_EXIT_OSI, KVM_EXIT_DCR
2242 and KVM_EXIT_PAPR the corresponding
2113operations are complete (and guest state is consistent) only after userspace 2243operations are complete (and guest state is consistent) only after userspace
2114has re-entered the kernel with KVM_RUN. The kernel side will first finish 2244has re-entered the kernel with KVM_RUN. The kernel side will first finish
2115incomplete operations and then check for pending signals. Userspace 2245incomplete operations and then check for pending signals. Userspace
diff --git a/MAINTAINERS b/MAINTAINERS
index b0b880da6e5c..42f07ea5bbc9 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -4314,10 +4314,10 @@ F: include/linux/kvm*
4314F: virt/kvm/ 4314F: virt/kvm/
4315 4315
4316KERNEL VIRTUAL MACHINE (KVM) FOR AMD-V 4316KERNEL VIRTUAL MACHINE (KVM) FOR AMD-V
4317M: Joerg Roedel <joerg.roedel@amd.com> 4317M: Joerg Roedel <joro@8bytes.org>
4318L: kvm@vger.kernel.org 4318L: kvm@vger.kernel.org
4319W: http://kvm.qumranet.com 4319W: http://kvm.qumranet.com
4320S: Supported 4320S: Maintained
4321F: arch/x86/include/asm/svm.h 4321F: arch/x86/include/asm/svm.h
4322F: arch/x86/kvm/svm.c 4322F: arch/x86/kvm/svm.c
4323 4323
@@ -4325,6 +4325,7 @@ KERNEL VIRTUAL MACHINE (KVM) FOR POWERPC
4325M: Alexander Graf <agraf@suse.de> 4325M: Alexander Graf <agraf@suse.de>
4326L: kvm-ppc@vger.kernel.org 4326L: kvm-ppc@vger.kernel.org
4327W: http://kvm.qumranet.com 4327W: http://kvm.qumranet.com
4328T: git git://github.com/agraf/linux-2.6.git
4328S: Supported 4329S: Supported
4329F: arch/powerpc/include/asm/kvm* 4330F: arch/powerpc/include/asm/kvm*
4330F: arch/powerpc/kvm/ 4331F: arch/powerpc/kvm/
diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c
index 0a88cb5d316d..bd1c51555038 100644
--- a/arch/ia64/kvm/kvm-ia64.c
+++ b/arch/ia64/kvm/kvm-ia64.c
@@ -1330,6 +1330,11 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
1330 return 0; 1330 return 0;
1331} 1331}
1332 1332
1333int kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
1334{
1335 return 0;
1336}
1337
1333int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) 1338int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
1334{ 1339{
1335 return -EINVAL; 1340 return -EINVAL;
@@ -1362,11 +1367,9 @@ static void kvm_release_vm_pages(struct kvm *kvm)
1362 struct kvm_memslots *slots; 1367 struct kvm_memslots *slots;
1363 struct kvm_memory_slot *memslot; 1368 struct kvm_memory_slot *memslot;
1364 int j; 1369 int j;
1365 unsigned long base_gfn;
1366 1370
1367 slots = kvm_memslots(kvm); 1371 slots = kvm_memslots(kvm);
1368 kvm_for_each_memslot(memslot, slots) { 1372 kvm_for_each_memslot(memslot, slots) {
1369 base_gfn = memslot->base_gfn;
1370 for (j = 0; j < memslot->npages; j++) { 1373 for (j = 0; j < memslot->npages; j++) {
1371 if (memslot->rmap[j]) 1374 if (memslot->rmap[j])
1372 put_page((struct page *)memslot->rmap[j]); 1375 put_page((struct page *)memslot->rmap[j]);
diff --git a/arch/powerpc/include/asm/Kbuild b/arch/powerpc/include/asm/Kbuild
index 2d62b484b3fc..650757c300db 100644
--- a/arch/powerpc/include/asm/Kbuild
+++ b/arch/powerpc/include/asm/Kbuild
@@ -1,5 +1,4 @@
1 1
2
3generic-y += clkdev.h 2generic-y += clkdev.h
4generic-y += rwsem.h 3generic-y += rwsem.h
5generic-y += trace_clock.h 4generic-y += trace_clock.h
diff --git a/arch/powerpc/include/asm/epapr_hcalls.h b/arch/powerpc/include/asm/epapr_hcalls.h
index bf2c06c33871..d3d634274d2c 100644
--- a/arch/powerpc/include/asm/epapr_hcalls.h
+++ b/arch/powerpc/include/asm/epapr_hcalls.h
@@ -50,64 +50,13 @@
50#ifndef _EPAPR_HCALLS_H 50#ifndef _EPAPR_HCALLS_H
51#define _EPAPR_HCALLS_H 51#define _EPAPR_HCALLS_H
52 52
53#include <uapi/asm/epapr_hcalls.h>
54
55#ifndef __ASSEMBLY__
53#include <linux/types.h> 56#include <linux/types.h>
54#include <linux/errno.h> 57#include <linux/errno.h>
55#include <asm/byteorder.h> 58#include <asm/byteorder.h>
56 59
57#define EV_BYTE_CHANNEL_SEND 1
58#define EV_BYTE_CHANNEL_RECEIVE 2
59#define EV_BYTE_CHANNEL_POLL 3
60#define EV_INT_SET_CONFIG 4
61#define EV_INT_GET_CONFIG 5
62#define EV_INT_SET_MASK 6
63#define EV_INT_GET_MASK 7
64#define EV_INT_IACK 9
65#define EV_INT_EOI 10
66#define EV_INT_SEND_IPI 11
67#define EV_INT_SET_TASK_PRIORITY 12
68#define EV_INT_GET_TASK_PRIORITY 13
69#define EV_DOORBELL_SEND 14
70#define EV_MSGSND 15
71#define EV_IDLE 16
72
73/* vendor ID: epapr */
74#define EV_LOCAL_VENDOR_ID 0 /* for private use */
75#define EV_EPAPR_VENDOR_ID 1
76#define EV_FSL_VENDOR_ID 2 /* Freescale Semiconductor */
77#define EV_IBM_VENDOR_ID 3 /* IBM */
78#define EV_GHS_VENDOR_ID 4 /* Green Hills Software */
79#define EV_ENEA_VENDOR_ID 5 /* Enea */
80#define EV_WR_VENDOR_ID 6 /* Wind River Systems */
81#define EV_AMCC_VENDOR_ID 7 /* Applied Micro Circuits */
82#define EV_KVM_VENDOR_ID 42 /* KVM */
83
84/* The max number of bytes that a byte channel can send or receive per call */
85#define EV_BYTE_CHANNEL_MAX_BYTES 16
86
87
88#define _EV_HCALL_TOKEN(id, num) (((id) << 16) | (num))
89#define EV_HCALL_TOKEN(hcall_num) _EV_HCALL_TOKEN(EV_EPAPR_VENDOR_ID, hcall_num)
90
91/* epapr error codes */
92#define EV_EPERM 1 /* Operation not permitted */
93#define EV_ENOENT 2 /* Entry Not Found */
94#define EV_EIO 3 /* I/O error occured */
95#define EV_EAGAIN 4 /* The operation had insufficient
96 * resources to complete and should be
97 * retried
98 */
99#define EV_ENOMEM 5 /* There was insufficient memory to
100 * complete the operation */
101#define EV_EFAULT 6 /* Bad guest address */
102#define EV_ENODEV 7 /* No such device */
103#define EV_EINVAL 8 /* An argument supplied to the hcall
104 was out of range or invalid */
105#define EV_INTERNAL 9 /* An internal error occured */
106#define EV_CONFIG 10 /* A configuration error was detected */
107#define EV_INVALID_STATE 11 /* The object is in an invalid state */
108#define EV_UNIMPLEMENTED 12 /* Unimplemented hypercall */
109#define EV_BUFFER_OVERFLOW 13 /* Caller-supplied buffer too small */
110
111/* 60/*
112 * Hypercall register clobber list 61 * Hypercall register clobber list
113 * 62 *
@@ -193,7 +142,7 @@ static inline unsigned int ev_int_set_config(unsigned int interrupt,
193 r5 = priority; 142 r5 = priority;
194 r6 = destination; 143 r6 = destination;
195 144
196 __asm__ __volatile__ ("sc 1" 145 asm volatile("bl epapr_hypercall_start"
197 : "+r" (r11), "+r" (r3), "+r" (r4), "+r" (r5), "+r" (r6) 146 : "+r" (r11), "+r" (r3), "+r" (r4), "+r" (r5), "+r" (r6)
198 : : EV_HCALL_CLOBBERS4 147 : : EV_HCALL_CLOBBERS4
199 ); 148 );
@@ -222,7 +171,7 @@ static inline unsigned int ev_int_get_config(unsigned int interrupt,
222 r11 = EV_HCALL_TOKEN(EV_INT_GET_CONFIG); 171 r11 = EV_HCALL_TOKEN(EV_INT_GET_CONFIG);
223 r3 = interrupt; 172 r3 = interrupt;
224 173
225 __asm__ __volatile__ ("sc 1" 174 asm volatile("bl epapr_hypercall_start"
226 : "+r" (r11), "+r" (r3), "=r" (r4), "=r" (r5), "=r" (r6) 175 : "+r" (r11), "+r" (r3), "=r" (r4), "=r" (r5), "=r" (r6)
227 : : EV_HCALL_CLOBBERS4 176 : : EV_HCALL_CLOBBERS4
228 ); 177 );
@@ -252,7 +201,7 @@ static inline unsigned int ev_int_set_mask(unsigned int interrupt,
252 r3 = interrupt; 201 r3 = interrupt;
253 r4 = mask; 202 r4 = mask;
254 203
255 __asm__ __volatile__ ("sc 1" 204 asm volatile("bl epapr_hypercall_start"
256 : "+r" (r11), "+r" (r3), "+r" (r4) 205 : "+r" (r11), "+r" (r3), "+r" (r4)
257 : : EV_HCALL_CLOBBERS2 206 : : EV_HCALL_CLOBBERS2
258 ); 207 );
@@ -277,7 +226,7 @@ static inline unsigned int ev_int_get_mask(unsigned int interrupt,
277 r11 = EV_HCALL_TOKEN(EV_INT_GET_MASK); 226 r11 = EV_HCALL_TOKEN(EV_INT_GET_MASK);
278 r3 = interrupt; 227 r3 = interrupt;
279 228
280 __asm__ __volatile__ ("sc 1" 229 asm volatile("bl epapr_hypercall_start"
281 : "+r" (r11), "+r" (r3), "=r" (r4) 230 : "+r" (r11), "+r" (r3), "=r" (r4)
282 : : EV_HCALL_CLOBBERS2 231 : : EV_HCALL_CLOBBERS2
283 ); 232 );
@@ -305,7 +254,7 @@ static inline unsigned int ev_int_eoi(unsigned int interrupt)
305 r11 = EV_HCALL_TOKEN(EV_INT_EOI); 254 r11 = EV_HCALL_TOKEN(EV_INT_EOI);
306 r3 = interrupt; 255 r3 = interrupt;
307 256
308 __asm__ __volatile__ ("sc 1" 257 asm volatile("bl epapr_hypercall_start"
309 : "+r" (r11), "+r" (r3) 258 : "+r" (r11), "+r" (r3)
310 : : EV_HCALL_CLOBBERS1 259 : : EV_HCALL_CLOBBERS1
311 ); 260 );
@@ -344,7 +293,7 @@ static inline unsigned int ev_byte_channel_send(unsigned int handle,
344 r7 = be32_to_cpu(p[2]); 293 r7 = be32_to_cpu(p[2]);
345 r8 = be32_to_cpu(p[3]); 294 r8 = be32_to_cpu(p[3]);
346 295
347 __asm__ __volatile__ ("sc 1" 296 asm volatile("bl epapr_hypercall_start"
348 : "+r" (r11), "+r" (r3), 297 : "+r" (r11), "+r" (r3),
349 "+r" (r4), "+r" (r5), "+r" (r6), "+r" (r7), "+r" (r8) 298 "+r" (r4), "+r" (r5), "+r" (r6), "+r" (r7), "+r" (r8)
350 : : EV_HCALL_CLOBBERS6 299 : : EV_HCALL_CLOBBERS6
@@ -383,7 +332,7 @@ static inline unsigned int ev_byte_channel_receive(unsigned int handle,
383 r3 = handle; 332 r3 = handle;
384 r4 = *count; 333 r4 = *count;
385 334
386 __asm__ __volatile__ ("sc 1" 335 asm volatile("bl epapr_hypercall_start"
387 : "+r" (r11), "+r" (r3), "+r" (r4), 336 : "+r" (r11), "+r" (r3), "+r" (r4),
388 "=r" (r5), "=r" (r6), "=r" (r7), "=r" (r8) 337 "=r" (r5), "=r" (r6), "=r" (r7), "=r" (r8)
389 : : EV_HCALL_CLOBBERS6 338 : : EV_HCALL_CLOBBERS6
@@ -421,7 +370,7 @@ static inline unsigned int ev_byte_channel_poll(unsigned int handle,
421 r11 = EV_HCALL_TOKEN(EV_BYTE_CHANNEL_POLL); 370 r11 = EV_HCALL_TOKEN(EV_BYTE_CHANNEL_POLL);
422 r3 = handle; 371 r3 = handle;
423 372
424 __asm__ __volatile__ ("sc 1" 373 asm volatile("bl epapr_hypercall_start"
425 : "+r" (r11), "+r" (r3), "=r" (r4), "=r" (r5) 374 : "+r" (r11), "+r" (r3), "=r" (r4), "=r" (r5)
426 : : EV_HCALL_CLOBBERS3 375 : : EV_HCALL_CLOBBERS3
427 ); 376 );
@@ -454,7 +403,7 @@ static inline unsigned int ev_int_iack(unsigned int handle,
454 r11 = EV_HCALL_TOKEN(EV_INT_IACK); 403 r11 = EV_HCALL_TOKEN(EV_INT_IACK);
455 r3 = handle; 404 r3 = handle;
456 405
457 __asm__ __volatile__ ("sc 1" 406 asm volatile("bl epapr_hypercall_start"
458 : "+r" (r11), "+r" (r3), "=r" (r4) 407 : "+r" (r11), "+r" (r3), "=r" (r4)
459 : : EV_HCALL_CLOBBERS2 408 : : EV_HCALL_CLOBBERS2
460 ); 409 );
@@ -478,7 +427,7 @@ static inline unsigned int ev_doorbell_send(unsigned int handle)
478 r11 = EV_HCALL_TOKEN(EV_DOORBELL_SEND); 427 r11 = EV_HCALL_TOKEN(EV_DOORBELL_SEND);
479 r3 = handle; 428 r3 = handle;
480 429
481 __asm__ __volatile__ ("sc 1" 430 asm volatile("bl epapr_hypercall_start"
482 : "+r" (r11), "+r" (r3) 431 : "+r" (r11), "+r" (r3)
483 : : EV_HCALL_CLOBBERS1 432 : : EV_HCALL_CLOBBERS1
484 ); 433 );
@@ -498,12 +447,12 @@ static inline unsigned int ev_idle(void)
498 447
499 r11 = EV_HCALL_TOKEN(EV_IDLE); 448 r11 = EV_HCALL_TOKEN(EV_IDLE);
500 449
501 __asm__ __volatile__ ("sc 1" 450 asm volatile("bl epapr_hypercall_start"
502 : "+r" (r11), "=r" (r3) 451 : "+r" (r11), "=r" (r3)
503 : : EV_HCALL_CLOBBERS1 452 : : EV_HCALL_CLOBBERS1
504 ); 453 );
505 454
506 return r3; 455 return r3;
507} 456}
508 457#endif /* !__ASSEMBLY__ */
509#endif 458#endif /* _EPAPR_HCALLS_H */
diff --git a/arch/powerpc/include/asm/fsl_hcalls.h b/arch/powerpc/include/asm/fsl_hcalls.h
index 922d9b5fe3d5..3abb58394da4 100644
--- a/arch/powerpc/include/asm/fsl_hcalls.h
+++ b/arch/powerpc/include/asm/fsl_hcalls.h
@@ -96,7 +96,7 @@ static inline unsigned int fh_send_nmi(unsigned int vcpu_mask)
96 r11 = FH_HCALL_TOKEN(FH_SEND_NMI); 96 r11 = FH_HCALL_TOKEN(FH_SEND_NMI);
97 r3 = vcpu_mask; 97 r3 = vcpu_mask;
98 98
99 __asm__ __volatile__ ("sc 1" 99 asm volatile("bl epapr_hypercall_start"
100 : "+r" (r11), "+r" (r3) 100 : "+r" (r11), "+r" (r3)
101 : : EV_HCALL_CLOBBERS1 101 : : EV_HCALL_CLOBBERS1
102 ); 102 );
@@ -151,7 +151,7 @@ static inline unsigned int fh_partition_get_dtprop(int handle,
151 r9 = (uint32_t)propvalue_addr; 151 r9 = (uint32_t)propvalue_addr;
152 r10 = *propvalue_len; 152 r10 = *propvalue_len;
153 153
154 __asm__ __volatile__ ("sc 1" 154 asm volatile("bl epapr_hypercall_start"
155 : "+r" (r11), 155 : "+r" (r11),
156 "+r" (r3), "+r" (r4), "+r" (r5), "+r" (r6), "+r" (r7), 156 "+r" (r3), "+r" (r4), "+r" (r5), "+r" (r6), "+r" (r7),
157 "+r" (r8), "+r" (r9), "+r" (r10) 157 "+r" (r8), "+r" (r9), "+r" (r10)
@@ -205,7 +205,7 @@ static inline unsigned int fh_partition_set_dtprop(int handle,
205 r9 = (uint32_t)propvalue_addr; 205 r9 = (uint32_t)propvalue_addr;
206 r10 = propvalue_len; 206 r10 = propvalue_len;
207 207
208 __asm__ __volatile__ ("sc 1" 208 asm volatile("bl epapr_hypercall_start"
209 : "+r" (r11), 209 : "+r" (r11),
210 "+r" (r3), "+r" (r4), "+r" (r5), "+r" (r6), "+r" (r7), 210 "+r" (r3), "+r" (r4), "+r" (r5), "+r" (r6), "+r" (r7),
211 "+r" (r8), "+r" (r9), "+r" (r10) 211 "+r" (r8), "+r" (r9), "+r" (r10)
@@ -229,7 +229,7 @@ static inline unsigned int fh_partition_restart(unsigned int partition)
229 r11 = FH_HCALL_TOKEN(FH_PARTITION_RESTART); 229 r11 = FH_HCALL_TOKEN(FH_PARTITION_RESTART);
230 r3 = partition; 230 r3 = partition;
231 231
232 __asm__ __volatile__ ("sc 1" 232 asm volatile("bl epapr_hypercall_start"
233 : "+r" (r11), "+r" (r3) 233 : "+r" (r11), "+r" (r3)
234 : : EV_HCALL_CLOBBERS1 234 : : EV_HCALL_CLOBBERS1
235 ); 235 );
@@ -262,7 +262,7 @@ static inline unsigned int fh_partition_get_status(unsigned int partition,
262 r11 = FH_HCALL_TOKEN(FH_PARTITION_GET_STATUS); 262 r11 = FH_HCALL_TOKEN(FH_PARTITION_GET_STATUS);
263 r3 = partition; 263 r3 = partition;
264 264
265 __asm__ __volatile__ ("sc 1" 265 asm volatile("bl epapr_hypercall_start"
266 : "+r" (r11), "+r" (r3), "=r" (r4) 266 : "+r" (r11), "+r" (r3), "=r" (r4)
267 : : EV_HCALL_CLOBBERS2 267 : : EV_HCALL_CLOBBERS2
268 ); 268 );
@@ -295,7 +295,7 @@ static inline unsigned int fh_partition_start(unsigned int partition,
295 r4 = entry_point; 295 r4 = entry_point;
296 r5 = load; 296 r5 = load;
297 297
298 __asm__ __volatile__ ("sc 1" 298 asm volatile("bl epapr_hypercall_start"
299 : "+r" (r11), "+r" (r3), "+r" (r4), "+r" (r5) 299 : "+r" (r11), "+r" (r3), "+r" (r4), "+r" (r5)
300 : : EV_HCALL_CLOBBERS3 300 : : EV_HCALL_CLOBBERS3
301 ); 301 );
@@ -317,7 +317,7 @@ static inline unsigned int fh_partition_stop(unsigned int partition)
317 r11 = FH_HCALL_TOKEN(FH_PARTITION_STOP); 317 r11 = FH_HCALL_TOKEN(FH_PARTITION_STOP);
318 r3 = partition; 318 r3 = partition;
319 319
320 __asm__ __volatile__ ("sc 1" 320 asm volatile("bl epapr_hypercall_start"
321 : "+r" (r11), "+r" (r3) 321 : "+r" (r11), "+r" (r3)
322 : : EV_HCALL_CLOBBERS1 322 : : EV_HCALL_CLOBBERS1
323 ); 323 );
@@ -376,7 +376,7 @@ static inline unsigned int fh_partition_memcpy(unsigned int source,
376#endif 376#endif
377 r7 = count; 377 r7 = count;
378 378
379 __asm__ __volatile__ ("sc 1" 379 asm volatile("bl epapr_hypercall_start"
380 : "+r" (r11), 380 : "+r" (r11),
381 "+r" (r3), "+r" (r4), "+r" (r5), "+r" (r6), "+r" (r7) 381 "+r" (r3), "+r" (r4), "+r" (r5), "+r" (r6), "+r" (r7)
382 : : EV_HCALL_CLOBBERS5 382 : : EV_HCALL_CLOBBERS5
@@ -399,7 +399,7 @@ static inline unsigned int fh_dma_enable(unsigned int liodn)
399 r11 = FH_HCALL_TOKEN(FH_DMA_ENABLE); 399 r11 = FH_HCALL_TOKEN(FH_DMA_ENABLE);
400 r3 = liodn; 400 r3 = liodn;
401 401
402 __asm__ __volatile__ ("sc 1" 402 asm volatile("bl epapr_hypercall_start"
403 : "+r" (r11), "+r" (r3) 403 : "+r" (r11), "+r" (r3)
404 : : EV_HCALL_CLOBBERS1 404 : : EV_HCALL_CLOBBERS1
405 ); 405 );
@@ -421,7 +421,7 @@ static inline unsigned int fh_dma_disable(unsigned int liodn)
421 r11 = FH_HCALL_TOKEN(FH_DMA_DISABLE); 421 r11 = FH_HCALL_TOKEN(FH_DMA_DISABLE);
422 r3 = liodn; 422 r3 = liodn;
423 423
424 __asm__ __volatile__ ("sc 1" 424 asm volatile("bl epapr_hypercall_start"
425 : "+r" (r11), "+r" (r3) 425 : "+r" (r11), "+r" (r3)
426 : : EV_HCALL_CLOBBERS1 426 : : EV_HCALL_CLOBBERS1
427 ); 427 );
@@ -447,7 +447,7 @@ static inline unsigned int fh_vmpic_get_msir(unsigned int interrupt,
447 r11 = FH_HCALL_TOKEN(FH_VMPIC_GET_MSIR); 447 r11 = FH_HCALL_TOKEN(FH_VMPIC_GET_MSIR);
448 r3 = interrupt; 448 r3 = interrupt;
449 449
450 __asm__ __volatile__ ("sc 1" 450 asm volatile("bl epapr_hypercall_start"
451 : "+r" (r11), "+r" (r3), "=r" (r4) 451 : "+r" (r11), "+r" (r3), "=r" (r4)
452 : : EV_HCALL_CLOBBERS2 452 : : EV_HCALL_CLOBBERS2
453 ); 453 );
@@ -469,7 +469,7 @@ static inline unsigned int fh_system_reset(void)
469 469
470 r11 = FH_HCALL_TOKEN(FH_SYSTEM_RESET); 470 r11 = FH_HCALL_TOKEN(FH_SYSTEM_RESET);
471 471
472 __asm__ __volatile__ ("sc 1" 472 asm volatile("bl epapr_hypercall_start"
473 : "+r" (r11), "=r" (r3) 473 : "+r" (r11), "=r" (r3)
474 : : EV_HCALL_CLOBBERS1 474 : : EV_HCALL_CLOBBERS1
475 ); 475 );
@@ -506,7 +506,7 @@ static inline unsigned int fh_err_get_info(int queue, uint32_t *bufsize,
506 r6 = addr_lo; 506 r6 = addr_lo;
507 r7 = peek; 507 r7 = peek;
508 508
509 __asm__ __volatile__ ("sc 1" 509 asm volatile("bl epapr_hypercall_start"
510 : "+r" (r11), "+r" (r3), "+r" (r4), "+r" (r5), "+r" (r6), 510 : "+r" (r11), "+r" (r3), "+r" (r4), "+r" (r5), "+r" (r6),
511 "+r" (r7) 511 "+r" (r7)
512 : : EV_HCALL_CLOBBERS5 512 : : EV_HCALL_CLOBBERS5
@@ -542,7 +542,7 @@ static inline unsigned int fh_get_core_state(unsigned int handle,
542 r3 = handle; 542 r3 = handle;
543 r4 = vcpu; 543 r4 = vcpu;
544 544
545 __asm__ __volatile__ ("sc 1" 545 asm volatile("bl epapr_hypercall_start"
546 : "+r" (r11), "+r" (r3), "+r" (r4) 546 : "+r" (r11), "+r" (r3), "+r" (r4)
547 : : EV_HCALL_CLOBBERS2 547 : : EV_HCALL_CLOBBERS2
548 ); 548 );
@@ -572,7 +572,7 @@ static inline unsigned int fh_enter_nap(unsigned int handle, unsigned int vcpu)
572 r3 = handle; 572 r3 = handle;
573 r4 = vcpu; 573 r4 = vcpu;
574 574
575 __asm__ __volatile__ ("sc 1" 575 asm volatile("bl epapr_hypercall_start"
576 : "+r" (r11), "+r" (r3), "+r" (r4) 576 : "+r" (r11), "+r" (r3), "+r" (r4)
577 : : EV_HCALL_CLOBBERS2 577 : : EV_HCALL_CLOBBERS2
578 ); 578 );
@@ -597,7 +597,7 @@ static inline unsigned int fh_exit_nap(unsigned int handle, unsigned int vcpu)
597 r3 = handle; 597 r3 = handle;
598 r4 = vcpu; 598 r4 = vcpu;
599 599
600 __asm__ __volatile__ ("sc 1" 600 asm volatile("bl epapr_hypercall_start"
601 : "+r" (r11), "+r" (r3), "+r" (r4) 601 : "+r" (r11), "+r" (r3), "+r" (r4)
602 : : EV_HCALL_CLOBBERS2 602 : : EV_HCALL_CLOBBERS2
603 ); 603 );
@@ -618,7 +618,7 @@ static inline unsigned int fh_claim_device(unsigned int handle)
618 r11 = FH_HCALL_TOKEN(FH_CLAIM_DEVICE); 618 r11 = FH_HCALL_TOKEN(FH_CLAIM_DEVICE);
619 r3 = handle; 619 r3 = handle;
620 620
621 __asm__ __volatile__ ("sc 1" 621 asm volatile("bl epapr_hypercall_start"
622 : "+r" (r11), "+r" (r3) 622 : "+r" (r11), "+r" (r3)
623 : : EV_HCALL_CLOBBERS1 623 : : EV_HCALL_CLOBBERS1
624 ); 624 );
@@ -645,7 +645,7 @@ static inline unsigned int fh_partition_stop_dma(unsigned int handle)
645 r11 = FH_HCALL_TOKEN(FH_PARTITION_STOP_DMA); 645 r11 = FH_HCALL_TOKEN(FH_PARTITION_STOP_DMA);
646 r3 = handle; 646 r3 = handle;
647 647
648 __asm__ __volatile__ ("sc 1" 648 asm volatile("bl epapr_hypercall_start"
649 : "+r" (r11), "+r" (r3) 649 : "+r" (r11), "+r" (r3)
650 : : EV_HCALL_CLOBBERS1 650 : : EV_HCALL_CLOBBERS1
651 ); 651 );
diff --git a/arch/powerpc/include/asm/kvm_asm.h b/arch/powerpc/include/asm/kvm_asm.h
index 76fdcfef0889..aabcdba8f6b0 100644
--- a/arch/powerpc/include/asm/kvm_asm.h
+++ b/arch/powerpc/include/asm/kvm_asm.h
@@ -118,6 +118,7 @@
118 118
119#define RESUME_FLAG_NV (1<<0) /* Reload guest nonvolatile state? */ 119#define RESUME_FLAG_NV (1<<0) /* Reload guest nonvolatile state? */
120#define RESUME_FLAG_HOST (1<<1) /* Resume host? */ 120#define RESUME_FLAG_HOST (1<<1) /* Resume host? */
121#define RESUME_FLAG_ARCH1 (1<<2)
121 122
122#define RESUME_GUEST 0 123#define RESUME_GUEST 0
123#define RESUME_GUEST_NV RESUME_FLAG_NV 124#define RESUME_GUEST_NV RESUME_FLAG_NV
diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h
index 7aefdb3e1ce4..5a56e1c5f851 100644
--- a/arch/powerpc/include/asm/kvm_book3s.h
+++ b/arch/powerpc/include/asm/kvm_book3s.h
@@ -81,6 +81,8 @@ struct kvmppc_vcpu_book3s {
81 u64 sdr1; 81 u64 sdr1;
82 u64 hior; 82 u64 hior;
83 u64 msr_mask; 83 u64 msr_mask;
84 u64 purr_offset;
85 u64 spurr_offset;
84#ifdef CONFIG_PPC_BOOK3S_32 86#ifdef CONFIG_PPC_BOOK3S_32
85 u32 vsid_pool[VSID_POOL_SIZE]; 87 u32 vsid_pool[VSID_POOL_SIZE];
86 u32 vsid_next; 88 u32 vsid_next;
@@ -157,10 +159,14 @@ extern void *kvmppc_pin_guest_page(struct kvm *kvm, unsigned long addr,
157extern void kvmppc_unpin_guest_page(struct kvm *kvm, void *addr); 159extern void kvmppc_unpin_guest_page(struct kvm *kvm, void *addr);
158extern long kvmppc_virtmode_h_enter(struct kvm_vcpu *vcpu, unsigned long flags, 160extern long kvmppc_virtmode_h_enter(struct kvm_vcpu *vcpu, unsigned long flags,
159 long pte_index, unsigned long pteh, unsigned long ptel); 161 long pte_index, unsigned long pteh, unsigned long ptel);
160extern long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags, 162extern long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags,
161 long pte_index, unsigned long pteh, unsigned long ptel); 163 long pte_index, unsigned long pteh, unsigned long ptel,
164 pgd_t *pgdir, bool realmode, unsigned long *idx_ret);
165extern long kvmppc_do_h_remove(struct kvm *kvm, unsigned long flags,
166 unsigned long pte_index, unsigned long avpn,
167 unsigned long *hpret);
162extern long kvmppc_hv_get_dirty_log(struct kvm *kvm, 168extern long kvmppc_hv_get_dirty_log(struct kvm *kvm,
163 struct kvm_memory_slot *memslot); 169 struct kvm_memory_slot *memslot, unsigned long *map);
164 170
165extern void kvmppc_entry_trampoline(void); 171extern void kvmppc_entry_trampoline(void);
166extern void kvmppc_hv_entry_trampoline(void); 172extern void kvmppc_hv_entry_trampoline(void);
diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h
index 0dd1d86d3e31..38bec1dc9928 100644
--- a/arch/powerpc/include/asm/kvm_book3s_64.h
+++ b/arch/powerpc/include/asm/kvm_book3s_64.h
@@ -50,6 +50,15 @@ extern int kvm_hpt_order; /* order of preallocated HPTs */
50#define HPTE_V_HVLOCK 0x40UL 50#define HPTE_V_HVLOCK 0x40UL
51#define HPTE_V_ABSENT 0x20UL 51#define HPTE_V_ABSENT 0x20UL
52 52
53/*
54 * We use this bit in the guest_rpte field of the revmap entry
55 * to indicate a modified HPTE.
56 */
57#define HPTE_GR_MODIFIED (1ul << 62)
58
59/* These bits are reserved in the guest view of the HPTE */
60#define HPTE_GR_RESERVED HPTE_GR_MODIFIED
61
53static inline long try_lock_hpte(unsigned long *hpte, unsigned long bits) 62static inline long try_lock_hpte(unsigned long *hpte, unsigned long bits)
54{ 63{
55 unsigned long tmp, old; 64 unsigned long tmp, old;
@@ -60,7 +69,7 @@ static inline long try_lock_hpte(unsigned long *hpte, unsigned long bits)
60 " ori %0,%0,%4\n" 69 " ori %0,%0,%4\n"
61 " stdcx. %0,0,%2\n" 70 " stdcx. %0,0,%2\n"
62 " beq+ 2f\n" 71 " beq+ 2f\n"
63 " li %1,%3\n" 72 " mr %1,%3\n"
64 "2: isync" 73 "2: isync"
65 : "=&r" (tmp), "=&r" (old) 74 : "=&r" (tmp), "=&r" (old)
66 : "r" (hpte), "r" (bits), "i" (HPTE_V_HVLOCK) 75 : "r" (hpte), "r" (bits), "i" (HPTE_V_HVLOCK)
@@ -237,4 +246,26 @@ static inline bool slot_is_aligned(struct kvm_memory_slot *memslot,
237 return !(memslot->base_gfn & mask) && !(memslot->npages & mask); 246 return !(memslot->base_gfn & mask) && !(memslot->npages & mask);
238} 247}
239 248
249/*
250 * This works for 4k, 64k and 16M pages on POWER7,
251 * and 4k and 16M pages on PPC970.
252 */
253static inline unsigned long slb_pgsize_encoding(unsigned long psize)
254{
255 unsigned long senc = 0;
256
257 if (psize > 0x1000) {
258 senc = SLB_VSID_L;
259 if (psize == 0x10000)
260 senc |= SLB_VSID_LP_01;
261 }
262 return senc;
263}
264
265static inline int is_vrma_hpte(unsigned long hpte_v)
266{
267 return (hpte_v & ~0xffffffUL) ==
268 (HPTE_V_1TB_SEG | (VRMA_VSID << (40 - 16)));
269}
270
240#endif /* __ASM_KVM_BOOK3S_64_H__ */ 271#endif /* __ASM_KVM_BOOK3S_64_H__ */
diff --git a/arch/powerpc/include/asm/kvm_booke_hv_asm.h b/arch/powerpc/include/asm/kvm_booke_hv_asm.h
index 30a600fa1b6a..3a79f5325712 100644
--- a/arch/powerpc/include/asm/kvm_booke_hv_asm.h
+++ b/arch/powerpc/include/asm/kvm_booke_hv_asm.h
@@ -17,6 +17,7 @@
17 * there are no exceptions for which we fall through directly to 17 * there are no exceptions for which we fall through directly to
18 * the normal host handler. 18 * the normal host handler.
19 * 19 *
20 * 32-bit host
20 * Expected inputs (normal exceptions): 21 * Expected inputs (normal exceptions):
21 * SCRATCH0 = saved r10 22 * SCRATCH0 = saved r10
22 * r10 = thread struct 23 * r10 = thread struct
@@ -33,14 +34,38 @@
33 * *(r8 + GPR9) = saved r9 34 * *(r8 + GPR9) = saved r9
34 * *(r8 + GPR10) = saved r10 (r10 not yet clobbered) 35 * *(r8 + GPR10) = saved r10 (r10 not yet clobbered)
35 * *(r8 + GPR11) = saved r11 36 * *(r8 + GPR11) = saved r11
37 *
38 * 64-bit host
39 * Expected inputs (GEN/GDBELL/DBG/MC exception types):
40 * r10 = saved CR
41 * r13 = PACA_POINTER
42 * *(r13 + PACA_EX##type + EX_R10) = saved r10
43 * *(r13 + PACA_EX##type + EX_R11) = saved r11
44 * SPRN_SPRG_##type##_SCRATCH = saved r13
45 *
46 * Expected inputs (CRIT exception type):
47 * r10 = saved CR
48 * r13 = PACA_POINTER
49 * *(r13 + PACA_EX##type + EX_R10) = saved r10
50 * *(r13 + PACA_EX##type + EX_R11) = saved r11
51 * *(r13 + PACA_EX##type + EX_R13) = saved r13
52 *
53 * Expected inputs (TLB exception type):
54 * r10 = saved CR
55 * r13 = PACA_POINTER
56 * *(r13 + PACA_EX##type + EX_TLB_R10) = saved r10
57 * *(r13 + PACA_EX##type + EX_TLB_R11) = saved r11
58 * SPRN_SPRG_GEN_SCRATCH = saved r13
59 *
60 * Only the bolted version of TLB miss exception handlers is supported now.
36 */ 61 */
37.macro DO_KVM intno srr1 62.macro DO_KVM intno srr1
38#ifdef CONFIG_KVM_BOOKE_HV 63#ifdef CONFIG_KVM_BOOKE_HV
39BEGIN_FTR_SECTION 64BEGIN_FTR_SECTION
40 mtocrf 0x80, r11 /* check MSR[GS] without clobbering reg */ 65 mtocrf 0x80, r11 /* check MSR[GS] without clobbering reg */
41 bf 3, kvmppc_resume_\intno\()_\srr1 66 bf 3, 1975f
42 b kvmppc_handler_\intno\()_\srr1 67 b kvmppc_handler_\intno\()_\srr1
43kvmppc_resume_\intno\()_\srr1: 681975:
44END_FTR_SECTION_IFSET(CPU_FTR_EMB_HV) 69END_FTR_SECTION_IFSET(CPU_FTR_EMB_HV)
45#endif 70#endif
46.endm 71.endm
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index 28e8f5e5c63e..ca9bf459db6a 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -46,7 +46,7 @@
46#define KVM_COALESCED_MMIO_PAGE_OFFSET 1 46#define KVM_COALESCED_MMIO_PAGE_OFFSET 1
47#endif 47#endif
48 48
49#ifdef CONFIG_KVM_BOOK3S_64_HV 49#if !defined(CONFIG_KVM_440)
50#include <linux/mmu_notifier.h> 50#include <linux/mmu_notifier.h>
51 51
52#define KVM_ARCH_WANT_MMU_NOTIFIER 52#define KVM_ARCH_WANT_MMU_NOTIFIER
@@ -204,7 +204,7 @@ struct revmap_entry {
204}; 204};
205 205
206/* 206/*
207 * We use the top bit of each memslot->rmap entry as a lock bit, 207 * We use the top bit of each memslot->arch.rmap entry as a lock bit,
208 * and bit 32 as a present flag. The bottom 32 bits are the 208 * and bit 32 as a present flag. The bottom 32 bits are the
209 * index in the guest HPT of a HPTE that points to the page. 209 * index in the guest HPT of a HPTE that points to the page.
210 */ 210 */
@@ -215,14 +215,17 @@ struct revmap_entry {
215#define KVMPPC_RMAP_PRESENT 0x100000000ul 215#define KVMPPC_RMAP_PRESENT 0x100000000ul
216#define KVMPPC_RMAP_INDEX 0xfffffffful 216#define KVMPPC_RMAP_INDEX 0xfffffffful
217 217
218/* Low-order bits in kvm->arch.slot_phys[][] */ 218/* Low-order bits in memslot->arch.slot_phys[] */
219#define KVMPPC_PAGE_ORDER_MASK 0x1f 219#define KVMPPC_PAGE_ORDER_MASK 0x1f
220#define KVMPPC_PAGE_NO_CACHE HPTE_R_I /* 0x20 */ 220#define KVMPPC_PAGE_NO_CACHE HPTE_R_I /* 0x20 */
221#define KVMPPC_PAGE_WRITETHRU HPTE_R_W /* 0x40 */ 221#define KVMPPC_PAGE_WRITETHRU HPTE_R_W /* 0x40 */
222#define KVMPPC_GOT_PAGE 0x80 222#define KVMPPC_GOT_PAGE 0x80
223 223
224struct kvm_arch_memory_slot { 224struct kvm_arch_memory_slot {
225#ifdef CONFIG_KVM_BOOK3S_64_HV
225 unsigned long *rmap; 226 unsigned long *rmap;
227 unsigned long *slot_phys;
228#endif /* CONFIG_KVM_BOOK3S_64_HV */
226}; 229};
227 230
228struct kvm_arch { 231struct kvm_arch {
@@ -243,12 +246,12 @@ struct kvm_arch {
243 int using_mmu_notifiers; 246 int using_mmu_notifiers;
244 u32 hpt_order; 247 u32 hpt_order;
245 atomic_t vcpus_running; 248 atomic_t vcpus_running;
249 u32 online_vcores;
246 unsigned long hpt_npte; 250 unsigned long hpt_npte;
247 unsigned long hpt_mask; 251 unsigned long hpt_mask;
252 atomic_t hpte_mod_interest;
248 spinlock_t slot_phys_lock; 253 spinlock_t slot_phys_lock;
249 unsigned long *slot_phys[KVM_MEM_SLOTS_NUM]; 254 cpumask_t need_tlb_flush;
250 int slot_npages[KVM_MEM_SLOTS_NUM];
251 unsigned short last_vcpu[NR_CPUS];
252 struct kvmppc_vcore *vcores[KVM_MAX_VCORES]; 255 struct kvmppc_vcore *vcores[KVM_MAX_VCORES];
253 struct kvmppc_linear_info *hpt_li; 256 struct kvmppc_linear_info *hpt_li;
254#endif /* CONFIG_KVM_BOOK3S_64_HV */ 257#endif /* CONFIG_KVM_BOOK3S_64_HV */
@@ -273,6 +276,7 @@ struct kvmppc_vcore {
273 int nap_count; 276 int nap_count;
274 int napping_threads; 277 int napping_threads;
275 u16 pcpu; 278 u16 pcpu;
279 u16 last_cpu;
276 u8 vcore_state; 280 u8 vcore_state;
277 u8 in_guest; 281 u8 in_guest;
278 struct list_head runnable_threads; 282 struct list_head runnable_threads;
@@ -288,9 +292,10 @@ struct kvmppc_vcore {
288 292
289/* Values for vcore_state */ 293/* Values for vcore_state */
290#define VCORE_INACTIVE 0 294#define VCORE_INACTIVE 0
291#define VCORE_RUNNING 1 295#define VCORE_SLEEPING 1
292#define VCORE_EXITING 2 296#define VCORE_STARTING 2
293#define VCORE_SLEEPING 3 297#define VCORE_RUNNING 3
298#define VCORE_EXITING 4
294 299
295/* 300/*
296 * Struct used to manage memory for a virtual processor area 301 * Struct used to manage memory for a virtual processor area
@@ -346,6 +351,27 @@ struct kvmppc_slb {
346 bool class : 1; 351 bool class : 1;
347}; 352};
348 353
354# ifdef CONFIG_PPC_FSL_BOOK3E
355#define KVMPPC_BOOKE_IAC_NUM 2
356#define KVMPPC_BOOKE_DAC_NUM 2
357# else
358#define KVMPPC_BOOKE_IAC_NUM 4
359#define KVMPPC_BOOKE_DAC_NUM 2
360# endif
361#define KVMPPC_BOOKE_MAX_IAC 4
362#define KVMPPC_BOOKE_MAX_DAC 2
363
364struct kvmppc_booke_debug_reg {
365 u32 dbcr0;
366 u32 dbcr1;
367 u32 dbcr2;
368#ifdef CONFIG_KVM_E500MC
369 u32 dbcr4;
370#endif
371 u64 iac[KVMPPC_BOOKE_MAX_IAC];
372 u64 dac[KVMPPC_BOOKE_MAX_DAC];
373};
374
349struct kvm_vcpu_arch { 375struct kvm_vcpu_arch {
350 ulong host_stack; 376 ulong host_stack;
351 u32 host_pid; 377 u32 host_pid;
@@ -380,13 +406,18 @@ struct kvm_vcpu_arch {
380 u32 host_mas4; 406 u32 host_mas4;
381 u32 host_mas6; 407 u32 host_mas6;
382 u32 shadow_epcr; 408 u32 shadow_epcr;
383 u32 epcr;
384 u32 shadow_msrp; 409 u32 shadow_msrp;
385 u32 eplc; 410 u32 eplc;
386 u32 epsc; 411 u32 epsc;
387 u32 oldpir; 412 u32 oldpir;
388#endif 413#endif
389 414
415#if defined(CONFIG_BOOKE)
416#if defined(CONFIG_KVM_BOOKE_HV) || defined(CONFIG_64BIT)
417 u32 epcr;
418#endif
419#endif
420
390#ifdef CONFIG_PPC_BOOK3S 421#ifdef CONFIG_PPC_BOOK3S
391 /* For Gekko paired singles */ 422 /* For Gekko paired singles */
392 u32 qpr[32]; 423 u32 qpr[32];
@@ -440,8 +471,6 @@ struct kvm_vcpu_arch {
440 471
441 u32 ccr0; 472 u32 ccr0;
442 u32 ccr1; 473 u32 ccr1;
443 u32 dbcr0;
444 u32 dbcr1;
445 u32 dbsr; 474 u32 dbsr;
446 475
447 u64 mmcr[3]; 476 u64 mmcr[3];
@@ -471,9 +500,12 @@ struct kvm_vcpu_arch {
471 ulong fault_esr; 500 ulong fault_esr;
472 ulong queued_dear; 501 ulong queued_dear;
473 ulong queued_esr; 502 ulong queued_esr;
503 spinlock_t wdt_lock;
504 struct timer_list wdt_timer;
474 u32 tlbcfg[4]; 505 u32 tlbcfg[4];
475 u32 mmucfg; 506 u32 mmucfg;
476 u32 epr; 507 u32 epr;
508 struct kvmppc_booke_debug_reg dbg_reg;
477#endif 509#endif
478 gpa_t paddr_accessed; 510 gpa_t paddr_accessed;
479 gva_t vaddr_accessed; 511 gva_t vaddr_accessed;
@@ -486,6 +518,7 @@ struct kvm_vcpu_arch {
486 u8 osi_needed; 518 u8 osi_needed;
487 u8 osi_enabled; 519 u8 osi_enabled;
488 u8 papr_enabled; 520 u8 papr_enabled;
521 u8 watchdog_enabled;
489 u8 sane; 522 u8 sane;
490 u8 cpu_type; 523 u8 cpu_type;
491 u8 hcall_needed; 524 u8 hcall_needed;
@@ -497,7 +530,6 @@ struct kvm_vcpu_arch {
497 u64 dec_jiffies; 530 u64 dec_jiffies;
498 u64 dec_expires; 531 u64 dec_expires;
499 unsigned long pending_exceptions; 532 unsigned long pending_exceptions;
500 u16 last_cpu;
501 u8 ceded; 533 u8 ceded;
502 u8 prodded; 534 u8 prodded;
503 u32 last_inst; 535 u32 last_inst;
@@ -534,13 +566,17 @@ struct kvm_vcpu_arch {
534 unsigned long dtl_index; 566 unsigned long dtl_index;
535 u64 stolen_logged; 567 u64 stolen_logged;
536 struct kvmppc_vpa slb_shadow; 568 struct kvmppc_vpa slb_shadow;
569
570 spinlock_t tbacct_lock;
571 u64 busy_stolen;
572 u64 busy_preempt;
537#endif 573#endif
538}; 574};
539 575
540/* Values for vcpu->arch.state */ 576/* Values for vcpu->arch.state */
541#define KVMPPC_VCPU_STOPPED 0 577#define KVMPPC_VCPU_NOTREADY 0
542#define KVMPPC_VCPU_BUSY_IN_HOST 1 578#define KVMPPC_VCPU_RUNNABLE 1
543#define KVMPPC_VCPU_RUNNABLE 2 579#define KVMPPC_VCPU_BUSY_IN_HOST 2
544 580
545/* Values for vcpu->arch.io_gpr */ 581/* Values for vcpu->arch.io_gpr */
546#define KVM_MMIO_REG_MASK 0x001f 582#define KVM_MMIO_REG_MASK 0x001f
diff --git a/arch/powerpc/include/asm/kvm_para.h b/arch/powerpc/include/asm/kvm_para.h
index 9365860fb7f6..2b119654b4c1 100644
--- a/arch/powerpc/include/asm/kvm_para.h
+++ b/arch/powerpc/include/asm/kvm_para.h
@@ -21,7 +21,6 @@
21 21
22#include <uapi/asm/kvm_para.h> 22#include <uapi/asm/kvm_para.h>
23 23
24
25#ifdef CONFIG_KVM_GUEST 24#ifdef CONFIG_KVM_GUEST
26 25
27#include <linux/of.h> 26#include <linux/of.h>
@@ -55,7 +54,7 @@ static unsigned long kvm_hypercall(unsigned long *in,
55 unsigned long *out, 54 unsigned long *out,
56 unsigned long nr) 55 unsigned long nr)
57{ 56{
58 return HC_EV_UNIMPLEMENTED; 57 return EV_UNIMPLEMENTED;
59} 58}
60 59
61#endif 60#endif
@@ -66,7 +65,7 @@ static inline long kvm_hypercall0_1(unsigned int nr, unsigned long *r2)
66 unsigned long out[8]; 65 unsigned long out[8];
67 unsigned long r; 66 unsigned long r;
68 67
69 r = kvm_hypercall(in, out, nr | HC_VENDOR_KVM); 68 r = kvm_hypercall(in, out, KVM_HCALL_TOKEN(nr));
70 *r2 = out[0]; 69 *r2 = out[0];
71 70
72 return r; 71 return r;
@@ -77,7 +76,7 @@ static inline long kvm_hypercall0(unsigned int nr)
77 unsigned long in[8]; 76 unsigned long in[8];
78 unsigned long out[8]; 77 unsigned long out[8];
79 78
80 return kvm_hypercall(in, out, nr | HC_VENDOR_KVM); 79 return kvm_hypercall(in, out, KVM_HCALL_TOKEN(nr));
81} 80}
82 81
83static inline long kvm_hypercall1(unsigned int nr, unsigned long p1) 82static inline long kvm_hypercall1(unsigned int nr, unsigned long p1)
@@ -86,7 +85,7 @@ static inline long kvm_hypercall1(unsigned int nr, unsigned long p1)
86 unsigned long out[8]; 85 unsigned long out[8];
87 86
88 in[0] = p1; 87 in[0] = p1;
89 return kvm_hypercall(in, out, nr | HC_VENDOR_KVM); 88 return kvm_hypercall(in, out, KVM_HCALL_TOKEN(nr));
90} 89}
91 90
92static inline long kvm_hypercall2(unsigned int nr, unsigned long p1, 91static inline long kvm_hypercall2(unsigned int nr, unsigned long p1,
@@ -97,7 +96,7 @@ static inline long kvm_hypercall2(unsigned int nr, unsigned long p1,
97 96
98 in[0] = p1; 97 in[0] = p1;
99 in[1] = p2; 98 in[1] = p2;
100 return kvm_hypercall(in, out, nr | HC_VENDOR_KVM); 99 return kvm_hypercall(in, out, KVM_HCALL_TOKEN(nr));
101} 100}
102 101
103static inline long kvm_hypercall3(unsigned int nr, unsigned long p1, 102static inline long kvm_hypercall3(unsigned int nr, unsigned long p1,
@@ -109,7 +108,7 @@ static inline long kvm_hypercall3(unsigned int nr, unsigned long p1,
109 in[0] = p1; 108 in[0] = p1;
110 in[1] = p2; 109 in[1] = p2;
111 in[2] = p3; 110 in[2] = p3;
112 return kvm_hypercall(in, out, nr | HC_VENDOR_KVM); 111 return kvm_hypercall(in, out, KVM_HCALL_TOKEN(nr));
113} 112}
114 113
115static inline long kvm_hypercall4(unsigned int nr, unsigned long p1, 114static inline long kvm_hypercall4(unsigned int nr, unsigned long p1,
@@ -123,7 +122,7 @@ static inline long kvm_hypercall4(unsigned int nr, unsigned long p1,
123 in[1] = p2; 122 in[1] = p2;
124 in[2] = p3; 123 in[2] = p3;
125 in[3] = p4; 124 in[3] = p4;
126 return kvm_hypercall(in, out, nr | HC_VENDOR_KVM); 125 return kvm_hypercall(in, out, KVM_HCALL_TOKEN(nr));
127} 126}
128 127
129 128
diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
index e006f0bdea95..572aa7530619 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -28,6 +28,7 @@
28#include <linux/types.h> 28#include <linux/types.h>
29#include <linux/kvm_types.h> 29#include <linux/kvm_types.h>
30#include <linux/kvm_host.h> 30#include <linux/kvm_host.h>
31#include <linux/bug.h>
31#ifdef CONFIG_PPC_BOOK3S 32#ifdef CONFIG_PPC_BOOK3S
32#include <asm/kvm_book3s.h> 33#include <asm/kvm_book3s.h>
33#else 34#else
@@ -68,6 +69,8 @@ extern void kvmppc_emulate_dec(struct kvm_vcpu *vcpu);
68extern u32 kvmppc_get_dec(struct kvm_vcpu *vcpu, u64 tb); 69extern u32 kvmppc_get_dec(struct kvm_vcpu *vcpu, u64 tb);
69extern void kvmppc_decrementer_func(unsigned long data); 70extern void kvmppc_decrementer_func(unsigned long data);
70extern int kvmppc_sanity_check(struct kvm_vcpu *vcpu); 71extern int kvmppc_sanity_check(struct kvm_vcpu *vcpu);
72extern int kvmppc_subarch_vcpu_init(struct kvm_vcpu *vcpu);
73extern void kvmppc_subarch_vcpu_uninit(struct kvm_vcpu *vcpu);
71 74
72/* Core-specific hooks */ 75/* Core-specific hooks */
73 76
@@ -104,6 +107,7 @@ extern void kvmppc_core_queue_external(struct kvm_vcpu *vcpu,
104 struct kvm_interrupt *irq); 107 struct kvm_interrupt *irq);
105extern void kvmppc_core_dequeue_external(struct kvm_vcpu *vcpu, 108extern void kvmppc_core_dequeue_external(struct kvm_vcpu *vcpu,
106 struct kvm_interrupt *irq); 109 struct kvm_interrupt *irq);
110extern void kvmppc_core_flush_tlb(struct kvm_vcpu *vcpu);
107 111
108extern int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu, 112extern int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,
109 unsigned int op, int *advance); 113 unsigned int op, int *advance);
@@ -111,6 +115,7 @@ extern int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn,
111 ulong val); 115 ulong val);
112extern int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, 116extern int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn,
113 ulong *val); 117 ulong *val);
118extern int kvmppc_core_check_requests(struct kvm_vcpu *vcpu);
114 119
115extern int kvmppc_booke_init(void); 120extern int kvmppc_booke_init(void);
116extern void kvmppc_booke_exit(void); 121extern void kvmppc_booke_exit(void);
@@ -139,16 +144,28 @@ extern struct kvmppc_linear_info *kvm_alloc_hpt(void);
139extern void kvm_release_hpt(struct kvmppc_linear_info *li); 144extern void kvm_release_hpt(struct kvmppc_linear_info *li);
140extern int kvmppc_core_init_vm(struct kvm *kvm); 145extern int kvmppc_core_init_vm(struct kvm *kvm);
141extern void kvmppc_core_destroy_vm(struct kvm *kvm); 146extern void kvmppc_core_destroy_vm(struct kvm *kvm);
147extern void kvmppc_core_free_memslot(struct kvm_memory_slot *free,
148 struct kvm_memory_slot *dont);
149extern int kvmppc_core_create_memslot(struct kvm_memory_slot *slot,
150 unsigned long npages);
142extern int kvmppc_core_prepare_memory_region(struct kvm *kvm, 151extern int kvmppc_core_prepare_memory_region(struct kvm *kvm,
152 struct kvm_memory_slot *memslot,
143 struct kvm_userspace_memory_region *mem); 153 struct kvm_userspace_memory_region *mem);
144extern void kvmppc_core_commit_memory_region(struct kvm *kvm, 154extern void kvmppc_core_commit_memory_region(struct kvm *kvm,
145 struct kvm_userspace_memory_region *mem); 155 struct kvm_userspace_memory_region *mem,
156 struct kvm_memory_slot old);
146extern int kvm_vm_ioctl_get_smmu_info(struct kvm *kvm, 157extern int kvm_vm_ioctl_get_smmu_info(struct kvm *kvm,
147 struct kvm_ppc_smmu_info *info); 158 struct kvm_ppc_smmu_info *info);
159extern void kvmppc_core_flush_memslot(struct kvm *kvm,
160 struct kvm_memory_slot *memslot);
148 161
149extern int kvmppc_bookehv_init(void); 162extern int kvmppc_bookehv_init(void);
150extern void kvmppc_bookehv_exit(void); 163extern void kvmppc_bookehv_exit(void);
151 164
165extern int kvmppc_prepare_to_enter(struct kvm_vcpu *vcpu);
166
167extern int kvm_vm_ioctl_get_htab_fd(struct kvm *kvm, struct kvm_get_htab_fd *);
168
152/* 169/*
153 * Cuts out inst bits with ordering according to spec. 170 * Cuts out inst bits with ordering according to spec.
154 * That means the leftmost bit is zero. All given bits are included. 171 * That means the leftmost bit is zero. All given bits are included.
@@ -182,6 +199,41 @@ static inline u32 kvmppc_set_field(u64 inst, int msb, int lsb, int value)
182 return r; 199 return r;
183} 200}
184 201
202union kvmppc_one_reg {
203 u32 wval;
204 u64 dval;
205 vector128 vval;
206 u64 vsxval[2];
207 struct {
208 u64 addr;
209 u64 length;
210 } vpaval;
211};
212
213#define one_reg_size(id) \
214 (1ul << (((id) & KVM_REG_SIZE_MASK) >> KVM_REG_SIZE_SHIFT))
215
216#define get_reg_val(id, reg) ({ \
217 union kvmppc_one_reg __u; \
218 switch (one_reg_size(id)) { \
219 case 4: __u.wval = (reg); break; \
220 case 8: __u.dval = (reg); break; \
221 default: BUG(); \
222 } \
223 __u; \
224})
225
226
227#define set_reg_val(id, val) ({ \
228 u64 __v; \
229 switch (one_reg_size(id)) { \
230 case 4: __v = (val).wval; break; \
231 case 8: __v = (val).dval; break; \
232 default: BUG(); \
233 } \
234 __v; \
235})
236
185void kvmppc_core_get_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs); 237void kvmppc_core_get_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs);
186int kvmppc_core_set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs); 238int kvmppc_core_set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs);
187 239
@@ -190,6 +242,8 @@ int kvmppc_set_sregs_ivor(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs);
190 242
191int kvm_vcpu_ioctl_get_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg); 243int kvm_vcpu_ioctl_get_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg);
192int kvm_vcpu_ioctl_set_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg); 244int kvm_vcpu_ioctl_set_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg);
245int kvmppc_get_one_reg(struct kvm_vcpu *vcpu, u64 id, union kvmppc_one_reg *);
246int kvmppc_set_one_reg(struct kvm_vcpu *vcpu, u64 id, union kvmppc_one_reg *);
193 247
194void kvmppc_set_pid(struct kvm_vcpu *vcpu, u32 pid); 248void kvmppc_set_pid(struct kvm_vcpu *vcpu, u32 pid);
195 249
@@ -230,5 +284,36 @@ static inline void kvmppc_mmu_flush_icache(pfn_t pfn)
230 } 284 }
231} 285}
232 286
287/* Please call after prepare_to_enter. This function puts the lazy ee state
288 back to normal mode, without actually enabling interrupts. */
289static inline void kvmppc_lazy_ee_enable(void)
290{
291#ifdef CONFIG_PPC64
292 /* Only need to enable IRQs by hard enabling them after this */
293 local_paca->irq_happened = 0;
294 local_paca->soft_enabled = 1;
295#endif
296}
297
298static inline ulong kvmppc_get_ea_indexed(struct kvm_vcpu *vcpu, int ra, int rb)
299{
300 ulong ea;
301 ulong msr_64bit = 0;
302
303 ea = kvmppc_get_gpr(vcpu, rb);
304 if (ra)
305 ea += kvmppc_get_gpr(vcpu, ra);
306
307#if defined(CONFIG_PPC_BOOK3E_64)
308 msr_64bit = MSR_CM;
309#elif defined(CONFIG_PPC_BOOK3S_64)
310 msr_64bit = MSR_SF;
311#endif
312
313 if (!(vcpu->arch.shared->msr & msr_64bit))
314 ea = (uint32_t)ea;
315
316 return ea;
317}
233 318
234#endif /* __POWERPC_KVM_PPC_H__ */ 319#endif /* __POWERPC_KVM_PPC_H__ */
diff --git a/arch/powerpc/include/asm/mmu-book3e.h b/arch/powerpc/include/asm/mmu-book3e.h
index eeabcdbc30f7..99d43e0c1e4a 100644
--- a/arch/powerpc/include/asm/mmu-book3e.h
+++ b/arch/powerpc/include/asm/mmu-book3e.h
@@ -59,7 +59,7 @@
59#define MAS1_TSIZE_SHIFT 7 59#define MAS1_TSIZE_SHIFT 7
60#define MAS1_TSIZE(x) (((x) << MAS1_TSIZE_SHIFT) & MAS1_TSIZE_MASK) 60#define MAS1_TSIZE(x) (((x) << MAS1_TSIZE_SHIFT) & MAS1_TSIZE_MASK)
61 61
62#define MAS2_EPN 0xFFFFF000 62#define MAS2_EPN (~0xFFFUL)
63#define MAS2_X0 0x00000040 63#define MAS2_X0 0x00000040
64#define MAS2_X1 0x00000020 64#define MAS2_X1 0x00000020
65#define MAS2_W 0x00000010 65#define MAS2_W 0x00000010
diff --git a/arch/powerpc/include/asm/mmu-hash64.h b/arch/powerpc/include/asm/mmu-hash64.h
index 9673f73eb8db..2fdb47a19efd 100644
--- a/arch/powerpc/include/asm/mmu-hash64.h
+++ b/arch/powerpc/include/asm/mmu-hash64.h
@@ -121,6 +121,16 @@ extern char initial_stab[];
121#define PP_RXRX 3 /* Supervisor read, User read */ 121#define PP_RXRX 3 /* Supervisor read, User read */
122#define PP_RXXX (HPTE_R_PP0 | 2) /* Supervisor read, user none */ 122#define PP_RXXX (HPTE_R_PP0 | 2) /* Supervisor read, user none */
123 123
124/* Fields for tlbiel instruction in architecture 2.06 */
125#define TLBIEL_INVAL_SEL_MASK 0xc00 /* invalidation selector */
126#define TLBIEL_INVAL_PAGE 0x000 /* invalidate a single page */
127#define TLBIEL_INVAL_SET_LPID 0x800 /* invalidate a set for current LPID */
128#define TLBIEL_INVAL_SET 0xc00 /* invalidate a set for all LPIDs */
129#define TLBIEL_INVAL_SET_MASK 0xfff000 /* set number to inval. */
130#define TLBIEL_INVAL_SET_SHIFT 12
131
132#define POWER7_TLB_SETS 128 /* # sets in POWER7 TLB */
133
124#ifndef __ASSEMBLY__ 134#ifndef __ASSEMBLY__
125 135
126struct hash_pte { 136struct hash_pte {
diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h
index d24c14163966..97d37278ea2d 100644
--- a/arch/powerpc/include/asm/reg.h
+++ b/arch/powerpc/include/asm/reg.h
@@ -518,6 +518,7 @@
518#define SRR1_WS_DEEPER 0x00020000 /* Some resources not maintained */ 518#define SRR1_WS_DEEPER 0x00020000 /* Some resources not maintained */
519#define SRR1_WS_DEEP 0x00010000 /* All resources maintained */ 519#define SRR1_WS_DEEP 0x00010000 /* All resources maintained */
520#define SRR1_PROGFPE 0x00100000 /* Floating Point Enabled */ 520#define SRR1_PROGFPE 0x00100000 /* Floating Point Enabled */
521#define SRR1_PROGILL 0x00080000 /* Illegal instruction */
521#define SRR1_PROGPRIV 0x00040000 /* Privileged instruction */ 522#define SRR1_PROGPRIV 0x00040000 /* Privileged instruction */
522#define SRR1_PROGTRAP 0x00020000 /* Trap */ 523#define SRR1_PROGTRAP 0x00020000 /* Trap */
523#define SRR1_PROGADDR 0x00010000 /* SRR0 contains subsequent addr */ 524#define SRR1_PROGADDR 0x00010000 /* SRR0 contains subsequent addr */
diff --git a/arch/powerpc/include/asm/reg_booke.h b/arch/powerpc/include/asm/reg_booke.h
index 2d916c4982c5..e07e6af5e1ff 100644
--- a/arch/powerpc/include/asm/reg_booke.h
+++ b/arch/powerpc/include/asm/reg_booke.h
@@ -539,6 +539,13 @@
539#define TCR_FIE 0x00800000 /* FIT Interrupt Enable */ 539#define TCR_FIE 0x00800000 /* FIT Interrupt Enable */
540#define TCR_ARE 0x00400000 /* Auto Reload Enable */ 540#define TCR_ARE 0x00400000 /* Auto Reload Enable */
541 541
542#ifdef CONFIG_E500
543#define TCR_GET_WP(tcr) ((((tcr) & 0xC0000000) >> 30) | \
544 (((tcr) & 0x1E0000) >> 15))
545#else
546#define TCR_GET_WP(tcr) (((tcr) & 0xC0000000) >> 30)
547#endif
548
542/* Bit definitions for the TSR. */ 549/* Bit definitions for the TSR. */
543#define TSR_ENW 0x80000000 /* Enable Next Watchdog */ 550#define TSR_ENW 0x80000000 /* Enable Next Watchdog */
544#define TSR_WIS 0x40000000 /* WDT Interrupt Status */ 551#define TSR_WIS 0x40000000 /* WDT Interrupt Status */
diff --git a/arch/powerpc/include/asm/smp.h b/arch/powerpc/include/asm/smp.h
index e807e9d8e3f7..5a4e437c238d 100644
--- a/arch/powerpc/include/asm/smp.h
+++ b/arch/powerpc/include/asm/smp.h
@@ -67,6 +67,14 @@ void generic_mach_cpu_die(void);
67void generic_set_cpu_dead(unsigned int cpu); 67void generic_set_cpu_dead(unsigned int cpu);
68void generic_set_cpu_up(unsigned int cpu); 68void generic_set_cpu_up(unsigned int cpu);
69int generic_check_cpu_restart(unsigned int cpu); 69int generic_check_cpu_restart(unsigned int cpu);
70
71extern void inhibit_secondary_onlining(void);
72extern void uninhibit_secondary_onlining(void);
73
74#else /* HOTPLUG_CPU */
75static inline void inhibit_secondary_onlining(void) {}
76static inline void uninhibit_secondary_onlining(void) {}
77
70#endif 78#endif
71 79
72#ifdef CONFIG_PPC64 80#ifdef CONFIG_PPC64
diff --git a/arch/powerpc/include/uapi/asm/Kbuild b/arch/powerpc/include/uapi/asm/Kbuild
index a33c3c03bb2e..f7bca6370745 100644
--- a/arch/powerpc/include/uapi/asm/Kbuild
+++ b/arch/powerpc/include/uapi/asm/Kbuild
@@ -7,6 +7,7 @@ header-y += bootx.h
7header-y += byteorder.h 7header-y += byteorder.h
8header-y += cputable.h 8header-y += cputable.h
9header-y += elf.h 9header-y += elf.h
10header-y += epapr_hcalls.h
10header-y += errno.h 11header-y += errno.h
11header-y += fcntl.h 12header-y += fcntl.h
12header-y += ioctl.h 13header-y += ioctl.h
diff --git a/arch/powerpc/include/uapi/asm/epapr_hcalls.h b/arch/powerpc/include/uapi/asm/epapr_hcalls.h
new file mode 100644
index 000000000000..7f9c74b46704
--- /dev/null
+++ b/arch/powerpc/include/uapi/asm/epapr_hcalls.h
@@ -0,0 +1,98 @@
1/*
2 * ePAPR hcall interface
3 *
4 * Copyright 2008-2011 Freescale Semiconductor, Inc.
5 *
6 * Author: Timur Tabi <timur@freescale.com>
7 *
8 * This file is provided under a dual BSD/GPL license. When using or
9 * redistributing this file, you may do so under either license.
10 *
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions are met:
13 * * Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * * Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 * * Neither the name of Freescale Semiconductor nor the
19 * names of its contributors may be used to endorse or promote products
20 * derived from this software without specific prior written permission.
21 *
22 *
23 * ALTERNATIVELY, this software may be distributed under the terms of the
24 * GNU General Public License ("GPL") as published by the Free Software
25 * Foundation, either version 2 of that License or (at your option) any
26 * later version.
27 *
28 * THIS SOFTWARE IS PROVIDED BY Freescale Semiconductor ``AS IS'' AND ANY
29 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
30 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
31 * DISCLAIMED. IN NO EVENT SHALL Freescale Semiconductor BE LIABLE FOR ANY
32 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
33 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
34 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
35 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
36 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
37 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
38 */
39
40#ifndef _UAPI_ASM_POWERPC_EPAPR_HCALLS_H
41#define _UAPI_ASM_POWERPC_EPAPR_HCALLS_H
42
43#define EV_BYTE_CHANNEL_SEND 1
44#define EV_BYTE_CHANNEL_RECEIVE 2
45#define EV_BYTE_CHANNEL_POLL 3
46#define EV_INT_SET_CONFIG 4
47#define EV_INT_GET_CONFIG 5
48#define EV_INT_SET_MASK 6
49#define EV_INT_GET_MASK 7
50#define EV_INT_IACK 9
51#define EV_INT_EOI 10
52#define EV_INT_SEND_IPI 11
53#define EV_INT_SET_TASK_PRIORITY 12
54#define EV_INT_GET_TASK_PRIORITY 13
55#define EV_DOORBELL_SEND 14
56#define EV_MSGSND 15
57#define EV_IDLE 16
58
59/* vendor ID: epapr */
60#define EV_LOCAL_VENDOR_ID 0 /* for private use */
61#define EV_EPAPR_VENDOR_ID 1
62#define EV_FSL_VENDOR_ID 2 /* Freescale Semiconductor */
63#define EV_IBM_VENDOR_ID 3 /* IBM */
64#define EV_GHS_VENDOR_ID 4 /* Green Hills Software */
65#define EV_ENEA_VENDOR_ID 5 /* Enea */
66#define EV_WR_VENDOR_ID 6 /* Wind River Systems */
67#define EV_AMCC_VENDOR_ID 7 /* Applied Micro Circuits */
68#define EV_KVM_VENDOR_ID 42 /* KVM */
69
70/* The max number of bytes that a byte channel can send or receive per call */
71#define EV_BYTE_CHANNEL_MAX_BYTES 16
72
73
74#define _EV_HCALL_TOKEN(id, num) (((id) << 16) | (num))
75#define EV_HCALL_TOKEN(hcall_num) _EV_HCALL_TOKEN(EV_EPAPR_VENDOR_ID, hcall_num)
76
77/* epapr return codes */
78#define EV_SUCCESS 0
79#define EV_EPERM 1 /* Operation not permitted */
80#define EV_ENOENT 2 /* Entry Not Found */
81#define EV_EIO 3 /* I/O error occured */
82#define EV_EAGAIN 4 /* The operation had insufficient
83 * resources to complete and should be
84 * retried
85 */
86#define EV_ENOMEM 5 /* There was insufficient memory to
87 * complete the operation */
88#define EV_EFAULT 6 /* Bad guest address */
89#define EV_ENODEV 7 /* No such device */
90#define EV_EINVAL 8 /* An argument supplied to the hcall
91 was out of range or invalid */
92#define EV_INTERNAL 9 /* An internal error occured */
93#define EV_CONFIG 10 /* A configuration error was detected */
94#define EV_INVALID_STATE 11 /* The object is in an invalid state */
95#define EV_UNIMPLEMENTED 12 /* Unimplemented hypercall */
96#define EV_BUFFER_OVERFLOW 13 /* Caller-supplied buffer too small */
97
98#endif /* _UAPI_ASM_POWERPC_EPAPR_HCALLS_H */
diff --git a/arch/powerpc/include/uapi/asm/kvm.h b/arch/powerpc/include/uapi/asm/kvm.h
index 1bea4d8ea6f4..2fba8a66fb10 100644
--- a/arch/powerpc/include/uapi/asm/kvm.h
+++ b/arch/powerpc/include/uapi/asm/kvm.h
@@ -221,6 +221,12 @@ struct kvm_sregs {
221 221
222 __u32 dbsr; /* KVM_SREGS_E_UPDATE_DBSR */ 222 __u32 dbsr; /* KVM_SREGS_E_UPDATE_DBSR */
223 __u32 dbcr[3]; 223 __u32 dbcr[3];
224 /*
225 * iac/dac registers are 64bit wide, while this API
226 * interface provides only lower 32 bits on 64 bit
227 * processors. ONE_REG interface is added for 64bit
228 * iac/dac registers.
229 */
224 __u32 iac[4]; 230 __u32 iac[4];
225 __u32 dac[2]; 231 __u32 dac[2];
226 __u32 dvc[2]; 232 __u32 dvc[2];
@@ -325,6 +331,86 @@ struct kvm_book3e_206_tlb_params {
325 __u32 reserved[8]; 331 __u32 reserved[8];
326}; 332};
327 333
334/* For KVM_PPC_GET_HTAB_FD */
335struct kvm_get_htab_fd {
336 __u64 flags;
337 __u64 start_index;
338 __u64 reserved[2];
339};
340
341/* Values for kvm_get_htab_fd.flags */
342#define KVM_GET_HTAB_BOLTED_ONLY ((__u64)0x1)
343#define KVM_GET_HTAB_WRITE ((__u64)0x2)
344
345/*
346 * Data read on the file descriptor is formatted as a series of
347 * records, each consisting of a header followed by a series of
348 * `n_valid' HPTEs (16 bytes each), which are all valid. Following
349 * those valid HPTEs there are `n_invalid' invalid HPTEs, which
350 * are not represented explicitly in the stream. The same format
351 * is used for writing.
352 */
353struct kvm_get_htab_header {
354 __u32 index;
355 __u16 n_valid;
356 __u16 n_invalid;
357};
358
328#define KVM_REG_PPC_HIOR (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x1) 359#define KVM_REG_PPC_HIOR (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x1)
360#define KVM_REG_PPC_IAC1 (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x2)
361#define KVM_REG_PPC_IAC2 (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x3)
362#define KVM_REG_PPC_IAC3 (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x4)
363#define KVM_REG_PPC_IAC4 (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x5)
364#define KVM_REG_PPC_DAC1 (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x6)
365#define KVM_REG_PPC_DAC2 (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x7)
366#define KVM_REG_PPC_DABR (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x8)
367#define KVM_REG_PPC_DSCR (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x9)
368#define KVM_REG_PPC_PURR (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xa)
369#define KVM_REG_PPC_SPURR (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xb)
370#define KVM_REG_PPC_DAR (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xc)
371#define KVM_REG_PPC_DSISR (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0xd)
372#define KVM_REG_PPC_AMR (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xe)
373#define KVM_REG_PPC_UAMOR (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xf)
374
375#define KVM_REG_PPC_MMCR0 (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x10)
376#define KVM_REG_PPC_MMCR1 (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x11)
377#define KVM_REG_PPC_MMCRA (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x12)
378
379#define KVM_REG_PPC_PMC1 (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x18)
380#define KVM_REG_PPC_PMC2 (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x19)
381#define KVM_REG_PPC_PMC3 (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x1a)
382#define KVM_REG_PPC_PMC4 (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x1b)
383#define KVM_REG_PPC_PMC5 (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x1c)
384#define KVM_REG_PPC_PMC6 (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x1d)
385#define KVM_REG_PPC_PMC7 (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x1e)
386#define KVM_REG_PPC_PMC8 (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x1f)
387
388/* 32 floating-point registers */
389#define KVM_REG_PPC_FPR0 (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x20)
390#define KVM_REG_PPC_FPR(n) (KVM_REG_PPC_FPR0 + (n))
391#define KVM_REG_PPC_FPR31 (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x3f)
392
393/* 32 VMX/Altivec vector registers */
394#define KVM_REG_PPC_VR0 (KVM_REG_PPC | KVM_REG_SIZE_U128 | 0x40)
395#define KVM_REG_PPC_VR(n) (KVM_REG_PPC_VR0 + (n))
396#define KVM_REG_PPC_VR31 (KVM_REG_PPC | KVM_REG_SIZE_U128 | 0x5f)
397
398/* 32 double-width FP registers for VSX */
399/* High-order halves overlap with FP regs */
400#define KVM_REG_PPC_VSR0 (KVM_REG_PPC | KVM_REG_SIZE_U128 | 0x60)
401#define KVM_REG_PPC_VSR(n) (KVM_REG_PPC_VSR0 + (n))
402#define KVM_REG_PPC_VSR31 (KVM_REG_PPC | KVM_REG_SIZE_U128 | 0x7f)
403
404/* FP and vector status/control registers */
405#define KVM_REG_PPC_FPSCR (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x80)
406#define KVM_REG_PPC_VSCR (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x81)
407
408/* Virtual processor areas */
409/* For SLB & DTL, address in high (first) half, length in low half */
410#define KVM_REG_PPC_VPA_ADDR (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x82)
411#define KVM_REG_PPC_VPA_SLB (KVM_REG_PPC | KVM_REG_SIZE_U128 | 0x83)
412#define KVM_REG_PPC_VPA_DTL (KVM_REG_PPC | KVM_REG_SIZE_U128 | 0x84)
413
414#define KVM_REG_PPC_EPCR (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x85)
329 415
330#endif /* __LINUX_KVM_POWERPC_H */ 416#endif /* __LINUX_KVM_POWERPC_H */
diff --git a/arch/powerpc/include/uapi/asm/kvm_para.h b/arch/powerpc/include/uapi/asm/kvm_para.h
index 5e04383a1db5..ed0e0254b47f 100644
--- a/arch/powerpc/include/uapi/asm/kvm_para.h
+++ b/arch/powerpc/include/uapi/asm/kvm_para.h
@@ -75,9 +75,10 @@ struct kvm_vcpu_arch_shared {
75}; 75};
76 76
77#define KVM_SC_MAGIC_R0 0x4b564d21 /* "KVM!" */ 77#define KVM_SC_MAGIC_R0 0x4b564d21 /* "KVM!" */
78#define HC_VENDOR_KVM (42 << 16) 78
79#define HC_EV_SUCCESS 0 79#define KVM_HCALL_TOKEN(num) _EV_HCALL_TOKEN(EV_KVM_VENDOR_ID, num)
80#define HC_EV_UNIMPLEMENTED 12 80
81#include <uapi/asm/epapr_hcalls.h>
81 82
82#define KVM_FEATURE_MAGIC_PAGE 1 83#define KVM_FEATURE_MAGIC_PAGE 1
83 84
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
index 7523539cfe9f..4e23ba2f3ca7 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -441,8 +441,7 @@ int main(void)
441 DEFINE(KVM_HOST_LPCR, offsetof(struct kvm, arch.host_lpcr)); 441 DEFINE(KVM_HOST_LPCR, offsetof(struct kvm, arch.host_lpcr));
442 DEFINE(KVM_HOST_SDR1, offsetof(struct kvm, arch.host_sdr1)); 442 DEFINE(KVM_HOST_SDR1, offsetof(struct kvm, arch.host_sdr1));
443 DEFINE(KVM_TLBIE_LOCK, offsetof(struct kvm, arch.tlbie_lock)); 443 DEFINE(KVM_TLBIE_LOCK, offsetof(struct kvm, arch.tlbie_lock));
444 DEFINE(KVM_ONLINE_CPUS, offsetof(struct kvm, online_vcpus.counter)); 444 DEFINE(KVM_NEED_FLUSH, offsetof(struct kvm, arch.need_tlb_flush.bits));
445 DEFINE(KVM_LAST_VCPU, offsetof(struct kvm, arch.last_vcpu));
446 DEFINE(KVM_LPCR, offsetof(struct kvm, arch.lpcr)); 445 DEFINE(KVM_LPCR, offsetof(struct kvm, arch.lpcr));
447 DEFINE(KVM_RMOR, offsetof(struct kvm, arch.rmor)); 446 DEFINE(KVM_RMOR, offsetof(struct kvm, arch.rmor));
448 DEFINE(KVM_VRMA_SLB_V, offsetof(struct kvm, arch.vrma_slb_v)); 447 DEFINE(KVM_VRMA_SLB_V, offsetof(struct kvm, arch.vrma_slb_v));
@@ -470,7 +469,6 @@ int main(void)
470 DEFINE(VCPU_SLB, offsetof(struct kvm_vcpu, arch.slb)); 469 DEFINE(VCPU_SLB, offsetof(struct kvm_vcpu, arch.slb));
471 DEFINE(VCPU_SLB_MAX, offsetof(struct kvm_vcpu, arch.slb_max)); 470 DEFINE(VCPU_SLB_MAX, offsetof(struct kvm_vcpu, arch.slb_max));
472 DEFINE(VCPU_SLB_NR, offsetof(struct kvm_vcpu, arch.slb_nr)); 471 DEFINE(VCPU_SLB_NR, offsetof(struct kvm_vcpu, arch.slb_nr));
473 DEFINE(VCPU_LAST_CPU, offsetof(struct kvm_vcpu, arch.last_cpu));
474 DEFINE(VCPU_FAULT_DSISR, offsetof(struct kvm_vcpu, arch.fault_dsisr)); 472 DEFINE(VCPU_FAULT_DSISR, offsetof(struct kvm_vcpu, arch.fault_dsisr));
475 DEFINE(VCPU_FAULT_DAR, offsetof(struct kvm_vcpu, arch.fault_dar)); 473 DEFINE(VCPU_FAULT_DAR, offsetof(struct kvm_vcpu, arch.fault_dar));
476 DEFINE(VCPU_LAST_INST, offsetof(struct kvm_vcpu, arch.last_inst)); 474 DEFINE(VCPU_LAST_INST, offsetof(struct kvm_vcpu, arch.last_inst));
diff --git a/arch/powerpc/kernel/epapr_hcalls.S b/arch/powerpc/kernel/epapr_hcalls.S
index 697b390ebfd8..62c0dc237826 100644
--- a/arch/powerpc/kernel/epapr_hcalls.S
+++ b/arch/powerpc/kernel/epapr_hcalls.S
@@ -8,13 +8,41 @@
8 */ 8 */
9 9
10#include <linux/threads.h> 10#include <linux/threads.h>
11#include <asm/epapr_hcalls.h>
11#include <asm/reg.h> 12#include <asm/reg.h>
12#include <asm/page.h> 13#include <asm/page.h>
13#include <asm/cputable.h> 14#include <asm/cputable.h>
14#include <asm/thread_info.h> 15#include <asm/thread_info.h>
15#include <asm/ppc_asm.h> 16#include <asm/ppc_asm.h>
17#include <asm/asm-compat.h>
16#include <asm/asm-offsets.h> 18#include <asm/asm-offsets.h>
17 19
20/* epapr_ev_idle() was derived from e500_idle() */
21_GLOBAL(epapr_ev_idle)
22 CURRENT_THREAD_INFO(r3, r1)
23 PPC_LL r4, TI_LOCAL_FLAGS(r3) /* set napping bit */
24 ori r4, r4,_TLF_NAPPING /* so when we take an exception */
25 PPC_STL r4, TI_LOCAL_FLAGS(r3) /* it will return to our caller */
26
27 wrteei 1
28
29idle_loop:
30 LOAD_REG_IMMEDIATE(r11, EV_HCALL_TOKEN(EV_IDLE))
31
32.global epapr_ev_idle_start
33epapr_ev_idle_start:
34 li r3, -1
35 nop
36 nop
37 nop
38
39 /*
40 * Guard against spurious wakeups from a hypervisor --
41 * only interrupt will cause us to return to LR due to
42 * _TLF_NAPPING.
43 */
44 b idle_loop
45
18/* Hypercall entry point. Will be patched with device tree instructions. */ 46/* Hypercall entry point. Will be patched with device tree instructions. */
19.global epapr_hypercall_start 47.global epapr_hypercall_start
20epapr_hypercall_start: 48epapr_hypercall_start:
diff --git a/arch/powerpc/kernel/epapr_paravirt.c b/arch/powerpc/kernel/epapr_paravirt.c
index 028aeae370b6..f3eab8594d9f 100644
--- a/arch/powerpc/kernel/epapr_paravirt.c
+++ b/arch/powerpc/kernel/epapr_paravirt.c
@@ -21,6 +21,10 @@
21#include <asm/epapr_hcalls.h> 21#include <asm/epapr_hcalls.h>
22#include <asm/cacheflush.h> 22#include <asm/cacheflush.h>
23#include <asm/code-patching.h> 23#include <asm/code-patching.h>
24#include <asm/machdep.h>
25
26extern void epapr_ev_idle(void);
27extern u32 epapr_ev_idle_start[];
24 28
25bool epapr_paravirt_enabled; 29bool epapr_paravirt_enabled;
26 30
@@ -41,8 +45,13 @@ static int __init epapr_paravirt_init(void)
41 if (len % 4 || len > (4 * 4)) 45 if (len % 4 || len > (4 * 4))
42 return -ENODEV; 46 return -ENODEV;
43 47
44 for (i = 0; i < (len / 4); i++) 48 for (i = 0; i < (len / 4); i++) {
45 patch_instruction(epapr_hypercall_start + i, insts[i]); 49 patch_instruction(epapr_hypercall_start + i, insts[i]);
50 patch_instruction(epapr_ev_idle_start + i, insts[i]);
51 }
52
53 if (of_get_property(hyper_node, "has-idle", NULL))
54 ppc_md.power_save = epapr_ev_idle;
46 55
47 epapr_paravirt_enabled = true; 56 epapr_paravirt_enabled = true;
48 57
diff --git a/arch/powerpc/kernel/kvm.c b/arch/powerpc/kernel/kvm.c
index 867db1de8949..a61b133c4f99 100644
--- a/arch/powerpc/kernel/kvm.c
+++ b/arch/powerpc/kernel/kvm.c
@@ -419,7 +419,7 @@ static void kvm_map_magic_page(void *data)
419 in[0] = KVM_MAGIC_PAGE; 419 in[0] = KVM_MAGIC_PAGE;
420 in[1] = KVM_MAGIC_PAGE; 420 in[1] = KVM_MAGIC_PAGE;
421 421
422 kvm_hypercall(in, out, HC_VENDOR_KVM | KVM_HC_PPC_MAP_MAGIC_PAGE); 422 kvm_hypercall(in, out, KVM_HCALL_TOKEN(KVM_HC_PPC_MAP_MAGIC_PAGE));
423 423
424 *features = out[0]; 424 *features = out[0];
425} 425}
diff --git a/arch/powerpc/kernel/ppc_ksyms.c b/arch/powerpc/kernel/ppc_ksyms.c
index 19e4288d8486..78b8766fd79e 100644
--- a/arch/powerpc/kernel/ppc_ksyms.c
+++ b/arch/powerpc/kernel/ppc_ksyms.c
@@ -43,6 +43,7 @@
43#include <asm/dcr.h> 43#include <asm/dcr.h>
44#include <asm/ftrace.h> 44#include <asm/ftrace.h>
45#include <asm/switch_to.h> 45#include <asm/switch_to.h>
46#include <asm/epapr_hcalls.h>
46 47
47#ifdef CONFIG_PPC32 48#ifdef CONFIG_PPC32
48extern void transfer_to_handler(void); 49extern void transfer_to_handler(void);
@@ -191,3 +192,7 @@ EXPORT_SYMBOL(__arch_hweight64);
191#ifdef CONFIG_PPC_BOOK3S_64 192#ifdef CONFIG_PPC_BOOK3S_64
192EXPORT_SYMBOL_GPL(mmu_psize_defs); 193EXPORT_SYMBOL_GPL(mmu_psize_defs);
193#endif 194#endif
195
196#ifdef CONFIG_EPAPR_PARAVIRT
197EXPORT_SYMBOL(epapr_hypercall_start);
198#endif
diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index 2b952b5386fd..e5b133ebd8a5 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -427,6 +427,45 @@ int generic_check_cpu_restart(unsigned int cpu)
427{ 427{
428 return per_cpu(cpu_state, cpu) == CPU_UP_PREPARE; 428 return per_cpu(cpu_state, cpu) == CPU_UP_PREPARE;
429} 429}
430
431static atomic_t secondary_inhibit_count;
432
433/*
434 * Don't allow secondary CPU threads to come online
435 */
436void inhibit_secondary_onlining(void)
437{
438 /*
439 * This makes secondary_inhibit_count stable during cpu
440 * online/offline operations.
441 */
442 get_online_cpus();
443
444 atomic_inc(&secondary_inhibit_count);
445 put_online_cpus();
446}
447EXPORT_SYMBOL_GPL(inhibit_secondary_onlining);
448
449/*
450 * Allow secondary CPU threads to come online again
451 */
452void uninhibit_secondary_onlining(void)
453{
454 get_online_cpus();
455 atomic_dec(&secondary_inhibit_count);
456 put_online_cpus();
457}
458EXPORT_SYMBOL_GPL(uninhibit_secondary_onlining);
459
460static int secondaries_inhibited(void)
461{
462 return atomic_read(&secondary_inhibit_count);
463}
464
465#else /* HOTPLUG_CPU */
466
467#define secondaries_inhibited() 0
468
430#endif 469#endif
431 470
432static void cpu_idle_thread_init(unsigned int cpu, struct task_struct *idle) 471static void cpu_idle_thread_init(unsigned int cpu, struct task_struct *idle)
@@ -445,6 +484,13 @@ int __cpuinit __cpu_up(unsigned int cpu, struct task_struct *tidle)
445{ 484{
446 int rc, c; 485 int rc, c;
447 486
487 /*
488 * Don't allow secondary threads to come online if inhibited
489 */
490 if (threads_per_core > 1 && secondaries_inhibited() &&
491 cpu % threads_per_core != 0)
492 return -EBUSY;
493
448 if (smp_ops == NULL || 494 if (smp_ops == NULL ||
449 (smp_ops->cpu_bootable && !smp_ops->cpu_bootable(cpu))) 495 (smp_ops->cpu_bootable && !smp_ops->cpu_bootable(cpu)))
450 return -EINVAL; 496 return -EINVAL;
diff --git a/arch/powerpc/kvm/44x.c b/arch/powerpc/kvm/44x.c
index 50e7dbc7356c..3d7fd21c65f9 100644
--- a/arch/powerpc/kvm/44x.c
+++ b/arch/powerpc/kvm/44x.c
@@ -83,6 +83,7 @@ int kvmppc_core_vcpu_setup(struct kvm_vcpu *vcpu)
83 vcpu_44x->shadow_refs[i].gtlb_index = -1; 83 vcpu_44x->shadow_refs[i].gtlb_index = -1;
84 84
85 vcpu->arch.cpu_type = KVM_CPU_440; 85 vcpu->arch.cpu_type = KVM_CPU_440;
86 vcpu->arch.pvr = mfspr(SPRN_PVR);
86 87
87 return 0; 88 return 0;
88} 89}
diff --git a/arch/powerpc/kvm/44x_emulate.c b/arch/powerpc/kvm/44x_emulate.c
index c8c61578fdfc..35ec0a8547da 100644
--- a/arch/powerpc/kvm/44x_emulate.c
+++ b/arch/powerpc/kvm/44x_emulate.c
@@ -27,12 +27,70 @@
27#include "booke.h" 27#include "booke.h"
28#include "44x_tlb.h" 28#include "44x_tlb.h"
29 29
30#define XOP_MFDCRX 259
30#define XOP_MFDCR 323 31#define XOP_MFDCR 323
32#define XOP_MTDCRX 387
31#define XOP_MTDCR 451 33#define XOP_MTDCR 451
32#define XOP_TLBSX 914 34#define XOP_TLBSX 914
33#define XOP_ICCCI 966 35#define XOP_ICCCI 966
34#define XOP_TLBWE 978 36#define XOP_TLBWE 978
35 37
38static int emulate_mtdcr(struct kvm_vcpu *vcpu, int rs, int dcrn)
39{
40 /* emulate some access in kernel */
41 switch (dcrn) {
42 case DCRN_CPR0_CONFIG_ADDR:
43 vcpu->arch.cpr0_cfgaddr = kvmppc_get_gpr(vcpu, rs);
44 return EMULATE_DONE;
45 default:
46 vcpu->run->dcr.dcrn = dcrn;
47 vcpu->run->dcr.data = kvmppc_get_gpr(vcpu, rs);
48 vcpu->run->dcr.is_write = 1;
49 vcpu->arch.dcr_is_write = 1;
50 vcpu->arch.dcr_needed = 1;
51 kvmppc_account_exit(vcpu, DCR_EXITS);
52 return EMULATE_DO_DCR;
53 }
54}
55
56static int emulate_mfdcr(struct kvm_vcpu *vcpu, int rt, int dcrn)
57{
58 /* The guest may access CPR0 registers to determine the timebase
59 * frequency, and it must know the real host frequency because it
60 * can directly access the timebase registers.
61 *
62 * It would be possible to emulate those accesses in userspace,
63 * but userspace can really only figure out the end frequency.
64 * We could decompose that into the factors that compute it, but
65 * that's tricky math, and it's easier to just report the real
66 * CPR0 values.
67 */
68 switch (dcrn) {
69 case DCRN_CPR0_CONFIG_ADDR:
70 kvmppc_set_gpr(vcpu, rt, vcpu->arch.cpr0_cfgaddr);
71 break;
72 case DCRN_CPR0_CONFIG_DATA:
73 local_irq_disable();
74 mtdcr(DCRN_CPR0_CONFIG_ADDR,
75 vcpu->arch.cpr0_cfgaddr);
76 kvmppc_set_gpr(vcpu, rt,
77 mfdcr(DCRN_CPR0_CONFIG_DATA));
78 local_irq_enable();
79 break;
80 default:
81 vcpu->run->dcr.dcrn = dcrn;
82 vcpu->run->dcr.data = 0;
83 vcpu->run->dcr.is_write = 0;
84 vcpu->arch.dcr_is_write = 0;
85 vcpu->arch.io_gpr = rt;
86 vcpu->arch.dcr_needed = 1;
87 kvmppc_account_exit(vcpu, DCR_EXITS);
88 return EMULATE_DO_DCR;
89 }
90
91 return EMULATE_DONE;
92}
93
36int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu, 94int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,
37 unsigned int inst, int *advance) 95 unsigned int inst, int *advance)
38{ 96{
@@ -50,55 +108,21 @@ int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,
50 switch (get_xop(inst)) { 108 switch (get_xop(inst)) {
51 109
52 case XOP_MFDCR: 110 case XOP_MFDCR:
53 /* The guest may access CPR0 registers to determine the timebase 111 emulated = emulate_mfdcr(vcpu, rt, dcrn);
54 * frequency, and it must know the real host frequency because it 112 break;
55 * can directly access the timebase registers.
56 *
57 * It would be possible to emulate those accesses in userspace,
58 * but userspace can really only figure out the end frequency.
59 * We could decompose that into the factors that compute it, but
60 * that's tricky math, and it's easier to just report the real
61 * CPR0 values.
62 */
63 switch (dcrn) {
64 case DCRN_CPR0_CONFIG_ADDR:
65 kvmppc_set_gpr(vcpu, rt, vcpu->arch.cpr0_cfgaddr);
66 break;
67 case DCRN_CPR0_CONFIG_DATA:
68 local_irq_disable();
69 mtdcr(DCRN_CPR0_CONFIG_ADDR,
70 vcpu->arch.cpr0_cfgaddr);
71 kvmppc_set_gpr(vcpu, rt,
72 mfdcr(DCRN_CPR0_CONFIG_DATA));
73 local_irq_enable();
74 break;
75 default:
76 run->dcr.dcrn = dcrn;
77 run->dcr.data = 0;
78 run->dcr.is_write = 0;
79 vcpu->arch.io_gpr = rt;
80 vcpu->arch.dcr_needed = 1;
81 kvmppc_account_exit(vcpu, DCR_EXITS);
82 emulated = EMULATE_DO_DCR;
83 }
84 113
114 case XOP_MFDCRX:
115 emulated = emulate_mfdcr(vcpu, rt,
116 kvmppc_get_gpr(vcpu, ra));
85 break; 117 break;
86 118
87 case XOP_MTDCR: 119 case XOP_MTDCR:
88 /* emulate some access in kernel */ 120 emulated = emulate_mtdcr(vcpu, rs, dcrn);
89 switch (dcrn) { 121 break;
90 case DCRN_CPR0_CONFIG_ADDR:
91 vcpu->arch.cpr0_cfgaddr = kvmppc_get_gpr(vcpu, rs);
92 break;
93 default:
94 run->dcr.dcrn = dcrn;
95 run->dcr.data = kvmppc_get_gpr(vcpu, rs);
96 run->dcr.is_write = 1;
97 vcpu->arch.dcr_needed = 1;
98 kvmppc_account_exit(vcpu, DCR_EXITS);
99 emulated = EMULATE_DO_DCR;
100 }
101 122
123 case XOP_MTDCRX:
124 emulated = emulate_mtdcr(vcpu, rs,
125 kvmppc_get_gpr(vcpu, ra));
102 break; 126 break;
103 127
104 case XOP_TLBWE: 128 case XOP_TLBWE:
diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig
index f4dacb9c57fa..4730c953f435 100644
--- a/arch/powerpc/kvm/Kconfig
+++ b/arch/powerpc/kvm/Kconfig
@@ -20,6 +20,7 @@ config KVM
20 bool 20 bool
21 select PREEMPT_NOTIFIERS 21 select PREEMPT_NOTIFIERS
22 select ANON_INODES 22 select ANON_INODES
23 select HAVE_KVM_EVENTFD
23 24
24config KVM_BOOK3S_HANDLER 25config KVM_BOOK3S_HANDLER
25 bool 26 bool
@@ -36,6 +37,7 @@ config KVM_BOOK3S_64_HANDLER
36config KVM_BOOK3S_PR 37config KVM_BOOK3S_PR
37 bool 38 bool
38 select KVM_MMIO 39 select KVM_MMIO
40 select MMU_NOTIFIER
39 41
40config KVM_BOOK3S_32 42config KVM_BOOK3S_32
41 tristate "KVM support for PowerPC book3s_32 processors" 43 tristate "KVM support for PowerPC book3s_32 processors"
@@ -123,6 +125,7 @@ config KVM_E500V2
123 depends on EXPERIMENTAL && E500 && !PPC_E500MC 125 depends on EXPERIMENTAL && E500 && !PPC_E500MC
124 select KVM 126 select KVM
125 select KVM_MMIO 127 select KVM_MMIO
128 select MMU_NOTIFIER
126 ---help--- 129 ---help---
127 Support running unmodified E500 guest kernels in virtual machines on 130 Support running unmodified E500 guest kernels in virtual machines on
128 E500v2 host processors. 131 E500v2 host processors.
@@ -138,6 +141,7 @@ config KVM_E500MC
138 select KVM 141 select KVM
139 select KVM_MMIO 142 select KVM_MMIO
140 select KVM_BOOKE_HV 143 select KVM_BOOKE_HV
144 select MMU_NOTIFIER
141 ---help--- 145 ---help---
142 Support running unmodified E500MC/E5500 (32-bit) guest kernels in 146 Support running unmodified E500MC/E5500 (32-bit) guest kernels in
143 virtual machines on E500MC/E5500 host processors. 147 virtual machines on E500MC/E5500 host processors.
diff --git a/arch/powerpc/kvm/Makefile b/arch/powerpc/kvm/Makefile
index c2a08636e6d4..1e473d46322c 100644
--- a/arch/powerpc/kvm/Makefile
+++ b/arch/powerpc/kvm/Makefile
@@ -6,7 +6,8 @@ subdir-ccflags-$(CONFIG_PPC_WERROR) := -Werror
6 6
7ccflags-y := -Ivirt/kvm -Iarch/powerpc/kvm 7ccflags-y := -Ivirt/kvm -Iarch/powerpc/kvm
8 8
9common-objs-y = $(addprefix ../../../virt/kvm/, kvm_main.o coalesced_mmio.o) 9common-objs-y = $(addprefix ../../../virt/kvm/, kvm_main.o coalesced_mmio.o \
10 eventfd.o)
10 11
11CFLAGS_44x_tlb.o := -I. 12CFLAGS_44x_tlb.o := -I.
12CFLAGS_e500_tlb.o := -I. 13CFLAGS_e500_tlb.o := -I.
@@ -72,10 +73,12 @@ kvm-book3s_64-builtin-objs-$(CONFIG_KVM_BOOK3S_64_HV) := \
72 book3s_hv_rmhandlers.o \ 73 book3s_hv_rmhandlers.o \
73 book3s_hv_rm_mmu.o \ 74 book3s_hv_rm_mmu.o \
74 book3s_64_vio_hv.o \ 75 book3s_64_vio_hv.o \
76 book3s_hv_ras.o \
75 book3s_hv_builtin.o 77 book3s_hv_builtin.o
76 78
77kvm-book3s_64-module-objs := \ 79kvm-book3s_64-module-objs := \
78 ../../../virt/kvm/kvm_main.o \ 80 ../../../virt/kvm/kvm_main.o \
81 ../../../virt/kvm/eventfd.o \
79 powerpc.o \ 82 powerpc.o \
80 emulate.o \ 83 emulate.o \
81 book3s.o \ 84 book3s.o \
diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
index 3f2a8360c857..a4b645285240 100644
--- a/arch/powerpc/kvm/book3s.c
+++ b/arch/powerpc/kvm/book3s.c
@@ -411,6 +411,15 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
411 return 0; 411 return 0;
412} 412}
413 413
414int kvmppc_subarch_vcpu_init(struct kvm_vcpu *vcpu)
415{
416 return 0;
417}
418
419void kvmppc_subarch_vcpu_uninit(struct kvm_vcpu *vcpu)
420{
421}
422
414int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) 423int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
415{ 424{
416 int i; 425 int i;
@@ -476,6 +485,122 @@ int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
476 return -ENOTSUPP; 485 return -ENOTSUPP;
477} 486}
478 487
488int kvm_vcpu_ioctl_get_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg)
489{
490 int r;
491 union kvmppc_one_reg val;
492 int size;
493 long int i;
494
495 size = one_reg_size(reg->id);
496 if (size > sizeof(val))
497 return -EINVAL;
498
499 r = kvmppc_get_one_reg(vcpu, reg->id, &val);
500
501 if (r == -EINVAL) {
502 r = 0;
503 switch (reg->id) {
504 case KVM_REG_PPC_DAR:
505 val = get_reg_val(reg->id, vcpu->arch.shared->dar);
506 break;
507 case KVM_REG_PPC_DSISR:
508 val = get_reg_val(reg->id, vcpu->arch.shared->dsisr);
509 break;
510 case KVM_REG_PPC_FPR0 ... KVM_REG_PPC_FPR31:
511 i = reg->id - KVM_REG_PPC_FPR0;
512 val = get_reg_val(reg->id, vcpu->arch.fpr[i]);
513 break;
514 case KVM_REG_PPC_FPSCR:
515 val = get_reg_val(reg->id, vcpu->arch.fpscr);
516 break;
517#ifdef CONFIG_ALTIVEC
518 case KVM_REG_PPC_VR0 ... KVM_REG_PPC_VR31:
519 if (!cpu_has_feature(CPU_FTR_ALTIVEC)) {
520 r = -ENXIO;
521 break;
522 }
523 val.vval = vcpu->arch.vr[reg->id - KVM_REG_PPC_VR0];
524 break;
525 case KVM_REG_PPC_VSCR:
526 if (!cpu_has_feature(CPU_FTR_ALTIVEC)) {
527 r = -ENXIO;
528 break;
529 }
530 val = get_reg_val(reg->id, vcpu->arch.vscr.u[3]);
531 break;
532#endif /* CONFIG_ALTIVEC */
533 default:
534 r = -EINVAL;
535 break;
536 }
537 }
538 if (r)
539 return r;
540
541 if (copy_to_user((char __user *)(unsigned long)reg->addr, &val, size))
542 r = -EFAULT;
543
544 return r;
545}
546
547int kvm_vcpu_ioctl_set_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg)
548{
549 int r;
550 union kvmppc_one_reg val;
551 int size;
552 long int i;
553
554 size = one_reg_size(reg->id);
555 if (size > sizeof(val))
556 return -EINVAL;
557
558 if (copy_from_user(&val, (char __user *)(unsigned long)reg->addr, size))
559 return -EFAULT;
560
561 r = kvmppc_set_one_reg(vcpu, reg->id, &val);
562
563 if (r == -EINVAL) {
564 r = 0;
565 switch (reg->id) {
566 case KVM_REG_PPC_DAR:
567 vcpu->arch.shared->dar = set_reg_val(reg->id, val);
568 break;
569 case KVM_REG_PPC_DSISR:
570 vcpu->arch.shared->dsisr = set_reg_val(reg->id, val);
571 break;
572 case KVM_REG_PPC_FPR0 ... KVM_REG_PPC_FPR31:
573 i = reg->id - KVM_REG_PPC_FPR0;
574 vcpu->arch.fpr[i] = set_reg_val(reg->id, val);
575 break;
576 case KVM_REG_PPC_FPSCR:
577 vcpu->arch.fpscr = set_reg_val(reg->id, val);
578 break;
579#ifdef CONFIG_ALTIVEC
580 case KVM_REG_PPC_VR0 ... KVM_REG_PPC_VR31:
581 if (!cpu_has_feature(CPU_FTR_ALTIVEC)) {
582 r = -ENXIO;
583 break;
584 }
585 vcpu->arch.vr[reg->id - KVM_REG_PPC_VR0] = val.vval;
586 break;
587 case KVM_REG_PPC_VSCR:
588 if (!cpu_has_feature(CPU_FTR_ALTIVEC)) {
589 r = -ENXIO;
590 break;
591 }
592 vcpu->arch.vscr.u[3] = set_reg_val(reg->id, val);
593 break;
594#endif /* CONFIG_ALTIVEC */
595 default:
596 r = -EINVAL;
597 break;
598 }
599 }
600
601 return r;
602}
603
479int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu, 604int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
480 struct kvm_translation *tr) 605 struct kvm_translation *tr)
481{ 606{
diff --git a/arch/powerpc/kvm/book3s_32_mmu_host.c b/arch/powerpc/kvm/book3s_32_mmu_host.c
index b0f625a33345..00e619bf608e 100644
--- a/arch/powerpc/kvm/book3s_32_mmu_host.c
+++ b/arch/powerpc/kvm/book3s_32_mmu_host.c
@@ -155,7 +155,7 @@ int kvmppc_mmu_map_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *orig_pte)
155 155
156 /* Get host physical address for gpa */ 156 /* Get host physical address for gpa */
157 hpaddr = kvmppc_gfn_to_pfn(vcpu, orig_pte->raddr >> PAGE_SHIFT); 157 hpaddr = kvmppc_gfn_to_pfn(vcpu, orig_pte->raddr >> PAGE_SHIFT);
158 if (is_error_pfn(hpaddr)) { 158 if (is_error_noslot_pfn(hpaddr)) {
159 printk(KERN_INFO "Couldn't get guest page for gfn %lx!\n", 159 printk(KERN_INFO "Couldn't get guest page for gfn %lx!\n",
160 orig_pte->eaddr); 160 orig_pte->eaddr);
161 r = -EINVAL; 161 r = -EINVAL;
@@ -254,6 +254,7 @@ next_pteg:
254 254
255 kvmppc_mmu_hpte_cache_map(vcpu, pte); 255 kvmppc_mmu_hpte_cache_map(vcpu, pte);
256 256
257 kvm_release_pfn_clean(hpaddr >> PAGE_SHIFT);
257out: 258out:
258 return r; 259 return r;
259} 260}
diff --git a/arch/powerpc/kvm/book3s_64_mmu_host.c b/arch/powerpc/kvm/book3s_64_mmu_host.c
index 4d72f9ebc554..ead58e317294 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_host.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_host.c
@@ -93,7 +93,7 @@ int kvmppc_mmu_map_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *orig_pte)
93 93
94 /* Get host physical address for gpa */ 94 /* Get host physical address for gpa */
95 hpaddr = kvmppc_gfn_to_pfn(vcpu, orig_pte->raddr >> PAGE_SHIFT); 95 hpaddr = kvmppc_gfn_to_pfn(vcpu, orig_pte->raddr >> PAGE_SHIFT);
96 if (is_error_pfn(hpaddr)) { 96 if (is_error_noslot_pfn(hpaddr)) {
97 printk(KERN_INFO "Couldn't get guest page for gfn %lx!\n", orig_pte->eaddr); 97 printk(KERN_INFO "Couldn't get guest page for gfn %lx!\n", orig_pte->eaddr);
98 r = -EINVAL; 98 r = -EINVAL;
99 goto out; 99 goto out;
@@ -171,6 +171,7 @@ map_again:
171 171
172 kvmppc_mmu_hpte_cache_map(vcpu, pte); 172 kvmppc_mmu_hpte_cache_map(vcpu, pte);
173 } 173 }
174 kvm_release_pfn_clean(hpaddr >> PAGE_SHIFT);
174 175
175out: 176out:
176 return r; 177 return r;
diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c
index d95d11322a15..8cc18abd6dde 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
@@ -24,6 +24,9 @@
24#include <linux/slab.h> 24#include <linux/slab.h>
25#include <linux/hugetlb.h> 25#include <linux/hugetlb.h>
26#include <linux/vmalloc.h> 26#include <linux/vmalloc.h>
27#include <linux/srcu.h>
28#include <linux/anon_inodes.h>
29#include <linux/file.h>
27 30
28#include <asm/tlbflush.h> 31#include <asm/tlbflush.h>
29#include <asm/kvm_ppc.h> 32#include <asm/kvm_ppc.h>
@@ -40,6 +43,11 @@
40/* Power architecture requires HPT is at least 256kB */ 43/* Power architecture requires HPT is at least 256kB */
41#define PPC_MIN_HPT_ORDER 18 44#define PPC_MIN_HPT_ORDER 18
42 45
46static long kvmppc_virtmode_do_h_enter(struct kvm *kvm, unsigned long flags,
47 long pte_index, unsigned long pteh,
48 unsigned long ptel, unsigned long *pte_idx_ret);
49static void kvmppc_rmap_reset(struct kvm *kvm);
50
43long kvmppc_alloc_hpt(struct kvm *kvm, u32 *htab_orderp) 51long kvmppc_alloc_hpt(struct kvm *kvm, u32 *htab_orderp)
44{ 52{
45 unsigned long hpt; 53 unsigned long hpt;
@@ -137,10 +145,11 @@ long kvmppc_alloc_reset_hpt(struct kvm *kvm, u32 *htab_orderp)
137 /* Set the entire HPT to 0, i.e. invalid HPTEs */ 145 /* Set the entire HPT to 0, i.e. invalid HPTEs */
138 memset((void *)kvm->arch.hpt_virt, 0, 1ul << order); 146 memset((void *)kvm->arch.hpt_virt, 0, 1ul << order);
139 /* 147 /*
140 * Set the whole last_vcpu array to an invalid vcpu number. 148 * Reset all the reverse-mapping chains for all memslots
141 * This ensures that each vcpu will flush its TLB on next entry.
142 */ 149 */
143 memset(kvm->arch.last_vcpu, 0xff, sizeof(kvm->arch.last_vcpu)); 150 kvmppc_rmap_reset(kvm);
151 /* Ensure that each vcpu will flush its TLB on next entry. */
152 cpumask_setall(&kvm->arch.need_tlb_flush);
144 *htab_orderp = order; 153 *htab_orderp = order;
145 err = 0; 154 err = 0;
146 } else { 155 } else {
@@ -184,6 +193,7 @@ void kvmppc_map_vrma(struct kvm_vcpu *vcpu, struct kvm_memory_slot *memslot,
184 unsigned long addr, hash; 193 unsigned long addr, hash;
185 unsigned long psize; 194 unsigned long psize;
186 unsigned long hp0, hp1; 195 unsigned long hp0, hp1;
196 unsigned long idx_ret;
187 long ret; 197 long ret;
188 struct kvm *kvm = vcpu->kvm; 198 struct kvm *kvm = vcpu->kvm;
189 199
@@ -215,7 +225,8 @@ void kvmppc_map_vrma(struct kvm_vcpu *vcpu, struct kvm_memory_slot *memslot,
215 hash = (hash << 3) + 7; 225 hash = (hash << 3) + 7;
216 hp_v = hp0 | ((addr >> 16) & ~0x7fUL); 226 hp_v = hp0 | ((addr >> 16) & ~0x7fUL);
217 hp_r = hp1 | addr; 227 hp_r = hp1 | addr;
218 ret = kvmppc_virtmode_h_enter(vcpu, H_EXACT, hash, hp_v, hp_r); 228 ret = kvmppc_virtmode_do_h_enter(kvm, H_EXACT, hash, hp_v, hp_r,
229 &idx_ret);
219 if (ret != H_SUCCESS) { 230 if (ret != H_SUCCESS) {
220 pr_err("KVM: map_vrma at %lx failed, ret=%ld\n", 231 pr_err("KVM: map_vrma at %lx failed, ret=%ld\n",
221 addr, ret); 232 addr, ret);
@@ -260,7 +271,7 @@ static void kvmppc_mmu_book3s_64_hv_reset_msr(struct kvm_vcpu *vcpu)
260 271
261/* 272/*
262 * This is called to get a reference to a guest page if there isn't 273 * This is called to get a reference to a guest page if there isn't
263 * one already in the kvm->arch.slot_phys[][] arrays. 274 * one already in the memslot->arch.slot_phys[] array.
264 */ 275 */
265static long kvmppc_get_guest_page(struct kvm *kvm, unsigned long gfn, 276static long kvmppc_get_guest_page(struct kvm *kvm, unsigned long gfn,
266 struct kvm_memory_slot *memslot, 277 struct kvm_memory_slot *memslot,
@@ -275,7 +286,7 @@ static long kvmppc_get_guest_page(struct kvm *kvm, unsigned long gfn,
275 struct vm_area_struct *vma; 286 struct vm_area_struct *vma;
276 unsigned long pfn, i, npages; 287 unsigned long pfn, i, npages;
277 288
278 physp = kvm->arch.slot_phys[memslot->id]; 289 physp = memslot->arch.slot_phys;
279 if (!physp) 290 if (!physp)
280 return -EINVAL; 291 return -EINVAL;
281 if (physp[gfn - memslot->base_gfn]) 292 if (physp[gfn - memslot->base_gfn])
@@ -353,15 +364,10 @@ static long kvmppc_get_guest_page(struct kvm *kvm, unsigned long gfn,
353 return err; 364 return err;
354} 365}
355 366
356/* 367long kvmppc_virtmode_do_h_enter(struct kvm *kvm, unsigned long flags,
357 * We come here on a H_ENTER call from the guest when we are not 368 long pte_index, unsigned long pteh,
358 * using mmu notifiers and we don't have the requested page pinned 369 unsigned long ptel, unsigned long *pte_idx_ret)
359 * already.
360 */
361long kvmppc_virtmode_h_enter(struct kvm_vcpu *vcpu, unsigned long flags,
362 long pte_index, unsigned long pteh, unsigned long ptel)
363{ 370{
364 struct kvm *kvm = vcpu->kvm;
365 unsigned long psize, gpa, gfn; 371 unsigned long psize, gpa, gfn;
366 struct kvm_memory_slot *memslot; 372 struct kvm_memory_slot *memslot;
367 long ret; 373 long ret;
@@ -389,8 +395,8 @@ long kvmppc_virtmode_h_enter(struct kvm_vcpu *vcpu, unsigned long flags,
389 do_insert: 395 do_insert:
390 /* Protect linux PTE lookup from page table destruction */ 396 /* Protect linux PTE lookup from page table destruction */
391 rcu_read_lock_sched(); /* this disables preemption too */ 397 rcu_read_lock_sched(); /* this disables preemption too */
392 vcpu->arch.pgdir = current->mm->pgd; 398 ret = kvmppc_do_h_enter(kvm, flags, pte_index, pteh, ptel,
393 ret = kvmppc_h_enter(vcpu, flags, pte_index, pteh, ptel); 399 current->mm->pgd, false, pte_idx_ret);
394 rcu_read_unlock_sched(); 400 rcu_read_unlock_sched();
395 if (ret == H_TOO_HARD) { 401 if (ret == H_TOO_HARD) {
396 /* this can't happen */ 402 /* this can't happen */
@@ -401,6 +407,19 @@ long kvmppc_virtmode_h_enter(struct kvm_vcpu *vcpu, unsigned long flags,
401 407
402} 408}
403 409
410/*
411 * We come here on a H_ENTER call from the guest when we are not
412 * using mmu notifiers and we don't have the requested page pinned
413 * already.
414 */
415long kvmppc_virtmode_h_enter(struct kvm_vcpu *vcpu, unsigned long flags,
416 long pte_index, unsigned long pteh,
417 unsigned long ptel)
418{
419 return kvmppc_virtmode_do_h_enter(vcpu->kvm, flags, pte_index,
420 pteh, ptel, &vcpu->arch.gpr[4]);
421}
422
404static struct kvmppc_slb *kvmppc_mmu_book3s_hv_find_slbe(struct kvm_vcpu *vcpu, 423static struct kvmppc_slb *kvmppc_mmu_book3s_hv_find_slbe(struct kvm_vcpu *vcpu,
405 gva_t eaddr) 424 gva_t eaddr)
406{ 425{
@@ -570,7 +589,7 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
570 struct kvm *kvm = vcpu->kvm; 589 struct kvm *kvm = vcpu->kvm;
571 unsigned long *hptep, hpte[3], r; 590 unsigned long *hptep, hpte[3], r;
572 unsigned long mmu_seq, psize, pte_size; 591 unsigned long mmu_seq, psize, pte_size;
573 unsigned long gfn, hva, pfn; 592 unsigned long gpa, gfn, hva, pfn;
574 struct kvm_memory_slot *memslot; 593 struct kvm_memory_slot *memslot;
575 unsigned long *rmap; 594 unsigned long *rmap;
576 struct revmap_entry *rev; 595 struct revmap_entry *rev;
@@ -608,15 +627,14 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
608 627
609 /* Translate the logical address and get the page */ 628 /* Translate the logical address and get the page */
610 psize = hpte_page_size(hpte[0], r); 629 psize = hpte_page_size(hpte[0], r);
611 gfn = hpte_rpn(r, psize); 630 gpa = (r & HPTE_R_RPN & ~(psize - 1)) | (ea & (psize - 1));
631 gfn = gpa >> PAGE_SHIFT;
612 memslot = gfn_to_memslot(kvm, gfn); 632 memslot = gfn_to_memslot(kvm, gfn);
613 633
614 /* No memslot means it's an emulated MMIO region */ 634 /* No memslot means it's an emulated MMIO region */
615 if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID)) { 635 if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID))
616 unsigned long gpa = (gfn << PAGE_SHIFT) | (ea & (psize - 1));
617 return kvmppc_hv_emulate_mmio(run, vcpu, gpa, ea, 636 return kvmppc_hv_emulate_mmio(run, vcpu, gpa, ea,
618 dsisr & DSISR_ISSTORE); 637 dsisr & DSISR_ISSTORE);
619 }
620 638
621 if (!kvm->arch.using_mmu_notifiers) 639 if (!kvm->arch.using_mmu_notifiers)
622 return -EFAULT; /* should never get here */ 640 return -EFAULT; /* should never get here */
@@ -710,7 +728,7 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
710 728
711 /* Check if we might have been invalidated; let the guest retry if so */ 729 /* Check if we might have been invalidated; let the guest retry if so */
712 ret = RESUME_GUEST; 730 ret = RESUME_GUEST;
713 if (mmu_notifier_retry(vcpu, mmu_seq)) { 731 if (mmu_notifier_retry(vcpu->kvm, mmu_seq)) {
714 unlock_rmap(rmap); 732 unlock_rmap(rmap);
715 goto out_unlock; 733 goto out_unlock;
716 } 734 }
@@ -756,6 +774,25 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
756 goto out_put; 774 goto out_put;
757} 775}
758 776
777static void kvmppc_rmap_reset(struct kvm *kvm)
778{
779 struct kvm_memslots *slots;
780 struct kvm_memory_slot *memslot;
781 int srcu_idx;
782
783 srcu_idx = srcu_read_lock(&kvm->srcu);
784 slots = kvm->memslots;
785 kvm_for_each_memslot(memslot, slots) {
786 /*
787 * This assumes it is acceptable to lose reference and
788 * change bits across a reset.
789 */
790 memset(memslot->arch.rmap, 0,
791 memslot->npages * sizeof(*memslot->arch.rmap));
792 }
793 srcu_read_unlock(&kvm->srcu, srcu_idx);
794}
795
759static int kvm_handle_hva_range(struct kvm *kvm, 796static int kvm_handle_hva_range(struct kvm *kvm,
760 unsigned long start, 797 unsigned long start,
761 unsigned long end, 798 unsigned long end,
@@ -850,7 +887,8 @@ static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp,
850 psize = hpte_page_size(hptep[0], ptel); 887 psize = hpte_page_size(hptep[0], ptel);
851 if ((hptep[0] & HPTE_V_VALID) && 888 if ((hptep[0] & HPTE_V_VALID) &&
852 hpte_rpn(ptel, psize) == gfn) { 889 hpte_rpn(ptel, psize) == gfn) {
853 hptep[0] |= HPTE_V_ABSENT; 890 if (kvm->arch.using_mmu_notifiers)
891 hptep[0] |= HPTE_V_ABSENT;
854 kvmppc_invalidate_hpte(kvm, hptep, i); 892 kvmppc_invalidate_hpte(kvm, hptep, i);
855 /* Harvest R and C */ 893 /* Harvest R and C */
856 rcbits = hptep[1] & (HPTE_R_R | HPTE_R_C); 894 rcbits = hptep[1] & (HPTE_R_R | HPTE_R_C);
@@ -877,6 +915,28 @@ int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end)
877 return 0; 915 return 0;
878} 916}
879 917
918void kvmppc_core_flush_memslot(struct kvm *kvm, struct kvm_memory_slot *memslot)
919{
920 unsigned long *rmapp;
921 unsigned long gfn;
922 unsigned long n;
923
924 rmapp = memslot->arch.rmap;
925 gfn = memslot->base_gfn;
926 for (n = memslot->npages; n; --n) {
927 /*
928 * Testing the present bit without locking is OK because
929 * the memslot has been marked invalid already, and hence
930 * no new HPTEs referencing this page can be created,
931 * thus the present bit can't go from 0 to 1.
932 */
933 if (*rmapp & KVMPPC_RMAP_PRESENT)
934 kvm_unmap_rmapp(kvm, rmapp, gfn);
935 ++rmapp;
936 ++gfn;
937 }
938}
939
880static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp, 940static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
881 unsigned long gfn) 941 unsigned long gfn)
882{ 942{
@@ -1030,16 +1090,16 @@ static int kvm_test_clear_dirty(struct kvm *kvm, unsigned long *rmapp)
1030 return ret; 1090 return ret;
1031} 1091}
1032 1092
1033long kvmppc_hv_get_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot) 1093long kvmppc_hv_get_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot,
1094 unsigned long *map)
1034{ 1095{
1035 unsigned long i; 1096 unsigned long i;
1036 unsigned long *rmapp, *map; 1097 unsigned long *rmapp;
1037 1098
1038 preempt_disable(); 1099 preempt_disable();
1039 rmapp = memslot->arch.rmap; 1100 rmapp = memslot->arch.rmap;
1040 map = memslot->dirty_bitmap;
1041 for (i = 0; i < memslot->npages; ++i) { 1101 for (i = 0; i < memslot->npages; ++i) {
1042 if (kvm_test_clear_dirty(kvm, rmapp)) 1102 if (kvm_test_clear_dirty(kvm, rmapp) && map)
1043 __set_bit_le(i, map); 1103 __set_bit_le(i, map);
1044 ++rmapp; 1104 ++rmapp;
1045 } 1105 }
@@ -1057,20 +1117,22 @@ void *kvmppc_pin_guest_page(struct kvm *kvm, unsigned long gpa,
1057 unsigned long hva, psize, offset; 1117 unsigned long hva, psize, offset;
1058 unsigned long pa; 1118 unsigned long pa;
1059 unsigned long *physp; 1119 unsigned long *physp;
1120 int srcu_idx;
1060 1121
1122 srcu_idx = srcu_read_lock(&kvm->srcu);
1061 memslot = gfn_to_memslot(kvm, gfn); 1123 memslot = gfn_to_memslot(kvm, gfn);
1062 if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID)) 1124 if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID))
1063 return NULL; 1125 goto err;
1064 if (!kvm->arch.using_mmu_notifiers) { 1126 if (!kvm->arch.using_mmu_notifiers) {
1065 physp = kvm->arch.slot_phys[memslot->id]; 1127 physp = memslot->arch.slot_phys;
1066 if (!physp) 1128 if (!physp)
1067 return NULL; 1129 goto err;
1068 physp += gfn - memslot->base_gfn; 1130 physp += gfn - memslot->base_gfn;
1069 pa = *physp; 1131 pa = *physp;
1070 if (!pa) { 1132 if (!pa) {
1071 if (kvmppc_get_guest_page(kvm, gfn, memslot, 1133 if (kvmppc_get_guest_page(kvm, gfn, memslot,
1072 PAGE_SIZE) < 0) 1134 PAGE_SIZE) < 0)
1073 return NULL; 1135 goto err;
1074 pa = *physp; 1136 pa = *physp;
1075 } 1137 }
1076 page = pfn_to_page(pa >> PAGE_SHIFT); 1138 page = pfn_to_page(pa >> PAGE_SHIFT);
@@ -1079,9 +1141,11 @@ void *kvmppc_pin_guest_page(struct kvm *kvm, unsigned long gpa,
1079 hva = gfn_to_hva_memslot(memslot, gfn); 1141 hva = gfn_to_hva_memslot(memslot, gfn);
1080 npages = get_user_pages_fast(hva, 1, 1, pages); 1142 npages = get_user_pages_fast(hva, 1, 1, pages);
1081 if (npages < 1) 1143 if (npages < 1)
1082 return NULL; 1144 goto err;
1083 page = pages[0]; 1145 page = pages[0];
1084 } 1146 }
1147 srcu_read_unlock(&kvm->srcu, srcu_idx);
1148
1085 psize = PAGE_SIZE; 1149 psize = PAGE_SIZE;
1086 if (PageHuge(page)) { 1150 if (PageHuge(page)) {
1087 page = compound_head(page); 1151 page = compound_head(page);
@@ -1091,6 +1155,10 @@ void *kvmppc_pin_guest_page(struct kvm *kvm, unsigned long gpa,
1091 if (nb_ret) 1155 if (nb_ret)
1092 *nb_ret = psize - offset; 1156 *nb_ret = psize - offset;
1093 return page_address(page) + offset; 1157 return page_address(page) + offset;
1158
1159 err:
1160 srcu_read_unlock(&kvm->srcu, srcu_idx);
1161 return NULL;
1094} 1162}
1095 1163
1096void kvmppc_unpin_guest_page(struct kvm *kvm, void *va) 1164void kvmppc_unpin_guest_page(struct kvm *kvm, void *va)
@@ -1100,6 +1168,348 @@ void kvmppc_unpin_guest_page(struct kvm *kvm, void *va)
1100 put_page(page); 1168 put_page(page);
1101} 1169}
1102 1170
1171/*
1172 * Functions for reading and writing the hash table via reads and
1173 * writes on a file descriptor.
1174 *
1175 * Reads return the guest view of the hash table, which has to be
1176 * pieced together from the real hash table and the guest_rpte
1177 * values in the revmap array.
1178 *
1179 * On writes, each HPTE written is considered in turn, and if it
1180 * is valid, it is written to the HPT as if an H_ENTER with the
1181 * exact flag set was done. When the invalid count is non-zero
1182 * in the header written to the stream, the kernel will make
1183 * sure that that many HPTEs are invalid, and invalidate them
1184 * if not.
1185 */
1186
1187struct kvm_htab_ctx {
1188 unsigned long index;
1189 unsigned long flags;
1190 struct kvm *kvm;
1191 int first_pass;
1192};
1193
1194#define HPTE_SIZE (2 * sizeof(unsigned long))
1195
1196static long record_hpte(unsigned long flags, unsigned long *hptp,
1197 unsigned long *hpte, struct revmap_entry *revp,
1198 int want_valid, int first_pass)
1199{
1200 unsigned long v, r;
1201 int ok = 1;
1202 int valid, dirty;
1203
1204 /* Unmodified entries are uninteresting except on the first pass */
1205 dirty = !!(revp->guest_rpte & HPTE_GR_MODIFIED);
1206 if (!first_pass && !dirty)
1207 return 0;
1208
1209 valid = 0;
1210 if (hptp[0] & (HPTE_V_VALID | HPTE_V_ABSENT)) {
1211 valid = 1;
1212 if ((flags & KVM_GET_HTAB_BOLTED_ONLY) &&
1213 !(hptp[0] & HPTE_V_BOLTED))
1214 valid = 0;
1215 }
1216 if (valid != want_valid)
1217 return 0;
1218
1219 v = r = 0;
1220 if (valid || dirty) {
1221 /* lock the HPTE so it's stable and read it */
1222 preempt_disable();
1223 while (!try_lock_hpte(hptp, HPTE_V_HVLOCK))
1224 cpu_relax();
1225 v = hptp[0];
1226 if (v & HPTE_V_ABSENT) {
1227 v &= ~HPTE_V_ABSENT;
1228 v |= HPTE_V_VALID;
1229 }
1230 /* re-evaluate valid and dirty from synchronized HPTE value */
1231 valid = !!(v & HPTE_V_VALID);
1232 if ((flags & KVM_GET_HTAB_BOLTED_ONLY) && !(v & HPTE_V_BOLTED))
1233 valid = 0;
1234 r = revp->guest_rpte | (hptp[1] & (HPTE_R_R | HPTE_R_C));
1235 dirty = !!(revp->guest_rpte & HPTE_GR_MODIFIED);
1236 /* only clear modified if this is the right sort of entry */
1237 if (valid == want_valid && dirty) {
1238 r &= ~HPTE_GR_MODIFIED;
1239 revp->guest_rpte = r;
1240 }
1241 asm volatile(PPC_RELEASE_BARRIER "" : : : "memory");
1242 hptp[0] &= ~HPTE_V_HVLOCK;
1243 preempt_enable();
1244 if (!(valid == want_valid && (first_pass || dirty)))
1245 ok = 0;
1246 }
1247 hpte[0] = v;
1248 hpte[1] = r;
1249 return ok;
1250}
1251
1252static ssize_t kvm_htab_read(struct file *file, char __user *buf,
1253 size_t count, loff_t *ppos)
1254{
1255 struct kvm_htab_ctx *ctx = file->private_data;
1256 struct kvm *kvm = ctx->kvm;
1257 struct kvm_get_htab_header hdr;
1258 unsigned long *hptp;
1259 struct revmap_entry *revp;
1260 unsigned long i, nb, nw;
1261 unsigned long __user *lbuf;
1262 struct kvm_get_htab_header __user *hptr;
1263 unsigned long flags;
1264 int first_pass;
1265 unsigned long hpte[2];
1266
1267 if (!access_ok(VERIFY_WRITE, buf, count))
1268 return -EFAULT;
1269
1270 first_pass = ctx->first_pass;
1271 flags = ctx->flags;
1272
1273 i = ctx->index;
1274 hptp = (unsigned long *)(kvm->arch.hpt_virt + (i * HPTE_SIZE));
1275 revp = kvm->arch.revmap + i;
1276 lbuf = (unsigned long __user *)buf;
1277
1278 nb = 0;
1279 while (nb + sizeof(hdr) + HPTE_SIZE < count) {
1280 /* Initialize header */
1281 hptr = (struct kvm_get_htab_header __user *)buf;
1282 hdr.n_valid = 0;
1283 hdr.n_invalid = 0;
1284 nw = nb;
1285 nb += sizeof(hdr);
1286 lbuf = (unsigned long __user *)(buf + sizeof(hdr));
1287
1288 /* Skip uninteresting entries, i.e. clean on not-first pass */
1289 if (!first_pass) {
1290 while (i < kvm->arch.hpt_npte &&
1291 !(revp->guest_rpte & HPTE_GR_MODIFIED)) {
1292 ++i;
1293 hptp += 2;
1294 ++revp;
1295 }
1296 }
1297 hdr.index = i;
1298
1299 /* Grab a series of valid entries */
1300 while (i < kvm->arch.hpt_npte &&
1301 hdr.n_valid < 0xffff &&
1302 nb + HPTE_SIZE < count &&
1303 record_hpte(flags, hptp, hpte, revp, 1, first_pass)) {
1304 /* valid entry, write it out */
1305 ++hdr.n_valid;
1306 if (__put_user(hpte[0], lbuf) ||
1307 __put_user(hpte[1], lbuf + 1))
1308 return -EFAULT;
1309 nb += HPTE_SIZE;
1310 lbuf += 2;
1311 ++i;
1312 hptp += 2;
1313 ++revp;
1314 }
1315 /* Now skip invalid entries while we can */
1316 while (i < kvm->arch.hpt_npte &&
1317 hdr.n_invalid < 0xffff &&
1318 record_hpte(flags, hptp, hpte, revp, 0, first_pass)) {
1319 /* found an invalid entry */
1320 ++hdr.n_invalid;
1321 ++i;
1322 hptp += 2;
1323 ++revp;
1324 }
1325
1326 if (hdr.n_valid || hdr.n_invalid) {
1327 /* write back the header */
1328 if (__copy_to_user(hptr, &hdr, sizeof(hdr)))
1329 return -EFAULT;
1330 nw = nb;
1331 buf = (char __user *)lbuf;
1332 } else {
1333 nb = nw;
1334 }
1335
1336 /* Check if we've wrapped around the hash table */
1337 if (i >= kvm->arch.hpt_npte) {
1338 i = 0;
1339 ctx->first_pass = 0;
1340 break;
1341 }
1342 }
1343
1344 ctx->index = i;
1345
1346 return nb;
1347}
1348
1349static ssize_t kvm_htab_write(struct file *file, const char __user *buf,
1350 size_t count, loff_t *ppos)
1351{
1352 struct kvm_htab_ctx *ctx = file->private_data;
1353 struct kvm *kvm = ctx->kvm;
1354 struct kvm_get_htab_header hdr;
1355 unsigned long i, j;
1356 unsigned long v, r;
1357 unsigned long __user *lbuf;
1358 unsigned long *hptp;
1359 unsigned long tmp[2];
1360 ssize_t nb;
1361 long int err, ret;
1362 int rma_setup;
1363
1364 if (!access_ok(VERIFY_READ, buf, count))
1365 return -EFAULT;
1366
1367 /* lock out vcpus from running while we're doing this */
1368 mutex_lock(&kvm->lock);
1369 rma_setup = kvm->arch.rma_setup_done;
1370 if (rma_setup) {
1371 kvm->arch.rma_setup_done = 0; /* temporarily */
1372 /* order rma_setup_done vs. vcpus_running */
1373 smp_mb();
1374 if (atomic_read(&kvm->arch.vcpus_running)) {
1375 kvm->arch.rma_setup_done = 1;
1376 mutex_unlock(&kvm->lock);
1377 return -EBUSY;
1378 }
1379 }
1380
1381 err = 0;
1382 for (nb = 0; nb + sizeof(hdr) <= count; ) {
1383 err = -EFAULT;
1384 if (__copy_from_user(&hdr, buf, sizeof(hdr)))
1385 break;
1386
1387 err = 0;
1388 if (nb + hdr.n_valid * HPTE_SIZE > count)
1389 break;
1390
1391 nb += sizeof(hdr);
1392 buf += sizeof(hdr);
1393
1394 err = -EINVAL;
1395 i = hdr.index;
1396 if (i >= kvm->arch.hpt_npte ||
1397 i + hdr.n_valid + hdr.n_invalid > kvm->arch.hpt_npte)
1398 break;
1399
1400 hptp = (unsigned long *)(kvm->arch.hpt_virt + (i * HPTE_SIZE));
1401 lbuf = (unsigned long __user *)buf;
1402 for (j = 0; j < hdr.n_valid; ++j) {
1403 err = -EFAULT;
1404 if (__get_user(v, lbuf) || __get_user(r, lbuf + 1))
1405 goto out;
1406 err = -EINVAL;
1407 if (!(v & HPTE_V_VALID))
1408 goto out;
1409 lbuf += 2;
1410 nb += HPTE_SIZE;
1411
1412 if (hptp[0] & (HPTE_V_VALID | HPTE_V_ABSENT))
1413 kvmppc_do_h_remove(kvm, 0, i, 0, tmp);
1414 err = -EIO;
1415 ret = kvmppc_virtmode_do_h_enter(kvm, H_EXACT, i, v, r,
1416 tmp);
1417 if (ret != H_SUCCESS) {
1418 pr_err("kvm_htab_write ret %ld i=%ld v=%lx "
1419 "r=%lx\n", ret, i, v, r);
1420 goto out;
1421 }
1422 if (!rma_setup && is_vrma_hpte(v)) {
1423 unsigned long psize = hpte_page_size(v, r);
1424 unsigned long senc = slb_pgsize_encoding(psize);
1425 unsigned long lpcr;
1426
1427 kvm->arch.vrma_slb_v = senc | SLB_VSID_B_1T |
1428 (VRMA_VSID << SLB_VSID_SHIFT_1T);
1429 lpcr = kvm->arch.lpcr & ~LPCR_VRMASD;
1430 lpcr |= senc << (LPCR_VRMASD_SH - 4);
1431 kvm->arch.lpcr = lpcr;
1432 rma_setup = 1;
1433 }
1434 ++i;
1435 hptp += 2;
1436 }
1437
1438 for (j = 0; j < hdr.n_invalid; ++j) {
1439 if (hptp[0] & (HPTE_V_VALID | HPTE_V_ABSENT))
1440 kvmppc_do_h_remove(kvm, 0, i, 0, tmp);
1441 ++i;
1442 hptp += 2;
1443 }
1444 err = 0;
1445 }
1446
1447 out:
1448 /* Order HPTE updates vs. rma_setup_done */
1449 smp_wmb();
1450 kvm->arch.rma_setup_done = rma_setup;
1451 mutex_unlock(&kvm->lock);
1452
1453 if (err)
1454 return err;
1455 return nb;
1456}
1457
1458static int kvm_htab_release(struct inode *inode, struct file *filp)
1459{
1460 struct kvm_htab_ctx *ctx = filp->private_data;
1461
1462 filp->private_data = NULL;
1463 if (!(ctx->flags & KVM_GET_HTAB_WRITE))
1464 atomic_dec(&ctx->kvm->arch.hpte_mod_interest);
1465 kvm_put_kvm(ctx->kvm);
1466 kfree(ctx);
1467 return 0;
1468}
1469
1470static struct file_operations kvm_htab_fops = {
1471 .read = kvm_htab_read,
1472 .write = kvm_htab_write,
1473 .llseek = default_llseek,
1474 .release = kvm_htab_release,
1475};
1476
1477int kvm_vm_ioctl_get_htab_fd(struct kvm *kvm, struct kvm_get_htab_fd *ghf)
1478{
1479 int ret;
1480 struct kvm_htab_ctx *ctx;
1481 int rwflag;
1482
1483 /* reject flags we don't recognize */
1484 if (ghf->flags & ~(KVM_GET_HTAB_BOLTED_ONLY | KVM_GET_HTAB_WRITE))
1485 return -EINVAL;
1486 ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
1487 if (!ctx)
1488 return -ENOMEM;
1489 kvm_get_kvm(kvm);
1490 ctx->kvm = kvm;
1491 ctx->index = ghf->start_index;
1492 ctx->flags = ghf->flags;
1493 ctx->first_pass = 1;
1494
1495 rwflag = (ghf->flags & KVM_GET_HTAB_WRITE) ? O_WRONLY : O_RDONLY;
1496 ret = anon_inode_getfd("kvm-htab", &kvm_htab_fops, ctx, rwflag);
1497 if (ret < 0) {
1498 kvm_put_kvm(kvm);
1499 return ret;
1500 }
1501
1502 if (rwflag == O_RDONLY) {
1503 mutex_lock(&kvm->slots_lock);
1504 atomic_inc(&kvm->arch.hpte_mod_interest);
1505 /* make sure kvmppc_do_h_enter etc. see the increment */
1506 synchronize_srcu_expedited(&kvm->srcu);
1507 mutex_unlock(&kvm->slots_lock);
1508 }
1509
1510 return ret;
1511}
1512
1103void kvmppc_mmu_book3s_hv_init(struct kvm_vcpu *vcpu) 1513void kvmppc_mmu_book3s_hv_init(struct kvm_vcpu *vcpu)
1104{ 1514{
1105 struct kvmppc_mmu *mmu = &vcpu->arch.mmu; 1515 struct kvmppc_mmu *mmu = &vcpu->arch.mmu;
diff --git a/arch/powerpc/kvm/book3s_emulate.c b/arch/powerpc/kvm/book3s_emulate.c
index b9a989dc76cc..d31a716f7f2b 100644
--- a/arch/powerpc/kvm/book3s_emulate.c
+++ b/arch/powerpc/kvm/book3s_emulate.c
@@ -22,6 +22,7 @@
22#include <asm/kvm_book3s.h> 22#include <asm/kvm_book3s.h>
23#include <asm/reg.h> 23#include <asm/reg.h>
24#include <asm/switch_to.h> 24#include <asm/switch_to.h>
25#include <asm/time.h>
25 26
26#define OP_19_XOP_RFID 18 27#define OP_19_XOP_RFID 18
27#define OP_19_XOP_RFI 50 28#define OP_19_XOP_RFI 50
@@ -395,6 +396,12 @@ int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, ulong spr_val)
395 (mfmsr() & MSR_HV)) 396 (mfmsr() & MSR_HV))
396 vcpu->arch.hflags |= BOOK3S_HFLAG_DCBZ32; 397 vcpu->arch.hflags |= BOOK3S_HFLAG_DCBZ32;
397 break; 398 break;
399 case SPRN_PURR:
400 to_book3s(vcpu)->purr_offset = spr_val - get_tb();
401 break;
402 case SPRN_SPURR:
403 to_book3s(vcpu)->spurr_offset = spr_val - get_tb();
404 break;
398 case SPRN_GQR0: 405 case SPRN_GQR0:
399 case SPRN_GQR1: 406 case SPRN_GQR1:
400 case SPRN_GQR2: 407 case SPRN_GQR2:
@@ -412,6 +419,7 @@ int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, ulong spr_val)
412 case SPRN_CTRLF: 419 case SPRN_CTRLF:
413 case SPRN_CTRLT: 420 case SPRN_CTRLT:
414 case SPRN_L2CR: 421 case SPRN_L2CR:
422 case SPRN_DSCR:
415 case SPRN_MMCR0_GEKKO: 423 case SPRN_MMCR0_GEKKO:
416 case SPRN_MMCR1_GEKKO: 424 case SPRN_MMCR1_GEKKO:
417 case SPRN_PMC1_GEKKO: 425 case SPRN_PMC1_GEKKO:
@@ -483,9 +491,15 @@ int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, ulong *spr_val)
483 *spr_val = to_book3s(vcpu)->hid[5]; 491 *spr_val = to_book3s(vcpu)->hid[5];
484 break; 492 break;
485 case SPRN_CFAR: 493 case SPRN_CFAR:
486 case SPRN_PURR: 494 case SPRN_DSCR:
487 *spr_val = 0; 495 *spr_val = 0;
488 break; 496 break;
497 case SPRN_PURR:
498 *spr_val = get_tb() + to_book3s(vcpu)->purr_offset;
499 break;
500 case SPRN_SPURR:
501 *spr_val = get_tb() + to_book3s(vcpu)->purr_offset;
502 break;
489 case SPRN_GQR0: 503 case SPRN_GQR0:
490 case SPRN_GQR1: 504 case SPRN_GQR1:
491 case SPRN_GQR2: 505 case SPRN_GQR2:
diff --git a/arch/powerpc/kvm/book3s_exports.c b/arch/powerpc/kvm/book3s_exports.c
index a150817d6d4c..7057a02f0906 100644
--- a/arch/powerpc/kvm/book3s_exports.c
+++ b/arch/powerpc/kvm/book3s_exports.c
@@ -28,8 +28,5 @@ EXPORT_SYMBOL_GPL(kvmppc_load_up_fpu);
28#ifdef CONFIG_ALTIVEC 28#ifdef CONFIG_ALTIVEC
29EXPORT_SYMBOL_GPL(kvmppc_load_up_altivec); 29EXPORT_SYMBOL_GPL(kvmppc_load_up_altivec);
30#endif 30#endif
31#ifdef CONFIG_VSX
32EXPORT_SYMBOL_GPL(kvmppc_load_up_vsx);
33#endif
34#endif 31#endif
35 32
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 721d4603a235..71d0c90b62bf 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -30,6 +30,7 @@
30#include <linux/cpumask.h> 30#include <linux/cpumask.h>
31#include <linux/spinlock.h> 31#include <linux/spinlock.h>
32#include <linux/page-flags.h> 32#include <linux/page-flags.h>
33#include <linux/srcu.h>
33 34
34#include <asm/reg.h> 35#include <asm/reg.h>
35#include <asm/cputable.h> 36#include <asm/cputable.h>
@@ -46,6 +47,7 @@
46#include <asm/page.h> 47#include <asm/page.h>
47#include <asm/hvcall.h> 48#include <asm/hvcall.h>
48#include <asm/switch_to.h> 49#include <asm/switch_to.h>
50#include <asm/smp.h>
49#include <linux/gfp.h> 51#include <linux/gfp.h>
50#include <linux/vmalloc.h> 52#include <linux/vmalloc.h>
51#include <linux/highmem.h> 53#include <linux/highmem.h>
@@ -55,25 +57,77 @@
55/* #define EXIT_DEBUG_SIMPLE */ 57/* #define EXIT_DEBUG_SIMPLE */
56/* #define EXIT_DEBUG_INT */ 58/* #define EXIT_DEBUG_INT */
57 59
60/* Used to indicate that a guest page fault needs to be handled */
61#define RESUME_PAGE_FAULT (RESUME_GUEST | RESUME_FLAG_ARCH1)
62
63/* Used as a "null" value for timebase values */
64#define TB_NIL (~(u64)0)
65
58static void kvmppc_end_cede(struct kvm_vcpu *vcpu); 66static void kvmppc_end_cede(struct kvm_vcpu *vcpu);
59static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu); 67static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu);
60 68
69/*
70 * We use the vcpu_load/put functions to measure stolen time.
71 * Stolen time is counted as time when either the vcpu is able to
72 * run as part of a virtual core, but the task running the vcore
73 * is preempted or sleeping, or when the vcpu needs something done
74 * in the kernel by the task running the vcpu, but that task is
75 * preempted or sleeping. Those two things have to be counted
76 * separately, since one of the vcpu tasks will take on the job
77 * of running the core, and the other vcpu tasks in the vcore will
78 * sleep waiting for it to do that, but that sleep shouldn't count
79 * as stolen time.
80 *
81 * Hence we accumulate stolen time when the vcpu can run as part of
82 * a vcore using vc->stolen_tb, and the stolen time when the vcpu
83 * needs its task to do other things in the kernel (for example,
84 * service a page fault) in busy_stolen. We don't accumulate
85 * stolen time for a vcore when it is inactive, or for a vcpu
86 * when it is in state RUNNING or NOTREADY. NOTREADY is a bit of
87 * a misnomer; it means that the vcpu task is not executing in
88 * the KVM_VCPU_RUN ioctl, i.e. it is in userspace or elsewhere in
89 * the kernel. We don't have any way of dividing up that time
90 * between time that the vcpu is genuinely stopped, time that
91 * the task is actively working on behalf of the vcpu, and time
92 * that the task is preempted, so we don't count any of it as
93 * stolen.
94 *
95 * Updates to busy_stolen are protected by arch.tbacct_lock;
96 * updates to vc->stolen_tb are protected by the arch.tbacct_lock
97 * of the vcpu that has taken responsibility for running the vcore
98 * (i.e. vc->runner). The stolen times are measured in units of
99 * timebase ticks. (Note that the != TB_NIL checks below are
100 * purely defensive; they should never fail.)
101 */
102
61void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu) 103void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
62{ 104{
63 struct kvmppc_vcore *vc = vcpu->arch.vcore; 105 struct kvmppc_vcore *vc = vcpu->arch.vcore;
64 106
65 local_paca->kvm_hstate.kvm_vcpu = vcpu; 107 spin_lock(&vcpu->arch.tbacct_lock);
66 local_paca->kvm_hstate.kvm_vcore = vc; 108 if (vc->runner == vcpu && vc->vcore_state != VCORE_INACTIVE &&
67 if (vc->runner == vcpu && vc->vcore_state != VCORE_INACTIVE) 109 vc->preempt_tb != TB_NIL) {
68 vc->stolen_tb += mftb() - vc->preempt_tb; 110 vc->stolen_tb += mftb() - vc->preempt_tb;
111 vc->preempt_tb = TB_NIL;
112 }
113 if (vcpu->arch.state == KVMPPC_VCPU_BUSY_IN_HOST &&
114 vcpu->arch.busy_preempt != TB_NIL) {
115 vcpu->arch.busy_stolen += mftb() - vcpu->arch.busy_preempt;
116 vcpu->arch.busy_preempt = TB_NIL;
117 }
118 spin_unlock(&vcpu->arch.tbacct_lock);
69} 119}
70 120
71void kvmppc_core_vcpu_put(struct kvm_vcpu *vcpu) 121void kvmppc_core_vcpu_put(struct kvm_vcpu *vcpu)
72{ 122{
73 struct kvmppc_vcore *vc = vcpu->arch.vcore; 123 struct kvmppc_vcore *vc = vcpu->arch.vcore;
74 124
125 spin_lock(&vcpu->arch.tbacct_lock);
75 if (vc->runner == vcpu && vc->vcore_state != VCORE_INACTIVE) 126 if (vc->runner == vcpu && vc->vcore_state != VCORE_INACTIVE)
76 vc->preempt_tb = mftb(); 127 vc->preempt_tb = mftb();
128 if (vcpu->arch.state == KVMPPC_VCPU_BUSY_IN_HOST)
129 vcpu->arch.busy_preempt = mftb();
130 spin_unlock(&vcpu->arch.tbacct_lock);
77} 131}
78 132
79void kvmppc_set_msr(struct kvm_vcpu *vcpu, u64 msr) 133void kvmppc_set_msr(struct kvm_vcpu *vcpu, u64 msr)
@@ -142,6 +196,22 @@ static void init_vpa(struct kvm_vcpu *vcpu, struct lppaca *vpa)
142 vpa->yield_count = 1; 196 vpa->yield_count = 1;
143} 197}
144 198
199static int set_vpa(struct kvm_vcpu *vcpu, struct kvmppc_vpa *v,
200 unsigned long addr, unsigned long len)
201{
202 /* check address is cacheline aligned */
203 if (addr & (L1_CACHE_BYTES - 1))
204 return -EINVAL;
205 spin_lock(&vcpu->arch.vpa_update_lock);
206 if (v->next_gpa != addr || v->len != len) {
207 v->next_gpa = addr;
208 v->len = addr ? len : 0;
209 v->update_pending = 1;
210 }
211 spin_unlock(&vcpu->arch.vpa_update_lock);
212 return 0;
213}
214
145/* Length for a per-processor buffer is passed in at offset 4 in the buffer */ 215/* Length for a per-processor buffer is passed in at offset 4 in the buffer */
146struct reg_vpa { 216struct reg_vpa {
147 u32 dummy; 217 u32 dummy;
@@ -317,10 +387,16 @@ static void kvmppc_update_vpa(struct kvm_vcpu *vcpu, struct kvmppc_vpa *vpap)
317 387
318static void kvmppc_update_vpas(struct kvm_vcpu *vcpu) 388static void kvmppc_update_vpas(struct kvm_vcpu *vcpu)
319{ 389{
390 if (!(vcpu->arch.vpa.update_pending ||
391 vcpu->arch.slb_shadow.update_pending ||
392 vcpu->arch.dtl.update_pending))
393 return;
394
320 spin_lock(&vcpu->arch.vpa_update_lock); 395 spin_lock(&vcpu->arch.vpa_update_lock);
321 if (vcpu->arch.vpa.update_pending) { 396 if (vcpu->arch.vpa.update_pending) {
322 kvmppc_update_vpa(vcpu, &vcpu->arch.vpa); 397 kvmppc_update_vpa(vcpu, &vcpu->arch.vpa);
323 init_vpa(vcpu, vcpu->arch.vpa.pinned_addr); 398 if (vcpu->arch.vpa.pinned_addr)
399 init_vpa(vcpu, vcpu->arch.vpa.pinned_addr);
324 } 400 }
325 if (vcpu->arch.dtl.update_pending) { 401 if (vcpu->arch.dtl.update_pending) {
326 kvmppc_update_vpa(vcpu, &vcpu->arch.dtl); 402 kvmppc_update_vpa(vcpu, &vcpu->arch.dtl);
@@ -332,24 +408,61 @@ static void kvmppc_update_vpas(struct kvm_vcpu *vcpu)
332 spin_unlock(&vcpu->arch.vpa_update_lock); 408 spin_unlock(&vcpu->arch.vpa_update_lock);
333} 409}
334 410
411/*
412 * Return the accumulated stolen time for the vcore up until `now'.
413 * The caller should hold the vcore lock.
414 */
415static u64 vcore_stolen_time(struct kvmppc_vcore *vc, u64 now)
416{
417 u64 p;
418
419 /*
420 * If we are the task running the vcore, then since we hold
421 * the vcore lock, we can't be preempted, so stolen_tb/preempt_tb
422 * can't be updated, so we don't need the tbacct_lock.
423 * If the vcore is inactive, it can't become active (since we
424 * hold the vcore lock), so the vcpu load/put functions won't
425 * update stolen_tb/preempt_tb, and we don't need tbacct_lock.
426 */
427 if (vc->vcore_state != VCORE_INACTIVE &&
428 vc->runner->arch.run_task != current) {
429 spin_lock(&vc->runner->arch.tbacct_lock);
430 p = vc->stolen_tb;
431 if (vc->preempt_tb != TB_NIL)
432 p += now - vc->preempt_tb;
433 spin_unlock(&vc->runner->arch.tbacct_lock);
434 } else {
435 p = vc->stolen_tb;
436 }
437 return p;
438}
439
335static void kvmppc_create_dtl_entry(struct kvm_vcpu *vcpu, 440static void kvmppc_create_dtl_entry(struct kvm_vcpu *vcpu,
336 struct kvmppc_vcore *vc) 441 struct kvmppc_vcore *vc)
337{ 442{
338 struct dtl_entry *dt; 443 struct dtl_entry *dt;
339 struct lppaca *vpa; 444 struct lppaca *vpa;
340 unsigned long old_stolen; 445 unsigned long stolen;
446 unsigned long core_stolen;
447 u64 now;
341 448
342 dt = vcpu->arch.dtl_ptr; 449 dt = vcpu->arch.dtl_ptr;
343 vpa = vcpu->arch.vpa.pinned_addr; 450 vpa = vcpu->arch.vpa.pinned_addr;
344 old_stolen = vcpu->arch.stolen_logged; 451 now = mftb();
345 vcpu->arch.stolen_logged = vc->stolen_tb; 452 core_stolen = vcore_stolen_time(vc, now);
453 stolen = core_stolen - vcpu->arch.stolen_logged;
454 vcpu->arch.stolen_logged = core_stolen;
455 spin_lock(&vcpu->arch.tbacct_lock);
456 stolen += vcpu->arch.busy_stolen;
457 vcpu->arch.busy_stolen = 0;
458 spin_unlock(&vcpu->arch.tbacct_lock);
346 if (!dt || !vpa) 459 if (!dt || !vpa)
347 return; 460 return;
348 memset(dt, 0, sizeof(struct dtl_entry)); 461 memset(dt, 0, sizeof(struct dtl_entry));
349 dt->dispatch_reason = 7; 462 dt->dispatch_reason = 7;
350 dt->processor_id = vc->pcpu + vcpu->arch.ptid; 463 dt->processor_id = vc->pcpu + vcpu->arch.ptid;
351 dt->timebase = mftb(); 464 dt->timebase = now;
352 dt->enqueue_to_dispatch_time = vc->stolen_tb - old_stolen; 465 dt->enqueue_to_dispatch_time = stolen;
353 dt->srr0 = kvmppc_get_pc(vcpu); 466 dt->srr0 = kvmppc_get_pc(vcpu);
354 dt->srr1 = vcpu->arch.shregs.msr; 467 dt->srr1 = vcpu->arch.shregs.msr;
355 ++dt; 468 ++dt;
@@ -366,13 +479,16 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu)
366 unsigned long req = kvmppc_get_gpr(vcpu, 3); 479 unsigned long req = kvmppc_get_gpr(vcpu, 3);
367 unsigned long target, ret = H_SUCCESS; 480 unsigned long target, ret = H_SUCCESS;
368 struct kvm_vcpu *tvcpu; 481 struct kvm_vcpu *tvcpu;
482 int idx;
369 483
370 switch (req) { 484 switch (req) {
371 case H_ENTER: 485 case H_ENTER:
486 idx = srcu_read_lock(&vcpu->kvm->srcu);
372 ret = kvmppc_virtmode_h_enter(vcpu, kvmppc_get_gpr(vcpu, 4), 487 ret = kvmppc_virtmode_h_enter(vcpu, kvmppc_get_gpr(vcpu, 4),
373 kvmppc_get_gpr(vcpu, 5), 488 kvmppc_get_gpr(vcpu, 5),
374 kvmppc_get_gpr(vcpu, 6), 489 kvmppc_get_gpr(vcpu, 6),
375 kvmppc_get_gpr(vcpu, 7)); 490 kvmppc_get_gpr(vcpu, 7));
491 srcu_read_unlock(&vcpu->kvm->srcu, idx);
376 break; 492 break;
377 case H_CEDE: 493 case H_CEDE:
378 break; 494 break;
@@ -429,6 +545,17 @@ static int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
429 case BOOK3S_INTERRUPT_PERFMON: 545 case BOOK3S_INTERRUPT_PERFMON:
430 r = RESUME_GUEST; 546 r = RESUME_GUEST;
431 break; 547 break;
548 case BOOK3S_INTERRUPT_MACHINE_CHECK:
549 /*
550 * Deliver a machine check interrupt to the guest.
551 * We have to do this, even if the host has handled the
552 * machine check, because machine checks use SRR0/1 and
553 * the interrupt might have trashed guest state in them.
554 */
555 kvmppc_book3s_queue_irqprio(vcpu,
556 BOOK3S_INTERRUPT_MACHINE_CHECK);
557 r = RESUME_GUEST;
558 break;
432 case BOOK3S_INTERRUPT_PROGRAM: 559 case BOOK3S_INTERRUPT_PROGRAM:
433 { 560 {
434 ulong flags; 561 ulong flags;
@@ -470,12 +597,12 @@ static int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
470 * have been handled already. 597 * have been handled already.
471 */ 598 */
472 case BOOK3S_INTERRUPT_H_DATA_STORAGE: 599 case BOOK3S_INTERRUPT_H_DATA_STORAGE:
473 r = kvmppc_book3s_hv_page_fault(run, vcpu, 600 r = RESUME_PAGE_FAULT;
474 vcpu->arch.fault_dar, vcpu->arch.fault_dsisr);
475 break; 601 break;
476 case BOOK3S_INTERRUPT_H_INST_STORAGE: 602 case BOOK3S_INTERRUPT_H_INST_STORAGE:
477 r = kvmppc_book3s_hv_page_fault(run, vcpu, 603 vcpu->arch.fault_dar = kvmppc_get_pc(vcpu);
478 kvmppc_get_pc(vcpu), 0); 604 vcpu->arch.fault_dsisr = 0;
605 r = RESUME_PAGE_FAULT;
479 break; 606 break;
480 /* 607 /*
481 * This occurs if the guest executes an illegal instruction. 608 * This occurs if the guest executes an illegal instruction.
@@ -535,36 +662,174 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
535 return 0; 662 return 0;
536} 663}
537 664
538int kvm_vcpu_ioctl_get_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg) 665int kvmppc_get_one_reg(struct kvm_vcpu *vcpu, u64 id, union kvmppc_one_reg *val)
539{ 666{
540 int r = -EINVAL; 667 int r = 0;
668 long int i;
541 669
542 switch (reg->id) { 670 switch (id) {
543 case KVM_REG_PPC_HIOR: 671 case KVM_REG_PPC_HIOR:
544 r = put_user(0, (u64 __user *)reg->addr); 672 *val = get_reg_val(id, 0);
673 break;
674 case KVM_REG_PPC_DABR:
675 *val = get_reg_val(id, vcpu->arch.dabr);
676 break;
677 case KVM_REG_PPC_DSCR:
678 *val = get_reg_val(id, vcpu->arch.dscr);
679 break;
680 case KVM_REG_PPC_PURR:
681 *val = get_reg_val(id, vcpu->arch.purr);
682 break;
683 case KVM_REG_PPC_SPURR:
684 *val = get_reg_val(id, vcpu->arch.spurr);
685 break;
686 case KVM_REG_PPC_AMR:
687 *val = get_reg_val(id, vcpu->arch.amr);
688 break;
689 case KVM_REG_PPC_UAMOR:
690 *val = get_reg_val(id, vcpu->arch.uamor);
691 break;
692 case KVM_REG_PPC_MMCR0 ... KVM_REG_PPC_MMCRA:
693 i = id - KVM_REG_PPC_MMCR0;
694 *val = get_reg_val(id, vcpu->arch.mmcr[i]);
695 break;
696 case KVM_REG_PPC_PMC1 ... KVM_REG_PPC_PMC8:
697 i = id - KVM_REG_PPC_PMC1;
698 *val = get_reg_val(id, vcpu->arch.pmc[i]);
699 break;
700#ifdef CONFIG_VSX
701 case KVM_REG_PPC_FPR0 ... KVM_REG_PPC_FPR31:
702 if (cpu_has_feature(CPU_FTR_VSX)) {
703 /* VSX => FP reg i is stored in arch.vsr[2*i] */
704 long int i = id - KVM_REG_PPC_FPR0;
705 *val = get_reg_val(id, vcpu->arch.vsr[2 * i]);
706 } else {
707 /* let generic code handle it */
708 r = -EINVAL;
709 }
710 break;
711 case KVM_REG_PPC_VSR0 ... KVM_REG_PPC_VSR31:
712 if (cpu_has_feature(CPU_FTR_VSX)) {
713 long int i = id - KVM_REG_PPC_VSR0;
714 val->vsxval[0] = vcpu->arch.vsr[2 * i];
715 val->vsxval[1] = vcpu->arch.vsr[2 * i + 1];
716 } else {
717 r = -ENXIO;
718 }
719 break;
720#endif /* CONFIG_VSX */
721 case KVM_REG_PPC_VPA_ADDR:
722 spin_lock(&vcpu->arch.vpa_update_lock);
723 *val = get_reg_val(id, vcpu->arch.vpa.next_gpa);
724 spin_unlock(&vcpu->arch.vpa_update_lock);
725 break;
726 case KVM_REG_PPC_VPA_SLB:
727 spin_lock(&vcpu->arch.vpa_update_lock);
728 val->vpaval.addr = vcpu->arch.slb_shadow.next_gpa;
729 val->vpaval.length = vcpu->arch.slb_shadow.len;
730 spin_unlock(&vcpu->arch.vpa_update_lock);
731 break;
732 case KVM_REG_PPC_VPA_DTL:
733 spin_lock(&vcpu->arch.vpa_update_lock);
734 val->vpaval.addr = vcpu->arch.dtl.next_gpa;
735 val->vpaval.length = vcpu->arch.dtl.len;
736 spin_unlock(&vcpu->arch.vpa_update_lock);
545 break; 737 break;
546 default: 738 default:
739 r = -EINVAL;
547 break; 740 break;
548 } 741 }
549 742
550 return r; 743 return r;
551} 744}
552 745
553int kvm_vcpu_ioctl_set_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg) 746int kvmppc_set_one_reg(struct kvm_vcpu *vcpu, u64 id, union kvmppc_one_reg *val)
554{ 747{
555 int r = -EINVAL; 748 int r = 0;
749 long int i;
750 unsigned long addr, len;
556 751
557 switch (reg->id) { 752 switch (id) {
558 case KVM_REG_PPC_HIOR: 753 case KVM_REG_PPC_HIOR:
559 {
560 u64 hior;
561 /* Only allow this to be set to zero */ 754 /* Only allow this to be set to zero */
562 r = get_user(hior, (u64 __user *)reg->addr); 755 if (set_reg_val(id, *val))
563 if (!r && (hior != 0))
564 r = -EINVAL; 756 r = -EINVAL;
565 break; 757 break;
566 } 758 case KVM_REG_PPC_DABR:
759 vcpu->arch.dabr = set_reg_val(id, *val);
760 break;
761 case KVM_REG_PPC_DSCR:
762 vcpu->arch.dscr = set_reg_val(id, *val);
763 break;
764 case KVM_REG_PPC_PURR:
765 vcpu->arch.purr = set_reg_val(id, *val);
766 break;
767 case KVM_REG_PPC_SPURR:
768 vcpu->arch.spurr = set_reg_val(id, *val);
769 break;
770 case KVM_REG_PPC_AMR:
771 vcpu->arch.amr = set_reg_val(id, *val);
772 break;
773 case KVM_REG_PPC_UAMOR:
774 vcpu->arch.uamor = set_reg_val(id, *val);
775 break;
776 case KVM_REG_PPC_MMCR0 ... KVM_REG_PPC_MMCRA:
777 i = id - KVM_REG_PPC_MMCR0;
778 vcpu->arch.mmcr[i] = set_reg_val(id, *val);
779 break;
780 case KVM_REG_PPC_PMC1 ... KVM_REG_PPC_PMC8:
781 i = id - KVM_REG_PPC_PMC1;
782 vcpu->arch.pmc[i] = set_reg_val(id, *val);
783 break;
784#ifdef CONFIG_VSX
785 case KVM_REG_PPC_FPR0 ... KVM_REG_PPC_FPR31:
786 if (cpu_has_feature(CPU_FTR_VSX)) {
787 /* VSX => FP reg i is stored in arch.vsr[2*i] */
788 long int i = id - KVM_REG_PPC_FPR0;
789 vcpu->arch.vsr[2 * i] = set_reg_val(id, *val);
790 } else {
791 /* let generic code handle it */
792 r = -EINVAL;
793 }
794 break;
795 case KVM_REG_PPC_VSR0 ... KVM_REG_PPC_VSR31:
796 if (cpu_has_feature(CPU_FTR_VSX)) {
797 long int i = id - KVM_REG_PPC_VSR0;
798 vcpu->arch.vsr[2 * i] = val->vsxval[0];
799 vcpu->arch.vsr[2 * i + 1] = val->vsxval[1];
800 } else {
801 r = -ENXIO;
802 }
803 break;
804#endif /* CONFIG_VSX */
805 case KVM_REG_PPC_VPA_ADDR:
806 addr = set_reg_val(id, *val);
807 r = -EINVAL;
808 if (!addr && (vcpu->arch.slb_shadow.next_gpa ||
809 vcpu->arch.dtl.next_gpa))
810 break;
811 r = set_vpa(vcpu, &vcpu->arch.vpa, addr, sizeof(struct lppaca));
812 break;
813 case KVM_REG_PPC_VPA_SLB:
814 addr = val->vpaval.addr;
815 len = val->vpaval.length;
816 r = -EINVAL;
817 if (addr && !vcpu->arch.vpa.next_gpa)
818 break;
819 r = set_vpa(vcpu, &vcpu->arch.slb_shadow, addr, len);
820 break;
821 case KVM_REG_PPC_VPA_DTL:
822 addr = val->vpaval.addr;
823 len = val->vpaval.length;
824 r = -EINVAL;
825 if (addr && (len < sizeof(struct dtl_entry) ||
826 !vcpu->arch.vpa.next_gpa))
827 break;
828 len -= len % sizeof(struct dtl_entry);
829 r = set_vpa(vcpu, &vcpu->arch.dtl, addr, len);
830 break;
567 default: 831 default:
832 r = -EINVAL;
568 break; 833 break;
569 } 834 }
570 835
@@ -599,20 +864,18 @@ struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id)
599 goto free_vcpu; 864 goto free_vcpu;
600 865
601 vcpu->arch.shared = &vcpu->arch.shregs; 866 vcpu->arch.shared = &vcpu->arch.shregs;
602 vcpu->arch.last_cpu = -1;
603 vcpu->arch.mmcr[0] = MMCR0_FC; 867 vcpu->arch.mmcr[0] = MMCR0_FC;
604 vcpu->arch.ctrl = CTRL_RUNLATCH; 868 vcpu->arch.ctrl = CTRL_RUNLATCH;
605 /* default to host PVR, since we can't spoof it */ 869 /* default to host PVR, since we can't spoof it */
606 vcpu->arch.pvr = mfspr(SPRN_PVR); 870 vcpu->arch.pvr = mfspr(SPRN_PVR);
607 kvmppc_set_pvr(vcpu, vcpu->arch.pvr); 871 kvmppc_set_pvr(vcpu, vcpu->arch.pvr);
608 spin_lock_init(&vcpu->arch.vpa_update_lock); 872 spin_lock_init(&vcpu->arch.vpa_update_lock);
873 spin_lock_init(&vcpu->arch.tbacct_lock);
874 vcpu->arch.busy_preempt = TB_NIL;
609 875
610 kvmppc_mmu_book3s_hv_init(vcpu); 876 kvmppc_mmu_book3s_hv_init(vcpu);
611 877
612 /* 878 vcpu->arch.state = KVMPPC_VCPU_NOTREADY;
613 * We consider the vcpu stopped until we see the first run ioctl for it.
614 */
615 vcpu->arch.state = KVMPPC_VCPU_STOPPED;
616 879
617 init_waitqueue_head(&vcpu->arch.cpu_run); 880 init_waitqueue_head(&vcpu->arch.cpu_run);
618 881
@@ -624,9 +887,10 @@ struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id)
624 INIT_LIST_HEAD(&vcore->runnable_threads); 887 INIT_LIST_HEAD(&vcore->runnable_threads);
625 spin_lock_init(&vcore->lock); 888 spin_lock_init(&vcore->lock);
626 init_waitqueue_head(&vcore->wq); 889 init_waitqueue_head(&vcore->wq);
627 vcore->preempt_tb = mftb(); 890 vcore->preempt_tb = TB_NIL;
628 } 891 }
629 kvm->arch.vcores[core] = vcore; 892 kvm->arch.vcores[core] = vcore;
893 kvm->arch.online_vcores++;
630 } 894 }
631 mutex_unlock(&kvm->lock); 895 mutex_unlock(&kvm->lock);
632 896
@@ -637,7 +901,6 @@ struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id)
637 ++vcore->num_threads; 901 ++vcore->num_threads;
638 spin_unlock(&vcore->lock); 902 spin_unlock(&vcore->lock);
639 vcpu->arch.vcore = vcore; 903 vcpu->arch.vcore = vcore;
640 vcpu->arch.stolen_logged = vcore->stolen_tb;
641 904
642 vcpu->arch.cpu_type = KVM_CPU_3S_64; 905 vcpu->arch.cpu_type = KVM_CPU_3S_64;
643 kvmppc_sanity_check(vcpu); 906 kvmppc_sanity_check(vcpu);
@@ -697,17 +960,18 @@ extern void xics_wake_cpu(int cpu);
697static void kvmppc_remove_runnable(struct kvmppc_vcore *vc, 960static void kvmppc_remove_runnable(struct kvmppc_vcore *vc,
698 struct kvm_vcpu *vcpu) 961 struct kvm_vcpu *vcpu)
699{ 962{
700 struct kvm_vcpu *v; 963 u64 now;
701 964
702 if (vcpu->arch.state != KVMPPC_VCPU_RUNNABLE) 965 if (vcpu->arch.state != KVMPPC_VCPU_RUNNABLE)
703 return; 966 return;
967 spin_lock(&vcpu->arch.tbacct_lock);
968 now = mftb();
969 vcpu->arch.busy_stolen += vcore_stolen_time(vc, now) -
970 vcpu->arch.stolen_logged;
971 vcpu->arch.busy_preempt = now;
704 vcpu->arch.state = KVMPPC_VCPU_BUSY_IN_HOST; 972 vcpu->arch.state = KVMPPC_VCPU_BUSY_IN_HOST;
973 spin_unlock(&vcpu->arch.tbacct_lock);
705 --vc->n_runnable; 974 --vc->n_runnable;
706 ++vc->n_busy;
707 /* decrement the physical thread id of each following vcpu */
708 v = vcpu;
709 list_for_each_entry_continue(v, &vc->runnable_threads, arch.run_list)
710 --v->arch.ptid;
711 list_del(&vcpu->arch.run_list); 975 list_del(&vcpu->arch.run_list);
712} 976}
713 977
@@ -720,6 +984,7 @@ static int kvmppc_grab_hwthread(int cpu)
720 984
721 /* Ensure the thread won't go into the kernel if it wakes */ 985 /* Ensure the thread won't go into the kernel if it wakes */
722 tpaca->kvm_hstate.hwthread_req = 1; 986 tpaca->kvm_hstate.hwthread_req = 1;
987 tpaca->kvm_hstate.kvm_vcpu = NULL;
723 988
724 /* 989 /*
725 * If the thread is already executing in the kernel (e.g. handling 990 * If the thread is already executing in the kernel (e.g. handling
@@ -769,7 +1034,6 @@ static void kvmppc_start_thread(struct kvm_vcpu *vcpu)
769 smp_wmb(); 1034 smp_wmb();
770#if defined(CONFIG_PPC_ICP_NATIVE) && defined(CONFIG_SMP) 1035#if defined(CONFIG_PPC_ICP_NATIVE) && defined(CONFIG_SMP)
771 if (vcpu->arch.ptid) { 1036 if (vcpu->arch.ptid) {
772 kvmppc_grab_hwthread(cpu);
773 xics_wake_cpu(cpu); 1037 xics_wake_cpu(cpu);
774 ++vc->n_woken; 1038 ++vc->n_woken;
775 } 1039 }
@@ -795,7 +1059,8 @@ static void kvmppc_wait_for_nap(struct kvmppc_vcore *vc)
795 1059
796/* 1060/*
797 * Check that we are on thread 0 and that any other threads in 1061 * Check that we are on thread 0 and that any other threads in
798 * this core are off-line. 1062 * this core are off-line. Then grab the threads so they can't
1063 * enter the kernel.
799 */ 1064 */
800static int on_primary_thread(void) 1065static int on_primary_thread(void)
801{ 1066{
@@ -807,6 +1072,17 @@ static int on_primary_thread(void)
807 while (++thr < threads_per_core) 1072 while (++thr < threads_per_core)
808 if (cpu_online(cpu + thr)) 1073 if (cpu_online(cpu + thr))
809 return 0; 1074 return 0;
1075
1076 /* Grab all hw threads so they can't go into the kernel */
1077 for (thr = 1; thr < threads_per_core; ++thr) {
1078 if (kvmppc_grab_hwthread(cpu + thr)) {
1079 /* Couldn't grab one; let the others go */
1080 do {
1081 kvmppc_release_hwthread(cpu + thr);
1082 } while (--thr > 0);
1083 return 0;
1084 }
1085 }
810 return 1; 1086 return 1;
811} 1087}
812 1088
@@ -814,21 +1090,24 @@ static int on_primary_thread(void)
814 * Run a set of guest threads on a physical core. 1090 * Run a set of guest threads on a physical core.
815 * Called with vc->lock held. 1091 * Called with vc->lock held.
816 */ 1092 */
817static int kvmppc_run_core(struct kvmppc_vcore *vc) 1093static void kvmppc_run_core(struct kvmppc_vcore *vc)
818{ 1094{
819 struct kvm_vcpu *vcpu, *vcpu0, *vnext; 1095 struct kvm_vcpu *vcpu, *vcpu0, *vnext;
820 long ret; 1096 long ret;
821 u64 now; 1097 u64 now;
822 int ptid, i, need_vpa_update; 1098 int ptid, i, need_vpa_update;
1099 int srcu_idx;
1100 struct kvm_vcpu *vcpus_to_update[threads_per_core];
823 1101
824 /* don't start if any threads have a signal pending */ 1102 /* don't start if any threads have a signal pending */
825 need_vpa_update = 0; 1103 need_vpa_update = 0;
826 list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list) { 1104 list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list) {
827 if (signal_pending(vcpu->arch.run_task)) 1105 if (signal_pending(vcpu->arch.run_task))
828 return 0; 1106 return;
829 need_vpa_update |= vcpu->arch.vpa.update_pending | 1107 if (vcpu->arch.vpa.update_pending ||
830 vcpu->arch.slb_shadow.update_pending | 1108 vcpu->arch.slb_shadow.update_pending ||
831 vcpu->arch.dtl.update_pending; 1109 vcpu->arch.dtl.update_pending)
1110 vcpus_to_update[need_vpa_update++] = vcpu;
832 } 1111 }
833 1112
834 /* 1113 /*
@@ -838,7 +1117,7 @@ static int kvmppc_run_core(struct kvmppc_vcore *vc)
838 vc->n_woken = 0; 1117 vc->n_woken = 0;
839 vc->nap_count = 0; 1118 vc->nap_count = 0;
840 vc->entry_exit_count = 0; 1119 vc->entry_exit_count = 0;
841 vc->vcore_state = VCORE_RUNNING; 1120 vc->vcore_state = VCORE_STARTING;
842 vc->in_guest = 0; 1121 vc->in_guest = 0;
843 vc->napping_threads = 0; 1122 vc->napping_threads = 0;
844 1123
@@ -848,24 +1127,12 @@ static int kvmppc_run_core(struct kvmppc_vcore *vc)
848 */ 1127 */
849 if (need_vpa_update) { 1128 if (need_vpa_update) {
850 spin_unlock(&vc->lock); 1129 spin_unlock(&vc->lock);
851 list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list) 1130 for (i = 0; i < need_vpa_update; ++i)
852 kvmppc_update_vpas(vcpu); 1131 kvmppc_update_vpas(vcpus_to_update[i]);
853 spin_lock(&vc->lock); 1132 spin_lock(&vc->lock);
854 } 1133 }
855 1134
856 /* 1135 /*
857 * Make sure we are running on thread 0, and that
858 * secondary threads are offline.
859 * XXX we should also block attempts to bring any
860 * secondary threads online.
861 */
862 if (threads_per_core > 1 && !on_primary_thread()) {
863 list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list)
864 vcpu->arch.ret = -EBUSY;
865 goto out;
866 }
867
868 /*
869 * Assign physical thread IDs, first to non-ceded vcpus 1136 * Assign physical thread IDs, first to non-ceded vcpus
870 * and then to ceded ones. 1137 * and then to ceded ones.
871 */ 1138 */
@@ -879,28 +1146,36 @@ static int kvmppc_run_core(struct kvmppc_vcore *vc)
879 } 1146 }
880 } 1147 }
881 if (!vcpu0) 1148 if (!vcpu0)
882 return 0; /* nothing to run */ 1149 goto out; /* nothing to run; should never happen */
883 list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list) 1150 list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list)
884 if (vcpu->arch.ceded) 1151 if (vcpu->arch.ceded)
885 vcpu->arch.ptid = ptid++; 1152 vcpu->arch.ptid = ptid++;
886 1153
887 vc->stolen_tb += mftb() - vc->preempt_tb; 1154 /*
1155 * Make sure we are running on thread 0, and that
1156 * secondary threads are offline.
1157 */
1158 if (threads_per_core > 1 && !on_primary_thread()) {
1159 list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list)
1160 vcpu->arch.ret = -EBUSY;
1161 goto out;
1162 }
1163
888 vc->pcpu = smp_processor_id(); 1164 vc->pcpu = smp_processor_id();
889 list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list) { 1165 list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list) {
890 kvmppc_start_thread(vcpu); 1166 kvmppc_start_thread(vcpu);
891 kvmppc_create_dtl_entry(vcpu, vc); 1167 kvmppc_create_dtl_entry(vcpu, vc);
892 } 1168 }
893 /* Grab any remaining hw threads so they can't go into the kernel */
894 for (i = ptid; i < threads_per_core; ++i)
895 kvmppc_grab_hwthread(vc->pcpu + i);
896 1169
1170 vc->vcore_state = VCORE_RUNNING;
897 preempt_disable(); 1171 preempt_disable();
898 spin_unlock(&vc->lock); 1172 spin_unlock(&vc->lock);
899 1173
900 kvm_guest_enter(); 1174 kvm_guest_enter();
1175
1176 srcu_idx = srcu_read_lock(&vcpu0->kvm->srcu);
1177
901 __kvmppc_vcore_entry(NULL, vcpu0); 1178 __kvmppc_vcore_entry(NULL, vcpu0);
902 for (i = 0; i < threads_per_core; ++i)
903 kvmppc_release_hwthread(vc->pcpu + i);
904 1179
905 spin_lock(&vc->lock); 1180 spin_lock(&vc->lock);
906 /* disable sending of IPIs on virtual external irqs */ 1181 /* disable sending of IPIs on virtual external irqs */
@@ -909,10 +1184,14 @@ static int kvmppc_run_core(struct kvmppc_vcore *vc)
909 /* wait for secondary threads to finish writing their state to memory */ 1184 /* wait for secondary threads to finish writing their state to memory */
910 if (vc->nap_count < vc->n_woken) 1185 if (vc->nap_count < vc->n_woken)
911 kvmppc_wait_for_nap(vc); 1186 kvmppc_wait_for_nap(vc);
1187 for (i = 0; i < threads_per_core; ++i)
1188 kvmppc_release_hwthread(vc->pcpu + i);
912 /* prevent other vcpu threads from doing kvmppc_start_thread() now */ 1189 /* prevent other vcpu threads from doing kvmppc_start_thread() now */
913 vc->vcore_state = VCORE_EXITING; 1190 vc->vcore_state = VCORE_EXITING;
914 spin_unlock(&vc->lock); 1191 spin_unlock(&vc->lock);
915 1192
1193 srcu_read_unlock(&vcpu0->kvm->srcu, srcu_idx);
1194
916 /* make sure updates to secondary vcpu structs are visible now */ 1195 /* make sure updates to secondary vcpu structs are visible now */
917 smp_mb(); 1196 smp_mb();
918 kvm_guest_exit(); 1197 kvm_guest_exit();
@@ -920,6 +1199,7 @@ static int kvmppc_run_core(struct kvmppc_vcore *vc)
920 preempt_enable(); 1199 preempt_enable();
921 kvm_resched(vcpu); 1200 kvm_resched(vcpu);
922 1201
1202 spin_lock(&vc->lock);
923 now = get_tb(); 1203 now = get_tb();
924 list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list) { 1204 list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list) {
925 /* cancel pending dec exception if dec is positive */ 1205 /* cancel pending dec exception if dec is positive */
@@ -943,10 +1223,8 @@ static int kvmppc_run_core(struct kvmppc_vcore *vc)
943 } 1223 }
944 } 1224 }
945 1225
946 spin_lock(&vc->lock);
947 out: 1226 out:
948 vc->vcore_state = VCORE_INACTIVE; 1227 vc->vcore_state = VCORE_INACTIVE;
949 vc->preempt_tb = mftb();
950 list_for_each_entry_safe(vcpu, vnext, &vc->runnable_threads, 1228 list_for_each_entry_safe(vcpu, vnext, &vc->runnable_threads,
951 arch.run_list) { 1229 arch.run_list) {
952 if (vcpu->arch.ret != RESUME_GUEST) { 1230 if (vcpu->arch.ret != RESUME_GUEST) {
@@ -954,8 +1232,6 @@ static int kvmppc_run_core(struct kvmppc_vcore *vc)
954 wake_up(&vcpu->arch.cpu_run); 1232 wake_up(&vcpu->arch.cpu_run);
955 } 1233 }
956 } 1234 }
957
958 return 1;
959} 1235}
960 1236
961/* 1237/*
@@ -979,20 +1255,11 @@ static void kvmppc_wait_for_exec(struct kvm_vcpu *vcpu, int wait_state)
979static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc) 1255static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc)
980{ 1256{
981 DEFINE_WAIT(wait); 1257 DEFINE_WAIT(wait);
982 struct kvm_vcpu *v;
983 int all_idle = 1;
984 1258
985 prepare_to_wait(&vc->wq, &wait, TASK_INTERRUPTIBLE); 1259 prepare_to_wait(&vc->wq, &wait, TASK_INTERRUPTIBLE);
986 vc->vcore_state = VCORE_SLEEPING; 1260 vc->vcore_state = VCORE_SLEEPING;
987 spin_unlock(&vc->lock); 1261 spin_unlock(&vc->lock);
988 list_for_each_entry(v, &vc->runnable_threads, arch.run_list) { 1262 schedule();
989 if (!v->arch.ceded || v->arch.pending_exceptions) {
990 all_idle = 0;
991 break;
992 }
993 }
994 if (all_idle)
995 schedule();
996 finish_wait(&vc->wq, &wait); 1263 finish_wait(&vc->wq, &wait);
997 spin_lock(&vc->lock); 1264 spin_lock(&vc->lock);
998 vc->vcore_state = VCORE_INACTIVE; 1265 vc->vcore_state = VCORE_INACTIVE;
@@ -1001,13 +1268,13 @@ static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc)
1001static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) 1268static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
1002{ 1269{
1003 int n_ceded; 1270 int n_ceded;
1004 int prev_state;
1005 struct kvmppc_vcore *vc; 1271 struct kvmppc_vcore *vc;
1006 struct kvm_vcpu *v, *vn; 1272 struct kvm_vcpu *v, *vn;
1007 1273
1008 kvm_run->exit_reason = 0; 1274 kvm_run->exit_reason = 0;
1009 vcpu->arch.ret = RESUME_GUEST; 1275 vcpu->arch.ret = RESUME_GUEST;
1010 vcpu->arch.trap = 0; 1276 vcpu->arch.trap = 0;
1277 kvmppc_update_vpas(vcpu);
1011 1278
1012 /* 1279 /*
1013 * Synchronize with other threads in this virtual core 1280 * Synchronize with other threads in this virtual core
@@ -1017,8 +1284,9 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
1017 vcpu->arch.ceded = 0; 1284 vcpu->arch.ceded = 0;
1018 vcpu->arch.run_task = current; 1285 vcpu->arch.run_task = current;
1019 vcpu->arch.kvm_run = kvm_run; 1286 vcpu->arch.kvm_run = kvm_run;
1020 prev_state = vcpu->arch.state; 1287 vcpu->arch.stolen_logged = vcore_stolen_time(vc, mftb());
1021 vcpu->arch.state = KVMPPC_VCPU_RUNNABLE; 1288 vcpu->arch.state = KVMPPC_VCPU_RUNNABLE;
1289 vcpu->arch.busy_preempt = TB_NIL;
1022 list_add_tail(&vcpu->arch.run_list, &vc->runnable_threads); 1290 list_add_tail(&vcpu->arch.run_list, &vc->runnable_threads);
1023 ++vc->n_runnable; 1291 ++vc->n_runnable;
1024 1292
@@ -1027,33 +1295,26 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
1027 * If the vcore is already running, we may be able to start 1295 * If the vcore is already running, we may be able to start
1028 * this thread straight away and have it join in. 1296 * this thread straight away and have it join in.
1029 */ 1297 */
1030 if (prev_state == KVMPPC_VCPU_STOPPED) { 1298 if (!signal_pending(current)) {
1031 if (vc->vcore_state == VCORE_RUNNING && 1299 if (vc->vcore_state == VCORE_RUNNING &&
1032 VCORE_EXIT_COUNT(vc) == 0) { 1300 VCORE_EXIT_COUNT(vc) == 0) {
1033 vcpu->arch.ptid = vc->n_runnable - 1; 1301 vcpu->arch.ptid = vc->n_runnable - 1;
1302 kvmppc_create_dtl_entry(vcpu, vc);
1034 kvmppc_start_thread(vcpu); 1303 kvmppc_start_thread(vcpu);
1304 } else if (vc->vcore_state == VCORE_SLEEPING) {
1305 wake_up(&vc->wq);
1035 } 1306 }
1036 1307
1037 } else if (prev_state == KVMPPC_VCPU_BUSY_IN_HOST) 1308 }
1038 --vc->n_busy;
1039 1309
1040 while (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE && 1310 while (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE &&
1041 !signal_pending(current)) { 1311 !signal_pending(current)) {
1042 if (vc->n_busy || vc->vcore_state != VCORE_INACTIVE) { 1312 if (vc->vcore_state != VCORE_INACTIVE) {
1043 spin_unlock(&vc->lock); 1313 spin_unlock(&vc->lock);
1044 kvmppc_wait_for_exec(vcpu, TASK_INTERRUPTIBLE); 1314 kvmppc_wait_for_exec(vcpu, TASK_INTERRUPTIBLE);
1045 spin_lock(&vc->lock); 1315 spin_lock(&vc->lock);
1046 continue; 1316 continue;
1047 } 1317 }
1048 vc->runner = vcpu;
1049 n_ceded = 0;
1050 list_for_each_entry(v, &vc->runnable_threads, arch.run_list)
1051 n_ceded += v->arch.ceded;
1052 if (n_ceded == vc->n_runnable)
1053 kvmppc_vcore_blocked(vc);
1054 else
1055 kvmppc_run_core(vc);
1056
1057 list_for_each_entry_safe(v, vn, &vc->runnable_threads, 1318 list_for_each_entry_safe(v, vn, &vc->runnable_threads,
1058 arch.run_list) { 1319 arch.run_list) {
1059 kvmppc_core_prepare_to_enter(v); 1320 kvmppc_core_prepare_to_enter(v);
@@ -1065,22 +1326,40 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
1065 wake_up(&v->arch.cpu_run); 1326 wake_up(&v->arch.cpu_run);
1066 } 1327 }
1067 } 1328 }
1329 if (!vc->n_runnable || vcpu->arch.state != KVMPPC_VCPU_RUNNABLE)
1330 break;
1331 vc->runner = vcpu;
1332 n_ceded = 0;
1333 list_for_each_entry(v, &vc->runnable_threads, arch.run_list)
1334 if (!v->arch.pending_exceptions)
1335 n_ceded += v->arch.ceded;
1336 if (n_ceded == vc->n_runnable)
1337 kvmppc_vcore_blocked(vc);
1338 else
1339 kvmppc_run_core(vc);
1068 vc->runner = NULL; 1340 vc->runner = NULL;
1069 } 1341 }
1070 1342
1071 if (signal_pending(current)) { 1343 while (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE &&
1072 if (vc->vcore_state == VCORE_RUNNING || 1344 (vc->vcore_state == VCORE_RUNNING ||
1073 vc->vcore_state == VCORE_EXITING) { 1345 vc->vcore_state == VCORE_EXITING)) {
1074 spin_unlock(&vc->lock); 1346 spin_unlock(&vc->lock);
1075 kvmppc_wait_for_exec(vcpu, TASK_UNINTERRUPTIBLE); 1347 kvmppc_wait_for_exec(vcpu, TASK_UNINTERRUPTIBLE);
1076 spin_lock(&vc->lock); 1348 spin_lock(&vc->lock);
1077 } 1349 }
1078 if (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE) { 1350
1079 kvmppc_remove_runnable(vc, vcpu); 1351 if (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE) {
1080 vcpu->stat.signal_exits++; 1352 kvmppc_remove_runnable(vc, vcpu);
1081 kvm_run->exit_reason = KVM_EXIT_INTR; 1353 vcpu->stat.signal_exits++;
1082 vcpu->arch.ret = -EINTR; 1354 kvm_run->exit_reason = KVM_EXIT_INTR;
1083 } 1355 vcpu->arch.ret = -EINTR;
1356 }
1357
1358 if (vc->n_runnable && vc->vcore_state == VCORE_INACTIVE) {
1359 /* Wake up some vcpu to run the core */
1360 v = list_first_entry(&vc->runnable_threads,
1361 struct kvm_vcpu, arch.run_list);
1362 wake_up(&v->arch.cpu_run);
1084 } 1363 }
1085 1364
1086 spin_unlock(&vc->lock); 1365 spin_unlock(&vc->lock);
@@ -1090,6 +1369,7 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
1090int kvmppc_vcpu_run(struct kvm_run *run, struct kvm_vcpu *vcpu) 1369int kvmppc_vcpu_run(struct kvm_run *run, struct kvm_vcpu *vcpu)
1091{ 1370{
1092 int r; 1371 int r;
1372 int srcu_idx;
1093 1373
1094 if (!vcpu->arch.sane) { 1374 if (!vcpu->arch.sane) {
1095 run->exit_reason = KVM_EXIT_INTERNAL_ERROR; 1375 run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
@@ -1120,6 +1400,7 @@ int kvmppc_vcpu_run(struct kvm_run *run, struct kvm_vcpu *vcpu)
1120 flush_vsx_to_thread(current); 1400 flush_vsx_to_thread(current);
1121 vcpu->arch.wqp = &vcpu->arch.vcore->wq; 1401 vcpu->arch.wqp = &vcpu->arch.vcore->wq;
1122 vcpu->arch.pgdir = current->mm->pgd; 1402 vcpu->arch.pgdir = current->mm->pgd;
1403 vcpu->arch.state = KVMPPC_VCPU_BUSY_IN_HOST;
1123 1404
1124 do { 1405 do {
1125 r = kvmppc_run_vcpu(run, vcpu); 1406 r = kvmppc_run_vcpu(run, vcpu);
@@ -1128,10 +1409,16 @@ int kvmppc_vcpu_run(struct kvm_run *run, struct kvm_vcpu *vcpu)
1128 !(vcpu->arch.shregs.msr & MSR_PR)) { 1409 !(vcpu->arch.shregs.msr & MSR_PR)) {
1129 r = kvmppc_pseries_do_hcall(vcpu); 1410 r = kvmppc_pseries_do_hcall(vcpu);
1130 kvmppc_core_prepare_to_enter(vcpu); 1411 kvmppc_core_prepare_to_enter(vcpu);
1412 } else if (r == RESUME_PAGE_FAULT) {
1413 srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
1414 r = kvmppc_book3s_hv_page_fault(run, vcpu,
1415 vcpu->arch.fault_dar, vcpu->arch.fault_dsisr);
1416 srcu_read_unlock(&vcpu->kvm->srcu, srcu_idx);
1131 } 1417 }
1132 } while (r == RESUME_GUEST); 1418 } while (r == RESUME_GUEST);
1133 1419
1134 out: 1420 out:
1421 vcpu->arch.state = KVMPPC_VCPU_NOTREADY;
1135 atomic_dec(&vcpu->kvm->arch.vcpus_running); 1422 atomic_dec(&vcpu->kvm->arch.vcpus_running);
1136 return r; 1423 return r;
1137} 1424}
@@ -1273,7 +1560,7 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log)
1273 n = kvm_dirty_bitmap_bytes(memslot); 1560 n = kvm_dirty_bitmap_bytes(memslot);
1274 memset(memslot->dirty_bitmap, 0, n); 1561 memset(memslot->dirty_bitmap, 0, n);
1275 1562
1276 r = kvmppc_hv_get_dirty_log(kvm, memslot); 1563 r = kvmppc_hv_get_dirty_log(kvm, memslot, memslot->dirty_bitmap);
1277 if (r) 1564 if (r)
1278 goto out; 1565 goto out;
1279 1566
@@ -1287,67 +1574,88 @@ out:
1287 return r; 1574 return r;
1288} 1575}
1289 1576
1290static unsigned long slb_pgsize_encoding(unsigned long psize) 1577static void unpin_slot(struct kvm_memory_slot *memslot)
1291{ 1578{
1292 unsigned long senc = 0; 1579 unsigned long *physp;
1580 unsigned long j, npages, pfn;
1581 struct page *page;
1293 1582
1294 if (psize > 0x1000) { 1583 physp = memslot->arch.slot_phys;
1295 senc = SLB_VSID_L; 1584 npages = memslot->npages;
1296 if (psize == 0x10000) 1585 if (!physp)
1297 senc |= SLB_VSID_LP_01; 1586 return;
1587 for (j = 0; j < npages; j++) {
1588 if (!(physp[j] & KVMPPC_GOT_PAGE))
1589 continue;
1590 pfn = physp[j] >> PAGE_SHIFT;
1591 page = pfn_to_page(pfn);
1592 SetPageDirty(page);
1593 put_page(page);
1594 }
1595}
1596
1597void kvmppc_core_free_memslot(struct kvm_memory_slot *free,
1598 struct kvm_memory_slot *dont)
1599{
1600 if (!dont || free->arch.rmap != dont->arch.rmap) {
1601 vfree(free->arch.rmap);
1602 free->arch.rmap = NULL;
1603 }
1604 if (!dont || free->arch.slot_phys != dont->arch.slot_phys) {
1605 unpin_slot(free);
1606 vfree(free->arch.slot_phys);
1607 free->arch.slot_phys = NULL;
1298 } 1608 }
1299 return senc; 1609}
1610
1611int kvmppc_core_create_memslot(struct kvm_memory_slot *slot,
1612 unsigned long npages)
1613{
1614 slot->arch.rmap = vzalloc(npages * sizeof(*slot->arch.rmap));
1615 if (!slot->arch.rmap)
1616 return -ENOMEM;
1617 slot->arch.slot_phys = NULL;
1618
1619 return 0;
1300} 1620}
1301 1621
1302int kvmppc_core_prepare_memory_region(struct kvm *kvm, 1622int kvmppc_core_prepare_memory_region(struct kvm *kvm,
1303 struct kvm_userspace_memory_region *mem) 1623 struct kvm_memory_slot *memslot,
1624 struct kvm_userspace_memory_region *mem)
1304{ 1625{
1305 unsigned long npages;
1306 unsigned long *phys; 1626 unsigned long *phys;
1307 1627
1308 /* Allocate a slot_phys array */ 1628 /* Allocate a slot_phys array if needed */
1309 phys = kvm->arch.slot_phys[mem->slot]; 1629 phys = memslot->arch.slot_phys;
1310 if (!kvm->arch.using_mmu_notifiers && !phys) { 1630 if (!kvm->arch.using_mmu_notifiers && !phys && memslot->npages) {
1311 npages = mem->memory_size >> PAGE_SHIFT; 1631 phys = vzalloc(memslot->npages * sizeof(unsigned long));
1312 phys = vzalloc(npages * sizeof(unsigned long));
1313 if (!phys) 1632 if (!phys)
1314 return -ENOMEM; 1633 return -ENOMEM;
1315 kvm->arch.slot_phys[mem->slot] = phys; 1634 memslot->arch.slot_phys = phys;
1316 kvm->arch.slot_npages[mem->slot] = npages;
1317 } 1635 }
1318 1636
1319 return 0; 1637 return 0;
1320} 1638}
1321 1639
1322static void unpin_slot(struct kvm *kvm, int slot_id) 1640void kvmppc_core_commit_memory_region(struct kvm *kvm,
1641 struct kvm_userspace_memory_region *mem,
1642 struct kvm_memory_slot old)
1323{ 1643{
1324 unsigned long *physp; 1644 unsigned long npages = mem->memory_size >> PAGE_SHIFT;
1325 unsigned long j, npages, pfn; 1645 struct kvm_memory_slot *memslot;
1326 struct page *page;
1327 1646
1328 physp = kvm->arch.slot_phys[slot_id]; 1647 if (npages && old.npages) {
1329 npages = kvm->arch.slot_npages[slot_id]; 1648 /*
1330 if (physp) { 1649 * If modifying a memslot, reset all the rmap dirty bits.
1331 spin_lock(&kvm->arch.slot_phys_lock); 1650 * If this is a new memslot, we don't need to do anything
1332 for (j = 0; j < npages; j++) { 1651 * since the rmap array starts out as all zeroes,
1333 if (!(physp[j] & KVMPPC_GOT_PAGE)) 1652 * i.e. no pages are dirty.
1334 continue; 1653 */
1335 pfn = physp[j] >> PAGE_SHIFT; 1654 memslot = id_to_memslot(kvm->memslots, mem->slot);
1336 page = pfn_to_page(pfn); 1655 kvmppc_hv_get_dirty_log(kvm, memslot, NULL);
1337 SetPageDirty(page);
1338 put_page(page);
1339 }
1340 kvm->arch.slot_phys[slot_id] = NULL;
1341 spin_unlock(&kvm->arch.slot_phys_lock);
1342 vfree(physp);
1343 } 1656 }
1344} 1657}
1345 1658
1346void kvmppc_core_commit_memory_region(struct kvm *kvm,
1347 struct kvm_userspace_memory_region *mem)
1348{
1349}
1350
1351static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu) 1659static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu)
1352{ 1660{
1353 int err = 0; 1661 int err = 0;
@@ -1362,6 +1670,7 @@ static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu)
1362 unsigned long rmls; 1670 unsigned long rmls;
1363 unsigned long *physp; 1671 unsigned long *physp;
1364 unsigned long i, npages; 1672 unsigned long i, npages;
1673 int srcu_idx;
1365 1674
1366 mutex_lock(&kvm->lock); 1675 mutex_lock(&kvm->lock);
1367 if (kvm->arch.rma_setup_done) 1676 if (kvm->arch.rma_setup_done)
@@ -1377,12 +1686,13 @@ static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu)
1377 } 1686 }
1378 1687
1379 /* Look up the memslot for guest physical address 0 */ 1688 /* Look up the memslot for guest physical address 0 */
1689 srcu_idx = srcu_read_lock(&kvm->srcu);
1380 memslot = gfn_to_memslot(kvm, 0); 1690 memslot = gfn_to_memslot(kvm, 0);
1381 1691
1382 /* We must have some memory at 0 by now */ 1692 /* We must have some memory at 0 by now */
1383 err = -EINVAL; 1693 err = -EINVAL;
1384 if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID)) 1694 if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID))
1385 goto out; 1695 goto out_srcu;
1386 1696
1387 /* Look up the VMA for the start of this memory slot */ 1697 /* Look up the VMA for the start of this memory slot */
1388 hva = memslot->userspace_addr; 1698 hva = memslot->userspace_addr;
@@ -1406,14 +1716,14 @@ static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu)
1406 err = -EPERM; 1716 err = -EPERM;
1407 if (cpu_has_feature(CPU_FTR_ARCH_201)) { 1717 if (cpu_has_feature(CPU_FTR_ARCH_201)) {
1408 pr_err("KVM: CPU requires an RMO\n"); 1718 pr_err("KVM: CPU requires an RMO\n");
1409 goto out; 1719 goto out_srcu;
1410 } 1720 }
1411 1721
1412 /* We can handle 4k, 64k or 16M pages in the VRMA */ 1722 /* We can handle 4k, 64k or 16M pages in the VRMA */
1413 err = -EINVAL; 1723 err = -EINVAL;
1414 if (!(psize == 0x1000 || psize == 0x10000 || 1724 if (!(psize == 0x1000 || psize == 0x10000 ||
1415 psize == 0x1000000)) 1725 psize == 0x1000000))
1416 goto out; 1726 goto out_srcu;
1417 1727
1418 /* Update VRMASD field in the LPCR */ 1728 /* Update VRMASD field in the LPCR */
1419 senc = slb_pgsize_encoding(psize); 1729 senc = slb_pgsize_encoding(psize);
@@ -1436,7 +1746,7 @@ static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu)
1436 err = -EINVAL; 1746 err = -EINVAL;
1437 if (rmls < 0) { 1747 if (rmls < 0) {
1438 pr_err("KVM: Can't use RMA of 0x%lx bytes\n", rma_size); 1748 pr_err("KVM: Can't use RMA of 0x%lx bytes\n", rma_size);
1439 goto out; 1749 goto out_srcu;
1440 } 1750 }
1441 atomic_inc(&ri->use_count); 1751 atomic_inc(&ri->use_count);
1442 kvm->arch.rma = ri; 1752 kvm->arch.rma = ri;
@@ -1465,17 +1775,24 @@ static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu)
1465 /* Initialize phys addrs of pages in RMO */ 1775 /* Initialize phys addrs of pages in RMO */
1466 npages = ri->npages; 1776 npages = ri->npages;
1467 porder = __ilog2(npages); 1777 porder = __ilog2(npages);
1468 physp = kvm->arch.slot_phys[memslot->id]; 1778 physp = memslot->arch.slot_phys;
1469 spin_lock(&kvm->arch.slot_phys_lock); 1779 if (physp) {
1470 for (i = 0; i < npages; ++i) 1780 if (npages > memslot->npages)
1471 physp[i] = ((ri->base_pfn + i) << PAGE_SHIFT) + porder; 1781 npages = memslot->npages;
1472 spin_unlock(&kvm->arch.slot_phys_lock); 1782 spin_lock(&kvm->arch.slot_phys_lock);
1783 for (i = 0; i < npages; ++i)
1784 physp[i] = ((ri->base_pfn + i) << PAGE_SHIFT) +
1785 porder;
1786 spin_unlock(&kvm->arch.slot_phys_lock);
1787 }
1473 } 1788 }
1474 1789
1475 /* Order updates to kvm->arch.lpcr etc. vs. rma_setup_done */ 1790 /* Order updates to kvm->arch.lpcr etc. vs. rma_setup_done */
1476 smp_wmb(); 1791 smp_wmb();
1477 kvm->arch.rma_setup_done = 1; 1792 kvm->arch.rma_setup_done = 1;
1478 err = 0; 1793 err = 0;
1794 out_srcu:
1795 srcu_read_unlock(&kvm->srcu, srcu_idx);
1479 out: 1796 out:
1480 mutex_unlock(&kvm->lock); 1797 mutex_unlock(&kvm->lock);
1481 return err; 1798 return err;
@@ -1496,6 +1813,13 @@ int kvmppc_core_init_vm(struct kvm *kvm)
1496 return -ENOMEM; 1813 return -ENOMEM;
1497 kvm->arch.lpid = lpid; 1814 kvm->arch.lpid = lpid;
1498 1815
1816 /*
1817 * Since we don't flush the TLB when tearing down a VM,
1818 * and this lpid might have previously been used,
1819 * make sure we flush on each core before running the new VM.
1820 */
1821 cpumask_setall(&kvm->arch.need_tlb_flush);
1822
1499 INIT_LIST_HEAD(&kvm->arch.spapr_tce_tables); 1823 INIT_LIST_HEAD(&kvm->arch.spapr_tce_tables);
1500 1824
1501 kvm->arch.rma = NULL; 1825 kvm->arch.rma = NULL;
@@ -1523,16 +1847,19 @@ int kvmppc_core_init_vm(struct kvm *kvm)
1523 1847
1524 kvm->arch.using_mmu_notifiers = !!cpu_has_feature(CPU_FTR_ARCH_206); 1848 kvm->arch.using_mmu_notifiers = !!cpu_has_feature(CPU_FTR_ARCH_206);
1525 spin_lock_init(&kvm->arch.slot_phys_lock); 1849 spin_lock_init(&kvm->arch.slot_phys_lock);
1850
1851 /*
1852 * Don't allow secondary CPU threads to come online
1853 * while any KVM VMs exist.
1854 */
1855 inhibit_secondary_onlining();
1856
1526 return 0; 1857 return 0;
1527} 1858}
1528 1859
1529void kvmppc_core_destroy_vm(struct kvm *kvm) 1860void kvmppc_core_destroy_vm(struct kvm *kvm)
1530{ 1861{
1531 unsigned long i; 1862 uninhibit_secondary_onlining();
1532
1533 if (!kvm->arch.using_mmu_notifiers)
1534 for (i = 0; i < KVM_MEM_SLOTS_NUM; i++)
1535 unpin_slot(kvm, i);
1536 1863
1537 if (kvm->arch.rma) { 1864 if (kvm->arch.rma) {
1538 kvm_release_rma(kvm->arch.rma); 1865 kvm_release_rma(kvm->arch.rma);
diff --git a/arch/powerpc/kvm/book3s_hv_builtin.c b/arch/powerpc/kvm/book3s_hv_builtin.c
index fb4eac290fef..ec0a9e5de100 100644
--- a/arch/powerpc/kvm/book3s_hv_builtin.c
+++ b/arch/powerpc/kvm/book3s_hv_builtin.c
@@ -157,8 +157,8 @@ static void __init kvm_linear_init_one(ulong size, int count, int type)
157 linear_info = alloc_bootmem(count * sizeof(struct kvmppc_linear_info)); 157 linear_info = alloc_bootmem(count * sizeof(struct kvmppc_linear_info));
158 for (i = 0; i < count; ++i) { 158 for (i = 0; i < count; ++i) {
159 linear = alloc_bootmem_align(size, size); 159 linear = alloc_bootmem_align(size, size);
160 pr_info("Allocated KVM %s at %p (%ld MB)\n", typestr, linear, 160 pr_debug("Allocated KVM %s at %p (%ld MB)\n", typestr, linear,
161 size >> 20); 161 size >> 20);
162 linear_info[i].base_virt = linear; 162 linear_info[i].base_virt = linear;
163 linear_info[i].base_pfn = __pa(linear) >> PAGE_SHIFT; 163 linear_info[i].base_pfn = __pa(linear) >> PAGE_SHIFT;
164 linear_info[i].npages = npages; 164 linear_info[i].npages = npages;
diff --git a/arch/powerpc/kvm/book3s_hv_ras.c b/arch/powerpc/kvm/book3s_hv_ras.c
new file mode 100644
index 000000000000..35f3cf0269b3
--- /dev/null
+++ b/arch/powerpc/kvm/book3s_hv_ras.c
@@ -0,0 +1,144 @@
1/*
2 * This program is free software; you can redistribute it and/or modify
3 * it under the terms of the GNU General Public License, version 2, as
4 * published by the Free Software Foundation.
5 *
6 * Copyright 2012 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
7 */
8
9#include <linux/types.h>
10#include <linux/string.h>
11#include <linux/kvm.h>
12#include <linux/kvm_host.h>
13#include <linux/kernel.h>
14#include <asm/opal.h>
15
16/* SRR1 bits for machine check on POWER7 */
17#define SRR1_MC_LDSTERR (1ul << (63-42))
18#define SRR1_MC_IFETCH_SH (63-45)
19#define SRR1_MC_IFETCH_MASK 0x7
20#define SRR1_MC_IFETCH_SLBPAR 2 /* SLB parity error */
21#define SRR1_MC_IFETCH_SLBMULTI 3 /* SLB multi-hit */
22#define SRR1_MC_IFETCH_SLBPARMULTI 4 /* SLB parity + multi-hit */
23#define SRR1_MC_IFETCH_TLBMULTI 5 /* I-TLB multi-hit */
24
25/* DSISR bits for machine check on POWER7 */
26#define DSISR_MC_DERAT_MULTI 0x800 /* D-ERAT multi-hit */
27#define DSISR_MC_TLB_MULTI 0x400 /* D-TLB multi-hit */
28#define DSISR_MC_SLB_PARITY 0x100 /* SLB parity error */
29#define DSISR_MC_SLB_MULTI 0x080 /* SLB multi-hit */
30#define DSISR_MC_SLB_PARMULTI 0x040 /* SLB parity + multi-hit */
31
32/* POWER7 SLB flush and reload */
33static void reload_slb(struct kvm_vcpu *vcpu)
34{
35 struct slb_shadow *slb;
36 unsigned long i, n;
37
38 /* First clear out SLB */
39 asm volatile("slbmte %0,%0; slbia" : : "r" (0));
40
41 /* Do they have an SLB shadow buffer registered? */
42 slb = vcpu->arch.slb_shadow.pinned_addr;
43 if (!slb)
44 return;
45
46 /* Sanity check */
47 n = min_t(u32, slb->persistent, SLB_MIN_SIZE);
48 if ((void *) &slb->save_area[n] > vcpu->arch.slb_shadow.pinned_end)
49 return;
50
51 /* Load up the SLB from that */
52 for (i = 0; i < n; ++i) {
53 unsigned long rb = slb->save_area[i].esid;
54 unsigned long rs = slb->save_area[i].vsid;
55
56 rb = (rb & ~0xFFFul) | i; /* insert entry number */
57 asm volatile("slbmte %0,%1" : : "r" (rs), "r" (rb));
58 }
59}
60
61/* POWER7 TLB flush */
62static void flush_tlb_power7(struct kvm_vcpu *vcpu)
63{
64 unsigned long i, rb;
65
66 rb = TLBIEL_INVAL_SET_LPID;
67 for (i = 0; i < POWER7_TLB_SETS; ++i) {
68 asm volatile("tlbiel %0" : : "r" (rb));
69 rb += 1 << TLBIEL_INVAL_SET_SHIFT;
70 }
71}
72
73/*
74 * On POWER7, see if we can handle a machine check that occurred inside
75 * the guest in real mode, without switching to the host partition.
76 *
77 * Returns: 0 => exit guest, 1 => deliver machine check to guest
78 */
79static long kvmppc_realmode_mc_power7(struct kvm_vcpu *vcpu)
80{
81 unsigned long srr1 = vcpu->arch.shregs.msr;
82 struct opal_machine_check_event *opal_evt;
83 long handled = 1;
84
85 if (srr1 & SRR1_MC_LDSTERR) {
86 /* error on load/store */
87 unsigned long dsisr = vcpu->arch.shregs.dsisr;
88
89 if (dsisr & (DSISR_MC_SLB_PARMULTI | DSISR_MC_SLB_MULTI |
90 DSISR_MC_SLB_PARITY | DSISR_MC_DERAT_MULTI)) {
91 /* flush and reload SLB; flushes D-ERAT too */
92 reload_slb(vcpu);
93 dsisr &= ~(DSISR_MC_SLB_PARMULTI | DSISR_MC_SLB_MULTI |
94 DSISR_MC_SLB_PARITY | DSISR_MC_DERAT_MULTI);
95 }
96 if (dsisr & DSISR_MC_TLB_MULTI) {
97 flush_tlb_power7(vcpu);
98 dsisr &= ~DSISR_MC_TLB_MULTI;
99 }
100 /* Any other errors we don't understand? */
101 if (dsisr & 0xffffffffUL)
102 handled = 0;
103 }
104
105 switch ((srr1 >> SRR1_MC_IFETCH_SH) & SRR1_MC_IFETCH_MASK) {
106 case 0:
107 break;
108 case SRR1_MC_IFETCH_SLBPAR:
109 case SRR1_MC_IFETCH_SLBMULTI:
110 case SRR1_MC_IFETCH_SLBPARMULTI:
111 reload_slb(vcpu);
112 break;
113 case SRR1_MC_IFETCH_TLBMULTI:
114 flush_tlb_power7(vcpu);
115 break;
116 default:
117 handled = 0;
118 }
119
120 /*
121 * See if OPAL has already handled the condition.
122 * We assume that if the condition is recovered then OPAL
123 * will have generated an error log event that we will pick
124 * up and log later.
125 */
126 opal_evt = local_paca->opal_mc_evt;
127 if (opal_evt->version == OpalMCE_V1 &&
128 (opal_evt->severity == OpalMCE_SEV_NO_ERROR ||
129 opal_evt->disposition == OpalMCE_DISPOSITION_RECOVERED))
130 handled = 1;
131
132 if (handled)
133 opal_evt->in_use = 0;
134
135 return handled;
136}
137
138long kvmppc_realmode_machine_check(struct kvm_vcpu *vcpu)
139{
140 if (cpu_has_feature(CPU_FTR_ARCH_206))
141 return kvmppc_realmode_mc_power7(vcpu);
142
143 return 0;
144}
diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
index fb0e821622d4..19c93bae1aea 100644
--- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
@@ -35,6 +35,37 @@ static void *real_vmalloc_addr(void *x)
35 return __va(addr); 35 return __va(addr);
36} 36}
37 37
38/* Return 1 if we need to do a global tlbie, 0 if we can use tlbiel */
39static int global_invalidates(struct kvm *kvm, unsigned long flags)
40{
41 int global;
42
43 /*
44 * If there is only one vcore, and it's currently running,
45 * we can use tlbiel as long as we mark all other physical
46 * cores as potentially having stale TLB entries for this lpid.
47 * If we're not using MMU notifiers, we never take pages away
48 * from the guest, so we can use tlbiel if requested.
49 * Otherwise, don't use tlbiel.
50 */
51 if (kvm->arch.online_vcores == 1 && local_paca->kvm_hstate.kvm_vcore)
52 global = 0;
53 else if (kvm->arch.using_mmu_notifiers)
54 global = 1;
55 else
56 global = !(flags & H_LOCAL);
57
58 if (!global) {
59 /* any other core might now have stale TLB entries... */
60 smp_wmb();
61 cpumask_setall(&kvm->arch.need_tlb_flush);
62 cpumask_clear_cpu(local_paca->kvm_hstate.kvm_vcore->pcpu,
63 &kvm->arch.need_tlb_flush);
64 }
65
66 return global;
67}
68
38/* 69/*
39 * Add this HPTE into the chain for the real page. 70 * Add this HPTE into the chain for the real page.
40 * Must be called with the chain locked; it unlocks the chain. 71 * Must be called with the chain locked; it unlocks the chain.
@@ -59,13 +90,24 @@ void kvmppc_add_revmap_chain(struct kvm *kvm, struct revmap_entry *rev,
59 head->back = pte_index; 90 head->back = pte_index;
60 } else { 91 } else {
61 rev->forw = rev->back = pte_index; 92 rev->forw = rev->back = pte_index;
62 i = pte_index; 93 *rmap = (*rmap & ~KVMPPC_RMAP_INDEX) |
94 pte_index | KVMPPC_RMAP_PRESENT;
63 } 95 }
64 smp_wmb(); 96 unlock_rmap(rmap);
65 *rmap = i | KVMPPC_RMAP_REFERENCED | KVMPPC_RMAP_PRESENT; /* unlock */
66} 97}
67EXPORT_SYMBOL_GPL(kvmppc_add_revmap_chain); 98EXPORT_SYMBOL_GPL(kvmppc_add_revmap_chain);
68 99
100/*
101 * Note modification of an HPTE; set the HPTE modified bit
102 * if anyone is interested.
103 */
104static inline void note_hpte_modification(struct kvm *kvm,
105 struct revmap_entry *rev)
106{
107 if (atomic_read(&kvm->arch.hpte_mod_interest))
108 rev->guest_rpte |= HPTE_GR_MODIFIED;
109}
110
69/* Remove this HPTE from the chain for a real page */ 111/* Remove this HPTE from the chain for a real page */
70static void remove_revmap_chain(struct kvm *kvm, long pte_index, 112static void remove_revmap_chain(struct kvm *kvm, long pte_index,
71 struct revmap_entry *rev, 113 struct revmap_entry *rev,
@@ -81,7 +123,7 @@ static void remove_revmap_chain(struct kvm *kvm, long pte_index,
81 ptel = rev->guest_rpte |= rcbits; 123 ptel = rev->guest_rpte |= rcbits;
82 gfn = hpte_rpn(ptel, hpte_page_size(hpte_v, ptel)); 124 gfn = hpte_rpn(ptel, hpte_page_size(hpte_v, ptel));
83 memslot = __gfn_to_memslot(kvm_memslots(kvm), gfn); 125 memslot = __gfn_to_memslot(kvm_memslots(kvm), gfn);
84 if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID)) 126 if (!memslot)
85 return; 127 return;
86 128
87 rmap = real_vmalloc_addr(&memslot->arch.rmap[gfn - memslot->base_gfn]); 129 rmap = real_vmalloc_addr(&memslot->arch.rmap[gfn - memslot->base_gfn]);
@@ -103,14 +145,14 @@ static void remove_revmap_chain(struct kvm *kvm, long pte_index,
103 unlock_rmap(rmap); 145 unlock_rmap(rmap);
104} 146}
105 147
106static pte_t lookup_linux_pte(struct kvm_vcpu *vcpu, unsigned long hva, 148static pte_t lookup_linux_pte(pgd_t *pgdir, unsigned long hva,
107 int writing, unsigned long *pte_sizep) 149 int writing, unsigned long *pte_sizep)
108{ 150{
109 pte_t *ptep; 151 pte_t *ptep;
110 unsigned long ps = *pte_sizep; 152 unsigned long ps = *pte_sizep;
111 unsigned int shift; 153 unsigned int shift;
112 154
113 ptep = find_linux_pte_or_hugepte(vcpu->arch.pgdir, hva, &shift); 155 ptep = find_linux_pte_or_hugepte(pgdir, hva, &shift);
114 if (!ptep) 156 if (!ptep)
115 return __pte(0); 157 return __pte(0);
116 if (shift) 158 if (shift)
@@ -130,15 +172,15 @@ static inline void unlock_hpte(unsigned long *hpte, unsigned long hpte_v)
130 hpte[0] = hpte_v; 172 hpte[0] = hpte_v;
131} 173}
132 174
133long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags, 175long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags,
134 long pte_index, unsigned long pteh, unsigned long ptel) 176 long pte_index, unsigned long pteh, unsigned long ptel,
177 pgd_t *pgdir, bool realmode, unsigned long *pte_idx_ret)
135{ 178{
136 struct kvm *kvm = vcpu->kvm;
137 unsigned long i, pa, gpa, gfn, psize; 179 unsigned long i, pa, gpa, gfn, psize;
138 unsigned long slot_fn, hva; 180 unsigned long slot_fn, hva;
139 unsigned long *hpte; 181 unsigned long *hpte;
140 struct revmap_entry *rev; 182 struct revmap_entry *rev;
141 unsigned long g_ptel = ptel; 183 unsigned long g_ptel;
142 struct kvm_memory_slot *memslot; 184 struct kvm_memory_slot *memslot;
143 unsigned long *physp, pte_size; 185 unsigned long *physp, pte_size;
144 unsigned long is_io; 186 unsigned long is_io;
@@ -147,13 +189,14 @@ long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags,
147 unsigned int writing; 189 unsigned int writing;
148 unsigned long mmu_seq; 190 unsigned long mmu_seq;
149 unsigned long rcbits; 191 unsigned long rcbits;
150 bool realmode = vcpu->arch.vcore->vcore_state == VCORE_RUNNING;
151 192
152 psize = hpte_page_size(pteh, ptel); 193 psize = hpte_page_size(pteh, ptel);
153 if (!psize) 194 if (!psize)
154 return H_PARAMETER; 195 return H_PARAMETER;
155 writing = hpte_is_writable(ptel); 196 writing = hpte_is_writable(ptel);
156 pteh &= ~(HPTE_V_HVLOCK | HPTE_V_ABSENT | HPTE_V_VALID); 197 pteh &= ~(HPTE_V_HVLOCK | HPTE_V_ABSENT | HPTE_V_VALID);
198 ptel &= ~HPTE_GR_RESERVED;
199 g_ptel = ptel;
157 200
158 /* used later to detect if we might have been invalidated */ 201 /* used later to detect if we might have been invalidated */
159 mmu_seq = kvm->mmu_notifier_seq; 202 mmu_seq = kvm->mmu_notifier_seq;
@@ -183,7 +226,7 @@ long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags,
183 rmap = &memslot->arch.rmap[slot_fn]; 226 rmap = &memslot->arch.rmap[slot_fn];
184 227
185 if (!kvm->arch.using_mmu_notifiers) { 228 if (!kvm->arch.using_mmu_notifiers) {
186 physp = kvm->arch.slot_phys[memslot->id]; 229 physp = memslot->arch.slot_phys;
187 if (!physp) 230 if (!physp)
188 return H_PARAMETER; 231 return H_PARAMETER;
189 physp += slot_fn; 232 physp += slot_fn;
@@ -201,7 +244,7 @@ long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags,
201 244
202 /* Look up the Linux PTE for the backing page */ 245 /* Look up the Linux PTE for the backing page */
203 pte_size = psize; 246 pte_size = psize;
204 pte = lookup_linux_pte(vcpu, hva, writing, &pte_size); 247 pte = lookup_linux_pte(pgdir, hva, writing, &pte_size);
205 if (pte_present(pte)) { 248 if (pte_present(pte)) {
206 if (writing && !pte_write(pte)) 249 if (writing && !pte_write(pte))
207 /* make the actual HPTE be read-only */ 250 /* make the actual HPTE be read-only */
@@ -210,6 +253,7 @@ long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags,
210 pa = pte_pfn(pte) << PAGE_SHIFT; 253 pa = pte_pfn(pte) << PAGE_SHIFT;
211 } 254 }
212 } 255 }
256
213 if (pte_size < psize) 257 if (pte_size < psize)
214 return H_PARAMETER; 258 return H_PARAMETER;
215 if (pa && pte_size > psize) 259 if (pa && pte_size > psize)
@@ -287,8 +331,10 @@ long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags,
287 rev = &kvm->arch.revmap[pte_index]; 331 rev = &kvm->arch.revmap[pte_index];
288 if (realmode) 332 if (realmode)
289 rev = real_vmalloc_addr(rev); 333 rev = real_vmalloc_addr(rev);
290 if (rev) 334 if (rev) {
291 rev->guest_rpte = g_ptel; 335 rev->guest_rpte = g_ptel;
336 note_hpte_modification(kvm, rev);
337 }
292 338
293 /* Link HPTE into reverse-map chain */ 339 /* Link HPTE into reverse-map chain */
294 if (pteh & HPTE_V_VALID) { 340 if (pteh & HPTE_V_VALID) {
@@ -297,7 +343,7 @@ long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags,
297 lock_rmap(rmap); 343 lock_rmap(rmap);
298 /* Check for pending invalidations under the rmap chain lock */ 344 /* Check for pending invalidations under the rmap chain lock */
299 if (kvm->arch.using_mmu_notifiers && 345 if (kvm->arch.using_mmu_notifiers &&
300 mmu_notifier_retry(vcpu, mmu_seq)) { 346 mmu_notifier_retry(kvm, mmu_seq)) {
301 /* inval in progress, write a non-present HPTE */ 347 /* inval in progress, write a non-present HPTE */
302 pteh |= HPTE_V_ABSENT; 348 pteh |= HPTE_V_ABSENT;
303 pteh &= ~HPTE_V_VALID; 349 pteh &= ~HPTE_V_VALID;
@@ -318,10 +364,17 @@ long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags,
318 hpte[0] = pteh; 364 hpte[0] = pteh;
319 asm volatile("ptesync" : : : "memory"); 365 asm volatile("ptesync" : : : "memory");
320 366
321 vcpu->arch.gpr[4] = pte_index; 367 *pte_idx_ret = pte_index;
322 return H_SUCCESS; 368 return H_SUCCESS;
323} 369}
324EXPORT_SYMBOL_GPL(kvmppc_h_enter); 370EXPORT_SYMBOL_GPL(kvmppc_do_h_enter);
371
372long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags,
373 long pte_index, unsigned long pteh, unsigned long ptel)
374{
375 return kvmppc_do_h_enter(vcpu->kvm, flags, pte_index, pteh, ptel,
376 vcpu->arch.pgdir, true, &vcpu->arch.gpr[4]);
377}
325 378
326#define LOCK_TOKEN (*(u32 *)(&get_paca()->lock_token)) 379#define LOCK_TOKEN (*(u32 *)(&get_paca()->lock_token))
327 380
@@ -343,11 +396,10 @@ static inline int try_lock_tlbie(unsigned int *lock)
343 return old == 0; 396 return old == 0;
344} 397}
345 398
346long kvmppc_h_remove(struct kvm_vcpu *vcpu, unsigned long flags, 399long kvmppc_do_h_remove(struct kvm *kvm, unsigned long flags,
347 unsigned long pte_index, unsigned long avpn, 400 unsigned long pte_index, unsigned long avpn,
348 unsigned long va) 401 unsigned long *hpret)
349{ 402{
350 struct kvm *kvm = vcpu->kvm;
351 unsigned long *hpte; 403 unsigned long *hpte;
352 unsigned long v, r, rb; 404 unsigned long v, r, rb;
353 struct revmap_entry *rev; 405 struct revmap_entry *rev;
@@ -369,7 +421,7 @@ long kvmppc_h_remove(struct kvm_vcpu *vcpu, unsigned long flags,
369 if (v & HPTE_V_VALID) { 421 if (v & HPTE_V_VALID) {
370 hpte[0] &= ~HPTE_V_VALID; 422 hpte[0] &= ~HPTE_V_VALID;
371 rb = compute_tlbie_rb(v, hpte[1], pte_index); 423 rb = compute_tlbie_rb(v, hpte[1], pte_index);
372 if (!(flags & H_LOCAL) && atomic_read(&kvm->online_vcpus) > 1) { 424 if (global_invalidates(kvm, flags)) {
373 while (!try_lock_tlbie(&kvm->arch.tlbie_lock)) 425 while (!try_lock_tlbie(&kvm->arch.tlbie_lock))
374 cpu_relax(); 426 cpu_relax();
375 asm volatile("ptesync" : : : "memory"); 427 asm volatile("ptesync" : : : "memory");
@@ -385,13 +437,22 @@ long kvmppc_h_remove(struct kvm_vcpu *vcpu, unsigned long flags,
385 /* Read PTE low word after tlbie to get final R/C values */ 437 /* Read PTE low word after tlbie to get final R/C values */
386 remove_revmap_chain(kvm, pte_index, rev, v, hpte[1]); 438 remove_revmap_chain(kvm, pte_index, rev, v, hpte[1]);
387 } 439 }
388 r = rev->guest_rpte; 440 r = rev->guest_rpte & ~HPTE_GR_RESERVED;
441 note_hpte_modification(kvm, rev);
389 unlock_hpte(hpte, 0); 442 unlock_hpte(hpte, 0);
390 443
391 vcpu->arch.gpr[4] = v; 444 hpret[0] = v;
392 vcpu->arch.gpr[5] = r; 445 hpret[1] = r;
393 return H_SUCCESS; 446 return H_SUCCESS;
394} 447}
448EXPORT_SYMBOL_GPL(kvmppc_do_h_remove);
449
450long kvmppc_h_remove(struct kvm_vcpu *vcpu, unsigned long flags,
451 unsigned long pte_index, unsigned long avpn)
452{
453 return kvmppc_do_h_remove(vcpu->kvm, flags, pte_index, avpn,
454 &vcpu->arch.gpr[4]);
455}
395 456
396long kvmppc_h_bulk_remove(struct kvm_vcpu *vcpu) 457long kvmppc_h_bulk_remove(struct kvm_vcpu *vcpu)
397{ 458{
@@ -459,6 +520,7 @@ long kvmppc_h_bulk_remove(struct kvm_vcpu *vcpu)
459 520
460 args[j] = ((0x80 | flags) << 56) + pte_index; 521 args[j] = ((0x80 | flags) << 56) + pte_index;
461 rev = real_vmalloc_addr(&kvm->arch.revmap[pte_index]); 522 rev = real_vmalloc_addr(&kvm->arch.revmap[pte_index]);
523 note_hpte_modification(kvm, rev);
462 524
463 if (!(hp[0] & HPTE_V_VALID)) { 525 if (!(hp[0] & HPTE_V_VALID)) {
464 /* insert R and C bits from PTE */ 526 /* insert R and C bits from PTE */
@@ -534,8 +596,6 @@ long kvmppc_h_protect(struct kvm_vcpu *vcpu, unsigned long flags,
534 return H_NOT_FOUND; 596 return H_NOT_FOUND;
535 } 597 }
536 598
537 if (atomic_read(&kvm->online_vcpus) == 1)
538 flags |= H_LOCAL;
539 v = hpte[0]; 599 v = hpte[0];
540 bits = (flags << 55) & HPTE_R_PP0; 600 bits = (flags << 55) & HPTE_R_PP0;
541 bits |= (flags << 48) & HPTE_R_KEY_HI; 601 bits |= (flags << 48) & HPTE_R_KEY_HI;
@@ -548,6 +608,7 @@ long kvmppc_h_protect(struct kvm_vcpu *vcpu, unsigned long flags,
548 if (rev) { 608 if (rev) {
549 r = (rev->guest_rpte & ~mask) | bits; 609 r = (rev->guest_rpte & ~mask) | bits;
550 rev->guest_rpte = r; 610 rev->guest_rpte = r;
611 note_hpte_modification(kvm, rev);
551 } 612 }
552 r = (hpte[1] & ~mask) | bits; 613 r = (hpte[1] & ~mask) | bits;
553 614
@@ -555,7 +616,7 @@ long kvmppc_h_protect(struct kvm_vcpu *vcpu, unsigned long flags,
555 if (v & HPTE_V_VALID) { 616 if (v & HPTE_V_VALID) {
556 rb = compute_tlbie_rb(v, r, pte_index); 617 rb = compute_tlbie_rb(v, r, pte_index);
557 hpte[0] = v & ~HPTE_V_VALID; 618 hpte[0] = v & ~HPTE_V_VALID;
558 if (!(flags & H_LOCAL)) { 619 if (global_invalidates(kvm, flags)) {
559 while(!try_lock_tlbie(&kvm->arch.tlbie_lock)) 620 while(!try_lock_tlbie(&kvm->arch.tlbie_lock))
560 cpu_relax(); 621 cpu_relax();
561 asm volatile("ptesync" : : : "memory"); 622 asm volatile("ptesync" : : : "memory");
@@ -568,6 +629,28 @@ long kvmppc_h_protect(struct kvm_vcpu *vcpu, unsigned long flags,
568 asm volatile("tlbiel %0" : : "r" (rb)); 629 asm volatile("tlbiel %0" : : "r" (rb));
569 asm volatile("ptesync" : : : "memory"); 630 asm volatile("ptesync" : : : "memory");
570 } 631 }
632 /*
633 * If the host has this page as readonly but the guest
634 * wants to make it read/write, reduce the permissions.
635 * Checking the host permissions involves finding the
636 * memslot and then the Linux PTE for the page.
637 */
638 if (hpte_is_writable(r) && kvm->arch.using_mmu_notifiers) {
639 unsigned long psize, gfn, hva;
640 struct kvm_memory_slot *memslot;
641 pgd_t *pgdir = vcpu->arch.pgdir;
642 pte_t pte;
643
644 psize = hpte_page_size(v, r);
645 gfn = ((r & HPTE_R_RPN) & ~(psize - 1)) >> PAGE_SHIFT;
646 memslot = __gfn_to_memslot(kvm_memslots(kvm), gfn);
647 if (memslot) {
648 hva = __gfn_to_hva_memslot(memslot, gfn);
649 pte = lookup_linux_pte(pgdir, hva, 1, &psize);
650 if (pte_present(pte) && !pte_write(pte))
651 r = hpte_make_readonly(r);
652 }
653 }
571 } 654 }
572 hpte[1] = r; 655 hpte[1] = r;
573 eieio(); 656 eieio();
@@ -599,8 +682,10 @@ long kvmppc_h_read(struct kvm_vcpu *vcpu, unsigned long flags,
599 v &= ~HPTE_V_ABSENT; 682 v &= ~HPTE_V_ABSENT;
600 v |= HPTE_V_VALID; 683 v |= HPTE_V_VALID;
601 } 684 }
602 if (v & HPTE_V_VALID) 685 if (v & HPTE_V_VALID) {
603 r = rev[i].guest_rpte | (r & (HPTE_R_R | HPTE_R_C)); 686 r = rev[i].guest_rpte | (r & (HPTE_R_R | HPTE_R_C));
687 r &= ~HPTE_GR_RESERVED;
688 }
604 vcpu->arch.gpr[4 + i * 2] = v; 689 vcpu->arch.gpr[4 + i * 2] = v;
605 vcpu->arch.gpr[5 + i * 2] = r; 690 vcpu->arch.gpr[5 + i * 2] = r;
606 } 691 }
diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index 74a24bbb9637..10b6c358dd77 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -27,6 +27,7 @@
27#include <asm/asm-offsets.h> 27#include <asm/asm-offsets.h>
28#include <asm/exception-64s.h> 28#include <asm/exception-64s.h>
29#include <asm/kvm_book3s_asm.h> 29#include <asm/kvm_book3s_asm.h>
30#include <asm/mmu-hash64.h>
30 31
31/***************************************************************************** 32/*****************************************************************************
32 * * 33 * *
@@ -134,8 +135,11 @@ kvm_start_guest:
134 135
13527: /* XXX should handle hypervisor maintenance interrupts etc. here */ 13627: /* XXX should handle hypervisor maintenance interrupts etc. here */
136 137
138 /* reload vcpu pointer after clearing the IPI */
139 ld r4,HSTATE_KVM_VCPU(r13)
140 cmpdi r4,0
137 /* if we have no vcpu to run, go back to sleep */ 141 /* if we have no vcpu to run, go back to sleep */
138 beq cr1,kvm_no_guest 142 beq kvm_no_guest
139 143
140 /* were we napping due to cede? */ 144 /* were we napping due to cede? */
141 lbz r0,HSTATE_NAPPING(r13) 145 lbz r0,HSTATE_NAPPING(r13)
@@ -310,7 +314,33 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201)
310 mtspr SPRN_SDR1,r6 /* switch to partition page table */ 314 mtspr SPRN_SDR1,r6 /* switch to partition page table */
311 mtspr SPRN_LPID,r7 315 mtspr SPRN_LPID,r7
312 isync 316 isync
317
318 /* See if we need to flush the TLB */
319 lhz r6,PACAPACAINDEX(r13) /* test_bit(cpu, need_tlb_flush) */
320 clrldi r7,r6,64-6 /* extract bit number (6 bits) */
321 srdi r6,r6,6 /* doubleword number */
322 sldi r6,r6,3 /* address offset */
323 add r6,r6,r9
324 addi r6,r6,KVM_NEED_FLUSH /* dword in kvm->arch.need_tlb_flush */
313 li r0,1 325 li r0,1
326 sld r0,r0,r7
327 ld r7,0(r6)
328 and. r7,r7,r0
329 beq 22f
33023: ldarx r7,0,r6 /* if set, clear the bit */
331 andc r7,r7,r0
332 stdcx. r7,0,r6
333 bne 23b
334 li r6,128 /* and flush the TLB */
335 mtctr r6
336 li r7,0x800 /* IS field = 0b10 */
337 ptesync
33828: tlbiel r7
339 addi r7,r7,0x1000
340 bdnz 28b
341 ptesync
342
34322: li r0,1
314 stb r0,VCORE_IN_GUEST(r5) /* signal secondaries to continue */ 344 stb r0,VCORE_IN_GUEST(r5) /* signal secondaries to continue */
315 b 10f 345 b 10f
316 346
@@ -333,36 +363,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201)
333 mr r9,r4 363 mr r9,r4
334 blt hdec_soon 364 blt hdec_soon
335 365
336 /*
337 * Invalidate the TLB if we could possibly have stale TLB
338 * entries for this partition on this core due to the use
339 * of tlbiel.
340 * XXX maybe only need this on primary thread?
341 */
342 ld r9,VCPU_KVM(r4) /* pointer to struct kvm */
343 lwz r5,VCPU_VCPUID(r4)
344 lhz r6,PACAPACAINDEX(r13)
345 rldimi r6,r5,0,62 /* XXX map as if threads 1:1 p:v */
346 lhz r8,VCPU_LAST_CPU(r4)
347 sldi r7,r6,1 /* see if this is the same vcpu */
348 add r7,r7,r9 /* as last ran on this pcpu */
349 lhz r0,KVM_LAST_VCPU(r7)
350 cmpw r6,r8 /* on the same cpu core as last time? */
351 bne 3f
352 cmpw r0,r5 /* same vcpu as this core last ran? */
353 beq 1f
3543: sth r6,VCPU_LAST_CPU(r4) /* if not, invalidate partition TLB */
355 sth r5,KVM_LAST_VCPU(r7)
356 li r6,128
357 mtctr r6
358 li r7,0x800 /* IS field = 0b10 */
359 ptesync
3602: tlbiel r7
361 addi r7,r7,0x1000
362 bdnz 2b
363 ptesync
3641:
365
366 /* Save purr/spurr */ 366 /* Save purr/spurr */
367 mfspr r5,SPRN_PURR 367 mfspr r5,SPRN_PURR
368 mfspr r6,SPRN_SPURR 368 mfspr r6,SPRN_SPURR
@@ -679,8 +679,7 @@ BEGIN_FTR_SECTION
6791: 6791:
680END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206) 680END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206)
681 681
682nohpte_cont: 682guest_exit_cont: /* r9 = vcpu, r12 = trap, r13 = paca */
683hcall_real_cont: /* r9 = vcpu, r12 = trap, r13 = paca */
684 /* Save DEC */ 683 /* Save DEC */
685 mfspr r5,SPRN_DEC 684 mfspr r5,SPRN_DEC
686 mftb r6 685 mftb r6
@@ -701,6 +700,11 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206)
701 std r6, VCPU_FAULT_DAR(r9) 700 std r6, VCPU_FAULT_DAR(r9)
702 stw r7, VCPU_FAULT_DSISR(r9) 701 stw r7, VCPU_FAULT_DSISR(r9)
703 702
703 /* See if it is a machine check */
704 cmpwi r12, BOOK3S_INTERRUPT_MACHINE_CHECK
705 beq machine_check_realmode
706mc_cont:
707
704 /* Save guest CTRL register, set runlatch to 1 */ 708 /* Save guest CTRL register, set runlatch to 1 */
7056: mfspr r6,SPRN_CTRLF 7096: mfspr r6,SPRN_CTRLF
706 stw r6,VCPU_CTRL(r9) 710 stw r6,VCPU_CTRL(r9)
@@ -1113,38 +1117,41 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201)
1113 /* 1117 /*
1114 * For external and machine check interrupts, we need 1118 * For external and machine check interrupts, we need
1115 * to call the Linux handler to process the interrupt. 1119 * to call the Linux handler to process the interrupt.
1116 * We do that by jumping to the interrupt vector address 1120 * We do that by jumping to absolute address 0x500 for
1117 * which we have in r12. The [h]rfid at the end of the 1121 * external interrupts, or the machine_check_fwnmi label
1122 * for machine checks (since firmware might have patched
1123 * the vector area at 0x200). The [h]rfid at the end of the
1118 * handler will return to the book3s_hv_interrupts.S code. 1124 * handler will return to the book3s_hv_interrupts.S code.
1119 * For other interrupts we do the rfid to get back 1125 * For other interrupts we do the rfid to get back
1120 * to the book3s_interrupts.S code here. 1126 * to the book3s_hv_interrupts.S code here.
1121 */ 1127 */
1122 ld r8, HSTATE_VMHANDLER(r13) 1128 ld r8, HSTATE_VMHANDLER(r13)
1123 ld r7, HSTATE_HOST_MSR(r13) 1129 ld r7, HSTATE_HOST_MSR(r13)
1124 1130
1131 cmpwi cr1, r12, BOOK3S_INTERRUPT_MACHINE_CHECK
1125 cmpwi r12, BOOK3S_INTERRUPT_EXTERNAL 1132 cmpwi r12, BOOK3S_INTERRUPT_EXTERNAL
1133BEGIN_FTR_SECTION
1126 beq 11f 1134 beq 11f
1127 cmpwi r12, BOOK3S_INTERRUPT_MACHINE_CHECK 1135END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206)
1128 1136
1129 /* RFI into the highmem handler, or branch to interrupt handler */ 1137 /* RFI into the highmem handler, or branch to interrupt handler */
113012: mfmsr r6 1138 mfmsr r6
1131 mtctr r12
1132 li r0, MSR_RI 1139 li r0, MSR_RI
1133 andc r6, r6, r0 1140 andc r6, r6, r0
1134 mtmsrd r6, 1 /* Clear RI in MSR */ 1141 mtmsrd r6, 1 /* Clear RI in MSR */
1135 mtsrr0 r8 1142 mtsrr0 r8
1136 mtsrr1 r7 1143 mtsrr1 r7
1137 beqctr 1144 beqa 0x500 /* external interrupt (PPC970) */
1145 beq cr1, 13f /* machine check */
1138 RFI 1146 RFI
1139 1147
114011: 1148 /* On POWER7, we have external interrupts set to use HSRR0/1 */
1141BEGIN_FTR_SECTION 114911: mtspr SPRN_HSRR0, r8
1142 b 12b
1143END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201)
1144 mtspr SPRN_HSRR0, r8
1145 mtspr SPRN_HSRR1, r7 1150 mtspr SPRN_HSRR1, r7
1146 ba 0x500 1151 ba 0x500
1147 1152
115313: b machine_check_fwnmi
1154
1148/* 1155/*
1149 * Check whether an HDSI is an HPTE not found fault or something else. 1156 * Check whether an HDSI is an HPTE not found fault or something else.
1150 * If it is an HPTE not found fault that is due to the guest accessing 1157 * If it is an HPTE not found fault that is due to the guest accessing
@@ -1177,7 +1184,7 @@ kvmppc_hdsi:
1177 cmpdi r3, 0 /* retry the instruction */ 1184 cmpdi r3, 0 /* retry the instruction */
1178 beq 6f 1185 beq 6f
1179 cmpdi r3, -1 /* handle in kernel mode */ 1186 cmpdi r3, -1 /* handle in kernel mode */
1180 beq nohpte_cont 1187 beq guest_exit_cont
1181 cmpdi r3, -2 /* MMIO emulation; need instr word */ 1188 cmpdi r3, -2 /* MMIO emulation; need instr word */
1182 beq 2f 1189 beq 2f
1183 1190
@@ -1191,6 +1198,7 @@ kvmppc_hdsi:
1191 li r10, BOOK3S_INTERRUPT_DATA_STORAGE 1198 li r10, BOOK3S_INTERRUPT_DATA_STORAGE
1192 li r11, (MSR_ME << 1) | 1 /* synthesize MSR_SF | MSR_ME */ 1199 li r11, (MSR_ME << 1) | 1 /* synthesize MSR_SF | MSR_ME */
1193 rotldi r11, r11, 63 1200 rotldi r11, r11, 63
1201fast_interrupt_c_return:
11946: ld r7, VCPU_CTR(r9) 12026: ld r7, VCPU_CTR(r9)
1195 lwz r8, VCPU_XER(r9) 1203 lwz r8, VCPU_XER(r9)
1196 mtctr r7 1204 mtctr r7
@@ -1223,7 +1231,7 @@ kvmppc_hdsi:
1223 /* Unset guest mode. */ 1231 /* Unset guest mode. */
1224 li r0, KVM_GUEST_MODE_NONE 1232 li r0, KVM_GUEST_MODE_NONE
1225 stb r0, HSTATE_IN_GUEST(r13) 1233 stb r0, HSTATE_IN_GUEST(r13)
1226 b nohpte_cont 1234 b guest_exit_cont
1227 1235
1228/* 1236/*
1229 * Similarly for an HISI, reflect it to the guest as an ISI unless 1237 * Similarly for an HISI, reflect it to the guest as an ISI unless
@@ -1249,9 +1257,9 @@ kvmppc_hisi:
1249 ld r11, VCPU_MSR(r9) 1257 ld r11, VCPU_MSR(r9)
1250 li r12, BOOK3S_INTERRUPT_H_INST_STORAGE 1258 li r12, BOOK3S_INTERRUPT_H_INST_STORAGE
1251 cmpdi r3, 0 /* retry the instruction */ 1259 cmpdi r3, 0 /* retry the instruction */
1252 beq 6f 1260 beq fast_interrupt_c_return
1253 cmpdi r3, -1 /* handle in kernel mode */ 1261 cmpdi r3, -1 /* handle in kernel mode */
1254 beq nohpte_cont 1262 beq guest_exit_cont
1255 1263
1256 /* Synthesize an ISI for the guest */ 1264 /* Synthesize an ISI for the guest */
1257 mr r11, r3 1265 mr r11, r3
@@ -1260,12 +1268,7 @@ kvmppc_hisi:
1260 li r10, BOOK3S_INTERRUPT_INST_STORAGE 1268 li r10, BOOK3S_INTERRUPT_INST_STORAGE
1261 li r11, (MSR_ME << 1) | 1 /* synthesize MSR_SF | MSR_ME */ 1269 li r11, (MSR_ME << 1) | 1 /* synthesize MSR_SF | MSR_ME */
1262 rotldi r11, r11, 63 1270 rotldi r11, r11, 63
12636: ld r7, VCPU_CTR(r9) 1271 b fast_interrupt_c_return
1264 lwz r8, VCPU_XER(r9)
1265 mtctr r7
1266 mtxer r8
1267 mr r4, r9
1268 b fast_guest_return
1269 1272
12703: ld r6, VCPU_KVM(r9) /* not relocated, use VRMA */ 12733: ld r6, VCPU_KVM(r9) /* not relocated, use VRMA */
1271 ld r5, KVM_VRMA_SLB_V(r6) 1274 ld r5, KVM_VRMA_SLB_V(r6)
@@ -1281,14 +1284,14 @@ kvmppc_hisi:
1281hcall_try_real_mode: 1284hcall_try_real_mode:
1282 ld r3,VCPU_GPR(R3)(r9) 1285 ld r3,VCPU_GPR(R3)(r9)
1283 andi. r0,r11,MSR_PR 1286 andi. r0,r11,MSR_PR
1284 bne hcall_real_cont 1287 bne guest_exit_cont
1285 clrrdi r3,r3,2 1288 clrrdi r3,r3,2
1286 cmpldi r3,hcall_real_table_end - hcall_real_table 1289 cmpldi r3,hcall_real_table_end - hcall_real_table
1287 bge hcall_real_cont 1290 bge guest_exit_cont
1288 LOAD_REG_ADDR(r4, hcall_real_table) 1291 LOAD_REG_ADDR(r4, hcall_real_table)
1289 lwzx r3,r3,r4 1292 lwzx r3,r3,r4
1290 cmpwi r3,0 1293 cmpwi r3,0
1291 beq hcall_real_cont 1294 beq guest_exit_cont
1292 add r3,r3,r4 1295 add r3,r3,r4
1293 mtctr r3 1296 mtctr r3
1294 mr r3,r9 /* get vcpu pointer */ 1297 mr r3,r9 /* get vcpu pointer */
@@ -1309,7 +1312,7 @@ hcall_real_fallback:
1309 li r12,BOOK3S_INTERRUPT_SYSCALL 1312 li r12,BOOK3S_INTERRUPT_SYSCALL
1310 ld r9, HSTATE_KVM_VCPU(r13) 1313 ld r9, HSTATE_KVM_VCPU(r13)
1311 1314
1312 b hcall_real_cont 1315 b guest_exit_cont
1313 1316
1314 .globl hcall_real_table 1317 .globl hcall_real_table
1315hcall_real_table: 1318hcall_real_table:
@@ -1568,6 +1571,21 @@ kvm_cede_exit:
1568 li r3,H_TOO_HARD 1571 li r3,H_TOO_HARD
1569 blr 1572 blr
1570 1573
1574 /* Try to handle a machine check in real mode */
1575machine_check_realmode:
1576 mr r3, r9 /* get vcpu pointer */
1577 bl .kvmppc_realmode_machine_check
1578 nop
1579 cmpdi r3, 0 /* continue exiting from guest? */
1580 ld r9, HSTATE_KVM_VCPU(r13)
1581 li r12, BOOK3S_INTERRUPT_MACHINE_CHECK
1582 beq mc_cont
1583 /* If not, deliver a machine check. SRR0/1 are already set */
1584 li r10, BOOK3S_INTERRUPT_MACHINE_CHECK
1585 li r11, (MSR_ME << 1) | 1 /* synthesize MSR_SF | MSR_ME */
1586 rotldi r11, r11, 63
1587 b fast_interrupt_c_return
1588
1571secondary_too_late: 1589secondary_too_late:
1572 ld r5,HSTATE_KVM_VCORE(r13) 1590 ld r5,HSTATE_KVM_VCORE(r13)
1573 HMT_LOW 1591 HMT_LOW
@@ -1587,6 +1605,10 @@ secondary_too_late:
1587 .endr 1605 .endr
1588 1606
1589secondary_nap: 1607secondary_nap:
1608 /* Clear our vcpu pointer so we don't come back in early */
1609 li r0, 0
1610 std r0, HSTATE_KVM_VCPU(r13)
1611 lwsync
1590 /* Clear any pending IPI - assume we're a secondary thread */ 1612 /* Clear any pending IPI - assume we're a secondary thread */
1591 ld r5, HSTATE_XICS_PHYS(r13) 1613 ld r5, HSTATE_XICS_PHYS(r13)
1592 li r7, XICS_XIRR 1614 li r7, XICS_XIRR
@@ -1612,8 +1634,6 @@ secondary_nap:
1612kvm_no_guest: 1634kvm_no_guest:
1613 li r0, KVM_HWTHREAD_IN_NAP 1635 li r0, KVM_HWTHREAD_IN_NAP
1614 stb r0, HSTATE_HWTHREAD_STATE(r13) 1636 stb r0, HSTATE_HWTHREAD_STATE(r13)
1615 li r0, 0
1616 std r0, HSTATE_KVM_VCPU(r13)
1617 1637
1618 li r3, LPCR_PECE0 1638 li r3, LPCR_PECE0
1619 mfspr r4, SPRN_LPCR 1639 mfspr r4, SPRN_LPCR
diff --git a/arch/powerpc/kvm/book3s_mmu_hpte.c b/arch/powerpc/kvm/book3s_mmu_hpte.c
index 41cb0017e757..2c86b0d63714 100644
--- a/arch/powerpc/kvm/book3s_mmu_hpte.c
+++ b/arch/powerpc/kvm/book3s_mmu_hpte.c
@@ -114,11 +114,6 @@ static void invalidate_pte(struct kvm_vcpu *vcpu, struct hpte_cache *pte)
114 hlist_del_init_rcu(&pte->list_vpte); 114 hlist_del_init_rcu(&pte->list_vpte);
115 hlist_del_init_rcu(&pte->list_vpte_long); 115 hlist_del_init_rcu(&pte->list_vpte_long);
116 116
117 if (pte->pte.may_write)
118 kvm_release_pfn_dirty(pte->pfn);
119 else
120 kvm_release_pfn_clean(pte->pfn);
121
122 spin_unlock(&vcpu3s->mmu_lock); 117 spin_unlock(&vcpu3s->mmu_lock);
123 118
124 vcpu3s->hpte_cache_count--; 119 vcpu3s->hpte_cache_count--;
diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c
index 05c28f59f77f..28d38adeca73 100644
--- a/arch/powerpc/kvm/book3s_pr.c
+++ b/arch/powerpc/kvm/book3s_pr.c
@@ -52,8 +52,6 @@ static int kvmppc_handle_ext(struct kvm_vcpu *vcpu, unsigned int exit_nr,
52#define MSR_USER32 MSR_USER 52#define MSR_USER32 MSR_USER
53#define MSR_USER64 MSR_USER 53#define MSR_USER64 MSR_USER
54#define HW_PAGE_SIZE PAGE_SIZE 54#define HW_PAGE_SIZE PAGE_SIZE
55#define __hard_irq_disable local_irq_disable
56#define __hard_irq_enable local_irq_enable
57#endif 55#endif
58 56
59void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu) 57void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
@@ -66,7 +64,7 @@ void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
66 svcpu->slb_max = to_book3s(vcpu)->slb_shadow_max; 64 svcpu->slb_max = to_book3s(vcpu)->slb_shadow_max;
67 svcpu_put(svcpu); 65 svcpu_put(svcpu);
68#endif 66#endif
69 67 vcpu->cpu = smp_processor_id();
70#ifdef CONFIG_PPC_BOOK3S_32 68#ifdef CONFIG_PPC_BOOK3S_32
71 current->thread.kvm_shadow_vcpu = to_book3s(vcpu)->shadow_vcpu; 69 current->thread.kvm_shadow_vcpu = to_book3s(vcpu)->shadow_vcpu;
72#endif 70#endif
@@ -83,17 +81,71 @@ void kvmppc_core_vcpu_put(struct kvm_vcpu *vcpu)
83 svcpu_put(svcpu); 81 svcpu_put(svcpu);
84#endif 82#endif
85 83
86 kvmppc_giveup_ext(vcpu, MSR_FP); 84 kvmppc_giveup_ext(vcpu, MSR_FP | MSR_VEC | MSR_VSX);
87 kvmppc_giveup_ext(vcpu, MSR_VEC); 85 vcpu->cpu = -1;
88 kvmppc_giveup_ext(vcpu, MSR_VSX); 86}
87
88int kvmppc_core_check_requests(struct kvm_vcpu *vcpu)
89{
90 int r = 1; /* Indicate we want to get back into the guest */
91
92 /* We misuse TLB_FLUSH to indicate that we want to clear
93 all shadow cache entries */
94 if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu))
95 kvmppc_mmu_pte_flush(vcpu, 0, 0);
96
97 return r;
98}
99
100/************* MMU Notifiers *************/
101
102int kvm_unmap_hva(struct kvm *kvm, unsigned long hva)
103{
104 trace_kvm_unmap_hva(hva);
105
106 /*
107 * Flush all shadow tlb entries everywhere. This is slow, but
108 * we are 100% sure that we catch the to be unmapped page
109 */
110 kvm_flush_remote_tlbs(kvm);
111
112 return 0;
113}
114
115int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end)
116{
117 /* kvm_unmap_hva flushes everything anyways */
118 kvm_unmap_hva(kvm, start);
119
120 return 0;
121}
122
123int kvm_age_hva(struct kvm *kvm, unsigned long hva)
124{
125 /* XXX could be more clever ;) */
126 return 0;
127}
128
129int kvm_test_age_hva(struct kvm *kvm, unsigned long hva)
130{
131 /* XXX could be more clever ;) */
132 return 0;
89} 133}
90 134
135void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
136{
137 /* The page will get remapped properly on its next fault */
138 kvm_unmap_hva(kvm, hva);
139}
140
141/*****************************************/
142
91static void kvmppc_recalc_shadow_msr(struct kvm_vcpu *vcpu) 143static void kvmppc_recalc_shadow_msr(struct kvm_vcpu *vcpu)
92{ 144{
93 ulong smsr = vcpu->arch.shared->msr; 145 ulong smsr = vcpu->arch.shared->msr;
94 146
95 /* Guest MSR values */ 147 /* Guest MSR values */
96 smsr &= MSR_FE0 | MSR_FE1 | MSR_SF | MSR_SE | MSR_BE | MSR_DE; 148 smsr &= MSR_FE0 | MSR_FE1 | MSR_SF | MSR_SE | MSR_BE;
97 /* Process MSR values */ 149 /* Process MSR values */
98 smsr |= MSR_ME | MSR_RI | MSR_IR | MSR_DR | MSR_PR | MSR_EE; 150 smsr |= MSR_ME | MSR_RI | MSR_IR | MSR_DR | MSR_PR | MSR_EE;
99 /* External providers the guest reserved */ 151 /* External providers the guest reserved */
@@ -379,10 +431,7 @@ int kvmppc_handle_pagefault(struct kvm_run *run, struct kvm_vcpu *vcpu,
379 431
380static inline int get_fpr_index(int i) 432static inline int get_fpr_index(int i)
381{ 433{
382#ifdef CONFIG_VSX 434 return i * TS_FPRWIDTH;
383 i *= 2;
384#endif
385 return i;
386} 435}
387 436
388/* Give up external provider (FPU, Altivec, VSX) */ 437/* Give up external provider (FPU, Altivec, VSX) */
@@ -396,41 +445,49 @@ void kvmppc_giveup_ext(struct kvm_vcpu *vcpu, ulong msr)
396 u64 *thread_fpr = (u64*)t->fpr; 445 u64 *thread_fpr = (u64*)t->fpr;
397 int i; 446 int i;
398 447
399 if (!(vcpu->arch.guest_owned_ext & msr)) 448 /*
449 * VSX instructions can access FP and vector registers, so if
450 * we are giving up VSX, make sure we give up FP and VMX as well.
451 */
452 if (msr & MSR_VSX)
453 msr |= MSR_FP | MSR_VEC;
454
455 msr &= vcpu->arch.guest_owned_ext;
456 if (!msr)
400 return; 457 return;
401 458
402#ifdef DEBUG_EXT 459#ifdef DEBUG_EXT
403 printk(KERN_INFO "Giving up ext 0x%lx\n", msr); 460 printk(KERN_INFO "Giving up ext 0x%lx\n", msr);
404#endif 461#endif
405 462
406 switch (msr) { 463 if (msr & MSR_FP) {
407 case MSR_FP: 464 /*
465 * Note that on CPUs with VSX, giveup_fpu stores
466 * both the traditional FP registers and the added VSX
467 * registers into thread.fpr[].
468 */
408 giveup_fpu(current); 469 giveup_fpu(current);
409 for (i = 0; i < ARRAY_SIZE(vcpu->arch.fpr); i++) 470 for (i = 0; i < ARRAY_SIZE(vcpu->arch.fpr); i++)
410 vcpu_fpr[i] = thread_fpr[get_fpr_index(i)]; 471 vcpu_fpr[i] = thread_fpr[get_fpr_index(i)];
411 472
412 vcpu->arch.fpscr = t->fpscr.val; 473 vcpu->arch.fpscr = t->fpscr.val;
413 break; 474
414 case MSR_VEC: 475#ifdef CONFIG_VSX
476 if (cpu_has_feature(CPU_FTR_VSX))
477 for (i = 0; i < ARRAY_SIZE(vcpu->arch.vsr) / 2; i++)
478 vcpu_vsx[i] = thread_fpr[get_fpr_index(i) + 1];
479#endif
480 }
481
415#ifdef CONFIG_ALTIVEC 482#ifdef CONFIG_ALTIVEC
483 if (msr & MSR_VEC) {
416 giveup_altivec(current); 484 giveup_altivec(current);
417 memcpy(vcpu->arch.vr, t->vr, sizeof(vcpu->arch.vr)); 485 memcpy(vcpu->arch.vr, t->vr, sizeof(vcpu->arch.vr));
418 vcpu->arch.vscr = t->vscr; 486 vcpu->arch.vscr = t->vscr;
419#endif
420 break;
421 case MSR_VSX:
422#ifdef CONFIG_VSX
423 __giveup_vsx(current);
424 for (i = 0; i < ARRAY_SIZE(vcpu->arch.vsr); i++)
425 vcpu_vsx[i] = thread_fpr[get_fpr_index(i) + 1];
426#endif
427 break;
428 default:
429 BUG();
430 } 487 }
488#endif
431 489
432 vcpu->arch.guest_owned_ext &= ~msr; 490 vcpu->arch.guest_owned_ext &= ~(msr | MSR_VSX);
433 current->thread.regs->msr &= ~msr;
434 kvmppc_recalc_shadow_msr(vcpu); 491 kvmppc_recalc_shadow_msr(vcpu);
435} 492}
436 493
@@ -490,47 +547,56 @@ static int kvmppc_handle_ext(struct kvm_vcpu *vcpu, unsigned int exit_nr,
490 return RESUME_GUEST; 547 return RESUME_GUEST;
491 } 548 }
492 549
493 /* We already own the ext */ 550 if (msr == MSR_VSX) {
494 if (vcpu->arch.guest_owned_ext & msr) { 551 /* No VSX? Give an illegal instruction interrupt */
495 return RESUME_GUEST; 552#ifdef CONFIG_VSX
553 if (!cpu_has_feature(CPU_FTR_VSX))
554#endif
555 {
556 kvmppc_core_queue_program(vcpu, SRR1_PROGILL);
557 return RESUME_GUEST;
558 }
559
560 /*
561 * We have to load up all the FP and VMX registers before
562 * we can let the guest use VSX instructions.
563 */
564 msr = MSR_FP | MSR_VEC | MSR_VSX;
496 } 565 }
497 566
567 /* See if we already own all the ext(s) needed */
568 msr &= ~vcpu->arch.guest_owned_ext;
569 if (!msr)
570 return RESUME_GUEST;
571
498#ifdef DEBUG_EXT 572#ifdef DEBUG_EXT
499 printk(KERN_INFO "Loading up ext 0x%lx\n", msr); 573 printk(KERN_INFO "Loading up ext 0x%lx\n", msr);
500#endif 574#endif
501 575
502 current->thread.regs->msr |= msr; 576 current->thread.regs->msr |= msr;
503 577
504 switch (msr) { 578 if (msr & MSR_FP) {
505 case MSR_FP:
506 for (i = 0; i < ARRAY_SIZE(vcpu->arch.fpr); i++) 579 for (i = 0; i < ARRAY_SIZE(vcpu->arch.fpr); i++)
507 thread_fpr[get_fpr_index(i)] = vcpu_fpr[i]; 580 thread_fpr[get_fpr_index(i)] = vcpu_fpr[i];
508 581#ifdef CONFIG_VSX
582 for (i = 0; i < ARRAY_SIZE(vcpu->arch.vsr) / 2; i++)
583 thread_fpr[get_fpr_index(i) + 1] = vcpu_vsx[i];
584#endif
509 t->fpscr.val = vcpu->arch.fpscr; 585 t->fpscr.val = vcpu->arch.fpscr;
510 t->fpexc_mode = 0; 586 t->fpexc_mode = 0;
511 kvmppc_load_up_fpu(); 587 kvmppc_load_up_fpu();
512 break; 588 }
513 case MSR_VEC: 589
590 if (msr & MSR_VEC) {
514#ifdef CONFIG_ALTIVEC 591#ifdef CONFIG_ALTIVEC
515 memcpy(t->vr, vcpu->arch.vr, sizeof(vcpu->arch.vr)); 592 memcpy(t->vr, vcpu->arch.vr, sizeof(vcpu->arch.vr));
516 t->vscr = vcpu->arch.vscr; 593 t->vscr = vcpu->arch.vscr;
517 t->vrsave = -1; 594 t->vrsave = -1;
518 kvmppc_load_up_altivec(); 595 kvmppc_load_up_altivec();
519#endif 596#endif
520 break;
521 case MSR_VSX:
522#ifdef CONFIG_VSX
523 for (i = 0; i < ARRAY_SIZE(vcpu->arch.vsr); i++)
524 thread_fpr[get_fpr_index(i) + 1] = vcpu_vsx[i];
525 kvmppc_load_up_vsx();
526#endif
527 break;
528 default:
529 BUG();
530 } 597 }
531 598
532 vcpu->arch.guest_owned_ext |= msr; 599 vcpu->arch.guest_owned_ext |= msr;
533
534 kvmppc_recalc_shadow_msr(vcpu); 600 kvmppc_recalc_shadow_msr(vcpu);
535 601
536 return RESUME_GUEST; 602 return RESUME_GUEST;
@@ -540,18 +606,18 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
540 unsigned int exit_nr) 606 unsigned int exit_nr)
541{ 607{
542 int r = RESUME_HOST; 608 int r = RESUME_HOST;
609 int s;
543 610
544 vcpu->stat.sum_exits++; 611 vcpu->stat.sum_exits++;
545 612
546 run->exit_reason = KVM_EXIT_UNKNOWN; 613 run->exit_reason = KVM_EXIT_UNKNOWN;
547 run->ready_for_interrupt_injection = 1; 614 run->ready_for_interrupt_injection = 1;
548 615
549 /* We get here with MSR.EE=0, so enable it to be a nice citizen */ 616 /* We get here with MSR.EE=1 */
550 __hard_irq_enable(); 617
618 trace_kvm_exit(exit_nr, vcpu);
619 kvm_guest_exit();
551 620
552 trace_kvm_book3s_exit(exit_nr, vcpu);
553 preempt_enable();
554 kvm_resched(vcpu);
555 switch (exit_nr) { 621 switch (exit_nr) {
556 case BOOK3S_INTERRUPT_INST_STORAGE: 622 case BOOK3S_INTERRUPT_INST_STORAGE:
557 { 623 {
@@ -802,7 +868,6 @@ program_interrupt:
802 } 868 }
803 } 869 }
804 870
805 preempt_disable();
806 if (!(r & RESUME_HOST)) { 871 if (!(r & RESUME_HOST)) {
807 /* To avoid clobbering exit_reason, only check for signals if 872 /* To avoid clobbering exit_reason, only check for signals if
808 * we aren't already exiting to userspace for some other 873 * we aren't already exiting to userspace for some other
@@ -814,20 +879,13 @@ program_interrupt:
814 * and if we really did time things so badly, then we just exit 879 * and if we really did time things so badly, then we just exit
815 * again due to a host external interrupt. 880 * again due to a host external interrupt.
816 */ 881 */
817 __hard_irq_disable(); 882 local_irq_disable();
818 if (signal_pending(current)) { 883 s = kvmppc_prepare_to_enter(vcpu);
819 __hard_irq_enable(); 884 if (s <= 0) {
820#ifdef EXIT_DEBUG 885 local_irq_enable();
821 printk(KERN_EMERG "KVM: Going back to host\n"); 886 r = s;
822#endif
823 vcpu->stat.signal_exits++;
824 run->exit_reason = KVM_EXIT_INTR;
825 r = -EINTR;
826 } else { 887 } else {
827 /* In case an interrupt came in that was triggered 888 kvmppc_lazy_ee_enable();
828 * from userspace (like DEC), we need to check what
829 * to inject now! */
830 kvmppc_core_prepare_to_enter(vcpu);
831 } 889 }
832 } 890 }
833 891
@@ -899,34 +957,59 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
899 return 0; 957 return 0;
900} 958}
901 959
902int kvm_vcpu_ioctl_get_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg) 960int kvmppc_get_one_reg(struct kvm_vcpu *vcpu, u64 id, union kvmppc_one_reg *val)
903{ 961{
904 int r = -EINVAL; 962 int r = 0;
905 963
906 switch (reg->id) { 964 switch (id) {
907 case KVM_REG_PPC_HIOR: 965 case KVM_REG_PPC_HIOR:
908 r = copy_to_user((u64 __user *)(long)reg->addr, 966 *val = get_reg_val(id, to_book3s(vcpu)->hior);
909 &to_book3s(vcpu)->hior, sizeof(u64));
910 break; 967 break;
968#ifdef CONFIG_VSX
969 case KVM_REG_PPC_VSR0 ... KVM_REG_PPC_VSR31: {
970 long int i = id - KVM_REG_PPC_VSR0;
971
972 if (!cpu_has_feature(CPU_FTR_VSX)) {
973 r = -ENXIO;
974 break;
975 }
976 val->vsxval[0] = vcpu->arch.fpr[i];
977 val->vsxval[1] = vcpu->arch.vsr[i];
978 break;
979 }
980#endif /* CONFIG_VSX */
911 default: 981 default:
982 r = -EINVAL;
912 break; 983 break;
913 } 984 }
914 985
915 return r; 986 return r;
916} 987}
917 988
918int kvm_vcpu_ioctl_set_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg) 989int kvmppc_set_one_reg(struct kvm_vcpu *vcpu, u64 id, union kvmppc_one_reg *val)
919{ 990{
920 int r = -EINVAL; 991 int r = 0;
921 992
922 switch (reg->id) { 993 switch (id) {
923 case KVM_REG_PPC_HIOR: 994 case KVM_REG_PPC_HIOR:
924 r = copy_from_user(&to_book3s(vcpu)->hior, 995 to_book3s(vcpu)->hior = set_reg_val(id, *val);
925 (u64 __user *)(long)reg->addr, sizeof(u64)); 996 to_book3s(vcpu)->hior_explicit = true;
926 if (!r) 997 break;
927 to_book3s(vcpu)->hior_explicit = true; 998#ifdef CONFIG_VSX
999 case KVM_REG_PPC_VSR0 ... KVM_REG_PPC_VSR31: {
1000 long int i = id - KVM_REG_PPC_VSR0;
1001
1002 if (!cpu_has_feature(CPU_FTR_VSX)) {
1003 r = -ENXIO;
1004 break;
1005 }
1006 vcpu->arch.fpr[i] = val->vsxval[0];
1007 vcpu->arch.vsr[i] = val->vsxval[1];
928 break; 1008 break;
1009 }
1010#endif /* CONFIG_VSX */
929 default: 1011 default:
1012 r = -EINVAL;
930 break; 1013 break;
931 } 1014 }
932 1015
@@ -1020,8 +1103,6 @@ int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
1020#endif 1103#endif
1021 ulong ext_msr; 1104 ulong ext_msr;
1022 1105
1023 preempt_disable();
1024
1025 /* Check if we can run the vcpu at all */ 1106 /* Check if we can run the vcpu at all */
1026 if (!vcpu->arch.sane) { 1107 if (!vcpu->arch.sane) {
1027 kvm_run->exit_reason = KVM_EXIT_INTERNAL_ERROR; 1108 kvm_run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
@@ -1029,21 +1110,16 @@ int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
1029 goto out; 1110 goto out;
1030 } 1111 }
1031 1112
1032 kvmppc_core_prepare_to_enter(vcpu);
1033
1034 /* 1113 /*
1035 * Interrupts could be timers for the guest which we have to inject 1114 * Interrupts could be timers for the guest which we have to inject
1036 * again, so let's postpone them until we're in the guest and if we 1115 * again, so let's postpone them until we're in the guest and if we
1037 * really did time things so badly, then we just exit again due to 1116 * really did time things so badly, then we just exit again due to
1038 * a host external interrupt. 1117 * a host external interrupt.
1039 */ 1118 */
1040 __hard_irq_disable(); 1119 local_irq_disable();
1041 1120 ret = kvmppc_prepare_to_enter(vcpu);
1042 /* No need to go into the guest when all we do is going out */ 1121 if (ret <= 0) {
1043 if (signal_pending(current)) { 1122 local_irq_enable();
1044 __hard_irq_enable();
1045 kvm_run->exit_reason = KVM_EXIT_INTR;
1046 ret = -EINTR;
1047 goto out; 1123 goto out;
1048 } 1124 }
1049 1125
@@ -1070,7 +1146,7 @@ int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
1070 /* Save VSX state in stack */ 1146 /* Save VSX state in stack */
1071 used_vsr = current->thread.used_vsr; 1147 used_vsr = current->thread.used_vsr;
1072 if (used_vsr && (current->thread.regs->msr & MSR_VSX)) 1148 if (used_vsr && (current->thread.regs->msr & MSR_VSX))
1073 __giveup_vsx(current); 1149 __giveup_vsx(current);
1074#endif 1150#endif
1075 1151
1076 /* Remember the MSR with disabled extensions */ 1152 /* Remember the MSR with disabled extensions */
@@ -1080,20 +1156,19 @@ int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
1080 if (vcpu->arch.shared->msr & MSR_FP) 1156 if (vcpu->arch.shared->msr & MSR_FP)
1081 kvmppc_handle_ext(vcpu, BOOK3S_INTERRUPT_FP_UNAVAIL, MSR_FP); 1157 kvmppc_handle_ext(vcpu, BOOK3S_INTERRUPT_FP_UNAVAIL, MSR_FP);
1082 1158
1083 kvm_guest_enter(); 1159 kvmppc_lazy_ee_enable();
1084 1160
1085 ret = __kvmppc_vcpu_run(kvm_run, vcpu); 1161 ret = __kvmppc_vcpu_run(kvm_run, vcpu);
1086 1162
1087 kvm_guest_exit(); 1163 /* No need for kvm_guest_exit. It's done in handle_exit.
1088 1164 We also get here with interrupts enabled. */
1089 current->thread.regs->msr = ext_msr;
1090 1165
1091 /* Make sure we save the guest FPU/Altivec/VSX state */ 1166 /* Make sure we save the guest FPU/Altivec/VSX state */
1092 kvmppc_giveup_ext(vcpu, MSR_FP); 1167 kvmppc_giveup_ext(vcpu, MSR_FP | MSR_VEC | MSR_VSX);
1093 kvmppc_giveup_ext(vcpu, MSR_VEC); 1168
1094 kvmppc_giveup_ext(vcpu, MSR_VSX); 1169 current->thread.regs->msr = ext_msr;
1095 1170
1096 /* Restore FPU state from stack */ 1171 /* Restore FPU/VSX state from stack */
1097 memcpy(current->thread.fpr, fpr, sizeof(current->thread.fpr)); 1172 memcpy(current->thread.fpr, fpr, sizeof(current->thread.fpr));
1098 current->thread.fpscr.val = fpscr; 1173 current->thread.fpscr.val = fpscr;
1099 current->thread.fpexc_mode = fpexc_mode; 1174 current->thread.fpexc_mode = fpexc_mode;
@@ -1113,7 +1188,7 @@ int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
1113#endif 1188#endif
1114 1189
1115out: 1190out:
1116 preempt_enable(); 1191 vcpu->mode = OUTSIDE_GUEST_MODE;
1117 return ret; 1192 return ret;
1118} 1193}
1119 1194
@@ -1181,14 +1256,31 @@ int kvm_vm_ioctl_get_smmu_info(struct kvm *kvm, struct kvm_ppc_smmu_info *info)
1181} 1256}
1182#endif /* CONFIG_PPC64 */ 1257#endif /* CONFIG_PPC64 */
1183 1258
1259void kvmppc_core_free_memslot(struct kvm_memory_slot *free,
1260 struct kvm_memory_slot *dont)
1261{
1262}
1263
1264int kvmppc_core_create_memslot(struct kvm_memory_slot *slot,
1265 unsigned long npages)
1266{
1267 return 0;
1268}
1269
1184int kvmppc_core_prepare_memory_region(struct kvm *kvm, 1270int kvmppc_core_prepare_memory_region(struct kvm *kvm,
1271 struct kvm_memory_slot *memslot,
1185 struct kvm_userspace_memory_region *mem) 1272 struct kvm_userspace_memory_region *mem)
1186{ 1273{
1187 return 0; 1274 return 0;
1188} 1275}
1189 1276
1190void kvmppc_core_commit_memory_region(struct kvm *kvm, 1277void kvmppc_core_commit_memory_region(struct kvm *kvm,
1191 struct kvm_userspace_memory_region *mem) 1278 struct kvm_userspace_memory_region *mem,
1279 struct kvm_memory_slot old)
1280{
1281}
1282
1283void kvmppc_core_flush_memslot(struct kvm *kvm, struct kvm_memory_slot *memslot)
1192{ 1284{
1193} 1285}
1194 1286
diff --git a/arch/powerpc/kvm/book3s_rmhandlers.S b/arch/powerpc/kvm/book3s_rmhandlers.S
index 9ecf6e35cd8d..8f7633e3afb8 100644
--- a/arch/powerpc/kvm/book3s_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_rmhandlers.S
@@ -170,20 +170,21 @@ kvmppc_handler_skip_ins:
170 * Call kvmppc_handler_trampoline_enter in real mode 170 * Call kvmppc_handler_trampoline_enter in real mode
171 * 171 *
172 * On entry, r4 contains the guest shadow MSR 172 * On entry, r4 contains the guest shadow MSR
173 * MSR.EE has to be 0 when calling this function
173 */ 174 */
174_GLOBAL(kvmppc_entry_trampoline) 175_GLOBAL(kvmppc_entry_trampoline)
175 mfmsr r5 176 mfmsr r5
176 LOAD_REG_ADDR(r7, kvmppc_handler_trampoline_enter) 177 LOAD_REG_ADDR(r7, kvmppc_handler_trampoline_enter)
177 toreal(r7) 178 toreal(r7)
178 179
179 li r9, MSR_RI
180 ori r9, r9, MSR_EE
181 andc r9, r5, r9 /* Clear EE and RI in MSR value */
182 li r6, MSR_IR | MSR_DR 180 li r6, MSR_IR | MSR_DR
183 ori r6, r6, MSR_EE 181 andc r6, r5, r6 /* Clear DR and IR in MSR value */
184 andc r6, r5, r6 /* Clear EE, DR and IR in MSR value */ 182 /*
185 MTMSR_EERI(r9) /* Clear EE and RI in MSR */ 183 * Set EE in HOST_MSR so that it's enabled when we get into our
186 mtsrr0 r7 /* before we set srr0/1 */ 184 * C exit handler function
185 */
186 ori r5, r5, MSR_EE
187 mtsrr0 r7
187 mtsrr1 r6 188 mtsrr1 r6
188 RFI 189 RFI
189 190
@@ -233,8 +234,5 @@ define_load_up(fpu)
233#ifdef CONFIG_ALTIVEC 234#ifdef CONFIG_ALTIVEC
234define_load_up(altivec) 235define_load_up(altivec)
235#endif 236#endif
236#ifdef CONFIG_VSX
237define_load_up(vsx)
238#endif
239 237
240#include "book3s_segment.S" 238#include "book3s_segment.S"
diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c
index d25a097c852b..69f114015780 100644
--- a/arch/powerpc/kvm/booke.c
+++ b/arch/powerpc/kvm/booke.c
@@ -36,9 +36,11 @@
36#include <asm/dbell.h> 36#include <asm/dbell.h>
37#include <asm/hw_irq.h> 37#include <asm/hw_irq.h>
38#include <asm/irq.h> 38#include <asm/irq.h>
39#include <asm/time.h>
39 40
40#include "timing.h" 41#include "timing.h"
41#include "booke.h" 42#include "booke.h"
43#include "trace.h"
42 44
43unsigned long kvmppc_booke_handlers; 45unsigned long kvmppc_booke_handlers;
44 46
@@ -62,6 +64,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
62 { "halt_wakeup", VCPU_STAT(halt_wakeup) }, 64 { "halt_wakeup", VCPU_STAT(halt_wakeup) },
63 { "doorbell", VCPU_STAT(dbell_exits) }, 65 { "doorbell", VCPU_STAT(dbell_exits) },
64 { "guest doorbell", VCPU_STAT(gdbell_exits) }, 66 { "guest doorbell", VCPU_STAT(gdbell_exits) },
67 { "remote_tlb_flush", VM_STAT(remote_tlb_flush) },
65 { NULL } 68 { NULL }
66}; 69};
67 70
@@ -120,6 +123,16 @@ static void kvmppc_vcpu_sync_spe(struct kvm_vcpu *vcpu)
120} 123}
121#endif 124#endif
122 125
126static void kvmppc_vcpu_sync_fpu(struct kvm_vcpu *vcpu)
127{
128#if defined(CONFIG_PPC_FPU) && !defined(CONFIG_KVM_BOOKE_HV)
129 /* We always treat the FP bit as enabled from the host
130 perspective, so only need to adjust the shadow MSR */
131 vcpu->arch.shadow_msr &= ~MSR_FP;
132 vcpu->arch.shadow_msr |= vcpu->arch.shared->msr & MSR_FP;
133#endif
134}
135
123/* 136/*
124 * Helper function for "full" MSR writes. No need to call this if only 137 * Helper function for "full" MSR writes. No need to call this if only
125 * EE/CE/ME/DE/RI are changing. 138 * EE/CE/ME/DE/RI are changing.
@@ -136,11 +149,13 @@ void kvmppc_set_msr(struct kvm_vcpu *vcpu, u32 new_msr)
136 149
137 kvmppc_mmu_msr_notify(vcpu, old_msr); 150 kvmppc_mmu_msr_notify(vcpu, old_msr);
138 kvmppc_vcpu_sync_spe(vcpu); 151 kvmppc_vcpu_sync_spe(vcpu);
152 kvmppc_vcpu_sync_fpu(vcpu);
139} 153}
140 154
141static void kvmppc_booke_queue_irqprio(struct kvm_vcpu *vcpu, 155static void kvmppc_booke_queue_irqprio(struct kvm_vcpu *vcpu,
142 unsigned int priority) 156 unsigned int priority)
143{ 157{
158 trace_kvm_booke_queue_irqprio(vcpu, priority);
144 set_bit(priority, &vcpu->arch.pending_exceptions); 159 set_bit(priority, &vcpu->arch.pending_exceptions);
145} 160}
146 161
@@ -206,6 +221,16 @@ void kvmppc_core_dequeue_external(struct kvm_vcpu *vcpu,
206 clear_bit(BOOKE_IRQPRIO_EXTERNAL_LEVEL, &vcpu->arch.pending_exceptions); 221 clear_bit(BOOKE_IRQPRIO_EXTERNAL_LEVEL, &vcpu->arch.pending_exceptions);
207} 222}
208 223
224static void kvmppc_core_queue_watchdog(struct kvm_vcpu *vcpu)
225{
226 kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_WATCHDOG);
227}
228
229static void kvmppc_core_dequeue_watchdog(struct kvm_vcpu *vcpu)
230{
231 clear_bit(BOOKE_IRQPRIO_WATCHDOG, &vcpu->arch.pending_exceptions);
232}
233
209static void set_guest_srr(struct kvm_vcpu *vcpu, unsigned long srr0, u32 srr1) 234static void set_guest_srr(struct kvm_vcpu *vcpu, unsigned long srr0, u32 srr1)
210{ 235{
211#ifdef CONFIG_KVM_BOOKE_HV 236#ifdef CONFIG_KVM_BOOKE_HV
@@ -287,6 +312,7 @@ static int kvmppc_booke_irqprio_deliver(struct kvm_vcpu *vcpu,
287 bool crit; 312 bool crit;
288 bool keep_irq = false; 313 bool keep_irq = false;
289 enum int_class int_class; 314 enum int_class int_class;
315 ulong new_msr = vcpu->arch.shared->msr;
290 316
291 /* Truncate crit indicators in 32 bit mode */ 317 /* Truncate crit indicators in 32 bit mode */
292 if (!(vcpu->arch.shared->msr & MSR_SF)) { 318 if (!(vcpu->arch.shared->msr & MSR_SF)) {
@@ -325,6 +351,7 @@ static int kvmppc_booke_irqprio_deliver(struct kvm_vcpu *vcpu,
325 msr_mask = MSR_CE | MSR_ME | MSR_DE; 351 msr_mask = MSR_CE | MSR_ME | MSR_DE;
326 int_class = INT_CLASS_NONCRIT; 352 int_class = INT_CLASS_NONCRIT;
327 break; 353 break;
354 case BOOKE_IRQPRIO_WATCHDOG:
328 case BOOKE_IRQPRIO_CRITICAL: 355 case BOOKE_IRQPRIO_CRITICAL:
329 case BOOKE_IRQPRIO_DBELL_CRIT: 356 case BOOKE_IRQPRIO_DBELL_CRIT:
330 allowed = vcpu->arch.shared->msr & MSR_CE; 357 allowed = vcpu->arch.shared->msr & MSR_CE;
@@ -381,7 +408,13 @@ static int kvmppc_booke_irqprio_deliver(struct kvm_vcpu *vcpu,
381 set_guest_esr(vcpu, vcpu->arch.queued_esr); 408 set_guest_esr(vcpu, vcpu->arch.queued_esr);
382 if (update_dear == true) 409 if (update_dear == true)
383 set_guest_dear(vcpu, vcpu->arch.queued_dear); 410 set_guest_dear(vcpu, vcpu->arch.queued_dear);
384 kvmppc_set_msr(vcpu, vcpu->arch.shared->msr & msr_mask); 411
412 new_msr &= msr_mask;
413#if defined(CONFIG_64BIT)
414 if (vcpu->arch.epcr & SPRN_EPCR_ICM)
415 new_msr |= MSR_CM;
416#endif
417 kvmppc_set_msr(vcpu, new_msr);
385 418
386 if (!keep_irq) 419 if (!keep_irq)
387 clear_bit(priority, &vcpu->arch.pending_exceptions); 420 clear_bit(priority, &vcpu->arch.pending_exceptions);
@@ -404,12 +437,121 @@ static int kvmppc_booke_irqprio_deliver(struct kvm_vcpu *vcpu,
404 return allowed; 437 return allowed;
405} 438}
406 439
440/*
441 * Return the number of jiffies until the next timeout. If the timeout is
442 * longer than the NEXT_TIMER_MAX_DELTA, then return NEXT_TIMER_MAX_DELTA
443 * because the larger value can break the timer APIs.
444 */
445static unsigned long watchdog_next_timeout(struct kvm_vcpu *vcpu)
446{
447 u64 tb, wdt_tb, wdt_ticks = 0;
448 u64 nr_jiffies = 0;
449 u32 period = TCR_GET_WP(vcpu->arch.tcr);
450
451 wdt_tb = 1ULL << (63 - period);
452 tb = get_tb();
453 /*
454 * The watchdog timeout will hapeen when TB bit corresponding
455 * to watchdog will toggle from 0 to 1.
456 */
457 if (tb & wdt_tb)
458 wdt_ticks = wdt_tb;
459
460 wdt_ticks += wdt_tb - (tb & (wdt_tb - 1));
461
462 /* Convert timebase ticks to jiffies */
463 nr_jiffies = wdt_ticks;
464
465 if (do_div(nr_jiffies, tb_ticks_per_jiffy))
466 nr_jiffies++;
467
468 return min_t(unsigned long long, nr_jiffies, NEXT_TIMER_MAX_DELTA);
469}
470
471static void arm_next_watchdog(struct kvm_vcpu *vcpu)
472{
473 unsigned long nr_jiffies;
474 unsigned long flags;
475
476 /*
477 * If TSR_ENW and TSR_WIS are not set then no need to exit to
478 * userspace, so clear the KVM_REQ_WATCHDOG request.
479 */
480 if ((vcpu->arch.tsr & (TSR_ENW | TSR_WIS)) != (TSR_ENW | TSR_WIS))
481 clear_bit(KVM_REQ_WATCHDOG, &vcpu->requests);
482
483 spin_lock_irqsave(&vcpu->arch.wdt_lock, flags);
484 nr_jiffies = watchdog_next_timeout(vcpu);
485 /*
486 * If the number of jiffies of watchdog timer >= NEXT_TIMER_MAX_DELTA
487 * then do not run the watchdog timer as this can break timer APIs.
488 */
489 if (nr_jiffies < NEXT_TIMER_MAX_DELTA)
490 mod_timer(&vcpu->arch.wdt_timer, jiffies + nr_jiffies);
491 else
492 del_timer(&vcpu->arch.wdt_timer);
493 spin_unlock_irqrestore(&vcpu->arch.wdt_lock, flags);
494}
495
496void kvmppc_watchdog_func(unsigned long data)
497{
498 struct kvm_vcpu *vcpu = (struct kvm_vcpu *)data;
499 u32 tsr, new_tsr;
500 int final;
501
502 do {
503 new_tsr = tsr = vcpu->arch.tsr;
504 final = 0;
505
506 /* Time out event */
507 if (tsr & TSR_ENW) {
508 if (tsr & TSR_WIS)
509 final = 1;
510 else
511 new_tsr = tsr | TSR_WIS;
512 } else {
513 new_tsr = tsr | TSR_ENW;
514 }
515 } while (cmpxchg(&vcpu->arch.tsr, tsr, new_tsr) != tsr);
516
517 if (new_tsr & TSR_WIS) {
518 smp_wmb();
519 kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu);
520 kvm_vcpu_kick(vcpu);
521 }
522
523 /*
524 * If this is final watchdog expiry and some action is required
525 * then exit to userspace.
526 */
527 if (final && (vcpu->arch.tcr & TCR_WRC_MASK) &&
528 vcpu->arch.watchdog_enabled) {
529 smp_wmb();
530 kvm_make_request(KVM_REQ_WATCHDOG, vcpu);
531 kvm_vcpu_kick(vcpu);
532 }
533
534 /*
535 * Stop running the watchdog timer after final expiration to
536 * prevent the host from being flooded with timers if the
537 * guest sets a short period.
538 * Timers will resume when TSR/TCR is updated next time.
539 */
540 if (!final)
541 arm_next_watchdog(vcpu);
542}
543
407static void update_timer_ints(struct kvm_vcpu *vcpu) 544static void update_timer_ints(struct kvm_vcpu *vcpu)
408{ 545{
409 if ((vcpu->arch.tcr & TCR_DIE) && (vcpu->arch.tsr & TSR_DIS)) 546 if ((vcpu->arch.tcr & TCR_DIE) && (vcpu->arch.tsr & TSR_DIS))
410 kvmppc_core_queue_dec(vcpu); 547 kvmppc_core_queue_dec(vcpu);
411 else 548 else
412 kvmppc_core_dequeue_dec(vcpu); 549 kvmppc_core_dequeue_dec(vcpu);
550
551 if ((vcpu->arch.tcr & TCR_WIE) && (vcpu->arch.tsr & TSR_WIS))
552 kvmppc_core_queue_watchdog(vcpu);
553 else
554 kvmppc_core_dequeue_watchdog(vcpu);
413} 555}
414 556
415static void kvmppc_core_check_exceptions(struct kvm_vcpu *vcpu) 557static void kvmppc_core_check_exceptions(struct kvm_vcpu *vcpu)
@@ -417,13 +559,6 @@ static void kvmppc_core_check_exceptions(struct kvm_vcpu *vcpu)
417 unsigned long *pending = &vcpu->arch.pending_exceptions; 559 unsigned long *pending = &vcpu->arch.pending_exceptions;
418 unsigned int priority; 560 unsigned int priority;
419 561
420 if (vcpu->requests) {
421 if (kvm_check_request(KVM_REQ_PENDING_TIMER, vcpu)) {
422 smp_mb();
423 update_timer_ints(vcpu);
424 }
425 }
426
427 priority = __ffs(*pending); 562 priority = __ffs(*pending);
428 while (priority < BOOKE_IRQPRIO_MAX) { 563 while (priority < BOOKE_IRQPRIO_MAX) {
429 if (kvmppc_booke_irqprio_deliver(vcpu, priority)) 564 if (kvmppc_booke_irqprio_deliver(vcpu, priority))
@@ -459,37 +594,20 @@ int kvmppc_core_prepare_to_enter(struct kvm_vcpu *vcpu)
459 return r; 594 return r;
460} 595}
461 596
462/* 597int kvmppc_core_check_requests(struct kvm_vcpu *vcpu)
463 * Common checks before entering the guest world. Call with interrupts
464 * disabled.
465 *
466 * returns !0 if a signal is pending and check_signal is true
467 */
468static int kvmppc_prepare_to_enter(struct kvm_vcpu *vcpu)
469{ 598{
470 int r = 0; 599 int r = 1; /* Indicate we want to get back into the guest */
471 600
472 WARN_ON_ONCE(!irqs_disabled()); 601 if (kvm_check_request(KVM_REQ_PENDING_TIMER, vcpu))
473 while (true) { 602 update_timer_ints(vcpu);
474 if (need_resched()) { 603#if defined(CONFIG_KVM_E500V2) || defined(CONFIG_KVM_E500MC)
475 local_irq_enable(); 604 if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu))
476 cond_resched(); 605 kvmppc_core_flush_tlb(vcpu);
477 local_irq_disable(); 606#endif
478 continue;
479 }
480
481 if (signal_pending(current)) {
482 r = 1;
483 break;
484 }
485
486 if (kvmppc_core_prepare_to_enter(vcpu)) {
487 /* interrupts got enabled in between, so we
488 are back at square 1 */
489 continue;
490 }
491 607
492 break; 608 if (kvm_check_request(KVM_REQ_WATCHDOG, vcpu)) {
609 vcpu->run->exit_reason = KVM_EXIT_WATCHDOG;
610 r = 0;
493 } 611 }
494 612
495 return r; 613 return r;
@@ -497,7 +615,7 @@ static int kvmppc_prepare_to_enter(struct kvm_vcpu *vcpu)
497 615
498int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) 616int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
499{ 617{
500 int ret; 618 int ret, s;
501#ifdef CONFIG_PPC_FPU 619#ifdef CONFIG_PPC_FPU
502 unsigned int fpscr; 620 unsigned int fpscr;
503 int fpexc_mode; 621 int fpexc_mode;
@@ -510,11 +628,13 @@ int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
510 } 628 }
511 629
512 local_irq_disable(); 630 local_irq_disable();
513 if (kvmppc_prepare_to_enter(vcpu)) { 631 s = kvmppc_prepare_to_enter(vcpu);
514 kvm_run->exit_reason = KVM_EXIT_INTR; 632 if (s <= 0) {
515 ret = -EINTR; 633 local_irq_enable();
634 ret = s;
516 goto out; 635 goto out;
517 } 636 }
637 kvmppc_lazy_ee_enable();
518 638
519 kvm_guest_enter(); 639 kvm_guest_enter();
520 640
@@ -542,6 +662,9 @@ int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
542 662
543 ret = __kvmppc_vcpu_run(kvm_run, vcpu); 663 ret = __kvmppc_vcpu_run(kvm_run, vcpu);
544 664
665 /* No need for kvm_guest_exit. It's done in handle_exit.
666 We also get here with interrupts enabled. */
667
545#ifdef CONFIG_PPC_FPU 668#ifdef CONFIG_PPC_FPU
546 kvmppc_save_guest_fp(vcpu); 669 kvmppc_save_guest_fp(vcpu);
547 670
@@ -557,10 +680,8 @@ int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
557 current->thread.fpexc_mode = fpexc_mode; 680 current->thread.fpexc_mode = fpexc_mode;
558#endif 681#endif
559 682
560 kvm_guest_exit();
561
562out: 683out:
563 local_irq_enable(); 684 vcpu->mode = OUTSIDE_GUEST_MODE;
564 return ret; 685 return ret;
565} 686}
566 687
@@ -668,6 +789,7 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
668 unsigned int exit_nr) 789 unsigned int exit_nr)
669{ 790{
670 int r = RESUME_HOST; 791 int r = RESUME_HOST;
792 int s;
671 793
672 /* update before a new last_exit_type is rewritten */ 794 /* update before a new last_exit_type is rewritten */
673 kvmppc_update_timing_stats(vcpu); 795 kvmppc_update_timing_stats(vcpu);
@@ -677,6 +799,9 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
677 799
678 local_irq_enable(); 800 local_irq_enable();
679 801
802 trace_kvm_exit(exit_nr, vcpu);
803 kvm_guest_exit();
804
680 run->exit_reason = KVM_EXIT_UNKNOWN; 805 run->exit_reason = KVM_EXIT_UNKNOWN;
681 run->ready_for_interrupt_injection = 1; 806 run->ready_for_interrupt_injection = 1;
682 807
@@ -971,10 +1096,12 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
971 */ 1096 */
972 if (!(r & RESUME_HOST)) { 1097 if (!(r & RESUME_HOST)) {
973 local_irq_disable(); 1098 local_irq_disable();
974 if (kvmppc_prepare_to_enter(vcpu)) { 1099 s = kvmppc_prepare_to_enter(vcpu);
975 run->exit_reason = KVM_EXIT_INTR; 1100 if (s <= 0) {
976 r = (-EINTR << 2) | RESUME_HOST | (r & RESUME_FLAG_NV); 1101 local_irq_enable();
977 kvmppc_account_exit(vcpu, SIGNAL_EXITS); 1102 r = (s << 2) | RESUME_HOST | (r & RESUME_FLAG_NV);
1103 } else {
1104 kvmppc_lazy_ee_enable();
978 } 1105 }
979 } 1106 }
980 1107
@@ -1011,6 +1138,21 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
1011 return r; 1138 return r;
1012} 1139}
1013 1140
1141int kvmppc_subarch_vcpu_init(struct kvm_vcpu *vcpu)
1142{
1143 /* setup watchdog timer once */
1144 spin_lock_init(&vcpu->arch.wdt_lock);
1145 setup_timer(&vcpu->arch.wdt_timer, kvmppc_watchdog_func,
1146 (unsigned long)vcpu);
1147
1148 return 0;
1149}
1150
1151void kvmppc_subarch_vcpu_uninit(struct kvm_vcpu *vcpu)
1152{
1153 del_timer_sync(&vcpu->arch.wdt_timer);
1154}
1155
1014int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) 1156int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
1015{ 1157{
1016 int i; 1158 int i;
@@ -1106,7 +1248,13 @@ static int set_sregs_base(struct kvm_vcpu *vcpu,
1106 } 1248 }
1107 1249
1108 if (sregs->u.e.update_special & KVM_SREGS_E_UPDATE_TSR) { 1250 if (sregs->u.e.update_special & KVM_SREGS_E_UPDATE_TSR) {
1251 u32 old_tsr = vcpu->arch.tsr;
1252
1109 vcpu->arch.tsr = sregs->u.e.tsr; 1253 vcpu->arch.tsr = sregs->u.e.tsr;
1254
1255 if ((old_tsr ^ vcpu->arch.tsr) & (TSR_ENW | TSR_WIS))
1256 arm_next_watchdog(vcpu);
1257
1110 update_timer_ints(vcpu); 1258 update_timer_ints(vcpu);
1111 } 1259 }
1112 1260
@@ -1221,12 +1369,70 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
1221 1369
1222int kvm_vcpu_ioctl_get_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg) 1370int kvm_vcpu_ioctl_get_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg)
1223{ 1371{
1224 return -EINVAL; 1372 int r = -EINVAL;
1373
1374 switch (reg->id) {
1375 case KVM_REG_PPC_IAC1:
1376 case KVM_REG_PPC_IAC2:
1377 case KVM_REG_PPC_IAC3:
1378 case KVM_REG_PPC_IAC4: {
1379 int iac = reg->id - KVM_REG_PPC_IAC1;
1380 r = copy_to_user((u64 __user *)(long)reg->addr,
1381 &vcpu->arch.dbg_reg.iac[iac], sizeof(u64));
1382 break;
1383 }
1384 case KVM_REG_PPC_DAC1:
1385 case KVM_REG_PPC_DAC2: {
1386 int dac = reg->id - KVM_REG_PPC_DAC1;
1387 r = copy_to_user((u64 __user *)(long)reg->addr,
1388 &vcpu->arch.dbg_reg.dac[dac], sizeof(u64));
1389 break;
1390 }
1391#if defined(CONFIG_64BIT)
1392 case KVM_REG_PPC_EPCR:
1393 r = put_user(vcpu->arch.epcr, (u32 __user *)(long)reg->addr);
1394 break;
1395#endif
1396 default:
1397 break;
1398 }
1399 return r;
1225} 1400}
1226 1401
1227int kvm_vcpu_ioctl_set_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg) 1402int kvm_vcpu_ioctl_set_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg)
1228{ 1403{
1229 return -EINVAL; 1404 int r = -EINVAL;
1405
1406 switch (reg->id) {
1407 case KVM_REG_PPC_IAC1:
1408 case KVM_REG_PPC_IAC2:
1409 case KVM_REG_PPC_IAC3:
1410 case KVM_REG_PPC_IAC4: {
1411 int iac = reg->id - KVM_REG_PPC_IAC1;
1412 r = copy_from_user(&vcpu->arch.dbg_reg.iac[iac],
1413 (u64 __user *)(long)reg->addr, sizeof(u64));
1414 break;
1415 }
1416 case KVM_REG_PPC_DAC1:
1417 case KVM_REG_PPC_DAC2: {
1418 int dac = reg->id - KVM_REG_PPC_DAC1;
1419 r = copy_from_user(&vcpu->arch.dbg_reg.dac[dac],
1420 (u64 __user *)(long)reg->addr, sizeof(u64));
1421 break;
1422 }
1423#if defined(CONFIG_64BIT)
1424 case KVM_REG_PPC_EPCR: {
1425 u32 new_epcr;
1426 r = get_user(new_epcr, (u32 __user *)(long)reg->addr);
1427 if (r == 0)
1428 kvmppc_set_epcr(vcpu, new_epcr);
1429 break;
1430 }
1431#endif
1432 default:
1433 break;
1434 }
1435 return r;
1230} 1436}
1231 1437
1232int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) 1438int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
@@ -1253,20 +1459,50 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log)
1253 return -ENOTSUPP; 1459 return -ENOTSUPP;
1254} 1460}
1255 1461
1462void kvmppc_core_free_memslot(struct kvm_memory_slot *free,
1463 struct kvm_memory_slot *dont)
1464{
1465}
1466
1467int kvmppc_core_create_memslot(struct kvm_memory_slot *slot,
1468 unsigned long npages)
1469{
1470 return 0;
1471}
1472
1256int kvmppc_core_prepare_memory_region(struct kvm *kvm, 1473int kvmppc_core_prepare_memory_region(struct kvm *kvm,
1474 struct kvm_memory_slot *memslot,
1257 struct kvm_userspace_memory_region *mem) 1475 struct kvm_userspace_memory_region *mem)
1258{ 1476{
1259 return 0; 1477 return 0;
1260} 1478}
1261 1479
1262void kvmppc_core_commit_memory_region(struct kvm *kvm, 1480void kvmppc_core_commit_memory_region(struct kvm *kvm,
1263 struct kvm_userspace_memory_region *mem) 1481 struct kvm_userspace_memory_region *mem,
1482 struct kvm_memory_slot old)
1483{
1484}
1485
1486void kvmppc_core_flush_memslot(struct kvm *kvm, struct kvm_memory_slot *memslot)
1487{
1488}
1489
1490void kvmppc_set_epcr(struct kvm_vcpu *vcpu, u32 new_epcr)
1264{ 1491{
1492#if defined(CONFIG_64BIT)
1493 vcpu->arch.epcr = new_epcr;
1494#ifdef CONFIG_KVM_BOOKE_HV
1495 vcpu->arch.shadow_epcr &= ~SPRN_EPCR_GICM;
1496 if (vcpu->arch.epcr & SPRN_EPCR_ICM)
1497 vcpu->arch.shadow_epcr |= SPRN_EPCR_GICM;
1498#endif
1499#endif
1265} 1500}
1266 1501
1267void kvmppc_set_tcr(struct kvm_vcpu *vcpu, u32 new_tcr) 1502void kvmppc_set_tcr(struct kvm_vcpu *vcpu, u32 new_tcr)
1268{ 1503{
1269 vcpu->arch.tcr = new_tcr; 1504 vcpu->arch.tcr = new_tcr;
1505 arm_next_watchdog(vcpu);
1270 update_timer_ints(vcpu); 1506 update_timer_ints(vcpu);
1271} 1507}
1272 1508
@@ -1281,6 +1517,14 @@ void kvmppc_set_tsr_bits(struct kvm_vcpu *vcpu, u32 tsr_bits)
1281void kvmppc_clr_tsr_bits(struct kvm_vcpu *vcpu, u32 tsr_bits) 1517void kvmppc_clr_tsr_bits(struct kvm_vcpu *vcpu, u32 tsr_bits)
1282{ 1518{
1283 clear_bits(tsr_bits, &vcpu->arch.tsr); 1519 clear_bits(tsr_bits, &vcpu->arch.tsr);
1520
1521 /*
1522 * We may have stopped the watchdog due to
1523 * being stuck on final expiration.
1524 */
1525 if (tsr_bits & (TSR_ENW | TSR_WIS))
1526 arm_next_watchdog(vcpu);
1527
1284 update_timer_ints(vcpu); 1528 update_timer_ints(vcpu);
1285} 1529}
1286 1530
@@ -1298,12 +1542,14 @@ void kvmppc_decrementer_func(unsigned long data)
1298 1542
1299void kvmppc_booke_vcpu_load(struct kvm_vcpu *vcpu, int cpu) 1543void kvmppc_booke_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
1300{ 1544{
1545 vcpu->cpu = smp_processor_id();
1301 current->thread.kvm_vcpu = vcpu; 1546 current->thread.kvm_vcpu = vcpu;
1302} 1547}
1303 1548
1304void kvmppc_booke_vcpu_put(struct kvm_vcpu *vcpu) 1549void kvmppc_booke_vcpu_put(struct kvm_vcpu *vcpu)
1305{ 1550{
1306 current->thread.kvm_vcpu = NULL; 1551 current->thread.kvm_vcpu = NULL;
1552 vcpu->cpu = -1;
1307} 1553}
1308 1554
1309int __init kvmppc_booke_init(void) 1555int __init kvmppc_booke_init(void)
diff --git a/arch/powerpc/kvm/booke.h b/arch/powerpc/kvm/booke.h
index ba61974c1e20..e9b88e433f64 100644
--- a/arch/powerpc/kvm/booke.h
+++ b/arch/powerpc/kvm/booke.h
@@ -69,6 +69,7 @@ extern unsigned long kvmppc_booke_handlers;
69void kvmppc_set_msr(struct kvm_vcpu *vcpu, u32 new_msr); 69void kvmppc_set_msr(struct kvm_vcpu *vcpu, u32 new_msr);
70void kvmppc_mmu_msr_notify(struct kvm_vcpu *vcpu, u32 old_msr); 70void kvmppc_mmu_msr_notify(struct kvm_vcpu *vcpu, u32 old_msr);
71 71
72void kvmppc_set_epcr(struct kvm_vcpu *vcpu, u32 new_epcr);
72void kvmppc_set_tcr(struct kvm_vcpu *vcpu, u32 new_tcr); 73void kvmppc_set_tcr(struct kvm_vcpu *vcpu, u32 new_tcr);
73void kvmppc_set_tsr_bits(struct kvm_vcpu *vcpu, u32 tsr_bits); 74void kvmppc_set_tsr_bits(struct kvm_vcpu *vcpu, u32 tsr_bits);
74void kvmppc_clr_tsr_bits(struct kvm_vcpu *vcpu, u32 tsr_bits); 75void kvmppc_clr_tsr_bits(struct kvm_vcpu *vcpu, u32 tsr_bits);
diff --git a/arch/powerpc/kvm/booke_emulate.c b/arch/powerpc/kvm/booke_emulate.c
index 12834bb608ab..4685b8cf2249 100644
--- a/arch/powerpc/kvm/booke_emulate.c
+++ b/arch/powerpc/kvm/booke_emulate.c
@@ -133,10 +133,10 @@ int kvmppc_booke_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, ulong spr_val)
133 vcpu->arch.csrr1 = spr_val; 133 vcpu->arch.csrr1 = spr_val;
134 break; 134 break;
135 case SPRN_DBCR0: 135 case SPRN_DBCR0:
136 vcpu->arch.dbcr0 = spr_val; 136 vcpu->arch.dbg_reg.dbcr0 = spr_val;
137 break; 137 break;
138 case SPRN_DBCR1: 138 case SPRN_DBCR1:
139 vcpu->arch.dbcr1 = spr_val; 139 vcpu->arch.dbg_reg.dbcr1 = spr_val;
140 break; 140 break;
141 case SPRN_DBSR: 141 case SPRN_DBSR:
142 vcpu->arch.dbsr &= ~spr_val; 142 vcpu->arch.dbsr &= ~spr_val;
@@ -145,6 +145,14 @@ int kvmppc_booke_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, ulong spr_val)
145 kvmppc_clr_tsr_bits(vcpu, spr_val); 145 kvmppc_clr_tsr_bits(vcpu, spr_val);
146 break; 146 break;
147 case SPRN_TCR: 147 case SPRN_TCR:
148 /*
149 * WRC is a 2-bit field that is supposed to preserve its
150 * value once written to non-zero.
151 */
152 if (vcpu->arch.tcr & TCR_WRC_MASK) {
153 spr_val &= ~TCR_WRC_MASK;
154 spr_val |= vcpu->arch.tcr & TCR_WRC_MASK;
155 }
148 kvmppc_set_tcr(vcpu, spr_val); 156 kvmppc_set_tcr(vcpu, spr_val);
149 break; 157 break;
150 158
@@ -229,7 +237,17 @@ int kvmppc_booke_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, ulong spr_val)
229 case SPRN_IVOR15: 237 case SPRN_IVOR15:
230 vcpu->arch.ivor[BOOKE_IRQPRIO_DEBUG] = spr_val; 238 vcpu->arch.ivor[BOOKE_IRQPRIO_DEBUG] = spr_val;
231 break; 239 break;
232 240 case SPRN_MCSR:
241 vcpu->arch.mcsr &= ~spr_val;
242 break;
243#if defined(CONFIG_64BIT)
244 case SPRN_EPCR:
245 kvmppc_set_epcr(vcpu, spr_val);
246#ifdef CONFIG_KVM_BOOKE_HV
247 mtspr(SPRN_EPCR, vcpu->arch.shadow_epcr);
248#endif
249 break;
250#endif
233 default: 251 default:
234 emulated = EMULATE_FAIL; 252 emulated = EMULATE_FAIL;
235 } 253 }
@@ -258,10 +276,10 @@ int kvmppc_booke_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, ulong *spr_val)
258 *spr_val = vcpu->arch.csrr1; 276 *spr_val = vcpu->arch.csrr1;
259 break; 277 break;
260 case SPRN_DBCR0: 278 case SPRN_DBCR0:
261 *spr_val = vcpu->arch.dbcr0; 279 *spr_val = vcpu->arch.dbg_reg.dbcr0;
262 break; 280 break;
263 case SPRN_DBCR1: 281 case SPRN_DBCR1:
264 *spr_val = vcpu->arch.dbcr1; 282 *spr_val = vcpu->arch.dbg_reg.dbcr1;
265 break; 283 break;
266 case SPRN_DBSR: 284 case SPRN_DBSR:
267 *spr_val = vcpu->arch.dbsr; 285 *spr_val = vcpu->arch.dbsr;
@@ -321,6 +339,14 @@ int kvmppc_booke_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, ulong *spr_val)
321 case SPRN_IVOR15: 339 case SPRN_IVOR15:
322 *spr_val = vcpu->arch.ivor[BOOKE_IRQPRIO_DEBUG]; 340 *spr_val = vcpu->arch.ivor[BOOKE_IRQPRIO_DEBUG];
323 break; 341 break;
342 case SPRN_MCSR:
343 *spr_val = vcpu->arch.mcsr;
344 break;
345#if defined(CONFIG_64BIT)
346 case SPRN_EPCR:
347 *spr_val = vcpu->arch.epcr;
348 break;
349#endif
324 350
325 default: 351 default:
326 emulated = EMULATE_FAIL; 352 emulated = EMULATE_FAIL;
diff --git a/arch/powerpc/kvm/bookehv_interrupts.S b/arch/powerpc/kvm/bookehv_interrupts.S
index 099fe8272b57..e8ed7d659c55 100644
--- a/arch/powerpc/kvm/bookehv_interrupts.S
+++ b/arch/powerpc/kvm/bookehv_interrupts.S
@@ -16,6 +16,7 @@
16 * 16 *
17 * Author: Varun Sethi <varun.sethi@freescale.com> 17 * Author: Varun Sethi <varun.sethi@freescale.com>
18 * Author: Scott Wood <scotwood@freescale.com> 18 * Author: Scott Wood <scotwood@freescale.com>
19 * Author: Mihai Caraman <mihai.caraman@freescale.com>
19 * 20 *
20 * This file is derived from arch/powerpc/kvm/booke_interrupts.S 21 * This file is derived from arch/powerpc/kvm/booke_interrupts.S
21 */ 22 */
@@ -30,31 +31,33 @@
30#include <asm/bitsperlong.h> 31#include <asm/bitsperlong.h>
31#include <asm/thread_info.h> 32#include <asm/thread_info.h>
32 33
34#ifdef CONFIG_64BIT
35#include <asm/exception-64e.h>
36#else
33#include "../kernel/head_booke.h" /* for THREAD_NORMSAVE() */ 37#include "../kernel/head_booke.h" /* for THREAD_NORMSAVE() */
34 38#endif
35#define GET_VCPU(vcpu, thread) \
36 PPC_LL vcpu, THREAD_KVM_VCPU(thread)
37 39
38#define LONGBYTES (BITS_PER_LONG / 8) 40#define LONGBYTES (BITS_PER_LONG / 8)
39 41
40#define VCPU_GUEST_SPRG(n) (VCPU_GUEST_SPRGS + (n * LONGBYTES)) 42#define VCPU_GUEST_SPRG(n) (VCPU_GUEST_SPRGS + (n * LONGBYTES))
41 43
42/* The host stack layout: */ 44/* The host stack layout: */
43#define HOST_R1 (0 * LONGBYTES) /* Implied by stwu. */ 45#define HOST_R1 0 /* Implied by stwu. */
44#define HOST_CALLEE_LR (1 * LONGBYTES) 46#define HOST_CALLEE_LR PPC_LR_STKOFF
45#define HOST_RUN (2 * LONGBYTES) /* struct kvm_run */ 47#define HOST_RUN (HOST_CALLEE_LR + LONGBYTES)
46/* 48/*
47 * r2 is special: it holds 'current', and it made nonvolatile in the 49 * r2 is special: it holds 'current', and it made nonvolatile in the
48 * kernel with the -ffixed-r2 gcc option. 50 * kernel with the -ffixed-r2 gcc option.
49 */ 51 */
50#define HOST_R2 (3 * LONGBYTES) 52#define HOST_R2 (HOST_RUN + LONGBYTES)
51#define HOST_CR (4 * LONGBYTES) 53#define HOST_CR (HOST_R2 + LONGBYTES)
52#define HOST_NV_GPRS (5 * LONGBYTES) 54#define HOST_NV_GPRS (HOST_CR + LONGBYTES)
53#define __HOST_NV_GPR(n) (HOST_NV_GPRS + ((n - 14) * LONGBYTES)) 55#define __HOST_NV_GPR(n) (HOST_NV_GPRS + ((n - 14) * LONGBYTES))
54#define HOST_NV_GPR(n) __HOST_NV_GPR(__REG_##n) 56#define HOST_NV_GPR(n) __HOST_NV_GPR(__REG_##n)
55#define HOST_MIN_STACK_SIZE (HOST_NV_GPR(R31) + LONGBYTES) 57#define HOST_MIN_STACK_SIZE (HOST_NV_GPR(R31) + LONGBYTES)
56#define HOST_STACK_SIZE ((HOST_MIN_STACK_SIZE + 15) & ~15) /* Align. */ 58#define HOST_STACK_SIZE ((HOST_MIN_STACK_SIZE + 15) & ~15) /* Align. */
57#define HOST_STACK_LR (HOST_STACK_SIZE + LONGBYTES) /* In caller stack frame. */ 59/* LR in caller stack frame. */
60#define HOST_STACK_LR (HOST_STACK_SIZE + PPC_LR_STKOFF)
58 61
59#define NEED_EMU 0x00000001 /* emulation -- save nv regs */ 62#define NEED_EMU 0x00000001 /* emulation -- save nv regs */
60#define NEED_DEAR 0x00000002 /* save faulting DEAR */ 63#define NEED_DEAR 0x00000002 /* save faulting DEAR */
@@ -201,12 +204,128 @@
201 b kvmppc_resume_host 204 b kvmppc_resume_host
202.endm 205.endm
203 206
207#ifdef CONFIG_64BIT
208/* Exception types */
209#define EX_GEN 1
210#define EX_GDBELL 2
211#define EX_DBG 3
212#define EX_MC 4
213#define EX_CRIT 5
214#define EX_TLB 6
215
216/*
217 * For input register values, see arch/powerpc/include/asm/kvm_booke_hv_asm.h
218 */
219.macro kvm_handler intno type scratch, paca_ex, ex_r10, ex_r11, srr0, srr1, flags
220 _GLOBAL(kvmppc_handler_\intno\()_\srr1)
221 mr r11, r4
222 /*
223 * Get vcpu from Paca: paca->__current.thread->kvm_vcpu
224 */
225 PPC_LL r4, PACACURRENT(r13)
226 PPC_LL r4, (THREAD + THREAD_KVM_VCPU)(r4)
227 stw r10, VCPU_CR(r4)
228 PPC_STL r11, VCPU_GPR(R4)(r4)
229 PPC_STL r5, VCPU_GPR(R5)(r4)
230 .if \type == EX_CRIT
231 PPC_LL r5, (\paca_ex + EX_R13)(r13)
232 .else
233 mfspr r5, \scratch
234 .endif
235 PPC_STL r6, VCPU_GPR(R6)(r4)
236 PPC_STL r8, VCPU_GPR(R8)(r4)
237 PPC_STL r9, VCPU_GPR(R9)(r4)
238 PPC_STL r5, VCPU_GPR(R13)(r4)
239 PPC_LL r6, (\paca_ex + \ex_r10)(r13)
240 PPC_LL r8, (\paca_ex + \ex_r11)(r13)
241 PPC_STL r3, VCPU_GPR(R3)(r4)
242 PPC_STL r7, VCPU_GPR(R7)(r4)
243 PPC_STL r12, VCPU_GPR(R12)(r4)
244 PPC_STL r6, VCPU_GPR(R10)(r4)
245 PPC_STL r8, VCPU_GPR(R11)(r4)
246 mfctr r5
247 PPC_STL r5, VCPU_CTR(r4)
248 mfspr r5, \srr0
249 mfspr r6, \srr1
250 kvm_handler_common \intno, \srr0, \flags
251.endm
252
253#define EX_PARAMS(type) \
254 EX_##type, \
255 SPRN_SPRG_##type##_SCRATCH, \
256 PACA_EX##type, \
257 EX_R10, \
258 EX_R11
259
260#define EX_PARAMS_TLB \
261 EX_TLB, \
262 SPRN_SPRG_GEN_SCRATCH, \
263 PACA_EXTLB, \
264 EX_TLB_R10, \
265 EX_TLB_R11
266
267kvm_handler BOOKE_INTERRUPT_CRITICAL, EX_PARAMS(CRIT), \
268 SPRN_CSRR0, SPRN_CSRR1, 0
269kvm_handler BOOKE_INTERRUPT_MACHINE_CHECK, EX_PARAMS(MC), \
270 SPRN_MCSRR0, SPRN_MCSRR1, 0
271kvm_handler BOOKE_INTERRUPT_DATA_STORAGE, EX_PARAMS(GEN), \
272 SPRN_SRR0, SPRN_SRR1,(NEED_EMU | NEED_DEAR | NEED_ESR)
273kvm_handler BOOKE_INTERRUPT_INST_STORAGE, EX_PARAMS(GEN), \
274 SPRN_SRR0, SPRN_SRR1, NEED_ESR
275kvm_handler BOOKE_INTERRUPT_EXTERNAL, EX_PARAMS(GEN), \
276 SPRN_SRR0, SPRN_SRR1, 0
277kvm_handler BOOKE_INTERRUPT_ALIGNMENT, EX_PARAMS(GEN), \
278 SPRN_SRR0, SPRN_SRR1,(NEED_DEAR | NEED_ESR)
279kvm_handler BOOKE_INTERRUPT_PROGRAM, EX_PARAMS(GEN), \
280 SPRN_SRR0, SPRN_SRR1,NEED_ESR
281kvm_handler BOOKE_INTERRUPT_FP_UNAVAIL, EX_PARAMS(GEN), \
282 SPRN_SRR0, SPRN_SRR1, 0
283kvm_handler BOOKE_INTERRUPT_AP_UNAVAIL, EX_PARAMS(GEN), \
284 SPRN_SRR0, SPRN_SRR1, 0
285kvm_handler BOOKE_INTERRUPT_DECREMENTER, EX_PARAMS(GEN), \
286 SPRN_SRR0, SPRN_SRR1, 0
287kvm_handler BOOKE_INTERRUPT_FIT, EX_PARAMS(GEN), \
288 SPRN_SRR0, SPRN_SRR1, 0
289kvm_handler BOOKE_INTERRUPT_WATCHDOG, EX_PARAMS(CRIT),\
290 SPRN_CSRR0, SPRN_CSRR1, 0
291/*
292 * Only bolted TLB miss exception handlers are supported for now
293 */
294kvm_handler BOOKE_INTERRUPT_DTLB_MISS, EX_PARAMS_TLB, \
295 SPRN_SRR0, SPRN_SRR1, (NEED_EMU | NEED_DEAR | NEED_ESR)
296kvm_handler BOOKE_INTERRUPT_ITLB_MISS, EX_PARAMS_TLB, \
297 SPRN_SRR0, SPRN_SRR1, 0
298kvm_handler BOOKE_INTERRUPT_SPE_UNAVAIL, EX_PARAMS(GEN), \
299 SPRN_SRR0, SPRN_SRR1, 0
300kvm_handler BOOKE_INTERRUPT_SPE_FP_DATA, EX_PARAMS(GEN), \
301 SPRN_SRR0, SPRN_SRR1, 0
302kvm_handler BOOKE_INTERRUPT_SPE_FP_ROUND, EX_PARAMS(GEN), \
303 SPRN_SRR0, SPRN_SRR1, 0
304kvm_handler BOOKE_INTERRUPT_PERFORMANCE_MONITOR, EX_PARAMS(GEN), \
305 SPRN_SRR0, SPRN_SRR1, 0
306kvm_handler BOOKE_INTERRUPT_DOORBELL, EX_PARAMS(GEN), \
307 SPRN_SRR0, SPRN_SRR1, 0
308kvm_handler BOOKE_INTERRUPT_DOORBELL_CRITICAL, EX_PARAMS(CRIT), \
309 SPRN_CSRR0, SPRN_CSRR1, 0
310kvm_handler BOOKE_INTERRUPT_HV_PRIV, EX_PARAMS(GEN), \
311 SPRN_SRR0, SPRN_SRR1, NEED_EMU
312kvm_handler BOOKE_INTERRUPT_HV_SYSCALL, EX_PARAMS(GEN), \
313 SPRN_SRR0, SPRN_SRR1, 0
314kvm_handler BOOKE_INTERRUPT_GUEST_DBELL, EX_PARAMS(GDBELL), \
315 SPRN_GSRR0, SPRN_GSRR1, 0
316kvm_handler BOOKE_INTERRUPT_GUEST_DBELL_CRIT, EX_PARAMS(CRIT), \
317 SPRN_CSRR0, SPRN_CSRR1, 0
318kvm_handler BOOKE_INTERRUPT_DEBUG, EX_PARAMS(DBG), \
319 SPRN_DSRR0, SPRN_DSRR1, 0
320kvm_handler BOOKE_INTERRUPT_DEBUG, EX_PARAMS(CRIT), \
321 SPRN_CSRR0, SPRN_CSRR1, 0
322#else
204/* 323/*
205 * For input register values, see arch/powerpc/include/asm/kvm_booke_hv_asm.h 324 * For input register values, see arch/powerpc/include/asm/kvm_booke_hv_asm.h
206 */ 325 */
207.macro kvm_handler intno srr0, srr1, flags 326.macro kvm_handler intno srr0, srr1, flags
208_GLOBAL(kvmppc_handler_\intno\()_\srr1) 327_GLOBAL(kvmppc_handler_\intno\()_\srr1)
209 GET_VCPU(r11, r10) 328 PPC_LL r11, THREAD_KVM_VCPU(r10)
210 PPC_STL r3, VCPU_GPR(R3)(r11) 329 PPC_STL r3, VCPU_GPR(R3)(r11)
211 mfspr r3, SPRN_SPRG_RSCRATCH0 330 mfspr r3, SPRN_SPRG_RSCRATCH0
212 PPC_STL r4, VCPU_GPR(R4)(r11) 331 PPC_STL r4, VCPU_GPR(R4)(r11)
@@ -233,7 +352,7 @@ _GLOBAL(kvmppc_handler_\intno\()_\srr1)
233.macro kvm_lvl_handler intno scratch srr0, srr1, flags 352.macro kvm_lvl_handler intno scratch srr0, srr1, flags
234_GLOBAL(kvmppc_handler_\intno\()_\srr1) 353_GLOBAL(kvmppc_handler_\intno\()_\srr1)
235 mfspr r10, SPRN_SPRG_THREAD 354 mfspr r10, SPRN_SPRG_THREAD
236 GET_VCPU(r11, r10) 355 PPC_LL r11, THREAD_KVM_VCPU(r10)
237 PPC_STL r3, VCPU_GPR(R3)(r11) 356 PPC_STL r3, VCPU_GPR(R3)(r11)
238 mfspr r3, \scratch 357 mfspr r3, \scratch
239 PPC_STL r4, VCPU_GPR(R4)(r11) 358 PPC_STL r4, VCPU_GPR(R4)(r11)
@@ -295,7 +414,7 @@ kvm_lvl_handler BOOKE_INTERRUPT_DEBUG, \
295 SPRN_SPRG_RSCRATCH_CRIT, SPRN_CSRR0, SPRN_CSRR1, 0 414 SPRN_SPRG_RSCRATCH_CRIT, SPRN_CSRR0, SPRN_CSRR1, 0
296kvm_lvl_handler BOOKE_INTERRUPT_DEBUG, \ 415kvm_lvl_handler BOOKE_INTERRUPT_DEBUG, \
297 SPRN_SPRG_RSCRATCH_DBG, SPRN_DSRR0, SPRN_DSRR1, 0 416 SPRN_SPRG_RSCRATCH_DBG, SPRN_DSRR0, SPRN_DSRR1, 0
298 417#endif
299 418
300/* Registers: 419/* Registers:
301 * SPRG_SCRATCH0: guest r10 420 * SPRG_SCRATCH0: guest r10
diff --git a/arch/powerpc/kvm/e500.h b/arch/powerpc/kvm/e500.h
index aa8b81428bf4..c70d37ed770a 100644
--- a/arch/powerpc/kvm/e500.h
+++ b/arch/powerpc/kvm/e500.h
@@ -27,8 +27,7 @@
27#define E500_TLB_NUM 2 27#define E500_TLB_NUM 2
28 28
29#define E500_TLB_VALID 1 29#define E500_TLB_VALID 1
30#define E500_TLB_DIRTY 2 30#define E500_TLB_BITMAP 2
31#define E500_TLB_BITMAP 4
32 31
33struct tlbe_ref { 32struct tlbe_ref {
34 pfn_t pfn; 33 pfn_t pfn;
@@ -130,9 +129,9 @@ int kvmppc_e500_emul_mt_mmucsr0(struct kvmppc_vcpu_e500 *vcpu_e500,
130 ulong value); 129 ulong value);
131int kvmppc_e500_emul_tlbwe(struct kvm_vcpu *vcpu); 130int kvmppc_e500_emul_tlbwe(struct kvm_vcpu *vcpu);
132int kvmppc_e500_emul_tlbre(struct kvm_vcpu *vcpu); 131int kvmppc_e500_emul_tlbre(struct kvm_vcpu *vcpu);
133int kvmppc_e500_emul_tlbivax(struct kvm_vcpu *vcpu, int ra, int rb); 132int kvmppc_e500_emul_tlbivax(struct kvm_vcpu *vcpu, gva_t ea);
134int kvmppc_e500_emul_tlbilx(struct kvm_vcpu *vcpu, int rt, int ra, int rb); 133int kvmppc_e500_emul_tlbilx(struct kvm_vcpu *vcpu, int type, gva_t ea);
135int kvmppc_e500_emul_tlbsx(struct kvm_vcpu *vcpu, int rb); 134int kvmppc_e500_emul_tlbsx(struct kvm_vcpu *vcpu, gva_t ea);
136int kvmppc_e500_tlb_init(struct kvmppc_vcpu_e500 *vcpu_e500); 135int kvmppc_e500_tlb_init(struct kvmppc_vcpu_e500 *vcpu_e500);
137void kvmppc_e500_tlb_uninit(struct kvmppc_vcpu_e500 *vcpu_e500); 136void kvmppc_e500_tlb_uninit(struct kvmppc_vcpu_e500 *vcpu_e500);
138 137
@@ -155,7 +154,7 @@ get_tlb_size(const struct kvm_book3e_206_tlb_entry *tlbe)
155 154
156static inline gva_t get_tlb_eaddr(const struct kvm_book3e_206_tlb_entry *tlbe) 155static inline gva_t get_tlb_eaddr(const struct kvm_book3e_206_tlb_entry *tlbe)
157{ 156{
158 return tlbe->mas2 & 0xfffff000; 157 return tlbe->mas2 & MAS2_EPN;
159} 158}
160 159
161static inline u64 get_tlb_bytes(const struct kvm_book3e_206_tlb_entry *tlbe) 160static inline u64 get_tlb_bytes(const struct kvm_book3e_206_tlb_entry *tlbe)
diff --git a/arch/powerpc/kvm/e500_emulate.c b/arch/powerpc/kvm/e500_emulate.c
index e04b0ef55ce0..e78f353a836a 100644
--- a/arch/powerpc/kvm/e500_emulate.c
+++ b/arch/powerpc/kvm/e500_emulate.c
@@ -89,6 +89,7 @@ int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,
89 int ra = get_ra(inst); 89 int ra = get_ra(inst);
90 int rb = get_rb(inst); 90 int rb = get_rb(inst);
91 int rt = get_rt(inst); 91 int rt = get_rt(inst);
92 gva_t ea;
92 93
93 switch (get_op(inst)) { 94 switch (get_op(inst)) {
94 case 31: 95 case 31:
@@ -113,15 +114,20 @@ int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,
113 break; 114 break;
114 115
115 case XOP_TLBSX: 116 case XOP_TLBSX:
116 emulated = kvmppc_e500_emul_tlbsx(vcpu,rb); 117 ea = kvmppc_get_ea_indexed(vcpu, ra, rb);
118 emulated = kvmppc_e500_emul_tlbsx(vcpu, ea);
117 break; 119 break;
118 120
119 case XOP_TLBILX: 121 case XOP_TLBILX: {
120 emulated = kvmppc_e500_emul_tlbilx(vcpu, rt, ra, rb); 122 int type = rt & 0x3;
123 ea = kvmppc_get_ea_indexed(vcpu, ra, rb);
124 emulated = kvmppc_e500_emul_tlbilx(vcpu, type, ea);
121 break; 125 break;
126 }
122 127
123 case XOP_TLBIVAX: 128 case XOP_TLBIVAX:
124 emulated = kvmppc_e500_emul_tlbivax(vcpu, ra, rb); 129 ea = kvmppc_get_ea_indexed(vcpu, ra, rb);
130 emulated = kvmppc_e500_emul_tlbivax(vcpu, ea);
125 break; 131 break;
126 132
127 default: 133 default:
diff --git a/arch/powerpc/kvm/e500_tlb.c b/arch/powerpc/kvm/e500_tlb.c
index ff38b664195d..cf3f18012371 100644
--- a/arch/powerpc/kvm/e500_tlb.c
+++ b/arch/powerpc/kvm/e500_tlb.c
@@ -304,17 +304,13 @@ static inline void kvmppc_e500_ref_setup(struct tlbe_ref *ref,
304 ref->flags = E500_TLB_VALID; 304 ref->flags = E500_TLB_VALID;
305 305
306 if (tlbe_is_writable(gtlbe)) 306 if (tlbe_is_writable(gtlbe))
307 ref->flags |= E500_TLB_DIRTY; 307 kvm_set_pfn_dirty(pfn);
308} 308}
309 309
310static inline void kvmppc_e500_ref_release(struct tlbe_ref *ref) 310static inline void kvmppc_e500_ref_release(struct tlbe_ref *ref)
311{ 311{
312 if (ref->flags & E500_TLB_VALID) { 312 if (ref->flags & E500_TLB_VALID) {
313 if (ref->flags & E500_TLB_DIRTY) 313 trace_kvm_booke206_ref_release(ref->pfn, ref->flags);
314 kvm_release_pfn_dirty(ref->pfn);
315 else
316 kvm_release_pfn_clean(ref->pfn);
317
318 ref->flags = 0; 314 ref->flags = 0;
319 } 315 }
320} 316}
@@ -357,6 +353,13 @@ static void clear_tlb_refs(struct kvmppc_vcpu_e500 *vcpu_e500)
357 clear_tlb_privs(vcpu_e500); 353 clear_tlb_privs(vcpu_e500);
358} 354}
359 355
356void kvmppc_core_flush_tlb(struct kvm_vcpu *vcpu)
357{
358 struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
359 clear_tlb_refs(vcpu_e500);
360 clear_tlb1_bitmap(vcpu_e500);
361}
362
360static inline void kvmppc_e500_deliver_tlb_miss(struct kvm_vcpu *vcpu, 363static inline void kvmppc_e500_deliver_tlb_miss(struct kvm_vcpu *vcpu,
361 unsigned int eaddr, int as) 364 unsigned int eaddr, int as)
362{ 365{
@@ -412,7 +415,8 @@ static inline void kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500,
412 struct tlbe_ref *ref) 415 struct tlbe_ref *ref)
413{ 416{
414 struct kvm_memory_slot *slot; 417 struct kvm_memory_slot *slot;
415 unsigned long pfn, hva; 418 unsigned long pfn = 0; /* silence GCC warning */
419 unsigned long hva;
416 int pfnmap = 0; 420 int pfnmap = 0;
417 int tsize = BOOK3E_PAGESZ_4K; 421 int tsize = BOOK3E_PAGESZ_4K;
418 422
@@ -521,7 +525,7 @@ static inline void kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500,
521 if (likely(!pfnmap)) { 525 if (likely(!pfnmap)) {
522 unsigned long tsize_pages = 1 << (tsize + 10 - PAGE_SHIFT); 526 unsigned long tsize_pages = 1 << (tsize + 10 - PAGE_SHIFT);
523 pfn = gfn_to_pfn_memslot(slot, gfn); 527 pfn = gfn_to_pfn_memslot(slot, gfn);
524 if (is_error_pfn(pfn)) { 528 if (is_error_noslot_pfn(pfn)) {
525 printk(KERN_ERR "Couldn't get real page for gfn %lx!\n", 529 printk(KERN_ERR "Couldn't get real page for gfn %lx!\n",
526 (long)gfn); 530 (long)gfn);
527 return; 531 return;
@@ -541,6 +545,9 @@ static inline void kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500,
541 545
542 /* Clear i-cache for new pages */ 546 /* Clear i-cache for new pages */
543 kvmppc_mmu_flush_icache(pfn); 547 kvmppc_mmu_flush_icache(pfn);
548
549 /* Drop refcount on page, so that mmu notifiers can clear it */
550 kvm_release_pfn_clean(pfn);
544} 551}
545 552
546/* XXX only map the one-one case, for now use TLB0 */ 553/* XXX only map the one-one case, for now use TLB0 */
@@ -682,14 +689,11 @@ int kvmppc_e500_emul_mt_mmucsr0(struct kvmppc_vcpu_e500 *vcpu_e500, ulong value)
682 return EMULATE_DONE; 689 return EMULATE_DONE;
683} 690}
684 691
685int kvmppc_e500_emul_tlbivax(struct kvm_vcpu *vcpu, int ra, int rb) 692int kvmppc_e500_emul_tlbivax(struct kvm_vcpu *vcpu, gva_t ea)
686{ 693{
687 struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); 694 struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
688 unsigned int ia; 695 unsigned int ia;
689 int esel, tlbsel; 696 int esel, tlbsel;
690 gva_t ea;
691
692 ea = ((ra) ? kvmppc_get_gpr(vcpu, ra) : 0) + kvmppc_get_gpr(vcpu, rb);
693 697
694 ia = (ea >> 2) & 0x1; 698 ia = (ea >> 2) & 0x1;
695 699
@@ -716,7 +720,7 @@ int kvmppc_e500_emul_tlbivax(struct kvm_vcpu *vcpu, int ra, int rb)
716} 720}
717 721
718static void tlbilx_all(struct kvmppc_vcpu_e500 *vcpu_e500, int tlbsel, 722static void tlbilx_all(struct kvmppc_vcpu_e500 *vcpu_e500, int tlbsel,
719 int pid, int rt) 723 int pid, int type)
720{ 724{
721 struct kvm_book3e_206_tlb_entry *tlbe; 725 struct kvm_book3e_206_tlb_entry *tlbe;
722 int tid, esel; 726 int tid, esel;
@@ -725,7 +729,7 @@ static void tlbilx_all(struct kvmppc_vcpu_e500 *vcpu_e500, int tlbsel,
725 for (esel = 0; esel < vcpu_e500->gtlb_params[tlbsel].entries; esel++) { 729 for (esel = 0; esel < vcpu_e500->gtlb_params[tlbsel].entries; esel++) {
726 tlbe = get_entry(vcpu_e500, tlbsel, esel); 730 tlbe = get_entry(vcpu_e500, tlbsel, esel);
727 tid = get_tlb_tid(tlbe); 731 tid = get_tlb_tid(tlbe);
728 if (rt == 0 || tid == pid) { 732 if (type == 0 || tid == pid) {
729 inval_gtlbe_on_host(vcpu_e500, tlbsel, esel); 733 inval_gtlbe_on_host(vcpu_e500, tlbsel, esel);
730 kvmppc_e500_gtlbe_invalidate(vcpu_e500, tlbsel, esel); 734 kvmppc_e500_gtlbe_invalidate(vcpu_e500, tlbsel, esel);
731 } 735 }
@@ -733,14 +737,9 @@ static void tlbilx_all(struct kvmppc_vcpu_e500 *vcpu_e500, int tlbsel,
733} 737}
734 738
735static void tlbilx_one(struct kvmppc_vcpu_e500 *vcpu_e500, int pid, 739static void tlbilx_one(struct kvmppc_vcpu_e500 *vcpu_e500, int pid,
736 int ra, int rb) 740 gva_t ea)
737{ 741{
738 int tlbsel, esel; 742 int tlbsel, esel;
739 gva_t ea;
740
741 ea = kvmppc_get_gpr(&vcpu_e500->vcpu, rb);
742 if (ra)
743 ea += kvmppc_get_gpr(&vcpu_e500->vcpu, ra);
744 743
745 for (tlbsel = 0; tlbsel < 2; tlbsel++) { 744 for (tlbsel = 0; tlbsel < 2; tlbsel++) {
746 esel = kvmppc_e500_tlb_index(vcpu_e500, ea, tlbsel, pid, -1); 745 esel = kvmppc_e500_tlb_index(vcpu_e500, ea, tlbsel, pid, -1);
@@ -752,16 +751,16 @@ static void tlbilx_one(struct kvmppc_vcpu_e500 *vcpu_e500, int pid,
752 } 751 }
753} 752}
754 753
755int kvmppc_e500_emul_tlbilx(struct kvm_vcpu *vcpu, int rt, int ra, int rb) 754int kvmppc_e500_emul_tlbilx(struct kvm_vcpu *vcpu, int type, gva_t ea)
756{ 755{
757 struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); 756 struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
758 int pid = get_cur_spid(vcpu); 757 int pid = get_cur_spid(vcpu);
759 758
760 if (rt == 0 || rt == 1) { 759 if (type == 0 || type == 1) {
761 tlbilx_all(vcpu_e500, 0, pid, rt); 760 tlbilx_all(vcpu_e500, 0, pid, type);
762 tlbilx_all(vcpu_e500, 1, pid, rt); 761 tlbilx_all(vcpu_e500, 1, pid, type);
763 } else if (rt == 3) { 762 } else if (type == 3) {
764 tlbilx_one(vcpu_e500, pid, ra, rb); 763 tlbilx_one(vcpu_e500, pid, ea);
765 } 764 }
766 765
767 return EMULATE_DONE; 766 return EMULATE_DONE;
@@ -786,16 +785,13 @@ int kvmppc_e500_emul_tlbre(struct kvm_vcpu *vcpu)
786 return EMULATE_DONE; 785 return EMULATE_DONE;
787} 786}
788 787
789int kvmppc_e500_emul_tlbsx(struct kvm_vcpu *vcpu, int rb) 788int kvmppc_e500_emul_tlbsx(struct kvm_vcpu *vcpu, gva_t ea)
790{ 789{
791 struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); 790 struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
792 int as = !!get_cur_sas(vcpu); 791 int as = !!get_cur_sas(vcpu);
793 unsigned int pid = get_cur_spid(vcpu); 792 unsigned int pid = get_cur_spid(vcpu);
794 int esel, tlbsel; 793 int esel, tlbsel;
795 struct kvm_book3e_206_tlb_entry *gtlbe = NULL; 794 struct kvm_book3e_206_tlb_entry *gtlbe = NULL;
796 gva_t ea;
797
798 ea = kvmppc_get_gpr(vcpu, rb);
799 795
800 for (tlbsel = 0; tlbsel < 2; tlbsel++) { 796 for (tlbsel = 0; tlbsel < 2; tlbsel++) {
801 esel = kvmppc_e500_tlb_index(vcpu_e500, ea, tlbsel, pid, as); 797 esel = kvmppc_e500_tlb_index(vcpu_e500, ea, tlbsel, pid, as);
@@ -875,6 +871,8 @@ int kvmppc_e500_emul_tlbwe(struct kvm_vcpu *vcpu)
875 871
876 gtlbe->mas1 = vcpu->arch.shared->mas1; 872 gtlbe->mas1 = vcpu->arch.shared->mas1;
877 gtlbe->mas2 = vcpu->arch.shared->mas2; 873 gtlbe->mas2 = vcpu->arch.shared->mas2;
874 if (!(vcpu->arch.shared->msr & MSR_CM))
875 gtlbe->mas2 &= 0xffffffffUL;
878 gtlbe->mas7_3 = vcpu->arch.shared->mas7_3; 876 gtlbe->mas7_3 = vcpu->arch.shared->mas7_3;
879 877
880 trace_kvm_booke206_gtlb_write(vcpu->arch.shared->mas0, gtlbe->mas1, 878 trace_kvm_booke206_gtlb_write(vcpu->arch.shared->mas0, gtlbe->mas1,
@@ -1039,8 +1037,12 @@ void kvmppc_mmu_map(struct kvm_vcpu *vcpu, u64 eaddr, gpa_t gpaddr,
1039 sesel = 0; /* unused */ 1037 sesel = 0; /* unused */
1040 priv = &vcpu_e500->gtlb_priv[tlbsel][esel]; 1038 priv = &vcpu_e500->gtlb_priv[tlbsel][esel];
1041 1039
1042 kvmppc_e500_setup_stlbe(vcpu, gtlbe, BOOK3E_PAGESZ_4K, 1040 /* Only triggers after clear_tlb_refs */
1043 &priv->ref, eaddr, &stlbe); 1041 if (unlikely(!(priv->ref.flags & E500_TLB_VALID)))
1042 kvmppc_e500_tlb0_map(vcpu_e500, esel, &stlbe);
1043 else
1044 kvmppc_e500_setup_stlbe(vcpu, gtlbe, BOOK3E_PAGESZ_4K,
1045 &priv->ref, eaddr, &stlbe);
1044 break; 1046 break;
1045 1047
1046 case 1: { 1048 case 1: {
@@ -1060,6 +1062,49 @@ void kvmppc_mmu_map(struct kvm_vcpu *vcpu, u64 eaddr, gpa_t gpaddr,
1060 write_stlbe(vcpu_e500, gtlbe, &stlbe, stlbsel, sesel); 1062 write_stlbe(vcpu_e500, gtlbe, &stlbe, stlbsel, sesel);
1061} 1063}
1062 1064
1065/************* MMU Notifiers *************/
1066
1067int kvm_unmap_hva(struct kvm *kvm, unsigned long hva)
1068{
1069 trace_kvm_unmap_hva(hva);
1070
1071 /*
1072 * Flush all shadow tlb entries everywhere. This is slow, but
1073 * we are 100% sure that we catch the to be unmapped page
1074 */
1075 kvm_flush_remote_tlbs(kvm);
1076
1077 return 0;
1078}
1079
1080int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end)
1081{
1082 /* kvm_unmap_hva flushes everything anyways */
1083 kvm_unmap_hva(kvm, start);
1084
1085 return 0;
1086}
1087
1088int kvm_age_hva(struct kvm *kvm, unsigned long hva)
1089{
1090 /* XXX could be more clever ;) */
1091 return 0;
1092}
1093
1094int kvm_test_age_hva(struct kvm *kvm, unsigned long hva)
1095{
1096 /* XXX could be more clever ;) */
1097 return 0;
1098}
1099
1100void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
1101{
1102 /* The page will get remapped properly on its next fault */
1103 kvm_unmap_hva(kvm, hva);
1104}
1105
1106/*****************************************/
1107
1063static void free_gtlb(struct kvmppc_vcpu_e500 *vcpu_e500) 1108static void free_gtlb(struct kvmppc_vcpu_e500 *vcpu_e500)
1064{ 1109{
1065 int i; 1110 int i;
@@ -1081,6 +1126,8 @@ static void free_gtlb(struct kvmppc_vcpu_e500 *vcpu_e500)
1081 } 1126 }
1082 1127
1083 vcpu_e500->num_shared_tlb_pages = 0; 1128 vcpu_e500->num_shared_tlb_pages = 0;
1129
1130 kfree(vcpu_e500->shared_tlb_pages);
1084 vcpu_e500->shared_tlb_pages = NULL; 1131 vcpu_e500->shared_tlb_pages = NULL;
1085 } else { 1132 } else {
1086 kfree(vcpu_e500->gtlb_arch); 1133 kfree(vcpu_e500->gtlb_arch);
@@ -1178,21 +1225,27 @@ int kvm_vcpu_ioctl_config_tlb(struct kvm_vcpu *vcpu,
1178 } 1225 }
1179 1226
1180 virt = vmap(pages, num_pages, VM_MAP, PAGE_KERNEL); 1227 virt = vmap(pages, num_pages, VM_MAP, PAGE_KERNEL);
1181 if (!virt) 1228 if (!virt) {
1229 ret = -ENOMEM;
1182 goto err_put_page; 1230 goto err_put_page;
1231 }
1183 1232
1184 privs[0] = kzalloc(sizeof(struct tlbe_priv) * params.tlb_sizes[0], 1233 privs[0] = kzalloc(sizeof(struct tlbe_priv) * params.tlb_sizes[0],
1185 GFP_KERNEL); 1234 GFP_KERNEL);
1186 privs[1] = kzalloc(sizeof(struct tlbe_priv) * params.tlb_sizes[1], 1235 privs[1] = kzalloc(sizeof(struct tlbe_priv) * params.tlb_sizes[1],
1187 GFP_KERNEL); 1236 GFP_KERNEL);
1188 1237
1189 if (!privs[0] || !privs[1]) 1238 if (!privs[0] || !privs[1]) {
1190 goto err_put_page; 1239 ret = -ENOMEM;
1240 goto err_privs;
1241 }
1191 1242
1192 g2h_bitmap = kzalloc(sizeof(u64) * params.tlb_sizes[1], 1243 g2h_bitmap = kzalloc(sizeof(u64) * params.tlb_sizes[1],
1193 GFP_KERNEL); 1244 GFP_KERNEL);
1194 if (!g2h_bitmap) 1245 if (!g2h_bitmap) {
1195 goto err_put_page; 1246 ret = -ENOMEM;
1247 goto err_privs;
1248 }
1196 1249
1197 free_gtlb(vcpu_e500); 1250 free_gtlb(vcpu_e500);
1198 1251
@@ -1232,10 +1285,11 @@ int kvm_vcpu_ioctl_config_tlb(struct kvm_vcpu *vcpu,
1232 kvmppc_recalc_tlb1map_range(vcpu_e500); 1285 kvmppc_recalc_tlb1map_range(vcpu_e500);
1233 return 0; 1286 return 0;
1234 1287
1235err_put_page: 1288err_privs:
1236 kfree(privs[0]); 1289 kfree(privs[0]);
1237 kfree(privs[1]); 1290 kfree(privs[1]);
1238 1291
1292err_put_page:
1239 for (i = 0; i < num_pages; i++) 1293 for (i = 0; i < num_pages; i++)
1240 put_page(pages[i]); 1294 put_page(pages[i]);
1241 1295
@@ -1332,7 +1386,7 @@ int kvmppc_e500_tlb_init(struct kvmppc_vcpu_e500 *vcpu_e500)
1332 if (!vcpu_e500->gtlb_priv[1]) 1386 if (!vcpu_e500->gtlb_priv[1])
1333 goto err; 1387 goto err;
1334 1388
1335 vcpu_e500->g2h_tlb1_map = kzalloc(sizeof(unsigned int) * 1389 vcpu_e500->g2h_tlb1_map = kzalloc(sizeof(u64) *
1336 vcpu_e500->gtlb_params[1].entries, 1390 vcpu_e500->gtlb_params[1].entries,
1337 GFP_KERNEL); 1391 GFP_KERNEL);
1338 if (!vcpu_e500->g2h_tlb1_map) 1392 if (!vcpu_e500->g2h_tlb1_map)
diff --git a/arch/powerpc/kvm/emulate.c b/arch/powerpc/kvm/emulate.c
index ee04abaefe23..b0855e5d8905 100644
--- a/arch/powerpc/kvm/emulate.c
+++ b/arch/powerpc/kvm/emulate.c
@@ -131,6 +131,125 @@ u32 kvmppc_get_dec(struct kvm_vcpu *vcpu, u64 tb)
131 return vcpu->arch.dec - jd; 131 return vcpu->arch.dec - jd;
132} 132}
133 133
134static int kvmppc_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs)
135{
136 enum emulation_result emulated = EMULATE_DONE;
137 ulong spr_val = kvmppc_get_gpr(vcpu, rs);
138
139 switch (sprn) {
140 case SPRN_SRR0:
141 vcpu->arch.shared->srr0 = spr_val;
142 break;
143 case SPRN_SRR1:
144 vcpu->arch.shared->srr1 = spr_val;
145 break;
146
147 /* XXX We need to context-switch the timebase for
148 * watchdog and FIT. */
149 case SPRN_TBWL: break;
150 case SPRN_TBWU: break;
151
152 case SPRN_MSSSR0: break;
153
154 case SPRN_DEC:
155 vcpu->arch.dec = spr_val;
156 kvmppc_emulate_dec(vcpu);
157 break;
158
159 case SPRN_SPRG0:
160 vcpu->arch.shared->sprg0 = spr_val;
161 break;
162 case SPRN_SPRG1:
163 vcpu->arch.shared->sprg1 = spr_val;
164 break;
165 case SPRN_SPRG2:
166 vcpu->arch.shared->sprg2 = spr_val;
167 break;
168 case SPRN_SPRG3:
169 vcpu->arch.shared->sprg3 = spr_val;
170 break;
171
172 default:
173 emulated = kvmppc_core_emulate_mtspr(vcpu, sprn,
174 spr_val);
175 if (emulated == EMULATE_FAIL)
176 printk(KERN_INFO "mtspr: unknown spr "
177 "0x%x\n", sprn);
178 break;
179 }
180
181 kvmppc_set_exit_type(vcpu, EMULATED_MTSPR_EXITS);
182
183 return emulated;
184}
185
186static int kvmppc_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt)
187{
188 enum emulation_result emulated = EMULATE_DONE;
189 ulong spr_val = 0;
190
191 switch (sprn) {
192 case SPRN_SRR0:
193 spr_val = vcpu->arch.shared->srr0;
194 break;
195 case SPRN_SRR1:
196 spr_val = vcpu->arch.shared->srr1;
197 break;
198 case SPRN_PVR:
199 spr_val = vcpu->arch.pvr;
200 break;
201 case SPRN_PIR:
202 spr_val = vcpu->vcpu_id;
203 break;
204 case SPRN_MSSSR0:
205 spr_val = 0;
206 break;
207
208 /* Note: mftb and TBRL/TBWL are user-accessible, so
209 * the guest can always access the real TB anyways.
210 * In fact, we probably will never see these traps. */
211 case SPRN_TBWL:
212 spr_val = get_tb() >> 32;
213 break;
214 case SPRN_TBWU:
215 spr_val = get_tb();
216 break;
217
218 case SPRN_SPRG0:
219 spr_val = vcpu->arch.shared->sprg0;
220 break;
221 case SPRN_SPRG1:
222 spr_val = vcpu->arch.shared->sprg1;
223 break;
224 case SPRN_SPRG2:
225 spr_val = vcpu->arch.shared->sprg2;
226 break;
227 case SPRN_SPRG3:
228 spr_val = vcpu->arch.shared->sprg3;
229 break;
230 /* Note: SPRG4-7 are user-readable, so we don't get
231 * a trap. */
232
233 case SPRN_DEC:
234 spr_val = kvmppc_get_dec(vcpu, get_tb());
235 break;
236 default:
237 emulated = kvmppc_core_emulate_mfspr(vcpu, sprn,
238 &spr_val);
239 if (unlikely(emulated == EMULATE_FAIL)) {
240 printk(KERN_INFO "mfspr: unknown spr "
241 "0x%x\n", sprn);
242 }
243 break;
244 }
245
246 if (emulated == EMULATE_DONE)
247 kvmppc_set_gpr(vcpu, rt, spr_val);
248 kvmppc_set_exit_type(vcpu, EMULATED_MFSPR_EXITS);
249
250 return emulated;
251}
252
134/* XXX to do: 253/* XXX to do:
135 * lhax 254 * lhax
136 * lhaux 255 * lhaux
@@ -156,7 +275,6 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)
156 int sprn = get_sprn(inst); 275 int sprn = get_sprn(inst);
157 enum emulation_result emulated = EMULATE_DONE; 276 enum emulation_result emulated = EMULATE_DONE;
158 int advance = 1; 277 int advance = 1;
159 ulong spr_val = 0;
160 278
161 /* this default type might be overwritten by subcategories */ 279 /* this default type might be overwritten by subcategories */
162 kvmppc_set_exit_type(vcpu, EMULATED_INST_EXITS); 280 kvmppc_set_exit_type(vcpu, EMULATED_INST_EXITS);
@@ -236,62 +354,7 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)
236 break; 354 break;
237 355
238 case OP_31_XOP_MFSPR: 356 case OP_31_XOP_MFSPR:
239 switch (sprn) { 357 emulated = kvmppc_emulate_mfspr(vcpu, sprn, rt);
240 case SPRN_SRR0:
241 spr_val = vcpu->arch.shared->srr0;
242 break;
243 case SPRN_SRR1:
244 spr_val = vcpu->arch.shared->srr1;
245 break;
246 case SPRN_PVR:
247 spr_val = vcpu->arch.pvr;
248 break;
249 case SPRN_PIR:
250 spr_val = vcpu->vcpu_id;
251 break;
252 case SPRN_MSSSR0:
253 spr_val = 0;
254 break;
255
256 /* Note: mftb and TBRL/TBWL are user-accessible, so
257 * the guest can always access the real TB anyways.
258 * In fact, we probably will never see these traps. */
259 case SPRN_TBWL:
260 spr_val = get_tb() >> 32;
261 break;
262 case SPRN_TBWU:
263 spr_val = get_tb();
264 break;
265
266 case SPRN_SPRG0:
267 spr_val = vcpu->arch.shared->sprg0;
268 break;
269 case SPRN_SPRG1:
270 spr_val = vcpu->arch.shared->sprg1;
271 break;
272 case SPRN_SPRG2:
273 spr_val = vcpu->arch.shared->sprg2;
274 break;
275 case SPRN_SPRG3:
276 spr_val = vcpu->arch.shared->sprg3;
277 break;
278 /* Note: SPRG4-7 are user-readable, so we don't get
279 * a trap. */
280
281 case SPRN_DEC:
282 spr_val = kvmppc_get_dec(vcpu, get_tb());
283 break;
284 default:
285 emulated = kvmppc_core_emulate_mfspr(vcpu, sprn,
286 &spr_val);
287 if (unlikely(emulated == EMULATE_FAIL)) {
288 printk(KERN_INFO "mfspr: unknown spr "
289 "0x%x\n", sprn);
290 }
291 break;
292 }
293 kvmppc_set_gpr(vcpu, rt, spr_val);
294 kvmppc_set_exit_type(vcpu, EMULATED_MFSPR_EXITS);
295 break; 358 break;
296 359
297 case OP_31_XOP_STHX: 360 case OP_31_XOP_STHX:
@@ -308,49 +371,7 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)
308 break; 371 break;
309 372
310 case OP_31_XOP_MTSPR: 373 case OP_31_XOP_MTSPR:
311 spr_val = kvmppc_get_gpr(vcpu, rs); 374 emulated = kvmppc_emulate_mtspr(vcpu, sprn, rs);
312 switch (sprn) {
313 case SPRN_SRR0:
314 vcpu->arch.shared->srr0 = spr_val;
315 break;
316 case SPRN_SRR1:
317 vcpu->arch.shared->srr1 = spr_val;
318 break;
319
320 /* XXX We need to context-switch the timebase for
321 * watchdog and FIT. */
322 case SPRN_TBWL: break;
323 case SPRN_TBWU: break;
324
325 case SPRN_MSSSR0: break;
326
327 case SPRN_DEC:
328 vcpu->arch.dec = spr_val;
329 kvmppc_emulate_dec(vcpu);
330 break;
331
332 case SPRN_SPRG0:
333 vcpu->arch.shared->sprg0 = spr_val;
334 break;
335 case SPRN_SPRG1:
336 vcpu->arch.shared->sprg1 = spr_val;
337 break;
338 case SPRN_SPRG2:
339 vcpu->arch.shared->sprg2 = spr_val;
340 break;
341 case SPRN_SPRG3:
342 vcpu->arch.shared->sprg3 = spr_val;
343 break;
344
345 default:
346 emulated = kvmppc_core_emulate_mtspr(vcpu, sprn,
347 spr_val);
348 if (emulated == EMULATE_FAIL)
349 printk(KERN_INFO "mtspr: unknown spr "
350 "0x%x\n", sprn);
351 break;
352 }
353 kvmppc_set_exit_type(vcpu, EMULATED_MTSPR_EXITS);
354 break; 375 break;
355 376
356 case OP_31_XOP_DCBI: 377 case OP_31_XOP_DCBI:
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 4d213b8b0fb5..70739a089560 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -30,6 +30,7 @@
30#include <asm/kvm_ppc.h> 30#include <asm/kvm_ppc.h>
31#include <asm/tlbflush.h> 31#include <asm/tlbflush.h>
32#include <asm/cputhreads.h> 32#include <asm/cputhreads.h>
33#include <asm/irqflags.h>
33#include "timing.h" 34#include "timing.h"
34#include "../mm/mmu_decl.h" 35#include "../mm/mmu_decl.h"
35 36
@@ -38,8 +39,7 @@
38 39
39int kvm_arch_vcpu_runnable(struct kvm_vcpu *v) 40int kvm_arch_vcpu_runnable(struct kvm_vcpu *v)
40{ 41{
41 return !(v->arch.shared->msr & MSR_WE) || 42 return !!(v->arch.pending_exceptions) ||
42 !!(v->arch.pending_exceptions) ||
43 v->requests; 43 v->requests;
44} 44}
45 45
@@ -48,6 +48,85 @@ int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu)
48 return 1; 48 return 1;
49} 49}
50 50
51#ifndef CONFIG_KVM_BOOK3S_64_HV
52/*
53 * Common checks before entering the guest world. Call with interrupts
54 * disabled.
55 *
56 * returns:
57 *
58 * == 1 if we're ready to go into guest state
59 * <= 0 if we need to go back to the host with return value
60 */
61int kvmppc_prepare_to_enter(struct kvm_vcpu *vcpu)
62{
63 int r = 1;
64
65 WARN_ON_ONCE(!irqs_disabled());
66 while (true) {
67 if (need_resched()) {
68 local_irq_enable();
69 cond_resched();
70 local_irq_disable();
71 continue;
72 }
73
74 if (signal_pending(current)) {
75 kvmppc_account_exit(vcpu, SIGNAL_EXITS);
76 vcpu->run->exit_reason = KVM_EXIT_INTR;
77 r = -EINTR;
78 break;
79 }
80
81 vcpu->mode = IN_GUEST_MODE;
82
83 /*
84 * Reading vcpu->requests must happen after setting vcpu->mode,
85 * so we don't miss a request because the requester sees
86 * OUTSIDE_GUEST_MODE and assumes we'll be checking requests
87 * before next entering the guest (and thus doesn't IPI).
88 */
89 smp_mb();
90
91 if (vcpu->requests) {
92 /* Make sure we process requests preemptable */
93 local_irq_enable();
94 trace_kvm_check_requests(vcpu);
95 r = kvmppc_core_check_requests(vcpu);
96 local_irq_disable();
97 if (r > 0)
98 continue;
99 break;
100 }
101
102 if (kvmppc_core_prepare_to_enter(vcpu)) {
103 /* interrupts got enabled in between, so we
104 are back at square 1 */
105 continue;
106 }
107
108#ifdef CONFIG_PPC64
109 /* lazy EE magic */
110 hard_irq_disable();
111 if (lazy_irq_pending()) {
112 /* Got an interrupt in between, try again */
113 local_irq_enable();
114 local_irq_disable();
115 kvm_guest_exit();
116 continue;
117 }
118
119 trace_hardirqs_on();
120#endif
121
122 kvm_guest_enter();
123 break;
124 }
125
126 return r;
127}
128#endif /* CONFIG_KVM_BOOK3S_64_HV */
129
51int kvmppc_kvm_pv(struct kvm_vcpu *vcpu) 130int kvmppc_kvm_pv(struct kvm_vcpu *vcpu)
52{ 131{
53 int nr = kvmppc_get_gpr(vcpu, 11); 132 int nr = kvmppc_get_gpr(vcpu, 11);
@@ -67,18 +146,18 @@ int kvmppc_kvm_pv(struct kvm_vcpu *vcpu)
67 } 146 }
68 147
69 switch (nr) { 148 switch (nr) {
70 case HC_VENDOR_KVM | KVM_HC_PPC_MAP_MAGIC_PAGE: 149 case KVM_HCALL_TOKEN(KVM_HC_PPC_MAP_MAGIC_PAGE):
71 { 150 {
72 vcpu->arch.magic_page_pa = param1; 151 vcpu->arch.magic_page_pa = param1;
73 vcpu->arch.magic_page_ea = param2; 152 vcpu->arch.magic_page_ea = param2;
74 153
75 r2 = KVM_MAGIC_FEAT_SR | KVM_MAGIC_FEAT_MAS0_TO_SPRG7; 154 r2 = KVM_MAGIC_FEAT_SR | KVM_MAGIC_FEAT_MAS0_TO_SPRG7;
76 155
77 r = HC_EV_SUCCESS; 156 r = EV_SUCCESS;
78 break; 157 break;
79 } 158 }
80 case HC_VENDOR_KVM | KVM_HC_FEATURES: 159 case KVM_HCALL_TOKEN(KVM_HC_FEATURES):
81 r = HC_EV_SUCCESS; 160 r = EV_SUCCESS;
82#if defined(CONFIG_PPC_BOOK3S) || defined(CONFIG_KVM_E500V2) 161#if defined(CONFIG_PPC_BOOK3S) || defined(CONFIG_KVM_E500V2)
83 /* XXX Missing magic page on 44x */ 162 /* XXX Missing magic page on 44x */
84 r2 |= (1 << KVM_FEATURE_MAGIC_PAGE); 163 r2 |= (1 << KVM_FEATURE_MAGIC_PAGE);
@@ -86,8 +165,13 @@ int kvmppc_kvm_pv(struct kvm_vcpu *vcpu)
86 165
87 /* Second return value is in r4 */ 166 /* Second return value is in r4 */
88 break; 167 break;
168 case EV_HCALL_TOKEN(EV_IDLE):
169 r = EV_SUCCESS;
170 kvm_vcpu_block(vcpu);
171 clear_bit(KVM_REQ_UNHALT, &vcpu->requests);
172 break;
89 default: 173 default:
90 r = HC_EV_UNIMPLEMENTED; 174 r = EV_UNIMPLEMENTED;
91 break; 175 break;
92 } 176 }
93 177
@@ -220,6 +304,7 @@ int kvm_dev_ioctl_check_extension(long ext)
220 switch (ext) { 304 switch (ext) {
221#ifdef CONFIG_BOOKE 305#ifdef CONFIG_BOOKE
222 case KVM_CAP_PPC_BOOKE_SREGS: 306 case KVM_CAP_PPC_BOOKE_SREGS:
307 case KVM_CAP_PPC_BOOKE_WATCHDOG:
223#else 308#else
224 case KVM_CAP_PPC_SEGSTATE: 309 case KVM_CAP_PPC_SEGSTATE:
225 case KVM_CAP_PPC_HIOR: 310 case KVM_CAP_PPC_HIOR:
@@ -229,6 +314,7 @@ int kvm_dev_ioctl_check_extension(long ext)
229 case KVM_CAP_PPC_IRQ_LEVEL: 314 case KVM_CAP_PPC_IRQ_LEVEL:
230 case KVM_CAP_ENABLE_CAP: 315 case KVM_CAP_ENABLE_CAP:
231 case KVM_CAP_ONE_REG: 316 case KVM_CAP_ONE_REG:
317 case KVM_CAP_IOEVENTFD:
232 r = 1; 318 r = 1;
233 break; 319 break;
234#ifndef CONFIG_KVM_BOOK3S_64_HV 320#ifndef CONFIG_KVM_BOOK3S_64_HV
@@ -260,10 +346,22 @@ int kvm_dev_ioctl_check_extension(long ext)
260 if (cpu_has_feature(CPU_FTR_ARCH_201)) 346 if (cpu_has_feature(CPU_FTR_ARCH_201))
261 r = 2; 347 r = 2;
262 break; 348 break;
349#endif
263 case KVM_CAP_SYNC_MMU: 350 case KVM_CAP_SYNC_MMU:
351#ifdef CONFIG_KVM_BOOK3S_64_HV
264 r = cpu_has_feature(CPU_FTR_ARCH_206) ? 1 : 0; 352 r = cpu_has_feature(CPU_FTR_ARCH_206) ? 1 : 0;
353#elif defined(KVM_ARCH_WANT_MMU_NOTIFIER)
354 r = 1;
355#else
356 r = 0;
357 break;
358#endif
359#ifdef CONFIG_KVM_BOOK3S_64_HV
360 case KVM_CAP_PPC_HTAB_FD:
361 r = 1;
265 break; 362 break;
266#endif 363#endif
364 break;
267 case KVM_CAP_NR_VCPUS: 365 case KVM_CAP_NR_VCPUS:
268 /* 366 /*
269 * Recommending a number of CPUs is somewhat arbitrary; we 367 * Recommending a number of CPUs is somewhat arbitrary; we
@@ -302,19 +400,12 @@ long kvm_arch_dev_ioctl(struct file *filp,
302void kvm_arch_free_memslot(struct kvm_memory_slot *free, 400void kvm_arch_free_memslot(struct kvm_memory_slot *free,
303 struct kvm_memory_slot *dont) 401 struct kvm_memory_slot *dont)
304{ 402{
305 if (!dont || free->arch.rmap != dont->arch.rmap) { 403 kvmppc_core_free_memslot(free, dont);
306 vfree(free->arch.rmap);
307 free->arch.rmap = NULL;
308 }
309} 404}
310 405
311int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages) 406int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages)
312{ 407{
313 slot->arch.rmap = vzalloc(npages * sizeof(*slot->arch.rmap)); 408 return kvmppc_core_create_memslot(slot, npages);
314 if (!slot->arch.rmap)
315 return -ENOMEM;
316
317 return 0;
318} 409}
319 410
320int kvm_arch_prepare_memory_region(struct kvm *kvm, 411int kvm_arch_prepare_memory_region(struct kvm *kvm,
@@ -323,7 +414,7 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
323 struct kvm_userspace_memory_region *mem, 414 struct kvm_userspace_memory_region *mem,
324 int user_alloc) 415 int user_alloc)
325{ 416{
326 return kvmppc_core_prepare_memory_region(kvm, mem); 417 return kvmppc_core_prepare_memory_region(kvm, memslot, mem);
327} 418}
328 419
329void kvm_arch_commit_memory_region(struct kvm *kvm, 420void kvm_arch_commit_memory_region(struct kvm *kvm,
@@ -331,7 +422,7 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
331 struct kvm_memory_slot old, 422 struct kvm_memory_slot old,
332 int user_alloc) 423 int user_alloc)
333{ 424{
334 kvmppc_core_commit_memory_region(kvm, mem); 425 kvmppc_core_commit_memory_region(kvm, mem, old);
335} 426}
336 427
337void kvm_arch_flush_shadow_all(struct kvm *kvm) 428void kvm_arch_flush_shadow_all(struct kvm *kvm)
@@ -341,6 +432,7 @@ void kvm_arch_flush_shadow_all(struct kvm *kvm)
341void kvm_arch_flush_shadow_memslot(struct kvm *kvm, 432void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
342 struct kvm_memory_slot *slot) 433 struct kvm_memory_slot *slot)
343{ 434{
435 kvmppc_core_flush_memslot(kvm, slot);
344} 436}
345 437
346struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id) 438struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id)
@@ -354,6 +446,11 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id)
354 return vcpu; 446 return vcpu;
355} 447}
356 448
449int kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
450{
451 return 0;
452}
453
357void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu) 454void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
358{ 455{
359 /* Make sure we're not using the vcpu anymore */ 456 /* Make sure we're not using the vcpu anymore */
@@ -390,6 +487,8 @@ enum hrtimer_restart kvmppc_decrementer_wakeup(struct hrtimer *timer)
390 487
391int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) 488int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
392{ 489{
490 int ret;
491
393 hrtimer_init(&vcpu->arch.dec_timer, CLOCK_REALTIME, HRTIMER_MODE_ABS); 492 hrtimer_init(&vcpu->arch.dec_timer, CLOCK_REALTIME, HRTIMER_MODE_ABS);
394 tasklet_init(&vcpu->arch.tasklet, kvmppc_decrementer_func, (ulong)vcpu); 493 tasklet_init(&vcpu->arch.tasklet, kvmppc_decrementer_func, (ulong)vcpu);
395 vcpu->arch.dec_timer.function = kvmppc_decrementer_wakeup; 494 vcpu->arch.dec_timer.function = kvmppc_decrementer_wakeup;
@@ -398,13 +497,14 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
398#ifdef CONFIG_KVM_EXIT_TIMING 497#ifdef CONFIG_KVM_EXIT_TIMING
399 mutex_init(&vcpu->arch.exit_timing_lock); 498 mutex_init(&vcpu->arch.exit_timing_lock);
400#endif 499#endif
401 500 ret = kvmppc_subarch_vcpu_init(vcpu);
402 return 0; 501 return ret;
403} 502}
404 503
405void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) 504void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
406{ 505{
407 kvmppc_mmu_destroy(vcpu); 506 kvmppc_mmu_destroy(vcpu);
507 kvmppc_subarch_vcpu_uninit(vcpu);
408} 508}
409 509
410void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) 510void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
@@ -420,7 +520,6 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
420 mtspr(SPRN_VRSAVE, vcpu->arch.vrsave); 520 mtspr(SPRN_VRSAVE, vcpu->arch.vrsave);
421#endif 521#endif
422 kvmppc_core_vcpu_load(vcpu, cpu); 522 kvmppc_core_vcpu_load(vcpu, cpu);
423 vcpu->cpu = smp_processor_id();
424} 523}
425 524
426void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) 525void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
@@ -429,7 +528,6 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
429#ifdef CONFIG_BOOKE 528#ifdef CONFIG_BOOKE
430 vcpu->arch.vrsave = mfspr(SPRN_VRSAVE); 529 vcpu->arch.vrsave = mfspr(SPRN_VRSAVE);
431#endif 530#endif
432 vcpu->cpu = -1;
433} 531}
434 532
435int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, 533int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
@@ -527,6 +625,13 @@ int kvmppc_handle_load(struct kvm_run *run, struct kvm_vcpu *vcpu,
527 vcpu->mmio_is_write = 0; 625 vcpu->mmio_is_write = 0;
528 vcpu->arch.mmio_sign_extend = 0; 626 vcpu->arch.mmio_sign_extend = 0;
529 627
628 if (!kvm_io_bus_read(vcpu->kvm, KVM_MMIO_BUS, run->mmio.phys_addr,
629 bytes, &run->mmio.data)) {
630 kvmppc_complete_mmio_load(vcpu, run);
631 vcpu->mmio_needed = 0;
632 return EMULATE_DONE;
633 }
634
530 return EMULATE_DO_MMIO; 635 return EMULATE_DO_MMIO;
531} 636}
532 637
@@ -536,8 +641,8 @@ int kvmppc_handle_loads(struct kvm_run *run, struct kvm_vcpu *vcpu,
536{ 641{
537 int r; 642 int r;
538 643
539 r = kvmppc_handle_load(run, vcpu, rt, bytes, is_bigendian);
540 vcpu->arch.mmio_sign_extend = 1; 644 vcpu->arch.mmio_sign_extend = 1;
645 r = kvmppc_handle_load(run, vcpu, rt, bytes, is_bigendian);
541 646
542 return r; 647 return r;
543} 648}
@@ -575,6 +680,13 @@ int kvmppc_handle_store(struct kvm_run *run, struct kvm_vcpu *vcpu,
575 } 680 }
576 } 681 }
577 682
683 if (!kvm_io_bus_write(vcpu->kvm, KVM_MMIO_BUS, run->mmio.phys_addr,
684 bytes, &run->mmio.data)) {
685 kvmppc_complete_mmio_load(vcpu, run);
686 vcpu->mmio_needed = 0;
687 return EMULATE_DONE;
688 }
689
578 return EMULATE_DO_MMIO; 690 return EMULATE_DO_MMIO;
579} 691}
580 692
@@ -649,6 +761,12 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu,
649 r = 0; 761 r = 0;
650 vcpu->arch.papr_enabled = true; 762 vcpu->arch.papr_enabled = true;
651 break; 763 break;
764#ifdef CONFIG_BOOKE
765 case KVM_CAP_PPC_BOOKE_WATCHDOG:
766 r = 0;
767 vcpu->arch.watchdog_enabled = true;
768 break;
769#endif
652#if defined(CONFIG_KVM_E500V2) || defined(CONFIG_KVM_E500MC) 770#if defined(CONFIG_KVM_E500V2) || defined(CONFIG_KVM_E500MC)
653 case KVM_CAP_SW_TLB: { 771 case KVM_CAP_SW_TLB: {
654 struct kvm_config_tlb cfg; 772 struct kvm_config_tlb cfg;
@@ -751,9 +869,16 @@ int kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf)
751 869
752static int kvm_vm_ioctl_get_pvinfo(struct kvm_ppc_pvinfo *pvinfo) 870static int kvm_vm_ioctl_get_pvinfo(struct kvm_ppc_pvinfo *pvinfo)
753{ 871{
872 u32 inst_nop = 0x60000000;
873#ifdef CONFIG_KVM_BOOKE_HV
874 u32 inst_sc1 = 0x44000022;
875 pvinfo->hcall[0] = inst_sc1;
876 pvinfo->hcall[1] = inst_nop;
877 pvinfo->hcall[2] = inst_nop;
878 pvinfo->hcall[3] = inst_nop;
879#else
754 u32 inst_lis = 0x3c000000; 880 u32 inst_lis = 0x3c000000;
755 u32 inst_ori = 0x60000000; 881 u32 inst_ori = 0x60000000;
756 u32 inst_nop = 0x60000000;
757 u32 inst_sc = 0x44000002; 882 u32 inst_sc = 0x44000002;
758 u32 inst_imm_mask = 0xffff; 883 u32 inst_imm_mask = 0xffff;
759 884
@@ -770,6 +895,9 @@ static int kvm_vm_ioctl_get_pvinfo(struct kvm_ppc_pvinfo *pvinfo)
770 pvinfo->hcall[1] = inst_ori | (KVM_SC_MAGIC_R0 & inst_imm_mask); 895 pvinfo->hcall[1] = inst_ori | (KVM_SC_MAGIC_R0 & inst_imm_mask);
771 pvinfo->hcall[2] = inst_sc; 896 pvinfo->hcall[2] = inst_sc;
772 pvinfo->hcall[3] = inst_nop; 897 pvinfo->hcall[3] = inst_nop;
898#endif
899
900 pvinfo->flags = KVM_PPC_PVINFO_FLAGS_EV_IDLE;
773 901
774 return 0; 902 return 0;
775} 903}
@@ -832,6 +960,17 @@ long kvm_arch_vm_ioctl(struct file *filp,
832 r = 0; 960 r = 0;
833 break; 961 break;
834 } 962 }
963
964 case KVM_PPC_GET_HTAB_FD: {
965 struct kvm *kvm = filp->private_data;
966 struct kvm_get_htab_fd ghf;
967
968 r = -EFAULT;
969 if (copy_from_user(&ghf, argp, sizeof(ghf)))
970 break;
971 r = kvm_vm_ioctl_get_htab_fd(kvm, &ghf);
972 break;
973 }
835#endif /* CONFIG_KVM_BOOK3S_64_HV */ 974#endif /* CONFIG_KVM_BOOK3S_64_HV */
836 975
837#ifdef CONFIG_PPC_BOOK3S_64 976#ifdef CONFIG_PPC_BOOK3S_64
diff --git a/arch/powerpc/kvm/trace.h b/arch/powerpc/kvm/trace.h
index ddb6a2149d44..e326489a5420 100644
--- a/arch/powerpc/kvm/trace.h
+++ b/arch/powerpc/kvm/trace.h
@@ -31,6 +31,126 @@ TRACE_EVENT(kvm_ppc_instr,
31 __entry->inst, __entry->pc, __entry->emulate) 31 __entry->inst, __entry->pc, __entry->emulate)
32); 32);
33 33
34#ifdef CONFIG_PPC_BOOK3S
35#define kvm_trace_symbol_exit \
36 {0x100, "SYSTEM_RESET"}, \
37 {0x200, "MACHINE_CHECK"}, \
38 {0x300, "DATA_STORAGE"}, \
39 {0x380, "DATA_SEGMENT"}, \
40 {0x400, "INST_STORAGE"}, \
41 {0x480, "INST_SEGMENT"}, \
42 {0x500, "EXTERNAL"}, \
43 {0x501, "EXTERNAL_LEVEL"}, \
44 {0x502, "EXTERNAL_HV"}, \
45 {0x600, "ALIGNMENT"}, \
46 {0x700, "PROGRAM"}, \
47 {0x800, "FP_UNAVAIL"}, \
48 {0x900, "DECREMENTER"}, \
49 {0x980, "HV_DECREMENTER"}, \
50 {0xc00, "SYSCALL"}, \
51 {0xd00, "TRACE"}, \
52 {0xe00, "H_DATA_STORAGE"}, \
53 {0xe20, "H_INST_STORAGE"}, \
54 {0xe40, "H_EMUL_ASSIST"}, \
55 {0xf00, "PERFMON"}, \
56 {0xf20, "ALTIVEC"}, \
57 {0xf40, "VSX"}
58#else
59#define kvm_trace_symbol_exit \
60 {0, "CRITICAL"}, \
61 {1, "MACHINE_CHECK"}, \
62 {2, "DATA_STORAGE"}, \
63 {3, "INST_STORAGE"}, \
64 {4, "EXTERNAL"}, \
65 {5, "ALIGNMENT"}, \
66 {6, "PROGRAM"}, \
67 {7, "FP_UNAVAIL"}, \
68 {8, "SYSCALL"}, \
69 {9, "AP_UNAVAIL"}, \
70 {10, "DECREMENTER"}, \
71 {11, "FIT"}, \
72 {12, "WATCHDOG"}, \
73 {13, "DTLB_MISS"}, \
74 {14, "ITLB_MISS"}, \
75 {15, "DEBUG"}, \
76 {32, "SPE_UNAVAIL"}, \
77 {33, "SPE_FP_DATA"}, \
78 {34, "SPE_FP_ROUND"}, \
79 {35, "PERFORMANCE_MONITOR"}, \
80 {36, "DOORBELL"}, \
81 {37, "DOORBELL_CRITICAL"}, \
82 {38, "GUEST_DBELL"}, \
83 {39, "GUEST_DBELL_CRIT"}, \
84 {40, "HV_SYSCALL"}, \
85 {41, "HV_PRIV"}
86#endif
87
88TRACE_EVENT(kvm_exit,
89 TP_PROTO(unsigned int exit_nr, struct kvm_vcpu *vcpu),
90 TP_ARGS(exit_nr, vcpu),
91
92 TP_STRUCT__entry(
93 __field( unsigned int, exit_nr )
94 __field( unsigned long, pc )
95 __field( unsigned long, msr )
96 __field( unsigned long, dar )
97#ifdef CONFIG_KVM_BOOK3S_PR
98 __field( unsigned long, srr1 )
99#endif
100 __field( unsigned long, last_inst )
101 ),
102
103 TP_fast_assign(
104#ifdef CONFIG_KVM_BOOK3S_PR
105 struct kvmppc_book3s_shadow_vcpu *svcpu;
106#endif
107 __entry->exit_nr = exit_nr;
108 __entry->pc = kvmppc_get_pc(vcpu);
109 __entry->dar = kvmppc_get_fault_dar(vcpu);
110 __entry->msr = vcpu->arch.shared->msr;
111#ifdef CONFIG_KVM_BOOK3S_PR
112 svcpu = svcpu_get(vcpu);
113 __entry->srr1 = svcpu->shadow_srr1;
114 svcpu_put(svcpu);
115#endif
116 __entry->last_inst = vcpu->arch.last_inst;
117 ),
118
119 TP_printk("exit=%s"
120 " | pc=0x%lx"
121 " | msr=0x%lx"
122 " | dar=0x%lx"
123#ifdef CONFIG_KVM_BOOK3S_PR
124 " | srr1=0x%lx"
125#endif
126 " | last_inst=0x%lx"
127 ,
128 __print_symbolic(__entry->exit_nr, kvm_trace_symbol_exit),
129 __entry->pc,
130 __entry->msr,
131 __entry->dar,
132#ifdef CONFIG_KVM_BOOK3S_PR
133 __entry->srr1,
134#endif
135 __entry->last_inst
136 )
137);
138
139TRACE_EVENT(kvm_unmap_hva,
140 TP_PROTO(unsigned long hva),
141 TP_ARGS(hva),
142
143 TP_STRUCT__entry(
144 __field( unsigned long, hva )
145 ),
146
147 TP_fast_assign(
148 __entry->hva = hva;
149 ),
150
151 TP_printk("unmap hva 0x%lx\n", __entry->hva)
152);
153
34TRACE_EVENT(kvm_stlb_inval, 154TRACE_EVENT(kvm_stlb_inval,
35 TP_PROTO(unsigned int stlb_index), 155 TP_PROTO(unsigned int stlb_index),
36 TP_ARGS(stlb_index), 156 TP_ARGS(stlb_index),
@@ -98,41 +218,31 @@ TRACE_EVENT(kvm_gtlb_write,
98 __entry->word1, __entry->word2) 218 __entry->word1, __entry->word2)
99); 219);
100 220
101 221TRACE_EVENT(kvm_check_requests,
102/************************************************************************* 222 TP_PROTO(struct kvm_vcpu *vcpu),
103 * Book3S trace points * 223 TP_ARGS(vcpu),
104 *************************************************************************/
105
106#ifdef CONFIG_KVM_BOOK3S_PR
107
108TRACE_EVENT(kvm_book3s_exit,
109 TP_PROTO(unsigned int exit_nr, struct kvm_vcpu *vcpu),
110 TP_ARGS(exit_nr, vcpu),
111 224
112 TP_STRUCT__entry( 225 TP_STRUCT__entry(
113 __field( unsigned int, exit_nr ) 226 __field( __u32, cpu_nr )
114 __field( unsigned long, pc ) 227 __field( __u32, requests )
115 __field( unsigned long, msr )
116 __field( unsigned long, dar )
117 __field( unsigned long, srr1 )
118 ), 228 ),
119 229
120 TP_fast_assign( 230 TP_fast_assign(
121 struct kvmppc_book3s_shadow_vcpu *svcpu; 231 __entry->cpu_nr = vcpu->vcpu_id;
122 __entry->exit_nr = exit_nr; 232 __entry->requests = vcpu->requests;
123 __entry->pc = kvmppc_get_pc(vcpu);
124 __entry->dar = kvmppc_get_fault_dar(vcpu);
125 __entry->msr = vcpu->arch.shared->msr;
126 svcpu = svcpu_get(vcpu);
127 __entry->srr1 = svcpu->shadow_srr1;
128 svcpu_put(svcpu);
129 ), 233 ),
130 234
131 TP_printk("exit=0x%x | pc=0x%lx | msr=0x%lx | dar=0x%lx | srr1=0x%lx", 235 TP_printk("vcpu=%x requests=%x",
132 __entry->exit_nr, __entry->pc, __entry->msr, __entry->dar, 236 __entry->cpu_nr, __entry->requests)
133 __entry->srr1)
134); 237);
135 238
239
240/*************************************************************************
241 * Book3S trace points *
242 *************************************************************************/
243
244#ifdef CONFIG_KVM_BOOK3S_PR
245
136TRACE_EVENT(kvm_book3s_reenter, 246TRACE_EVENT(kvm_book3s_reenter,
137 TP_PROTO(int r, struct kvm_vcpu *vcpu), 247 TP_PROTO(int r, struct kvm_vcpu *vcpu),
138 TP_ARGS(r, vcpu), 248 TP_ARGS(r, vcpu),
@@ -395,6 +505,44 @@ TRACE_EVENT(kvm_booke206_gtlb_write,
395 __entry->mas2, __entry->mas7_3) 505 __entry->mas2, __entry->mas7_3)
396); 506);
397 507
508TRACE_EVENT(kvm_booke206_ref_release,
509 TP_PROTO(__u64 pfn, __u32 flags),
510 TP_ARGS(pfn, flags),
511
512 TP_STRUCT__entry(
513 __field( __u64, pfn )
514 __field( __u32, flags )
515 ),
516
517 TP_fast_assign(
518 __entry->pfn = pfn;
519 __entry->flags = flags;
520 ),
521
522 TP_printk("pfn=%llx flags=%x",
523 __entry->pfn, __entry->flags)
524);
525
526TRACE_EVENT(kvm_booke_queue_irqprio,
527 TP_PROTO(struct kvm_vcpu *vcpu, unsigned int priority),
528 TP_ARGS(vcpu, priority),
529
530 TP_STRUCT__entry(
531 __field( __u32, cpu_nr )
532 __field( __u32, priority )
533 __field( unsigned long, pending )
534 ),
535
536 TP_fast_assign(
537 __entry->cpu_nr = vcpu->vcpu_id;
538 __entry->priority = priority;
539 __entry->pending = vcpu->arch.pending_exceptions;
540 ),
541
542 TP_printk("vcpu=%x prio=%x pending=%lx",
543 __entry->cpu_nr, __entry->priority, __entry->pending)
544);
545
398#endif 546#endif
399 547
400#endif /* _TRACE_KVM_H */ 548#endif /* _TRACE_KVM_H */
diff --git a/arch/powerpc/platforms/Kconfig b/arch/powerpc/platforms/Kconfig
index e7a896acd982..48a920d51489 100644
--- a/arch/powerpc/platforms/Kconfig
+++ b/arch/powerpc/platforms/Kconfig
@@ -90,6 +90,7 @@ config MPIC
90config PPC_EPAPR_HV_PIC 90config PPC_EPAPR_HV_PIC
91 bool 91 bool
92 default n 92 default n
93 select EPAPR_PARAVIRT
93 94
94config MPIC_WEIRD 95config MPIC_WEIRD
95 bool 96 bool
diff --git a/arch/powerpc/sysdev/fsl_msi.c b/arch/powerpc/sysdev/fsl_msi.c
index 51ffafae561e..63c5f04ea580 100644
--- a/arch/powerpc/sysdev/fsl_msi.c
+++ b/arch/powerpc/sysdev/fsl_msi.c
@@ -236,7 +236,6 @@ static void fsl_msi_cascade(unsigned int irq, struct irq_desc *desc)
236 u32 intr_index; 236 u32 intr_index;
237 u32 have_shift = 0; 237 u32 have_shift = 0;
238 struct fsl_msi_cascade_data *cascade_data; 238 struct fsl_msi_cascade_data *cascade_data;
239 unsigned int ret;
240 239
241 cascade_data = irq_get_handler_data(irq); 240 cascade_data = irq_get_handler_data(irq);
242 msi_data = cascade_data->msi_data; 241 msi_data = cascade_data->msi_data;
@@ -268,7 +267,9 @@ static void fsl_msi_cascade(unsigned int irq, struct irq_desc *desc)
268 case FSL_PIC_IP_IPIC: 267 case FSL_PIC_IP_IPIC:
269 msir_value = fsl_msi_read(msi_data->msi_regs, msir_index * 0x4); 268 msir_value = fsl_msi_read(msi_data->msi_regs, msir_index * 0x4);
270 break; 269 break;
271 case FSL_PIC_IP_VMPIC: 270#ifdef CONFIG_EPAPR_PARAVIRT
271 case FSL_PIC_IP_VMPIC: {
272 unsigned int ret;
272 ret = fh_vmpic_get_msir(virq_to_hw(irq), &msir_value); 273 ret = fh_vmpic_get_msir(virq_to_hw(irq), &msir_value);
273 if (ret) { 274 if (ret) {
274 pr_err("fsl-msi: fh_vmpic_get_msir() failed for " 275 pr_err("fsl-msi: fh_vmpic_get_msir() failed for "
@@ -277,6 +278,8 @@ static void fsl_msi_cascade(unsigned int irq, struct irq_desc *desc)
277 } 278 }
278 break; 279 break;
279 } 280 }
281#endif
282 }
280 283
281 while (msir_value) { 284 while (msir_value) {
282 intr_index = ffs(msir_value) - 1; 285 intr_index = ffs(msir_value) - 1;
@@ -508,10 +511,12 @@ static const struct of_device_id fsl_of_msi_ids[] = {
508 .compatible = "fsl,ipic-msi", 511 .compatible = "fsl,ipic-msi",
509 .data = &ipic_msi_feature, 512 .data = &ipic_msi_feature,
510 }, 513 },
514#ifdef CONFIG_EPAPR_PARAVIRT
511 { 515 {
512 .compatible = "fsl,vmpic-msi", 516 .compatible = "fsl,vmpic-msi",
513 .data = &vmpic_msi_feature, 517 .data = &vmpic_msi_feature,
514 }, 518 },
519#endif
515 {} 520 {}
516}; 521};
517 522
diff --git a/arch/powerpc/sysdev/fsl_soc.c b/arch/powerpc/sysdev/fsl_soc.c
index c449dbd1c938..97118dc3d285 100644
--- a/arch/powerpc/sysdev/fsl_soc.c
+++ b/arch/powerpc/sysdev/fsl_soc.c
@@ -253,6 +253,7 @@ struct platform_diu_data_ops diu_ops;
253EXPORT_SYMBOL(diu_ops); 253EXPORT_SYMBOL(diu_ops);
254#endif 254#endif
255 255
256#ifdef CONFIG_EPAPR_PARAVIRT
256/* 257/*
257 * Restart the current partition 258 * Restart the current partition
258 * 259 *
@@ -278,3 +279,4 @@ void fsl_hv_halt(void)
278 pr_info("hv exit\n"); 279 pr_info("hv exit\n");
279 fh_partition_stop(-1); 280 fh_partition_stop(-1);
280} 281}
282#endif
diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c
index ff1e2f8ef94a..c30615e605ac 100644
--- a/arch/s390/kvm/interrupt.c
+++ b/arch/s390/kvm/interrupt.c
@@ -629,10 +629,27 @@ int kvm_s390_inject_vcpu(struct kvm_vcpu *vcpu,
629 break; 629 break;
630 case KVM_S390_SIGP_STOP: 630 case KVM_S390_SIGP_STOP:
631 case KVM_S390_RESTART: 631 case KVM_S390_RESTART:
632 VCPU_EVENT(vcpu, 3, "inject: type %x", s390int->type);
633 inti->type = s390int->type;
634 break;
632 case KVM_S390_INT_EXTERNAL_CALL: 635 case KVM_S390_INT_EXTERNAL_CALL:
636 if (s390int->parm & 0xffff0000) {
637 kfree(inti);
638 return -EINVAL;
639 }
640 VCPU_EVENT(vcpu, 3, "inject: external call source-cpu:%u",
641 s390int->parm);
642 inti->type = s390int->type;
643 inti->extcall.code = s390int->parm;
644 break;
633 case KVM_S390_INT_EMERGENCY: 645 case KVM_S390_INT_EMERGENCY:
634 VCPU_EVENT(vcpu, 3, "inject: type %x", s390int->type); 646 if (s390int->parm & 0xffff0000) {
647 kfree(inti);
648 return -EINVAL;
649 }
650 VCPU_EVENT(vcpu, 3, "inject: emergency %u\n", s390int->parm);
635 inti->type = s390int->type; 651 inti->type = s390int->type;
652 inti->emerg.code = s390int->parm;
636 break; 653 break;
637 case KVM_S390_INT_VIRTIO: 654 case KVM_S390_INT_VIRTIO:
638 case KVM_S390_INT_SERVICE: 655 case KVM_S390_INT_SERVICE:
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index d91a95568002..c9011bfaabbe 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -355,6 +355,11 @@ static void kvm_s390_vcpu_initial_reset(struct kvm_vcpu *vcpu)
355 atomic_set_mask(CPUSTAT_STOPPED, &vcpu->arch.sie_block->cpuflags); 355 atomic_set_mask(CPUSTAT_STOPPED, &vcpu->arch.sie_block->cpuflags);
356} 356}
357 357
358int kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
359{
360 return 0;
361}
362
358int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) 363int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
359{ 364{
360 atomic_set(&vcpu->arch.sie_block->cpuflags, CPUSTAT_ZARCH | 365 atomic_set(&vcpu->arch.sie_block->cpuflags, CPUSTAT_ZARCH |
@@ -993,7 +998,7 @@ static int __init kvm_s390_init(void)
993 } 998 }
994 memcpy(facilities, S390_lowcore.stfle_fac_list, 16); 999 memcpy(facilities, S390_lowcore.stfle_fac_list, 16);
995 facilities[0] &= 0xff00fff3f47c0000ULL; 1000 facilities[0] &= 0xff00fff3f47c0000ULL;
996 facilities[1] &= 0x201c000000000000ULL; 1001 facilities[1] &= 0x001c000000000000ULL;
997 return 0; 1002 return 0;
998} 1003}
999 1004
diff --git a/arch/x86/include/asm/clocksource.h b/arch/x86/include/asm/clocksource.h
index 0bdbbb3b9ce7..16a57f4ed64d 100644
--- a/arch/x86/include/asm/clocksource.h
+++ b/arch/x86/include/asm/clocksource.h
@@ -8,6 +8,7 @@
8#define VCLOCK_NONE 0 /* No vDSO clock available. */ 8#define VCLOCK_NONE 0 /* No vDSO clock available. */
9#define VCLOCK_TSC 1 /* vDSO should use vread_tsc. */ 9#define VCLOCK_TSC 1 /* vDSO should use vread_tsc. */
10#define VCLOCK_HPET 2 /* vDSO should use vread_hpet. */ 10#define VCLOCK_HPET 2 /* vDSO should use vread_hpet. */
11#define VCLOCK_PVCLOCK 3 /* vDSO should use vread_pvclock. */
11 12
12struct arch_clocksource_data { 13struct arch_clocksource_data {
13 int vclock_mode; 14 int vclock_mode;
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index da40b1e2228e..2d9075e863a0 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -202,6 +202,7 @@
202 202
203/* Intel-defined CPU features, CPUID level 0x00000007:0 (ebx), word 9 */ 203/* Intel-defined CPU features, CPUID level 0x00000007:0 (ebx), word 9 */
204#define X86_FEATURE_FSGSBASE (9*32+ 0) /* {RD/WR}{FS/GS}BASE instructions*/ 204#define X86_FEATURE_FSGSBASE (9*32+ 0) /* {RD/WR}{FS/GS}BASE instructions*/
205#define X86_FEATURE_TSC_ADJUST (9*32+ 1) /* TSC adjustment MSR 0x3b */
205#define X86_FEATURE_BMI1 (9*32+ 3) /* 1st group bit manipulation extensions */ 206#define X86_FEATURE_BMI1 (9*32+ 3) /* 1st group bit manipulation extensions */
206#define X86_FEATURE_HLE (9*32+ 4) /* Hardware Lock Elision */ 207#define X86_FEATURE_HLE (9*32+ 4) /* Hardware Lock Elision */
207#define X86_FEATURE_AVX2 (9*32+ 5) /* AVX2 instructions */ 208#define X86_FEATURE_AVX2 (9*32+ 5) /* AVX2 instructions */
diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h
index 4da3c0c4c974..a09c28571064 100644
--- a/arch/x86/include/asm/fixmap.h
+++ b/arch/x86/include/asm/fixmap.h
@@ -19,6 +19,7 @@
19#include <asm/acpi.h> 19#include <asm/acpi.h>
20#include <asm/apicdef.h> 20#include <asm/apicdef.h>
21#include <asm/page.h> 21#include <asm/page.h>
22#include <asm/pvclock.h>
22#ifdef CONFIG_X86_32 23#ifdef CONFIG_X86_32
23#include <linux/threads.h> 24#include <linux/threads.h>
24#include <asm/kmap_types.h> 25#include <asm/kmap_types.h>
@@ -81,6 +82,10 @@ enum fixed_addresses {
81 VVAR_PAGE, 82 VVAR_PAGE,
82 VSYSCALL_HPET, 83 VSYSCALL_HPET,
83#endif 84#endif
85#ifdef CONFIG_PARAVIRT_CLOCK
86 PVCLOCK_FIXMAP_BEGIN,
87 PVCLOCK_FIXMAP_END = PVCLOCK_FIXMAP_BEGIN+PVCLOCK_VSYSCALL_NR_PAGES-1,
88#endif
84 FIX_DBGP_BASE, 89 FIX_DBGP_BASE,
85 FIX_EARLYCON_MEM_BASE, 90 FIX_EARLYCON_MEM_BASE,
86#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT 91#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
diff --git a/arch/x86/include/asm/kexec.h b/arch/x86/include/asm/kexec.h
index 317ff1703d0b..6080d2694bad 100644
--- a/arch/x86/include/asm/kexec.h
+++ b/arch/x86/include/asm/kexec.h
@@ -163,6 +163,9 @@ struct kimage_arch {
163}; 163};
164#endif 164#endif
165 165
166typedef void crash_vmclear_fn(void);
167extern crash_vmclear_fn __rcu *crash_vmclear_loaded_vmcss;
168
166#endif /* __ASSEMBLY__ */ 169#endif /* __ASSEMBLY__ */
167 170
168#endif /* _ASM_X86_KEXEC_H */ 171#endif /* _ASM_X86_KEXEC_H */
diff --git a/arch/x86/include/asm/kvm_guest.h b/arch/x86/include/asm/kvm_guest.h
new file mode 100644
index 000000000000..a92b1763c419
--- /dev/null
+++ b/arch/x86/include/asm/kvm_guest.h
@@ -0,0 +1,6 @@
1#ifndef _ASM_X86_KVM_GUEST_H
2#define _ASM_X86_KVM_GUEST_H
3
4int kvm_setup_vsyscall_timeinfo(void);
5
6#endif /* _ASM_X86_KVM_GUEST_H */
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index b2e11f452435..dc87b65e9c3a 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -22,6 +22,8 @@
22#include <linux/kvm_para.h> 22#include <linux/kvm_para.h>
23#include <linux/kvm_types.h> 23#include <linux/kvm_types.h>
24#include <linux/perf_event.h> 24#include <linux/perf_event.h>
25#include <linux/pvclock_gtod.h>
26#include <linux/clocksource.h>
25 27
26#include <asm/pvclock-abi.h> 28#include <asm/pvclock-abi.h>
27#include <asm/desc.h> 29#include <asm/desc.h>
@@ -442,6 +444,7 @@ struct kvm_vcpu_arch {
442 s8 virtual_tsc_shift; 444 s8 virtual_tsc_shift;
443 u32 virtual_tsc_mult; 445 u32 virtual_tsc_mult;
444 u32 virtual_tsc_khz; 446 u32 virtual_tsc_khz;
447 s64 ia32_tsc_adjust_msr;
445 448
446 atomic_t nmi_queued; /* unprocessed asynchronous NMIs */ 449 atomic_t nmi_queued; /* unprocessed asynchronous NMIs */
447 unsigned nmi_pending; /* NMI queued after currently running handler */ 450 unsigned nmi_pending; /* NMI queued after currently running handler */
@@ -559,6 +562,12 @@ struct kvm_arch {
559 u64 cur_tsc_write; 562 u64 cur_tsc_write;
560 u64 cur_tsc_offset; 563 u64 cur_tsc_offset;
561 u8 cur_tsc_generation; 564 u8 cur_tsc_generation;
565 int nr_vcpus_matched_tsc;
566
567 spinlock_t pvclock_gtod_sync_lock;
568 bool use_master_clock;
569 u64 master_kernel_ns;
570 cycle_t master_cycle_now;
562 571
563 struct kvm_xen_hvm_config xen_hvm_config; 572 struct kvm_xen_hvm_config xen_hvm_config;
564 573
@@ -612,6 +621,12 @@ struct kvm_vcpu_stat {
612 621
613struct x86_instruction_info; 622struct x86_instruction_info;
614 623
624struct msr_data {
625 bool host_initiated;
626 u32 index;
627 u64 data;
628};
629
615struct kvm_x86_ops { 630struct kvm_x86_ops {
616 int (*cpu_has_kvm_support)(void); /* __init */ 631 int (*cpu_has_kvm_support)(void); /* __init */
617 int (*disabled_by_bios)(void); /* __init */ 632 int (*disabled_by_bios)(void); /* __init */
@@ -634,7 +649,7 @@ struct kvm_x86_ops {
634 649
635 void (*update_db_bp_intercept)(struct kvm_vcpu *vcpu); 650 void (*update_db_bp_intercept)(struct kvm_vcpu *vcpu);
636 int (*get_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata); 651 int (*get_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata);
637 int (*set_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 data); 652 int (*set_msr)(struct kvm_vcpu *vcpu, struct msr_data *msr);
638 u64 (*get_segment_base)(struct kvm_vcpu *vcpu, int seg); 653 u64 (*get_segment_base)(struct kvm_vcpu *vcpu, int seg);
639 void (*get_segment)(struct kvm_vcpu *vcpu, 654 void (*get_segment)(struct kvm_vcpu *vcpu,
640 struct kvm_segment *var, int seg); 655 struct kvm_segment *var, int seg);
@@ -697,10 +712,11 @@ struct kvm_x86_ops {
697 bool (*has_wbinvd_exit)(void); 712 bool (*has_wbinvd_exit)(void);
698 713
699 void (*set_tsc_khz)(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale); 714 void (*set_tsc_khz)(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale);
715 u64 (*read_tsc_offset)(struct kvm_vcpu *vcpu);
700 void (*write_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset); 716 void (*write_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset);
701 717
702 u64 (*compute_tsc_offset)(struct kvm_vcpu *vcpu, u64 target_tsc); 718 u64 (*compute_tsc_offset)(struct kvm_vcpu *vcpu, u64 target_tsc);
703 u64 (*read_l1_tsc)(struct kvm_vcpu *vcpu); 719 u64 (*read_l1_tsc)(struct kvm_vcpu *vcpu, u64 host_tsc);
704 720
705 void (*get_exit_info)(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2); 721 void (*get_exit_info)(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2);
706 722
@@ -785,7 +801,7 @@ static inline int emulate_instruction(struct kvm_vcpu *vcpu,
785 801
786void kvm_enable_efer_bits(u64); 802void kvm_enable_efer_bits(u64);
787int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *data); 803int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *data);
788int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data); 804int kvm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr);
789 805
790struct x86_emulate_ctxt; 806struct x86_emulate_ctxt;
791 807
@@ -812,7 +828,7 @@ void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l);
812int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr); 828int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr);
813 829
814int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata); 830int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata);
815int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data); 831int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr);
816 832
817unsigned long kvm_get_rflags(struct kvm_vcpu *vcpu); 833unsigned long kvm_get_rflags(struct kvm_vcpu *vcpu);
818void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags); 834void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags);
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index e400cdb2dd65..6e930b218724 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -236,6 +236,7 @@
236#define MSR_IA32_EBL_CR_POWERON 0x0000002a 236#define MSR_IA32_EBL_CR_POWERON 0x0000002a
237#define MSR_EBC_FREQUENCY_ID 0x0000002c 237#define MSR_EBC_FREQUENCY_ID 0x0000002c
238#define MSR_IA32_FEATURE_CONTROL 0x0000003a 238#define MSR_IA32_FEATURE_CONTROL 0x0000003a
239#define MSR_IA32_TSC_ADJUST 0x0000003b
239 240
240#define FEATURE_CONTROL_LOCKED (1<<0) 241#define FEATURE_CONTROL_LOCKED (1<<0)
241#define FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX (1<<1) 242#define FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX (1<<1)
diff --git a/arch/x86/include/asm/pvclock.h b/arch/x86/include/asm/pvclock.h
index c59cc97fe6c1..109a9dd5d454 100644
--- a/arch/x86/include/asm/pvclock.h
+++ b/arch/x86/include/asm/pvclock.h
@@ -6,6 +6,7 @@
6 6
7/* some helper functions for xen and kvm pv clock sources */ 7/* some helper functions for xen and kvm pv clock sources */
8cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src); 8cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src);
9u8 pvclock_read_flags(struct pvclock_vcpu_time_info *src);
9void pvclock_set_flags(u8 flags); 10void pvclock_set_flags(u8 flags);
10unsigned long pvclock_tsc_khz(struct pvclock_vcpu_time_info *src); 11unsigned long pvclock_tsc_khz(struct pvclock_vcpu_time_info *src);
11void pvclock_read_wallclock(struct pvclock_wall_clock *wall, 12void pvclock_read_wallclock(struct pvclock_wall_clock *wall,
@@ -56,4 +57,50 @@ static inline u64 pvclock_scale_delta(u64 delta, u32 mul_frac, int shift)
56 return product; 57 return product;
57} 58}
58 59
60static __always_inline
61u64 pvclock_get_nsec_offset(const struct pvclock_vcpu_time_info *src)
62{
63 u64 delta = __native_read_tsc() - src->tsc_timestamp;
64 return pvclock_scale_delta(delta, src->tsc_to_system_mul,
65 src->tsc_shift);
66}
67
68static __always_inline
69unsigned __pvclock_read_cycles(const struct pvclock_vcpu_time_info *src,
70 cycle_t *cycles, u8 *flags)
71{
72 unsigned version;
73 cycle_t ret, offset;
74 u8 ret_flags;
75
76 version = src->version;
77 /* Note: emulated platforms which do not advertise SSE2 support
78 * result in kvmclock not using the necessary RDTSC barriers.
79 * Without barriers, it is possible that RDTSC instruction reads from
80 * the time stamp counter outside rdtsc_barrier protected section
81 * below, resulting in violation of monotonicity.
82 */
83 rdtsc_barrier();
84 offset = pvclock_get_nsec_offset(src);
85 ret = src->system_time + offset;
86 ret_flags = src->flags;
87 rdtsc_barrier();
88
89 *cycles = ret;
90 *flags = ret_flags;
91 return version;
92}
93
94struct pvclock_vsyscall_time_info {
95 struct pvclock_vcpu_time_info pvti;
96 u32 migrate_count;
97} __attribute__((__aligned__(SMP_CACHE_BYTES)));
98
99#define PVTI_SIZE sizeof(struct pvclock_vsyscall_time_info)
100#define PVCLOCK_VSYSCALL_NR_PAGES (((NR_CPUS-1)/(PAGE_SIZE/PVTI_SIZE))+1)
101
102int __init pvclock_init_vsyscall(struct pvclock_vsyscall_time_info *i,
103 int size);
104struct pvclock_vcpu_time_info *pvclock_get_vsyscall_time_info(int cpu);
105
59#endif /* _ASM_X86_PVCLOCK_H */ 106#endif /* _ASM_X86_PVCLOCK_H */
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index 36ec21c36d68..c2d56b34830d 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -445,8 +445,7 @@ enum vmcs_field {
445#define VMX_EPTP_WB_BIT (1ull << 14) 445#define VMX_EPTP_WB_BIT (1ull << 14)
446#define VMX_EPT_2MB_PAGE_BIT (1ull << 16) 446#define VMX_EPT_2MB_PAGE_BIT (1ull << 16)
447#define VMX_EPT_1GB_PAGE_BIT (1ull << 17) 447#define VMX_EPT_1GB_PAGE_BIT (1ull << 17)
448#define VMX_EPT_AD_BIT (1ull << 21) 448#define VMX_EPT_AD_BIT (1ull << 21)
449#define VMX_EPT_EXTENT_INDIVIDUAL_BIT (1ull << 24)
450#define VMX_EPT_EXTENT_CONTEXT_BIT (1ull << 25) 449#define VMX_EPT_EXTENT_CONTEXT_BIT (1ull << 25)
451#define VMX_EPT_EXTENT_GLOBAL_BIT (1ull << 26) 450#define VMX_EPT_EXTENT_GLOBAL_BIT (1ull << 26)
452 451
diff --git a/arch/x86/include/asm/vsyscall.h b/arch/x86/include/asm/vsyscall.h
index eaea1d31f753..80f80955cfd8 100644
--- a/arch/x86/include/asm/vsyscall.h
+++ b/arch/x86/include/asm/vsyscall.h
@@ -33,6 +33,26 @@ extern void map_vsyscall(void);
33 */ 33 */
34extern bool emulate_vsyscall(struct pt_regs *regs, unsigned long address); 34extern bool emulate_vsyscall(struct pt_regs *regs, unsigned long address);
35 35
36#ifdef CONFIG_X86_64
37
38#define VGETCPU_CPU_MASK 0xfff
39
40static inline unsigned int __getcpu(void)
41{
42 unsigned int p;
43
44 if (VVAR(vgetcpu_mode) == VGETCPU_RDTSCP) {
45 /* Load per CPU data from RDTSCP */
46 native_read_tscp(&p);
47 } else {
48 /* Load per CPU data from GDT */
49 asm("lsl %1,%0" : "=r" (p) : "r" (__PER_CPU_SEG));
50 }
51
52 return p;
53}
54#endif /* CONFIG_X86_64 */
55
36#endif /* __KERNEL__ */ 56#endif /* __KERNEL__ */
37 57
38#endif /* _ASM_X86_VSYSCALL_H */ 58#endif /* _ASM_X86_VSYSCALL_H */
diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index 13ad89971d47..74467feb4dc5 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -16,6 +16,7 @@
16#include <linux/delay.h> 16#include <linux/delay.h>
17#include <linux/elf.h> 17#include <linux/elf.h>
18#include <linux/elfcore.h> 18#include <linux/elfcore.h>
19#include <linux/module.h>
19 20
20#include <asm/processor.h> 21#include <asm/processor.h>
21#include <asm/hardirq.h> 22#include <asm/hardirq.h>
@@ -30,6 +31,27 @@
30 31
31int in_crash_kexec; 32int in_crash_kexec;
32 33
34/*
35 * This is used to VMCLEAR all VMCSs loaded on the
36 * processor. And when loading kvm_intel module, the
37 * callback function pointer will be assigned.
38 *
39 * protected by rcu.
40 */
41crash_vmclear_fn __rcu *crash_vmclear_loaded_vmcss = NULL;
42EXPORT_SYMBOL_GPL(crash_vmclear_loaded_vmcss);
43
44static inline void cpu_crash_vmclear_loaded_vmcss(void)
45{
46 crash_vmclear_fn *do_vmclear_operation = NULL;
47
48 rcu_read_lock();
49 do_vmclear_operation = rcu_dereference(crash_vmclear_loaded_vmcss);
50 if (do_vmclear_operation)
51 do_vmclear_operation();
52 rcu_read_unlock();
53}
54
33#if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC) 55#if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC)
34 56
35static void kdump_nmi_callback(int cpu, struct pt_regs *regs) 57static void kdump_nmi_callback(int cpu, struct pt_regs *regs)
@@ -46,6 +68,11 @@ static void kdump_nmi_callback(int cpu, struct pt_regs *regs)
46#endif 68#endif
47 crash_save_cpu(regs, cpu); 69 crash_save_cpu(regs, cpu);
48 70
71 /*
72 * VMCLEAR VMCSs loaded on all cpus if needed.
73 */
74 cpu_crash_vmclear_loaded_vmcss();
75
49 /* Disable VMX or SVM if needed. 76 /* Disable VMX or SVM if needed.
50 * 77 *
51 * We need to disable virtualization on all CPUs. 78 * We need to disable virtualization on all CPUs.
@@ -88,6 +115,11 @@ void native_machine_crash_shutdown(struct pt_regs *regs)
88 115
89 kdump_nmi_shootdown_cpus(); 116 kdump_nmi_shootdown_cpus();
90 117
118 /*
119 * VMCLEAR VMCSs loaded on this cpu if needed.
120 */
121 cpu_crash_vmclear_loaded_vmcss();
122
91 /* Booting kdump kernel with VMX or SVM enabled won't work, 123 /* Booting kdump kernel with VMX or SVM enabled won't work,
92 * because (among other limitations) we can't disable paging 124 * because (among other limitations) we can't disable paging
93 * with the virt flags. 125 * with the virt flags.
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 4180a874c764..08b973f64032 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -42,6 +42,7 @@
42#include <asm/apic.h> 42#include <asm/apic.h>
43#include <asm/apicdef.h> 43#include <asm/apicdef.h>
44#include <asm/hypervisor.h> 44#include <asm/hypervisor.h>
45#include <asm/kvm_guest.h>
45 46
46static int kvmapf = 1; 47static int kvmapf = 1;
47 48
@@ -62,6 +63,15 @@ static int parse_no_stealacc(char *arg)
62 63
63early_param("no-steal-acc", parse_no_stealacc); 64early_param("no-steal-acc", parse_no_stealacc);
64 65
66static int kvmclock_vsyscall = 1;
67static int parse_no_kvmclock_vsyscall(char *arg)
68{
69 kvmclock_vsyscall = 0;
70 return 0;
71}
72
73early_param("no-kvmclock-vsyscall", parse_no_kvmclock_vsyscall);
74
65static DEFINE_PER_CPU(struct kvm_vcpu_pv_apf_data, apf_reason) __aligned(64); 75static DEFINE_PER_CPU(struct kvm_vcpu_pv_apf_data, apf_reason) __aligned(64);
66static DEFINE_PER_CPU(struct kvm_steal_time, steal_time) __aligned(64); 76static DEFINE_PER_CPU(struct kvm_steal_time, steal_time) __aligned(64);
67static int has_steal_clock = 0; 77static int has_steal_clock = 0;
@@ -110,11 +120,6 @@ void kvm_async_pf_task_wait(u32 token)
110 struct kvm_task_sleep_head *b = &async_pf_sleepers[key]; 120 struct kvm_task_sleep_head *b = &async_pf_sleepers[key];
111 struct kvm_task_sleep_node n, *e; 121 struct kvm_task_sleep_node n, *e;
112 DEFINE_WAIT(wait); 122 DEFINE_WAIT(wait);
113 int cpu, idle;
114
115 cpu = get_cpu();
116 idle = idle_cpu(cpu);
117 put_cpu();
118 123
119 spin_lock(&b->lock); 124 spin_lock(&b->lock);
120 e = _find_apf_task(b, token); 125 e = _find_apf_task(b, token);
@@ -128,7 +133,7 @@ void kvm_async_pf_task_wait(u32 token)
128 133
129 n.token = token; 134 n.token = token;
130 n.cpu = smp_processor_id(); 135 n.cpu = smp_processor_id();
131 n.halted = idle || preempt_count() > 1; 136 n.halted = is_idle_task(current) || preempt_count() > 1;
132 init_waitqueue_head(&n.wq); 137 init_waitqueue_head(&n.wq);
133 hlist_add_head(&n.link, &b->list); 138 hlist_add_head(&n.link, &b->list);
134 spin_unlock(&b->lock); 139 spin_unlock(&b->lock);
@@ -471,6 +476,9 @@ void __init kvm_guest_init(void)
471 if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) 476 if (kvm_para_has_feature(KVM_FEATURE_PV_EOI))
472 apic_set_eoi_write(kvm_guest_apic_eoi_write); 477 apic_set_eoi_write(kvm_guest_apic_eoi_write);
473 478
479 if (kvmclock_vsyscall)
480 kvm_setup_vsyscall_timeinfo();
481
474#ifdef CONFIG_SMP 482#ifdef CONFIG_SMP
475 smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu; 483 smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu;
476 register_cpu_notifier(&kvm_cpu_notifier); 484 register_cpu_notifier(&kvm_cpu_notifier);
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index f1b42b3a186c..220a360010f8 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -23,6 +23,7 @@
23#include <asm/apic.h> 23#include <asm/apic.h>
24#include <linux/percpu.h> 24#include <linux/percpu.h>
25#include <linux/hardirq.h> 25#include <linux/hardirq.h>
26#include <linux/memblock.h>
26 27
27#include <asm/x86_init.h> 28#include <asm/x86_init.h>
28#include <asm/reboot.h> 29#include <asm/reboot.h>
@@ -39,7 +40,7 @@ static int parse_no_kvmclock(char *arg)
39early_param("no-kvmclock", parse_no_kvmclock); 40early_param("no-kvmclock", parse_no_kvmclock);
40 41
41/* The hypervisor will put information about time periodically here */ 42/* The hypervisor will put information about time periodically here */
42static DEFINE_PER_CPU_SHARED_ALIGNED(struct pvclock_vcpu_time_info, hv_clock); 43static struct pvclock_vsyscall_time_info *hv_clock;
43static struct pvclock_wall_clock wall_clock; 44static struct pvclock_wall_clock wall_clock;
44 45
45/* 46/*
@@ -52,15 +53,20 @@ static unsigned long kvm_get_wallclock(void)
52 struct pvclock_vcpu_time_info *vcpu_time; 53 struct pvclock_vcpu_time_info *vcpu_time;
53 struct timespec ts; 54 struct timespec ts;
54 int low, high; 55 int low, high;
56 int cpu;
55 57
56 low = (int)__pa_symbol(&wall_clock); 58 low = (int)__pa_symbol(&wall_clock);
57 high = ((u64)__pa_symbol(&wall_clock) >> 32); 59 high = ((u64)__pa_symbol(&wall_clock) >> 32);
58 60
59 native_write_msr(msr_kvm_wall_clock, low, high); 61 native_write_msr(msr_kvm_wall_clock, low, high);
60 62
61 vcpu_time = &get_cpu_var(hv_clock); 63 preempt_disable();
64 cpu = smp_processor_id();
65
66 vcpu_time = &hv_clock[cpu].pvti;
62 pvclock_read_wallclock(&wall_clock, vcpu_time, &ts); 67 pvclock_read_wallclock(&wall_clock, vcpu_time, &ts);
63 put_cpu_var(hv_clock); 68
69 preempt_enable();
64 70
65 return ts.tv_sec; 71 return ts.tv_sec;
66} 72}
@@ -74,9 +80,11 @@ static cycle_t kvm_clock_read(void)
74{ 80{
75 struct pvclock_vcpu_time_info *src; 81 struct pvclock_vcpu_time_info *src;
76 cycle_t ret; 82 cycle_t ret;
83 int cpu;
77 84
78 preempt_disable_notrace(); 85 preempt_disable_notrace();
79 src = &__get_cpu_var(hv_clock); 86 cpu = smp_processor_id();
87 src = &hv_clock[cpu].pvti;
80 ret = pvclock_clocksource_read(src); 88 ret = pvclock_clocksource_read(src);
81 preempt_enable_notrace(); 89 preempt_enable_notrace();
82 return ret; 90 return ret;
@@ -99,8 +107,15 @@ static cycle_t kvm_clock_get_cycles(struct clocksource *cs)
99static unsigned long kvm_get_tsc_khz(void) 107static unsigned long kvm_get_tsc_khz(void)
100{ 108{
101 struct pvclock_vcpu_time_info *src; 109 struct pvclock_vcpu_time_info *src;
102 src = &per_cpu(hv_clock, 0); 110 int cpu;
103 return pvclock_tsc_khz(src); 111 unsigned long tsc_khz;
112
113 preempt_disable();
114 cpu = smp_processor_id();
115 src = &hv_clock[cpu].pvti;
116 tsc_khz = pvclock_tsc_khz(src);
117 preempt_enable();
118 return tsc_khz;
104} 119}
105 120
106static void kvm_get_preset_lpj(void) 121static void kvm_get_preset_lpj(void)
@@ -119,10 +134,14 @@ bool kvm_check_and_clear_guest_paused(void)
119{ 134{
120 bool ret = false; 135 bool ret = false;
121 struct pvclock_vcpu_time_info *src; 136 struct pvclock_vcpu_time_info *src;
137 int cpu = smp_processor_id();
122 138
123 src = &__get_cpu_var(hv_clock); 139 if (!hv_clock)
140 return ret;
141
142 src = &hv_clock[cpu].pvti;
124 if ((src->flags & PVCLOCK_GUEST_STOPPED) != 0) { 143 if ((src->flags & PVCLOCK_GUEST_STOPPED) != 0) {
125 __this_cpu_and(hv_clock.flags, ~PVCLOCK_GUEST_STOPPED); 144 src->flags &= ~PVCLOCK_GUEST_STOPPED;
126 ret = true; 145 ret = true;
127 } 146 }
128 147
@@ -141,9 +160,10 @@ int kvm_register_clock(char *txt)
141{ 160{
142 int cpu = smp_processor_id(); 161 int cpu = smp_processor_id();
143 int low, high, ret; 162 int low, high, ret;
163 struct pvclock_vcpu_time_info *src = &hv_clock[cpu].pvti;
144 164
145 low = (int)__pa(&per_cpu(hv_clock, cpu)) | 1; 165 low = (int)__pa(src) | 1;
146 high = ((u64)__pa(&per_cpu(hv_clock, cpu)) >> 32); 166 high = ((u64)__pa(src) >> 32);
147 ret = native_write_msr_safe(msr_kvm_system_time, low, high); 167 ret = native_write_msr_safe(msr_kvm_system_time, low, high);
148 printk(KERN_INFO "kvm-clock: cpu %d, msr %x:%x, %s\n", 168 printk(KERN_INFO "kvm-clock: cpu %d, msr %x:%x, %s\n",
149 cpu, high, low, txt); 169 cpu, high, low, txt);
@@ -197,6 +217,8 @@ static void kvm_shutdown(void)
197 217
198void __init kvmclock_init(void) 218void __init kvmclock_init(void)
199{ 219{
220 unsigned long mem;
221
200 if (!kvm_para_available()) 222 if (!kvm_para_available())
201 return; 223 return;
202 224
@@ -209,8 +231,18 @@ void __init kvmclock_init(void)
209 printk(KERN_INFO "kvm-clock: Using msrs %x and %x", 231 printk(KERN_INFO "kvm-clock: Using msrs %x and %x",
210 msr_kvm_system_time, msr_kvm_wall_clock); 232 msr_kvm_system_time, msr_kvm_wall_clock);
211 233
212 if (kvm_register_clock("boot clock")) 234 mem = memblock_alloc(sizeof(struct pvclock_vsyscall_time_info)*NR_CPUS,
235 PAGE_SIZE);
236 if (!mem)
237 return;
238 hv_clock = __va(mem);
239
240 if (kvm_register_clock("boot clock")) {
241 hv_clock = NULL;
242 memblock_free(mem,
243 sizeof(struct pvclock_vsyscall_time_info)*NR_CPUS);
213 return; 244 return;
245 }
214 pv_time_ops.sched_clock = kvm_clock_read; 246 pv_time_ops.sched_clock = kvm_clock_read;
215 x86_platform.calibrate_tsc = kvm_get_tsc_khz; 247 x86_platform.calibrate_tsc = kvm_get_tsc_khz;
216 x86_platform.get_wallclock = kvm_get_wallclock; 248 x86_platform.get_wallclock = kvm_get_wallclock;
@@ -233,3 +265,37 @@ void __init kvmclock_init(void)
233 if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE_STABLE_BIT)) 265 if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE_STABLE_BIT))
234 pvclock_set_flags(PVCLOCK_TSC_STABLE_BIT); 266 pvclock_set_flags(PVCLOCK_TSC_STABLE_BIT);
235} 267}
268
269int __init kvm_setup_vsyscall_timeinfo(void)
270{
271#ifdef CONFIG_X86_64
272 int cpu;
273 int ret;
274 u8 flags;
275 struct pvclock_vcpu_time_info *vcpu_time;
276 unsigned int size;
277
278 size = sizeof(struct pvclock_vsyscall_time_info)*NR_CPUS;
279
280 preempt_disable();
281 cpu = smp_processor_id();
282
283 vcpu_time = &hv_clock[cpu].pvti;
284 flags = pvclock_read_flags(vcpu_time);
285
286 if (!(flags & PVCLOCK_TSC_STABLE_BIT)) {
287 preempt_enable();
288 return 1;
289 }
290
291 if ((ret = pvclock_init_vsyscall(hv_clock, size))) {
292 preempt_enable();
293 return ret;
294 }
295
296 preempt_enable();
297
298 kvm_clock.archdata.vclock_mode = VCLOCK_PVCLOCK;
299#endif
300 return 0;
301}
diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c
index 42eb3300dfc6..85c39590c1a4 100644
--- a/arch/x86/kernel/pvclock.c
+++ b/arch/x86/kernel/pvclock.c
@@ -17,23 +17,13 @@
17 17
18#include <linux/kernel.h> 18#include <linux/kernel.h>
19#include <linux/percpu.h> 19#include <linux/percpu.h>
20#include <linux/notifier.h>
21#include <linux/sched.h>
22#include <linux/gfp.h>
23#include <linux/bootmem.h>
24#include <asm/fixmap.h>
20#include <asm/pvclock.h> 25#include <asm/pvclock.h>
21 26
22/*
23 * These are perodically updated
24 * xen: magic shared_info page
25 * kvm: gpa registered via msr
26 * and then copied here.
27 */
28struct pvclock_shadow_time {
29 u64 tsc_timestamp; /* TSC at last update of time vals. */
30 u64 system_timestamp; /* Time, in nanosecs, since boot. */
31 u32 tsc_to_nsec_mul;
32 int tsc_shift;
33 u32 version;
34 u8 flags;
35};
36
37static u8 valid_flags __read_mostly = 0; 27static u8 valid_flags __read_mostly = 0;
38 28
39void pvclock_set_flags(u8 flags) 29void pvclock_set_flags(u8 flags)
@@ -41,34 +31,6 @@ void pvclock_set_flags(u8 flags)
41 valid_flags = flags; 31 valid_flags = flags;
42} 32}
43 33
44static u64 pvclock_get_nsec_offset(struct pvclock_shadow_time *shadow)
45{
46 u64 delta = native_read_tsc() - shadow->tsc_timestamp;
47 return pvclock_scale_delta(delta, shadow->tsc_to_nsec_mul,
48 shadow->tsc_shift);
49}
50
51/*
52 * Reads a consistent set of time-base values from hypervisor,
53 * into a shadow data area.
54 */
55static unsigned pvclock_get_time_values(struct pvclock_shadow_time *dst,
56 struct pvclock_vcpu_time_info *src)
57{
58 do {
59 dst->version = src->version;
60 rmb(); /* fetch version before data */
61 dst->tsc_timestamp = src->tsc_timestamp;
62 dst->system_timestamp = src->system_time;
63 dst->tsc_to_nsec_mul = src->tsc_to_system_mul;
64 dst->tsc_shift = src->tsc_shift;
65 dst->flags = src->flags;
66 rmb(); /* test version after fetching data */
67 } while ((src->version & 1) || (dst->version != src->version));
68
69 return dst->version;
70}
71
72unsigned long pvclock_tsc_khz(struct pvclock_vcpu_time_info *src) 34unsigned long pvclock_tsc_khz(struct pvclock_vcpu_time_info *src)
73{ 35{
74 u64 pv_tsc_khz = 1000000ULL << 32; 36 u64 pv_tsc_khz = 1000000ULL << 32;
@@ -88,23 +50,32 @@ void pvclock_resume(void)
88 atomic64_set(&last_value, 0); 50 atomic64_set(&last_value, 0);
89} 51}
90 52
53u8 pvclock_read_flags(struct pvclock_vcpu_time_info *src)
54{
55 unsigned version;
56 cycle_t ret;
57 u8 flags;
58
59 do {
60 version = __pvclock_read_cycles(src, &ret, &flags);
61 } while ((src->version & 1) || version != src->version);
62
63 return flags & valid_flags;
64}
65
91cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src) 66cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src)
92{ 67{
93 struct pvclock_shadow_time shadow;
94 unsigned version; 68 unsigned version;
95 cycle_t ret, offset; 69 cycle_t ret;
96 u64 last; 70 u64 last;
71 u8 flags;
97 72
98 do { 73 do {
99 version = pvclock_get_time_values(&shadow, src); 74 version = __pvclock_read_cycles(src, &ret, &flags);
100 barrier(); 75 } while ((src->version & 1) || version != src->version);
101 offset = pvclock_get_nsec_offset(&shadow);
102 ret = shadow.system_timestamp + offset;
103 barrier();
104 } while (version != src->version);
105 76
106 if ((valid_flags & PVCLOCK_TSC_STABLE_BIT) && 77 if ((valid_flags & PVCLOCK_TSC_STABLE_BIT) &&
107 (shadow.flags & PVCLOCK_TSC_STABLE_BIT)) 78 (flags & PVCLOCK_TSC_STABLE_BIT))
108 return ret; 79 return ret;
109 80
110 /* 81 /*
@@ -156,3 +127,71 @@ void pvclock_read_wallclock(struct pvclock_wall_clock *wall_clock,
156 127
157 set_normalized_timespec(ts, now.tv_sec, now.tv_nsec); 128 set_normalized_timespec(ts, now.tv_sec, now.tv_nsec);
158} 129}
130
131static struct pvclock_vsyscall_time_info *pvclock_vdso_info;
132
133static struct pvclock_vsyscall_time_info *
134pvclock_get_vsyscall_user_time_info(int cpu)
135{
136 if (!pvclock_vdso_info) {
137 BUG();
138 return NULL;
139 }
140
141 return &pvclock_vdso_info[cpu];
142}
143
144struct pvclock_vcpu_time_info *pvclock_get_vsyscall_time_info(int cpu)
145{
146 return &pvclock_get_vsyscall_user_time_info(cpu)->pvti;
147}
148
149#ifdef CONFIG_X86_64
150static int pvclock_task_migrate(struct notifier_block *nb, unsigned long l,
151 void *v)
152{
153 struct task_migration_notifier *mn = v;
154 struct pvclock_vsyscall_time_info *pvti;
155
156 pvti = pvclock_get_vsyscall_user_time_info(mn->from_cpu);
157
158 /* this is NULL when pvclock vsyscall is not initialized */
159 if (unlikely(pvti == NULL))
160 return NOTIFY_DONE;
161
162 pvti->migrate_count++;
163
164 return NOTIFY_DONE;
165}
166
167static struct notifier_block pvclock_migrate = {
168 .notifier_call = pvclock_task_migrate,
169};
170
171/*
172 * Initialize the generic pvclock vsyscall state. This will allocate
173 * a/some page(s) for the per-vcpu pvclock information, set up a
174 * fixmap mapping for the page(s)
175 */
176
177int __init pvclock_init_vsyscall(struct pvclock_vsyscall_time_info *i,
178 int size)
179{
180 int idx;
181
182 WARN_ON (size != PVCLOCK_VSYSCALL_NR_PAGES*PAGE_SIZE);
183
184 pvclock_vdso_info = i;
185
186 for (idx = 0; idx <= (PVCLOCK_FIXMAP_END-PVCLOCK_FIXMAP_BEGIN); idx++) {
187 __set_fixmap(PVCLOCK_FIXMAP_BEGIN + idx,
188 __pa_symbol(i) + (idx*PAGE_SIZE),
189 PAGE_KERNEL_VVAR);
190 }
191
192
193 register_task_migration_notifier(&pvclock_migrate);
194
195 return 0;
196}
197#endif
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index ec79e773342e..a20ecb5b6cbf 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -320,6 +320,8 @@ static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
320 if (index == 0) { 320 if (index == 0) {
321 entry->ebx &= kvm_supported_word9_x86_features; 321 entry->ebx &= kvm_supported_word9_x86_features;
322 cpuid_mask(&entry->ebx, 9); 322 cpuid_mask(&entry->ebx, 9);
323 // TSC_ADJUST is emulated
324 entry->ebx |= F(TSC_ADJUST);
323 } else 325 } else
324 entry->ebx = 0; 326 entry->ebx = 0;
325 entry->eax = 0; 327 entry->eax = 0;
@@ -659,6 +661,7 @@ void kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, u32 *ecx, u32 *edx)
659 } else 661 } else
660 *eax = *ebx = *ecx = *edx = 0; 662 *eax = *ebx = *ecx = *edx = 0;
661} 663}
664EXPORT_SYMBOL_GPL(kvm_cpuid);
662 665
663void kvm_emulate_cpuid(struct kvm_vcpu *vcpu) 666void kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
664{ 667{
diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
index 58fc51488828..b7fd07984888 100644
--- a/arch/x86/kvm/cpuid.h
+++ b/arch/x86/kvm/cpuid.h
@@ -31,6 +31,14 @@ static inline bool guest_cpuid_has_xsave(struct kvm_vcpu *vcpu)
31 return best && (best->ecx & bit(X86_FEATURE_XSAVE)); 31 return best && (best->ecx & bit(X86_FEATURE_XSAVE));
32} 32}
33 33
34static inline bool guest_cpuid_has_tsc_adjust(struct kvm_vcpu *vcpu)
35{
36 struct kvm_cpuid_entry2 *best;
37
38 best = kvm_find_cpuid_entry(vcpu, 7, 0);
39 return best && (best->ebx & bit(X86_FEATURE_TSC_ADJUST));
40}
41
34static inline bool guest_cpuid_has_smep(struct kvm_vcpu *vcpu) 42static inline bool guest_cpuid_has_smep(struct kvm_vcpu *vcpu)
35{ 43{
36 struct kvm_cpuid_entry2 *best; 44 struct kvm_cpuid_entry2 *best;
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index bba39bfa1c4b..a27e76371108 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -676,8 +676,9 @@ static int __linearize(struct x86_emulate_ctxt *ctxt,
676 addr.seg); 676 addr.seg);
677 if (!usable) 677 if (!usable)
678 goto bad; 678 goto bad;
679 /* code segment or read-only data segment */ 679 /* code segment in protected mode or read-only data segment */
680 if (((desc.type & 8) || !(desc.type & 2)) && write) 680 if ((((ctxt->mode != X86EMUL_MODE_REAL) && (desc.type & 8))
681 || !(desc.type & 2)) && write)
681 goto bad; 682 goto bad;
682 /* unreadable code segment */ 683 /* unreadable code segment */
683 if (!fetch && (desc.type & 8) && !(desc.type & 2)) 684 if (!fetch && (desc.type & 8) && !(desc.type & 2))
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 43e9fadca5d0..9392f527f107 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -1011,7 +1011,7 @@ static void start_apic_timer(struct kvm_lapic *apic)
1011 local_irq_save(flags); 1011 local_irq_save(flags);
1012 1012
1013 now = apic->lapic_timer.timer.base->get_time(); 1013 now = apic->lapic_timer.timer.base->get_time();
1014 guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu); 1014 guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu, native_read_tsc());
1015 if (likely(tscdeadline > guest_tsc)) { 1015 if (likely(tscdeadline > guest_tsc)) {
1016 ns = (tscdeadline - guest_tsc) * 1000000ULL; 1016 ns = (tscdeadline - guest_tsc) * 1000000ULL;
1017 do_div(ns, this_tsc_khz); 1017 do_div(ns, this_tsc_khz);
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 6f85fe0bf958..01d7c2ad05f5 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2382,12 +2382,20 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
2382 || (!vcpu->arch.mmu.direct_map && write_fault 2382 || (!vcpu->arch.mmu.direct_map && write_fault
2383 && !is_write_protection(vcpu) && !user_fault)) { 2383 && !is_write_protection(vcpu) && !user_fault)) {
2384 2384
2385 /*
2386 * There are two cases:
2387 * - the one is other vcpu creates new sp in the window
2388 * between mapping_level() and acquiring mmu-lock.
2389 * - the another case is the new sp is created by itself
2390 * (page-fault path) when guest uses the target gfn as
2391 * its page table.
2392 * Both of these cases can be fixed by allowing guest to
2393 * retry the access, it will refault, then we can establish
2394 * the mapping by using small page.
2395 */
2385 if (level > PT_PAGE_TABLE_LEVEL && 2396 if (level > PT_PAGE_TABLE_LEVEL &&
2386 has_wrprotected_page(vcpu->kvm, gfn, level)) { 2397 has_wrprotected_page(vcpu->kvm, gfn, level))
2387 ret = 1;
2388 drop_spte(vcpu->kvm, sptep);
2389 goto done; 2398 goto done;
2390 }
2391 2399
2392 spte |= PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE; 2400 spte |= PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE;
2393 2401
@@ -2505,6 +2513,14 @@ static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
2505 mmu_free_roots(vcpu); 2513 mmu_free_roots(vcpu);
2506} 2514}
2507 2515
2516static bool is_rsvd_bits_set(struct kvm_mmu *mmu, u64 gpte, int level)
2517{
2518 int bit7;
2519
2520 bit7 = (gpte >> 7) & 1;
2521 return (gpte & mmu->rsvd_bits_mask[bit7][level-1]) != 0;
2522}
2523
2508static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, 2524static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn,
2509 bool no_dirty_log) 2525 bool no_dirty_log)
2510{ 2526{
@@ -2517,6 +2533,26 @@ static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn,
2517 return gfn_to_pfn_memslot_atomic(slot, gfn); 2533 return gfn_to_pfn_memslot_atomic(slot, gfn);
2518} 2534}
2519 2535
2536static bool prefetch_invalid_gpte(struct kvm_vcpu *vcpu,
2537 struct kvm_mmu_page *sp, u64 *spte,
2538 u64 gpte)
2539{
2540 if (is_rsvd_bits_set(&vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL))
2541 goto no_present;
2542
2543 if (!is_present_gpte(gpte))
2544 goto no_present;
2545
2546 if (!(gpte & PT_ACCESSED_MASK))
2547 goto no_present;
2548
2549 return false;
2550
2551no_present:
2552 drop_spte(vcpu->kvm, spte);
2553 return true;
2554}
2555
2520static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu, 2556static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu,
2521 struct kvm_mmu_page *sp, 2557 struct kvm_mmu_page *sp,
2522 u64 *start, u64 *end) 2558 u64 *start, u64 *end)
@@ -2671,7 +2707,7 @@ static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu,
2671 * PT_PAGE_TABLE_LEVEL and there would be no adjustment done 2707 * PT_PAGE_TABLE_LEVEL and there would be no adjustment done
2672 * here. 2708 * here.
2673 */ 2709 */
2674 if (!is_error_pfn(pfn) && !kvm_is_mmio_pfn(pfn) && 2710 if (!is_error_noslot_pfn(pfn) && !kvm_is_mmio_pfn(pfn) &&
2675 level == PT_PAGE_TABLE_LEVEL && 2711 level == PT_PAGE_TABLE_LEVEL &&
2676 PageTransCompound(pfn_to_page(pfn)) && 2712 PageTransCompound(pfn_to_page(pfn)) &&
2677 !has_wrprotected_page(vcpu->kvm, gfn, PT_DIRECTORY_LEVEL)) { 2713 !has_wrprotected_page(vcpu->kvm, gfn, PT_DIRECTORY_LEVEL)) {
@@ -2699,18 +2735,13 @@ static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu,
2699 } 2735 }
2700} 2736}
2701 2737
2702static bool mmu_invalid_pfn(pfn_t pfn)
2703{
2704 return unlikely(is_invalid_pfn(pfn));
2705}
2706
2707static bool handle_abnormal_pfn(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn, 2738static bool handle_abnormal_pfn(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn,
2708 pfn_t pfn, unsigned access, int *ret_val) 2739 pfn_t pfn, unsigned access, int *ret_val)
2709{ 2740{
2710 bool ret = true; 2741 bool ret = true;
2711 2742
2712 /* The pfn is invalid, report the error! */ 2743 /* The pfn is invalid, report the error! */
2713 if (unlikely(is_invalid_pfn(pfn))) { 2744 if (unlikely(is_error_pfn(pfn))) {
2714 *ret_val = kvm_handle_bad_page(vcpu, gfn, pfn); 2745 *ret_val = kvm_handle_bad_page(vcpu, gfn, pfn);
2715 goto exit; 2746 goto exit;
2716 } 2747 }
@@ -2862,7 +2893,7 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code,
2862 return r; 2893 return r;
2863 2894
2864 spin_lock(&vcpu->kvm->mmu_lock); 2895 spin_lock(&vcpu->kvm->mmu_lock);
2865 if (mmu_notifier_retry(vcpu, mmu_seq)) 2896 if (mmu_notifier_retry(vcpu->kvm, mmu_seq))
2866 goto out_unlock; 2897 goto out_unlock;
2867 kvm_mmu_free_some_pages(vcpu); 2898 kvm_mmu_free_some_pages(vcpu);
2868 if (likely(!force_pt_level)) 2899 if (likely(!force_pt_level))
@@ -3331,7 +3362,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
3331 return r; 3362 return r;
3332 3363
3333 spin_lock(&vcpu->kvm->mmu_lock); 3364 spin_lock(&vcpu->kvm->mmu_lock);
3334 if (mmu_notifier_retry(vcpu, mmu_seq)) 3365 if (mmu_notifier_retry(vcpu->kvm, mmu_seq))
3335 goto out_unlock; 3366 goto out_unlock;
3336 kvm_mmu_free_some_pages(vcpu); 3367 kvm_mmu_free_some_pages(vcpu);
3337 if (likely(!force_pt_level)) 3368 if (likely(!force_pt_level))
@@ -3399,14 +3430,6 @@ static void paging_free(struct kvm_vcpu *vcpu)
3399 nonpaging_free(vcpu); 3430 nonpaging_free(vcpu);
3400} 3431}
3401 3432
3402static bool is_rsvd_bits_set(struct kvm_mmu *mmu, u64 gpte, int level)
3403{
3404 int bit7;
3405
3406 bit7 = (gpte >> 7) & 1;
3407 return (gpte & mmu->rsvd_bits_mask[bit7][level-1]) != 0;
3408}
3409
3410static inline void protect_clean_gpte(unsigned *access, unsigned gpte) 3433static inline void protect_clean_gpte(unsigned *access, unsigned gpte)
3411{ 3434{
3412 unsigned mask; 3435 unsigned mask;
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 714e2c01a6fe..891eb6d93b8b 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -305,51 +305,43 @@ static int FNAME(walk_addr_nested)(struct guest_walker *walker,
305 addr, access); 305 addr, access);
306} 306}
307 307
308static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu, 308static bool
309 struct kvm_mmu_page *sp, u64 *spte, 309FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
310 pt_element_t gpte) 310 u64 *spte, pt_element_t gpte, bool no_dirty_log)
311{ 311{
312 if (is_rsvd_bits_set(&vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL))
313 goto no_present;
314
315 if (!is_present_gpte(gpte))
316 goto no_present;
317
318 if (!(gpte & PT_ACCESSED_MASK))
319 goto no_present;
320
321 return false;
322
323no_present:
324 drop_spte(vcpu->kvm, spte);
325 return true;
326}
327
328static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
329 u64 *spte, const void *pte)
330{
331 pt_element_t gpte;
332 unsigned pte_access; 312 unsigned pte_access;
313 gfn_t gfn;
333 pfn_t pfn; 314 pfn_t pfn;
334 315
335 gpte = *(const pt_element_t *)pte; 316 if (prefetch_invalid_gpte(vcpu, sp, spte, gpte))
336 if (FNAME(prefetch_invalid_gpte)(vcpu, sp, spte, gpte)) 317 return false;
337 return;
338 318
339 pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte); 319 pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte);
320
321 gfn = gpte_to_gfn(gpte);
340 pte_access = sp->role.access & gpte_access(vcpu, gpte); 322 pte_access = sp->role.access & gpte_access(vcpu, gpte);
341 protect_clean_gpte(&pte_access, gpte); 323 protect_clean_gpte(&pte_access, gpte);
342 pfn = gfn_to_pfn_atomic(vcpu->kvm, gpte_to_gfn(gpte)); 324 pfn = pte_prefetch_gfn_to_pfn(vcpu, gfn,
343 if (mmu_invalid_pfn(pfn)) 325 no_dirty_log && (pte_access & ACC_WRITE_MASK));
344 return; 326 if (is_error_pfn(pfn))
327 return false;
345 328
346 /* 329 /*
347 * we call mmu_set_spte() with host_writable = true because that 330 * we call mmu_set_spte() with host_writable = true because
348 * vcpu->arch.update_pte.pfn was fetched from get_user_pages(write = 1). 331 * pte_prefetch_gfn_to_pfn always gets a writable pfn.
349 */ 332 */
350 mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0, 333 mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0,
351 NULL, PT_PAGE_TABLE_LEVEL, 334 NULL, PT_PAGE_TABLE_LEVEL, gfn, pfn, true, true);
352 gpte_to_gfn(gpte), pfn, true, true); 335
336 return true;
337}
338
339static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
340 u64 *spte, const void *pte)
341{
342 pt_element_t gpte = *(const pt_element_t *)pte;
343
344 FNAME(prefetch_gpte)(vcpu, sp, spte, gpte, false);
353} 345}
354 346
355static bool FNAME(gpte_changed)(struct kvm_vcpu *vcpu, 347static bool FNAME(gpte_changed)(struct kvm_vcpu *vcpu,
@@ -395,53 +387,34 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,
395 spte = sp->spt + i; 387 spte = sp->spt + i;
396 388
397 for (i = 0; i < PTE_PREFETCH_NUM; i++, spte++) { 389 for (i = 0; i < PTE_PREFETCH_NUM; i++, spte++) {
398 pt_element_t gpte;
399 unsigned pte_access;
400 gfn_t gfn;
401 pfn_t pfn;
402
403 if (spte == sptep) 390 if (spte == sptep)
404 continue; 391 continue;
405 392
406 if (is_shadow_present_pte(*spte)) 393 if (is_shadow_present_pte(*spte))
407 continue; 394 continue;
408 395
409 gpte = gptep[i]; 396 if (!FNAME(prefetch_gpte)(vcpu, sp, spte, gptep[i], true))
410
411 if (FNAME(prefetch_invalid_gpte)(vcpu, sp, spte, gpte))
412 continue;
413
414 pte_access = sp->role.access & gpte_access(vcpu, gpte);
415 protect_clean_gpte(&pte_access, gpte);
416 gfn = gpte_to_gfn(gpte);
417 pfn = pte_prefetch_gfn_to_pfn(vcpu, gfn,
418 pte_access & ACC_WRITE_MASK);
419 if (mmu_invalid_pfn(pfn))
420 break; 397 break;
421
422 mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0,
423 NULL, PT_PAGE_TABLE_LEVEL, gfn,
424 pfn, true, true);
425 } 398 }
426} 399}
427 400
428/* 401/*
429 * Fetch a shadow pte for a specific level in the paging hierarchy. 402 * Fetch a shadow pte for a specific level in the paging hierarchy.
403 * If the guest tries to write a write-protected page, we need to
404 * emulate this operation, return 1 to indicate this case.
430 */ 405 */
431static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, 406static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
432 struct guest_walker *gw, 407 struct guest_walker *gw,
433 int user_fault, int write_fault, int hlevel, 408 int user_fault, int write_fault, int hlevel,
434 int *emulate, pfn_t pfn, bool map_writable, 409 pfn_t pfn, bool map_writable, bool prefault)
435 bool prefault)
436{ 410{
437 unsigned access = gw->pt_access;
438 struct kvm_mmu_page *sp = NULL; 411 struct kvm_mmu_page *sp = NULL;
439 int top_level;
440 unsigned direct_access;
441 struct kvm_shadow_walk_iterator it; 412 struct kvm_shadow_walk_iterator it;
413 unsigned direct_access, access = gw->pt_access;
414 int top_level, emulate = 0;
442 415
443 if (!is_present_gpte(gw->ptes[gw->level - 1])) 416 if (!is_present_gpte(gw->ptes[gw->level - 1]))
444 return NULL; 417 return 0;
445 418
446 direct_access = gw->pte_access; 419 direct_access = gw->pte_access;
447 420
@@ -505,17 +478,17 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
505 478
506 clear_sp_write_flooding_count(it.sptep); 479 clear_sp_write_flooding_count(it.sptep);
507 mmu_set_spte(vcpu, it.sptep, access, gw->pte_access, 480 mmu_set_spte(vcpu, it.sptep, access, gw->pte_access,
508 user_fault, write_fault, emulate, it.level, 481 user_fault, write_fault, &emulate, it.level,
509 gw->gfn, pfn, prefault, map_writable); 482 gw->gfn, pfn, prefault, map_writable);
510 FNAME(pte_prefetch)(vcpu, gw, it.sptep); 483 FNAME(pte_prefetch)(vcpu, gw, it.sptep);
511 484
512 return it.sptep; 485 return emulate;
513 486
514out_gpte_changed: 487out_gpte_changed:
515 if (sp) 488 if (sp)
516 kvm_mmu_put_page(sp, it.sptep); 489 kvm_mmu_put_page(sp, it.sptep);
517 kvm_release_pfn_clean(pfn); 490 kvm_release_pfn_clean(pfn);
518 return NULL; 491 return 0;
519} 492}
520 493
521/* 494/*
@@ -538,8 +511,6 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
538 int write_fault = error_code & PFERR_WRITE_MASK; 511 int write_fault = error_code & PFERR_WRITE_MASK;
539 int user_fault = error_code & PFERR_USER_MASK; 512 int user_fault = error_code & PFERR_USER_MASK;
540 struct guest_walker walker; 513 struct guest_walker walker;
541 u64 *sptep;
542 int emulate = 0;
543 int r; 514 int r;
544 pfn_t pfn; 515 pfn_t pfn;
545 int level = PT_PAGE_TABLE_LEVEL; 516 int level = PT_PAGE_TABLE_LEVEL;
@@ -594,24 +565,20 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
594 return r; 565 return r;
595 566
596 spin_lock(&vcpu->kvm->mmu_lock); 567 spin_lock(&vcpu->kvm->mmu_lock);
597 if (mmu_notifier_retry(vcpu, mmu_seq)) 568 if (mmu_notifier_retry(vcpu->kvm, mmu_seq))
598 goto out_unlock; 569 goto out_unlock;
599 570
600 kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT); 571 kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT);
601 kvm_mmu_free_some_pages(vcpu); 572 kvm_mmu_free_some_pages(vcpu);
602 if (!force_pt_level) 573 if (!force_pt_level)
603 transparent_hugepage_adjust(vcpu, &walker.gfn, &pfn, &level); 574 transparent_hugepage_adjust(vcpu, &walker.gfn, &pfn, &level);
604 sptep = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault, 575 r = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault,
605 level, &emulate, pfn, map_writable, prefault); 576 level, pfn, map_writable, prefault);
606 (void)sptep;
607 pgprintk("%s: shadow pte %p %llx emulate %d\n", __func__,
608 sptep, *sptep, emulate);
609
610 ++vcpu->stat.pf_fixed; 577 ++vcpu->stat.pf_fixed;
611 kvm_mmu_audit(vcpu, AUDIT_POST_PAGE_FAULT); 578 kvm_mmu_audit(vcpu, AUDIT_POST_PAGE_FAULT);
612 spin_unlock(&vcpu->kvm->mmu_lock); 579 spin_unlock(&vcpu->kvm->mmu_lock);
613 580
614 return emulate; 581 return r;
615 582
616out_unlock: 583out_unlock:
617 spin_unlock(&vcpu->kvm->mmu_lock); 584 spin_unlock(&vcpu->kvm->mmu_lock);
@@ -757,7 +724,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
757 sizeof(pt_element_t))) 724 sizeof(pt_element_t)))
758 return -EINVAL; 725 return -EINVAL;
759 726
760 if (FNAME(prefetch_invalid_gpte)(vcpu, sp, &sp->spt[i], gpte)) { 727 if (prefetch_invalid_gpte(vcpu, sp, &sp->spt[i], gpte)) {
761 vcpu->kvm->tlbs_dirty++; 728 vcpu->kvm->tlbs_dirty++;
762 continue; 729 continue;
763 } 730 }
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index d017df3899ef..d29d3cd1c156 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -20,6 +20,7 @@
20#include "mmu.h" 20#include "mmu.h"
21#include "kvm_cache_regs.h" 21#include "kvm_cache_regs.h"
22#include "x86.h" 22#include "x86.h"
23#include "cpuid.h"
23 24
24#include <linux/module.h> 25#include <linux/module.h>
25#include <linux/mod_devicetable.h> 26#include <linux/mod_devicetable.h>
@@ -630,15 +631,12 @@ static int svm_hardware_enable(void *garbage)
630 return -EBUSY; 631 return -EBUSY;
631 632
632 if (!has_svm()) { 633 if (!has_svm()) {
633 printk(KERN_ERR "svm_hardware_enable: err EOPNOTSUPP on %d\n", 634 pr_err("%s: err EOPNOTSUPP on %d\n", __func__, me);
634 me);
635 return -EINVAL; 635 return -EINVAL;
636 } 636 }
637 sd = per_cpu(svm_data, me); 637 sd = per_cpu(svm_data, me);
638
639 if (!sd) { 638 if (!sd) {
640 printk(KERN_ERR "svm_hardware_enable: svm_data is NULL on %d\n", 639 pr_err("%s: svm_data is NULL on %d\n", __func__, me);
641 me);
642 return -EINVAL; 640 return -EINVAL;
643 } 641 }
644 642
@@ -1012,6 +1010,13 @@ static void svm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale)
1012 svm->tsc_ratio = ratio; 1010 svm->tsc_ratio = ratio;
1013} 1011}
1014 1012
1013static u64 svm_read_tsc_offset(struct kvm_vcpu *vcpu)
1014{
1015 struct vcpu_svm *svm = to_svm(vcpu);
1016
1017 return svm->vmcb->control.tsc_offset;
1018}
1019
1015static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) 1020static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
1016{ 1021{
1017 struct vcpu_svm *svm = to_svm(vcpu); 1022 struct vcpu_svm *svm = to_svm(vcpu);
@@ -1189,6 +1194,8 @@ static void init_vmcb(struct vcpu_svm *svm)
1189static int svm_vcpu_reset(struct kvm_vcpu *vcpu) 1194static int svm_vcpu_reset(struct kvm_vcpu *vcpu)
1190{ 1195{
1191 struct vcpu_svm *svm = to_svm(vcpu); 1196 struct vcpu_svm *svm = to_svm(vcpu);
1197 u32 dummy;
1198 u32 eax = 1;
1192 1199
1193 init_vmcb(svm); 1200 init_vmcb(svm);
1194 1201
@@ -1197,8 +1204,9 @@ static int svm_vcpu_reset(struct kvm_vcpu *vcpu)
1197 svm->vmcb->save.cs.base = svm->vcpu.arch.sipi_vector << 12; 1204 svm->vmcb->save.cs.base = svm->vcpu.arch.sipi_vector << 12;
1198 svm->vmcb->save.cs.selector = svm->vcpu.arch.sipi_vector << 8; 1205 svm->vmcb->save.cs.selector = svm->vcpu.arch.sipi_vector << 8;
1199 } 1206 }
1200 vcpu->arch.regs_avail = ~0; 1207
1201 vcpu->arch.regs_dirty = ~0; 1208 kvm_cpuid(vcpu, &eax, &dummy, &dummy, &dummy);
1209 kvm_register_write(vcpu, VCPU_REGS_RDX, eax);
1202 1210
1203 return 0; 1211 return 0;
1204} 1212}
@@ -1254,11 +1262,6 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
1254 svm->vmcb_pa = page_to_pfn(page) << PAGE_SHIFT; 1262 svm->vmcb_pa = page_to_pfn(page) << PAGE_SHIFT;
1255 svm->asid_generation = 0; 1263 svm->asid_generation = 0;
1256 init_vmcb(svm); 1264 init_vmcb(svm);
1257 kvm_write_tsc(&svm->vcpu, 0);
1258
1259 err = fx_init(&svm->vcpu);
1260 if (err)
1261 goto free_page4;
1262 1265
1263 svm->vcpu.arch.apic_base = 0xfee00000 | MSR_IA32_APICBASE_ENABLE; 1266 svm->vcpu.arch.apic_base = 0xfee00000 | MSR_IA32_APICBASE_ENABLE;
1264 if (kvm_vcpu_is_bsp(&svm->vcpu)) 1267 if (kvm_vcpu_is_bsp(&svm->vcpu))
@@ -1268,8 +1271,6 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
1268 1271
1269 return &svm->vcpu; 1272 return &svm->vcpu;
1270 1273
1271free_page4:
1272 __free_page(hsave_page);
1273free_page3: 1274free_page3:
1274 __free_pages(nested_msrpm_pages, MSRPM_ALLOC_ORDER); 1275 __free_pages(nested_msrpm_pages, MSRPM_ALLOC_ORDER);
1275free_page2: 1276free_page2:
@@ -3008,11 +3009,11 @@ static int cr8_write_interception(struct vcpu_svm *svm)
3008 return 0; 3009 return 0;
3009} 3010}
3010 3011
3011u64 svm_read_l1_tsc(struct kvm_vcpu *vcpu) 3012u64 svm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc)
3012{ 3013{
3013 struct vmcb *vmcb = get_host_vmcb(to_svm(vcpu)); 3014 struct vmcb *vmcb = get_host_vmcb(to_svm(vcpu));
3014 return vmcb->control.tsc_offset + 3015 return vmcb->control.tsc_offset +
3015 svm_scale_tsc(vcpu, native_read_tsc()); 3016 svm_scale_tsc(vcpu, host_tsc);
3016} 3017}
3017 3018
3018static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data) 3019static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data)
@@ -3131,13 +3132,15 @@ static int svm_set_vm_cr(struct kvm_vcpu *vcpu, u64 data)
3131 return 0; 3132 return 0;
3132} 3133}
3133 3134
3134static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data) 3135static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
3135{ 3136{
3136 struct vcpu_svm *svm = to_svm(vcpu); 3137 struct vcpu_svm *svm = to_svm(vcpu);
3137 3138
3139 u32 ecx = msr->index;
3140 u64 data = msr->data;
3138 switch (ecx) { 3141 switch (ecx) {
3139 case MSR_IA32_TSC: 3142 case MSR_IA32_TSC:
3140 kvm_write_tsc(vcpu, data); 3143 kvm_write_tsc(vcpu, msr);
3141 break; 3144 break;
3142 case MSR_STAR: 3145 case MSR_STAR:
3143 svm->vmcb->save.star = data; 3146 svm->vmcb->save.star = data;
@@ -3192,20 +3195,24 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data)
3192 vcpu_unimpl(vcpu, "unimplemented wrmsr: 0x%x data 0x%llx\n", ecx, data); 3195 vcpu_unimpl(vcpu, "unimplemented wrmsr: 0x%x data 0x%llx\n", ecx, data);
3193 break; 3196 break;
3194 default: 3197 default:
3195 return kvm_set_msr_common(vcpu, ecx, data); 3198 return kvm_set_msr_common(vcpu, msr);
3196 } 3199 }
3197 return 0; 3200 return 0;
3198} 3201}
3199 3202
3200static int wrmsr_interception(struct vcpu_svm *svm) 3203static int wrmsr_interception(struct vcpu_svm *svm)
3201{ 3204{
3205 struct msr_data msr;
3202 u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX]; 3206 u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX];
3203 u64 data = (svm->vcpu.arch.regs[VCPU_REGS_RAX] & -1u) 3207 u64 data = (svm->vcpu.arch.regs[VCPU_REGS_RAX] & -1u)
3204 | ((u64)(svm->vcpu.arch.regs[VCPU_REGS_RDX] & -1u) << 32); 3208 | ((u64)(svm->vcpu.arch.regs[VCPU_REGS_RDX] & -1u) << 32);
3205 3209
3210 msr.data = data;
3211 msr.index = ecx;
3212 msr.host_initiated = false;
3206 3213
3207 svm->next_rip = kvm_rip_read(&svm->vcpu) + 2; 3214 svm->next_rip = kvm_rip_read(&svm->vcpu) + 2;
3208 if (svm_set_msr(&svm->vcpu, ecx, data)) { 3215 if (svm_set_msr(&svm->vcpu, &msr)) {
3209 trace_kvm_msr_write_ex(ecx, data); 3216 trace_kvm_msr_write_ex(ecx, data);
3210 kvm_inject_gp(&svm->vcpu, 0); 3217 kvm_inject_gp(&svm->vcpu, 0);
3211 } else { 3218 } else {
@@ -4302,6 +4309,7 @@ static struct kvm_x86_ops svm_x86_ops = {
4302 .has_wbinvd_exit = svm_has_wbinvd_exit, 4309 .has_wbinvd_exit = svm_has_wbinvd_exit,
4303 4310
4304 .set_tsc_khz = svm_set_tsc_khz, 4311 .set_tsc_khz = svm_set_tsc_khz,
4312 .read_tsc_offset = svm_read_tsc_offset,
4305 .write_tsc_offset = svm_write_tsc_offset, 4313 .write_tsc_offset = svm_write_tsc_offset,
4306 .adjust_tsc_offset = svm_adjust_tsc_offset, 4314 .adjust_tsc_offset = svm_adjust_tsc_offset,
4307 .compute_tsc_offset = svm_compute_tsc_offset, 4315 .compute_tsc_offset = svm_compute_tsc_offset,
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index bca63f04dccb..fe5e00ed7036 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -4,6 +4,7 @@
4#include <linux/tracepoint.h> 4#include <linux/tracepoint.h>
5#include <asm/vmx.h> 5#include <asm/vmx.h>
6#include <asm/svm.h> 6#include <asm/svm.h>
7#include <asm/clocksource.h>
7 8
8#undef TRACE_SYSTEM 9#undef TRACE_SYSTEM
9#define TRACE_SYSTEM kvm 10#define TRACE_SYSTEM kvm
@@ -754,6 +755,68 @@ TRACE_EVENT(
754 __entry->write ? "Write" : "Read", 755 __entry->write ? "Write" : "Read",
755 __entry->gpa_match ? "GPA" : "GVA") 756 __entry->gpa_match ? "GPA" : "GVA")
756); 757);
758
759#ifdef CONFIG_X86_64
760
761#define host_clocks \
762 {VCLOCK_NONE, "none"}, \
763 {VCLOCK_TSC, "tsc"}, \
764 {VCLOCK_HPET, "hpet"} \
765
766TRACE_EVENT(kvm_update_master_clock,
767 TP_PROTO(bool use_master_clock, unsigned int host_clock, bool offset_matched),
768 TP_ARGS(use_master_clock, host_clock, offset_matched),
769
770 TP_STRUCT__entry(
771 __field( bool, use_master_clock )
772 __field( unsigned int, host_clock )
773 __field( bool, offset_matched )
774 ),
775
776 TP_fast_assign(
777 __entry->use_master_clock = use_master_clock;
778 __entry->host_clock = host_clock;
779 __entry->offset_matched = offset_matched;
780 ),
781
782 TP_printk("masterclock %d hostclock %s offsetmatched %u",
783 __entry->use_master_clock,
784 __print_symbolic(__entry->host_clock, host_clocks),
785 __entry->offset_matched)
786);
787
788TRACE_EVENT(kvm_track_tsc,
789 TP_PROTO(unsigned int vcpu_id, unsigned int nr_matched,
790 unsigned int online_vcpus, bool use_master_clock,
791 unsigned int host_clock),
792 TP_ARGS(vcpu_id, nr_matched, online_vcpus, use_master_clock,
793 host_clock),
794
795 TP_STRUCT__entry(
796 __field( unsigned int, vcpu_id )
797 __field( unsigned int, nr_vcpus_matched_tsc )
798 __field( unsigned int, online_vcpus )
799 __field( bool, use_master_clock )
800 __field( unsigned int, host_clock )
801 ),
802
803 TP_fast_assign(
804 __entry->vcpu_id = vcpu_id;
805 __entry->nr_vcpus_matched_tsc = nr_matched;
806 __entry->online_vcpus = online_vcpus;
807 __entry->use_master_clock = use_master_clock;
808 __entry->host_clock = host_clock;
809 ),
810
811 TP_printk("vcpu_id %u masterclock %u offsetmatched %u nr_online %u"
812 " hostclock %s",
813 __entry->vcpu_id, __entry->use_master_clock,
814 __entry->nr_vcpus_matched_tsc, __entry->online_vcpus,
815 __print_symbolic(__entry->host_clock, host_clocks))
816);
817
818#endif /* CONFIG_X86_64 */
819
757#endif /* _TRACE_KVM_H */ 820#endif /* _TRACE_KVM_H */
758 821
759#undef TRACE_INCLUDE_PATH 822#undef TRACE_INCLUDE_PATH
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index f85815945fc6..9120ae1901e4 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -42,6 +42,7 @@
42#include <asm/i387.h> 42#include <asm/i387.h>
43#include <asm/xcr.h> 43#include <asm/xcr.h>
44#include <asm/perf_event.h> 44#include <asm/perf_event.h>
45#include <asm/kexec.h>
45 46
46#include "trace.h" 47#include "trace.h"
47 48
@@ -802,11 +803,6 @@ static inline bool cpu_has_vmx_ept_ad_bits(void)
802 return vmx_capability.ept & VMX_EPT_AD_BIT; 803 return vmx_capability.ept & VMX_EPT_AD_BIT;
803} 804}
804 805
805static inline bool cpu_has_vmx_invept_individual_addr(void)
806{
807 return vmx_capability.ept & VMX_EPT_EXTENT_INDIVIDUAL_BIT;
808}
809
810static inline bool cpu_has_vmx_invept_context(void) 806static inline bool cpu_has_vmx_invept_context(void)
811{ 807{
812 return vmx_capability.ept & VMX_EPT_EXTENT_CONTEXT_BIT; 808 return vmx_capability.ept & VMX_EPT_EXTENT_CONTEXT_BIT;
@@ -992,6 +988,46 @@ static void vmcs_load(struct vmcs *vmcs)
992 vmcs, phys_addr); 988 vmcs, phys_addr);
993} 989}
994 990
991#ifdef CONFIG_KEXEC
992/*
993 * This bitmap is used to indicate whether the vmclear
994 * operation is enabled on all cpus. All disabled by
995 * default.
996 */
997static cpumask_t crash_vmclear_enabled_bitmap = CPU_MASK_NONE;
998
999static inline void crash_enable_local_vmclear(int cpu)
1000{
1001 cpumask_set_cpu(cpu, &crash_vmclear_enabled_bitmap);
1002}
1003
1004static inline void crash_disable_local_vmclear(int cpu)
1005{
1006 cpumask_clear_cpu(cpu, &crash_vmclear_enabled_bitmap);
1007}
1008
1009static inline int crash_local_vmclear_enabled(int cpu)
1010{
1011 return cpumask_test_cpu(cpu, &crash_vmclear_enabled_bitmap);
1012}
1013
1014static void crash_vmclear_local_loaded_vmcss(void)
1015{
1016 int cpu = raw_smp_processor_id();
1017 struct loaded_vmcs *v;
1018
1019 if (!crash_local_vmclear_enabled(cpu))
1020 return;
1021
1022 list_for_each_entry(v, &per_cpu(loaded_vmcss_on_cpu, cpu),
1023 loaded_vmcss_on_cpu_link)
1024 vmcs_clear(v->vmcs);
1025}
1026#else
1027static inline void crash_enable_local_vmclear(int cpu) { }
1028static inline void crash_disable_local_vmclear(int cpu) { }
1029#endif /* CONFIG_KEXEC */
1030
995static void __loaded_vmcs_clear(void *arg) 1031static void __loaded_vmcs_clear(void *arg)
996{ 1032{
997 struct loaded_vmcs *loaded_vmcs = arg; 1033 struct loaded_vmcs *loaded_vmcs = arg;
@@ -1001,15 +1037,28 @@ static void __loaded_vmcs_clear(void *arg)
1001 return; /* vcpu migration can race with cpu offline */ 1037 return; /* vcpu migration can race with cpu offline */
1002 if (per_cpu(current_vmcs, cpu) == loaded_vmcs->vmcs) 1038 if (per_cpu(current_vmcs, cpu) == loaded_vmcs->vmcs)
1003 per_cpu(current_vmcs, cpu) = NULL; 1039 per_cpu(current_vmcs, cpu) = NULL;
1040 crash_disable_local_vmclear(cpu);
1004 list_del(&loaded_vmcs->loaded_vmcss_on_cpu_link); 1041 list_del(&loaded_vmcs->loaded_vmcss_on_cpu_link);
1042
1043 /*
1044 * we should ensure updating loaded_vmcs->loaded_vmcss_on_cpu_link
1045 * is before setting loaded_vmcs->vcpu to -1 which is done in
1046 * loaded_vmcs_init. Otherwise, other cpu can see vcpu = -1 fist
1047 * then adds the vmcs into percpu list before it is deleted.
1048 */
1049 smp_wmb();
1050
1005 loaded_vmcs_init(loaded_vmcs); 1051 loaded_vmcs_init(loaded_vmcs);
1052 crash_enable_local_vmclear(cpu);
1006} 1053}
1007 1054
1008static void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs) 1055static void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs)
1009{ 1056{
1010 if (loaded_vmcs->cpu != -1) 1057 int cpu = loaded_vmcs->cpu;
1011 smp_call_function_single( 1058
1012 loaded_vmcs->cpu, __loaded_vmcs_clear, loaded_vmcs, 1); 1059 if (cpu != -1)
1060 smp_call_function_single(cpu,
1061 __loaded_vmcs_clear, loaded_vmcs, 1);
1013} 1062}
1014 1063
1015static inline void vpid_sync_vcpu_single(struct vcpu_vmx *vmx) 1064static inline void vpid_sync_vcpu_single(struct vcpu_vmx *vmx)
@@ -1051,17 +1100,6 @@ static inline void ept_sync_context(u64 eptp)
1051 } 1100 }
1052} 1101}
1053 1102
1054static inline void ept_sync_individual_addr(u64 eptp, gpa_t gpa)
1055{
1056 if (enable_ept) {
1057 if (cpu_has_vmx_invept_individual_addr())
1058 __invept(VMX_EPT_EXTENT_INDIVIDUAL_ADDR,
1059 eptp, gpa);
1060 else
1061 ept_sync_context(eptp);
1062 }
1063}
1064
1065static __always_inline unsigned long vmcs_readl(unsigned long field) 1103static __always_inline unsigned long vmcs_readl(unsigned long field)
1066{ 1104{
1067 unsigned long value; 1105 unsigned long value;
@@ -1535,8 +1573,18 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
1535 1573
1536 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); 1574 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
1537 local_irq_disable(); 1575 local_irq_disable();
1576 crash_disable_local_vmclear(cpu);
1577
1578 /*
1579 * Read loaded_vmcs->cpu should be before fetching
1580 * loaded_vmcs->loaded_vmcss_on_cpu_link.
1581 * See the comments in __loaded_vmcs_clear().
1582 */
1583 smp_rmb();
1584
1538 list_add(&vmx->loaded_vmcs->loaded_vmcss_on_cpu_link, 1585 list_add(&vmx->loaded_vmcs->loaded_vmcss_on_cpu_link,
1539 &per_cpu(loaded_vmcss_on_cpu, cpu)); 1586 &per_cpu(loaded_vmcss_on_cpu, cpu));
1587 crash_enable_local_vmclear(cpu);
1540 local_irq_enable(); 1588 local_irq_enable();
1541 1589
1542 /* 1590 /*
@@ -1839,11 +1887,10 @@ static u64 guest_read_tsc(void)
1839 * Like guest_read_tsc, but always returns L1's notion of the timestamp 1887 * Like guest_read_tsc, but always returns L1's notion of the timestamp
1840 * counter, even if a nested guest (L2) is currently running. 1888 * counter, even if a nested guest (L2) is currently running.
1841 */ 1889 */
1842u64 vmx_read_l1_tsc(struct kvm_vcpu *vcpu) 1890u64 vmx_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc)
1843{ 1891{
1844 u64 host_tsc, tsc_offset; 1892 u64 tsc_offset;
1845 1893
1846 rdtscll(host_tsc);
1847 tsc_offset = is_guest_mode(vcpu) ? 1894 tsc_offset = is_guest_mode(vcpu) ?
1848 to_vmx(vcpu)->nested.vmcs01_tsc_offset : 1895 to_vmx(vcpu)->nested.vmcs01_tsc_offset :
1849 vmcs_read64(TSC_OFFSET); 1896 vmcs_read64(TSC_OFFSET);
@@ -1866,6 +1913,11 @@ static void vmx_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale)
1866 WARN(1, "user requested TSC rate below hardware speed\n"); 1913 WARN(1, "user requested TSC rate below hardware speed\n");
1867} 1914}
1868 1915
1916static u64 vmx_read_tsc_offset(struct kvm_vcpu *vcpu)
1917{
1918 return vmcs_read64(TSC_OFFSET);
1919}
1920
1869/* 1921/*
1870 * writes 'offset' into guest's timestamp counter offset register 1922 * writes 'offset' into guest's timestamp counter offset register
1871 */ 1923 */
@@ -2202,15 +2254,17 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
2202 * Returns 0 on success, non-0 otherwise. 2254 * Returns 0 on success, non-0 otherwise.
2203 * Assumes vcpu_load() was already called. 2255 * Assumes vcpu_load() was already called.
2204 */ 2256 */
2205static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) 2257static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
2206{ 2258{
2207 struct vcpu_vmx *vmx = to_vmx(vcpu); 2259 struct vcpu_vmx *vmx = to_vmx(vcpu);
2208 struct shared_msr_entry *msr; 2260 struct shared_msr_entry *msr;
2209 int ret = 0; 2261 int ret = 0;
2262 u32 msr_index = msr_info->index;
2263 u64 data = msr_info->data;
2210 2264
2211 switch (msr_index) { 2265 switch (msr_index) {
2212 case MSR_EFER: 2266 case MSR_EFER:
2213 ret = kvm_set_msr_common(vcpu, msr_index, data); 2267 ret = kvm_set_msr_common(vcpu, msr_info);
2214 break; 2268 break;
2215#ifdef CONFIG_X86_64 2269#ifdef CONFIG_X86_64
2216 case MSR_FS_BASE: 2270 case MSR_FS_BASE:
@@ -2236,7 +2290,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
2236 vmcs_writel(GUEST_SYSENTER_ESP, data); 2290 vmcs_writel(GUEST_SYSENTER_ESP, data);
2237 break; 2291 break;
2238 case MSR_IA32_TSC: 2292 case MSR_IA32_TSC:
2239 kvm_write_tsc(vcpu, data); 2293 kvm_write_tsc(vcpu, msr_info);
2240 break; 2294 break;
2241 case MSR_IA32_CR_PAT: 2295 case MSR_IA32_CR_PAT:
2242 if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) { 2296 if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
@@ -2244,7 +2298,10 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
2244 vcpu->arch.pat = data; 2298 vcpu->arch.pat = data;
2245 break; 2299 break;
2246 } 2300 }
2247 ret = kvm_set_msr_common(vcpu, msr_index, data); 2301 ret = kvm_set_msr_common(vcpu, msr_info);
2302 break;
2303 case MSR_IA32_TSC_ADJUST:
2304 ret = kvm_set_msr_common(vcpu, msr_info);
2248 break; 2305 break;
2249 case MSR_TSC_AUX: 2306 case MSR_TSC_AUX:
2250 if (!vmx->rdtscp_enabled) 2307 if (!vmx->rdtscp_enabled)
@@ -2267,7 +2324,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
2267 } 2324 }
2268 break; 2325 break;
2269 } 2326 }
2270 ret = kvm_set_msr_common(vcpu, msr_index, data); 2327 ret = kvm_set_msr_common(vcpu, msr_info);
2271 } 2328 }
2272 2329
2273 return ret; 2330 return ret;
@@ -2341,6 +2398,18 @@ static int hardware_enable(void *garbage)
2341 return -EBUSY; 2398 return -EBUSY;
2342 2399
2343 INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu)); 2400 INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu));
2401
2402 /*
2403 * Now we can enable the vmclear operation in kdump
2404 * since the loaded_vmcss_on_cpu list on this cpu
2405 * has been initialized.
2406 *
2407 * Though the cpu is not in VMX operation now, there
2408 * is no problem to enable the vmclear operation
2409 * for the loaded_vmcss_on_cpu list is empty!
2410 */
2411 crash_enable_local_vmclear(cpu);
2412
2344 rdmsrl(MSR_IA32_FEATURE_CONTROL, old); 2413 rdmsrl(MSR_IA32_FEATURE_CONTROL, old);
2345 2414
2346 test_bits = FEATURE_CONTROL_LOCKED; 2415 test_bits = FEATURE_CONTROL_LOCKED;
@@ -2697,6 +2766,7 @@ static void fix_pmode_dataseg(struct kvm_vcpu *vcpu, int seg, struct kvm_segment
2697 if (!(vmcs_readl(sf->base) == tmp.base && tmp.s)) { 2766 if (!(vmcs_readl(sf->base) == tmp.base && tmp.s)) {
2698 tmp.base = vmcs_readl(sf->base); 2767 tmp.base = vmcs_readl(sf->base);
2699 tmp.selector = vmcs_read16(sf->selector); 2768 tmp.selector = vmcs_read16(sf->selector);
2769 tmp.dpl = tmp.selector & SELECTOR_RPL_MASK;
2700 tmp.s = 1; 2770 tmp.s = 1;
2701 } 2771 }
2702 vmx_set_segment(vcpu, &tmp, seg); 2772 vmx_set_segment(vcpu, &tmp, seg);
@@ -3246,7 +3316,7 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu,
3246 * unrestricted guest like Westmere to older host that don't have 3316 * unrestricted guest like Westmere to older host that don't have
3247 * unrestricted guest like Nehelem. 3317 * unrestricted guest like Nehelem.
3248 */ 3318 */
3249 if (!enable_unrestricted_guest && vmx->rmode.vm86_active) { 3319 if (vmx->rmode.vm86_active) {
3250 switch (seg) { 3320 switch (seg) {
3251 case VCPU_SREG_CS: 3321 case VCPU_SREG_CS:
3252 vmcs_write32(GUEST_CS_AR_BYTES, 0xf3); 3322 vmcs_write32(GUEST_CS_AR_BYTES, 0xf3);
@@ -3897,8 +3967,6 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
3897 vmcs_writel(CR0_GUEST_HOST_MASK, ~0UL); 3967 vmcs_writel(CR0_GUEST_HOST_MASK, ~0UL);
3898 set_cr4_guest_host_mask(vmx); 3968 set_cr4_guest_host_mask(vmx);
3899 3969
3900 kvm_write_tsc(&vmx->vcpu, 0);
3901
3902 return 0; 3970 return 0;
3903} 3971}
3904 3972
@@ -3908,8 +3976,6 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
3908 u64 msr; 3976 u64 msr;
3909 int ret; 3977 int ret;
3910 3978
3911 vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP));
3912
3913 vmx->rmode.vm86_active = 0; 3979 vmx->rmode.vm86_active = 0;
3914 3980
3915 vmx->soft_vnmi_blocked = 0; 3981 vmx->soft_vnmi_blocked = 0;
@@ -3921,10 +3987,6 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
3921 msr |= MSR_IA32_APICBASE_BSP; 3987 msr |= MSR_IA32_APICBASE_BSP;
3922 kvm_set_apic_base(&vmx->vcpu, msr); 3988 kvm_set_apic_base(&vmx->vcpu, msr);
3923 3989
3924 ret = fx_init(&vmx->vcpu);
3925 if (ret != 0)
3926 goto out;
3927
3928 vmx_segment_cache_clear(vmx); 3990 vmx_segment_cache_clear(vmx);
3929 3991
3930 seg_setup(VCPU_SREG_CS); 3992 seg_setup(VCPU_SREG_CS);
@@ -3965,7 +4027,6 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
3965 kvm_rip_write(vcpu, 0xfff0); 4027 kvm_rip_write(vcpu, 0xfff0);
3966 else 4028 else
3967 kvm_rip_write(vcpu, 0); 4029 kvm_rip_write(vcpu, 0);
3968 kvm_register_write(vcpu, VCPU_REGS_RSP, 0);
3969 4030
3970 vmcs_writel(GUEST_GDTR_BASE, 0); 4031 vmcs_writel(GUEST_GDTR_BASE, 0);
3971 vmcs_write32(GUEST_GDTR_LIMIT, 0xffff); 4032 vmcs_write32(GUEST_GDTR_LIMIT, 0xffff);
@@ -4015,7 +4076,6 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
4015 /* HACK: Don't enable emulation on guest boot/reset */ 4076 /* HACK: Don't enable emulation on guest boot/reset */
4016 vmx->emulation_required = 0; 4077 vmx->emulation_required = 0;
4017 4078
4018out:
4019 return ret; 4079 return ret;
4020} 4080}
4021 4081
@@ -4287,16 +4347,6 @@ static int handle_exception(struct kvm_vcpu *vcpu)
4287 if (is_machine_check(intr_info)) 4347 if (is_machine_check(intr_info))
4288 return handle_machine_check(vcpu); 4348 return handle_machine_check(vcpu);
4289 4349
4290 if ((vect_info & VECTORING_INFO_VALID_MASK) &&
4291 !is_page_fault(intr_info)) {
4292 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
4293 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_SIMUL_EX;
4294 vcpu->run->internal.ndata = 2;
4295 vcpu->run->internal.data[0] = vect_info;
4296 vcpu->run->internal.data[1] = intr_info;
4297 return 0;
4298 }
4299
4300 if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR) 4350 if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR)
4301 return 1; /* already handled by vmx_vcpu_run() */ 4351 return 1; /* already handled by vmx_vcpu_run() */
4302 4352
@@ -4315,6 +4365,22 @@ static int handle_exception(struct kvm_vcpu *vcpu)
4315 error_code = 0; 4365 error_code = 0;
4316 if (intr_info & INTR_INFO_DELIVER_CODE_MASK) 4366 if (intr_info & INTR_INFO_DELIVER_CODE_MASK)
4317 error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE); 4367 error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
4368
4369 /*
4370 * The #PF with PFEC.RSVD = 1 indicates the guest is accessing
4371 * MMIO, it is better to report an internal error.
4372 * See the comments in vmx_handle_exit.
4373 */
4374 if ((vect_info & VECTORING_INFO_VALID_MASK) &&
4375 !(is_page_fault(intr_info) && !(error_code & PFERR_RSVD_MASK))) {
4376 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
4377 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_SIMUL_EX;
4378 vcpu->run->internal.ndata = 2;
4379 vcpu->run->internal.data[0] = vect_info;
4380 vcpu->run->internal.data[1] = intr_info;
4381 return 0;
4382 }
4383
4318 if (is_page_fault(intr_info)) { 4384 if (is_page_fault(intr_info)) {
4319 /* EPT won't cause page fault directly */ 4385 /* EPT won't cause page fault directly */
4320 BUG_ON(enable_ept); 4386 BUG_ON(enable_ept);
@@ -4626,11 +4692,15 @@ static int handle_rdmsr(struct kvm_vcpu *vcpu)
4626 4692
4627static int handle_wrmsr(struct kvm_vcpu *vcpu) 4693static int handle_wrmsr(struct kvm_vcpu *vcpu)
4628{ 4694{
4695 struct msr_data msr;
4629 u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX]; 4696 u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX];
4630 u64 data = (vcpu->arch.regs[VCPU_REGS_RAX] & -1u) 4697 u64 data = (vcpu->arch.regs[VCPU_REGS_RAX] & -1u)
4631 | ((u64)(vcpu->arch.regs[VCPU_REGS_RDX] & -1u) << 32); 4698 | ((u64)(vcpu->arch.regs[VCPU_REGS_RDX] & -1u) << 32);
4632 4699
4633 if (vmx_set_msr(vcpu, ecx, data) != 0) { 4700 msr.data = data;
4701 msr.index = ecx;
4702 msr.host_initiated = false;
4703 if (vmx_set_msr(vcpu, &msr) != 0) {
4634 trace_kvm_msr_write_ex(ecx, data); 4704 trace_kvm_msr_write_ex(ecx, data);
4635 kvm_inject_gp(vcpu, 0); 4705 kvm_inject_gp(vcpu, 0);
4636 return 1; 4706 return 1;
@@ -4827,11 +4897,6 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu)
4827 4897
4828 exit_qualification = vmcs_readl(EXIT_QUALIFICATION); 4898 exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
4829 4899
4830 if (exit_qualification & (1 << 6)) {
4831 printk(KERN_ERR "EPT: GPA exceeds GAW!\n");
4832 return -EINVAL;
4833 }
4834
4835 gla_validity = (exit_qualification >> 7) & 0x3; 4900 gla_validity = (exit_qualification >> 7) & 0x3;
4836 if (gla_validity != 0x3 && gla_validity != 0x1 && gla_validity != 0) { 4901 if (gla_validity != 0x3 && gla_validity != 0x1 && gla_validity != 0) {
4837 printk(KERN_ERR "EPT: Handling EPT violation failed!\n"); 4902 printk(KERN_ERR "EPT: Handling EPT violation failed!\n");
@@ -5979,13 +6044,24 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
5979 return 0; 6044 return 0;
5980 } 6045 }
5981 6046
6047 /*
6048 * Note:
6049 * Do not try to fix EXIT_REASON_EPT_MISCONFIG if it caused by
6050 * delivery event since it indicates guest is accessing MMIO.
6051 * The vm-exit can be triggered again after return to guest that
6052 * will cause infinite loop.
6053 */
5982 if ((vectoring_info & VECTORING_INFO_VALID_MASK) && 6054 if ((vectoring_info & VECTORING_INFO_VALID_MASK) &&
5983 (exit_reason != EXIT_REASON_EXCEPTION_NMI && 6055 (exit_reason != EXIT_REASON_EXCEPTION_NMI &&
5984 exit_reason != EXIT_REASON_EPT_VIOLATION && 6056 exit_reason != EXIT_REASON_EPT_VIOLATION &&
5985 exit_reason != EXIT_REASON_TASK_SWITCH)) 6057 exit_reason != EXIT_REASON_TASK_SWITCH)) {
5986 printk(KERN_WARNING "%s: unexpected, valid vectoring info " 6058 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
5987 "(0x%x) and exit reason is 0x%x\n", 6059 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_DELIVERY_EV;
5988 __func__, vectoring_info, exit_reason); 6060 vcpu->run->internal.ndata = 2;
6061 vcpu->run->internal.data[0] = vectoring_info;
6062 vcpu->run->internal.data[1] = exit_reason;
6063 return 0;
6064 }
5989 6065
5990 if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked && 6066 if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked &&
5991 !(is_guest_mode(vcpu) && nested_cpu_has_virtual_nmis( 6067 !(is_guest_mode(vcpu) && nested_cpu_has_virtual_nmis(
@@ -7309,6 +7385,7 @@ static struct kvm_x86_ops vmx_x86_ops = {
7309 .has_wbinvd_exit = cpu_has_vmx_wbinvd_exit, 7385 .has_wbinvd_exit = cpu_has_vmx_wbinvd_exit,
7310 7386
7311 .set_tsc_khz = vmx_set_tsc_khz, 7387 .set_tsc_khz = vmx_set_tsc_khz,
7388 .read_tsc_offset = vmx_read_tsc_offset,
7312 .write_tsc_offset = vmx_write_tsc_offset, 7389 .write_tsc_offset = vmx_write_tsc_offset,
7313 .adjust_tsc_offset = vmx_adjust_tsc_offset, 7390 .adjust_tsc_offset = vmx_adjust_tsc_offset,
7314 .compute_tsc_offset = vmx_compute_tsc_offset, 7391 .compute_tsc_offset = vmx_compute_tsc_offset,
@@ -7367,6 +7444,11 @@ static int __init vmx_init(void)
7367 if (r) 7444 if (r)
7368 goto out3; 7445 goto out3;
7369 7446
7447#ifdef CONFIG_KEXEC
7448 rcu_assign_pointer(crash_vmclear_loaded_vmcss,
7449 crash_vmclear_local_loaded_vmcss);
7450#endif
7451
7370 vmx_disable_intercept_for_msr(MSR_FS_BASE, false); 7452 vmx_disable_intercept_for_msr(MSR_FS_BASE, false);
7371 vmx_disable_intercept_for_msr(MSR_GS_BASE, false); 7453 vmx_disable_intercept_for_msr(MSR_GS_BASE, false);
7372 vmx_disable_intercept_for_msr(MSR_KERNEL_GS_BASE, true); 7454 vmx_disable_intercept_for_msr(MSR_KERNEL_GS_BASE, true);
@@ -7404,6 +7486,11 @@ static void __exit vmx_exit(void)
7404 free_page((unsigned long)vmx_io_bitmap_b); 7486 free_page((unsigned long)vmx_io_bitmap_b);
7405 free_page((unsigned long)vmx_io_bitmap_a); 7487 free_page((unsigned long)vmx_io_bitmap_a);
7406 7488
7489#ifdef CONFIG_KEXEC
7490 rcu_assign_pointer(crash_vmclear_loaded_vmcss, NULL);
7491 synchronize_rcu();
7492#endif
7493
7407 kvm_exit(); 7494 kvm_exit();
7408} 7495}
7409 7496
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 4f7641756be2..76f54461f7cb 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -46,6 +46,8 @@
46#include <linux/uaccess.h> 46#include <linux/uaccess.h>
47#include <linux/hash.h> 47#include <linux/hash.h>
48#include <linux/pci.h> 48#include <linux/pci.h>
49#include <linux/timekeeper_internal.h>
50#include <linux/pvclock_gtod.h>
49#include <trace/events/kvm.h> 51#include <trace/events/kvm.h>
50 52
51#define CREATE_TRACE_POINTS 53#define CREATE_TRACE_POINTS
@@ -158,7 +160,9 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
158 160
159u64 __read_mostly host_xcr0; 161u64 __read_mostly host_xcr0;
160 162
161int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt); 163static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt);
164
165static int kvm_vcpu_reset(struct kvm_vcpu *vcpu);
162 166
163static inline void kvm_async_pf_hash_reset(struct kvm_vcpu *vcpu) 167static inline void kvm_async_pf_hash_reset(struct kvm_vcpu *vcpu)
164{ 168{
@@ -633,7 +637,7 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
633 } 637 }
634 638
635 if (is_long_mode(vcpu)) { 639 if (is_long_mode(vcpu)) {
636 if (kvm_read_cr4(vcpu) & X86_CR4_PCIDE) { 640 if (kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE)) {
637 if (cr3 & CR3_PCID_ENABLED_RESERVED_BITS) 641 if (cr3 & CR3_PCID_ENABLED_RESERVED_BITS)
638 return 1; 642 return 1;
639 } else 643 } else
@@ -827,6 +831,7 @@ static u32 msrs_to_save[] = {
827static unsigned num_msrs_to_save; 831static unsigned num_msrs_to_save;
828 832
829static const u32 emulated_msrs[] = { 833static const u32 emulated_msrs[] = {
834 MSR_IA32_TSC_ADJUST,
830 MSR_IA32_TSCDEADLINE, 835 MSR_IA32_TSCDEADLINE,
831 MSR_IA32_MISC_ENABLE, 836 MSR_IA32_MISC_ENABLE,
832 MSR_IA32_MCG_STATUS, 837 MSR_IA32_MCG_STATUS,
@@ -886,9 +891,9 @@ EXPORT_SYMBOL_GPL(kvm_enable_efer_bits);
886 * Returns 0 on success, non-0 otherwise. 891 * Returns 0 on success, non-0 otherwise.
887 * Assumes vcpu_load() was already called. 892 * Assumes vcpu_load() was already called.
888 */ 893 */
889int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) 894int kvm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
890{ 895{
891 return kvm_x86_ops->set_msr(vcpu, msr_index, data); 896 return kvm_x86_ops->set_msr(vcpu, msr);
892} 897}
893 898
894/* 899/*
@@ -896,9 +901,63 @@ int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
896 */ 901 */
897static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data) 902static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
898{ 903{
899 return kvm_set_msr(vcpu, index, *data); 904 struct msr_data msr;
905
906 msr.data = *data;
907 msr.index = index;
908 msr.host_initiated = true;
909 return kvm_set_msr(vcpu, &msr);
900} 910}
901 911
912#ifdef CONFIG_X86_64
913struct pvclock_gtod_data {
914 seqcount_t seq;
915
916 struct { /* extract of a clocksource struct */
917 int vclock_mode;
918 cycle_t cycle_last;
919 cycle_t mask;
920 u32 mult;
921 u32 shift;
922 } clock;
923
924 /* open coded 'struct timespec' */
925 u64 monotonic_time_snsec;
926 time_t monotonic_time_sec;
927};
928
929static struct pvclock_gtod_data pvclock_gtod_data;
930
931static void update_pvclock_gtod(struct timekeeper *tk)
932{
933 struct pvclock_gtod_data *vdata = &pvclock_gtod_data;
934
935 write_seqcount_begin(&vdata->seq);
936
937 /* copy pvclock gtod data */
938 vdata->clock.vclock_mode = tk->clock->archdata.vclock_mode;
939 vdata->clock.cycle_last = tk->clock->cycle_last;
940 vdata->clock.mask = tk->clock->mask;
941 vdata->clock.mult = tk->mult;
942 vdata->clock.shift = tk->shift;
943
944 vdata->monotonic_time_sec = tk->xtime_sec
945 + tk->wall_to_monotonic.tv_sec;
946 vdata->monotonic_time_snsec = tk->xtime_nsec
947 + (tk->wall_to_monotonic.tv_nsec
948 << tk->shift);
949 while (vdata->monotonic_time_snsec >=
950 (((u64)NSEC_PER_SEC) << tk->shift)) {
951 vdata->monotonic_time_snsec -=
952 ((u64)NSEC_PER_SEC) << tk->shift;
953 vdata->monotonic_time_sec++;
954 }
955
956 write_seqcount_end(&vdata->seq);
957}
958#endif
959
960
902static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock) 961static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
903{ 962{
904 int version; 963 int version;
@@ -995,6 +1054,10 @@ static inline u64 get_kernel_ns(void)
995 return timespec_to_ns(&ts); 1054 return timespec_to_ns(&ts);
996} 1055}
997 1056
1057#ifdef CONFIG_X86_64
1058static atomic_t kvm_guest_has_master_clock = ATOMIC_INIT(0);
1059#endif
1060
998static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz); 1061static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz);
999unsigned long max_tsc_khz; 1062unsigned long max_tsc_khz;
1000 1063
@@ -1046,12 +1109,47 @@ static u64 compute_guest_tsc(struct kvm_vcpu *vcpu, s64 kernel_ns)
1046 return tsc; 1109 return tsc;
1047} 1110}
1048 1111
1049void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data) 1112void kvm_track_tsc_matching(struct kvm_vcpu *vcpu)
1113{
1114#ifdef CONFIG_X86_64
1115 bool vcpus_matched;
1116 bool do_request = false;
1117 struct kvm_arch *ka = &vcpu->kvm->arch;
1118 struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
1119
1120 vcpus_matched = (ka->nr_vcpus_matched_tsc + 1 ==
1121 atomic_read(&vcpu->kvm->online_vcpus));
1122
1123 if (vcpus_matched && gtod->clock.vclock_mode == VCLOCK_TSC)
1124 if (!ka->use_master_clock)
1125 do_request = 1;
1126
1127 if (!vcpus_matched && ka->use_master_clock)
1128 do_request = 1;
1129
1130 if (do_request)
1131 kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
1132
1133 trace_kvm_track_tsc(vcpu->vcpu_id, ka->nr_vcpus_matched_tsc,
1134 atomic_read(&vcpu->kvm->online_vcpus),
1135 ka->use_master_clock, gtod->clock.vclock_mode);
1136#endif
1137}
1138
1139static void update_ia32_tsc_adjust_msr(struct kvm_vcpu *vcpu, s64 offset)
1140{
1141 u64 curr_offset = kvm_x86_ops->read_tsc_offset(vcpu);
1142 vcpu->arch.ia32_tsc_adjust_msr += offset - curr_offset;
1143}
1144
1145void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr)
1050{ 1146{
1051 struct kvm *kvm = vcpu->kvm; 1147 struct kvm *kvm = vcpu->kvm;
1052 u64 offset, ns, elapsed; 1148 u64 offset, ns, elapsed;
1053 unsigned long flags; 1149 unsigned long flags;
1054 s64 usdiff; 1150 s64 usdiff;
1151 bool matched;
1152 u64 data = msr->data;
1055 1153
1056 raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags); 1154 raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);
1057 offset = kvm_x86_ops->compute_tsc_offset(vcpu, data); 1155 offset = kvm_x86_ops->compute_tsc_offset(vcpu, data);
@@ -1094,6 +1192,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data)
1094 offset = kvm_x86_ops->compute_tsc_offset(vcpu, data); 1192 offset = kvm_x86_ops->compute_tsc_offset(vcpu, data);
1095 pr_debug("kvm: adjusted tsc offset by %llu\n", delta); 1193 pr_debug("kvm: adjusted tsc offset by %llu\n", delta);
1096 } 1194 }
1195 matched = true;
1097 } else { 1196 } else {
1098 /* 1197 /*
1099 * We split periods of matched TSC writes into generations. 1198 * We split periods of matched TSC writes into generations.
@@ -1108,6 +1207,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data)
1108 kvm->arch.cur_tsc_nsec = ns; 1207 kvm->arch.cur_tsc_nsec = ns;
1109 kvm->arch.cur_tsc_write = data; 1208 kvm->arch.cur_tsc_write = data;
1110 kvm->arch.cur_tsc_offset = offset; 1209 kvm->arch.cur_tsc_offset = offset;
1210 matched = false;
1111 pr_debug("kvm: new tsc generation %u, clock %llu\n", 1211 pr_debug("kvm: new tsc generation %u, clock %llu\n",
1112 kvm->arch.cur_tsc_generation, data); 1212 kvm->arch.cur_tsc_generation, data);
1113 } 1213 }
@@ -1129,26 +1229,195 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data)
1129 vcpu->arch.this_tsc_nsec = kvm->arch.cur_tsc_nsec; 1229 vcpu->arch.this_tsc_nsec = kvm->arch.cur_tsc_nsec;
1130 vcpu->arch.this_tsc_write = kvm->arch.cur_tsc_write; 1230 vcpu->arch.this_tsc_write = kvm->arch.cur_tsc_write;
1131 1231
1232 if (guest_cpuid_has_tsc_adjust(vcpu) && !msr->host_initiated)
1233 update_ia32_tsc_adjust_msr(vcpu, offset);
1132 kvm_x86_ops->write_tsc_offset(vcpu, offset); 1234 kvm_x86_ops->write_tsc_offset(vcpu, offset);
1133 raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags); 1235 raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
1236
1237 spin_lock(&kvm->arch.pvclock_gtod_sync_lock);
1238 if (matched)
1239 kvm->arch.nr_vcpus_matched_tsc++;
1240 else
1241 kvm->arch.nr_vcpus_matched_tsc = 0;
1242
1243 kvm_track_tsc_matching(vcpu);
1244 spin_unlock(&kvm->arch.pvclock_gtod_sync_lock);
1134} 1245}
1135 1246
1136EXPORT_SYMBOL_GPL(kvm_write_tsc); 1247EXPORT_SYMBOL_GPL(kvm_write_tsc);
1137 1248
1249#ifdef CONFIG_X86_64
1250
1251static cycle_t read_tsc(void)
1252{
1253 cycle_t ret;
1254 u64 last;
1255
1256 /*
1257 * Empirically, a fence (of type that depends on the CPU)
1258 * before rdtsc is enough to ensure that rdtsc is ordered
1259 * with respect to loads. The various CPU manuals are unclear
1260 * as to whether rdtsc can be reordered with later loads,
1261 * but no one has ever seen it happen.
1262 */
1263 rdtsc_barrier();
1264 ret = (cycle_t)vget_cycles();
1265
1266 last = pvclock_gtod_data.clock.cycle_last;
1267
1268 if (likely(ret >= last))
1269 return ret;
1270
1271 /*
1272 * GCC likes to generate cmov here, but this branch is extremely
1273 * predictable (it's just a funciton of time and the likely is
1274 * very likely) and there's a data dependence, so force GCC
1275 * to generate a branch instead. I don't barrier() because
1276 * we don't actually need a barrier, and if this function
1277 * ever gets inlined it will generate worse code.
1278 */
1279 asm volatile ("");
1280 return last;
1281}
1282
1283static inline u64 vgettsc(cycle_t *cycle_now)
1284{
1285 long v;
1286 struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
1287
1288 *cycle_now = read_tsc();
1289
1290 v = (*cycle_now - gtod->clock.cycle_last) & gtod->clock.mask;
1291 return v * gtod->clock.mult;
1292}
1293
1294static int do_monotonic(struct timespec *ts, cycle_t *cycle_now)
1295{
1296 unsigned long seq;
1297 u64 ns;
1298 int mode;
1299 struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
1300
1301 ts->tv_nsec = 0;
1302 do {
1303 seq = read_seqcount_begin(&gtod->seq);
1304 mode = gtod->clock.vclock_mode;
1305 ts->tv_sec = gtod->monotonic_time_sec;
1306 ns = gtod->monotonic_time_snsec;
1307 ns += vgettsc(cycle_now);
1308 ns >>= gtod->clock.shift;
1309 } while (unlikely(read_seqcount_retry(&gtod->seq, seq)));
1310 timespec_add_ns(ts, ns);
1311
1312 return mode;
1313}
1314
1315/* returns true if host is using tsc clocksource */
1316static bool kvm_get_time_and_clockread(s64 *kernel_ns, cycle_t *cycle_now)
1317{
1318 struct timespec ts;
1319
1320 /* checked again under seqlock below */
1321 if (pvclock_gtod_data.clock.vclock_mode != VCLOCK_TSC)
1322 return false;
1323
1324 if (do_monotonic(&ts, cycle_now) != VCLOCK_TSC)
1325 return false;
1326
1327 monotonic_to_bootbased(&ts);
1328 *kernel_ns = timespec_to_ns(&ts);
1329
1330 return true;
1331}
1332#endif
1333
1334/*
1335 *
1336 * Assuming a stable TSC across physical CPUS, and a stable TSC
1337 * across virtual CPUs, the following condition is possible.
1338 * Each numbered line represents an event visible to both
1339 * CPUs at the next numbered event.
1340 *
1341 * "timespecX" represents host monotonic time. "tscX" represents
1342 * RDTSC value.
1343 *
1344 * VCPU0 on CPU0 | VCPU1 on CPU1
1345 *
1346 * 1. read timespec0,tsc0
1347 * 2. | timespec1 = timespec0 + N
1348 * | tsc1 = tsc0 + M
1349 * 3. transition to guest | transition to guest
1350 * 4. ret0 = timespec0 + (rdtsc - tsc0) |
1351 * 5. | ret1 = timespec1 + (rdtsc - tsc1)
1352 * | ret1 = timespec0 + N + (rdtsc - (tsc0 + M))
1353 *
1354 * Since ret0 update is visible to VCPU1 at time 5, to obey monotonicity:
1355 *
1356 * - ret0 < ret1
1357 * - timespec0 + (rdtsc - tsc0) < timespec0 + N + (rdtsc - (tsc0 + M))
1358 * ...
1359 * - 0 < N - M => M < N
1360 *
1361 * That is, when timespec0 != timespec1, M < N. Unfortunately that is not
1362 * always the case (the difference between two distinct xtime instances
1363 * might be smaller then the difference between corresponding TSC reads,
1364 * when updating guest vcpus pvclock areas).
1365 *
1366 * To avoid that problem, do not allow visibility of distinct
1367 * system_timestamp/tsc_timestamp values simultaneously: use a master
1368 * copy of host monotonic time values. Update that master copy
1369 * in lockstep.
1370 *
1371 * Rely on synchronization of host TSCs and guest TSCs for monotonicity.
1372 *
1373 */
1374
1375static void pvclock_update_vm_gtod_copy(struct kvm *kvm)
1376{
1377#ifdef CONFIG_X86_64
1378 struct kvm_arch *ka = &kvm->arch;
1379 int vclock_mode;
1380 bool host_tsc_clocksource, vcpus_matched;
1381
1382 vcpus_matched = (ka->nr_vcpus_matched_tsc + 1 ==
1383 atomic_read(&kvm->online_vcpus));
1384
1385 /*
1386 * If the host uses TSC clock, then passthrough TSC as stable
1387 * to the guest.
1388 */
1389 host_tsc_clocksource = kvm_get_time_and_clockread(
1390 &ka->master_kernel_ns,
1391 &ka->master_cycle_now);
1392
1393 ka->use_master_clock = host_tsc_clocksource & vcpus_matched;
1394
1395 if (ka->use_master_clock)
1396 atomic_set(&kvm_guest_has_master_clock, 1);
1397
1398 vclock_mode = pvclock_gtod_data.clock.vclock_mode;
1399 trace_kvm_update_master_clock(ka->use_master_clock, vclock_mode,
1400 vcpus_matched);
1401#endif
1402}
1403
1138static int kvm_guest_time_update(struct kvm_vcpu *v) 1404static int kvm_guest_time_update(struct kvm_vcpu *v)
1139{ 1405{
1140 unsigned long flags; 1406 unsigned long flags, this_tsc_khz;
1141 struct kvm_vcpu_arch *vcpu = &v->arch; 1407 struct kvm_vcpu_arch *vcpu = &v->arch;
1408 struct kvm_arch *ka = &v->kvm->arch;
1142 void *shared_kaddr; 1409 void *shared_kaddr;
1143 unsigned long this_tsc_khz;
1144 s64 kernel_ns, max_kernel_ns; 1410 s64 kernel_ns, max_kernel_ns;
1145 u64 tsc_timestamp; 1411 u64 tsc_timestamp, host_tsc;
1412 struct pvclock_vcpu_time_info *guest_hv_clock;
1146 u8 pvclock_flags; 1413 u8 pvclock_flags;
1414 bool use_master_clock;
1415
1416 kernel_ns = 0;
1417 host_tsc = 0;
1147 1418
1148 /* Keep irq disabled to prevent changes to the clock */ 1419 /* Keep irq disabled to prevent changes to the clock */
1149 local_irq_save(flags); 1420 local_irq_save(flags);
1150 tsc_timestamp = kvm_x86_ops->read_l1_tsc(v);
1151 kernel_ns = get_kernel_ns();
1152 this_tsc_khz = __get_cpu_var(cpu_tsc_khz); 1421 this_tsc_khz = __get_cpu_var(cpu_tsc_khz);
1153 if (unlikely(this_tsc_khz == 0)) { 1422 if (unlikely(this_tsc_khz == 0)) {
1154 local_irq_restore(flags); 1423 local_irq_restore(flags);
@@ -1157,6 +1426,24 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
1157 } 1426 }
1158 1427
1159 /* 1428 /*
1429 * If the host uses TSC clock, then passthrough TSC as stable
1430 * to the guest.
1431 */
1432 spin_lock(&ka->pvclock_gtod_sync_lock);
1433 use_master_clock = ka->use_master_clock;
1434 if (use_master_clock) {
1435 host_tsc = ka->master_cycle_now;
1436 kernel_ns = ka->master_kernel_ns;
1437 }
1438 spin_unlock(&ka->pvclock_gtod_sync_lock);
1439 if (!use_master_clock) {
1440 host_tsc = native_read_tsc();
1441 kernel_ns = get_kernel_ns();
1442 }
1443
1444 tsc_timestamp = kvm_x86_ops->read_l1_tsc(v, host_tsc);
1445
1446 /*
1160 * We may have to catch up the TSC to match elapsed wall clock 1447 * We may have to catch up the TSC to match elapsed wall clock
1161 * time for two reasons, even if kvmclock is used. 1448 * time for two reasons, even if kvmclock is used.
1162 * 1) CPU could have been running below the maximum TSC rate 1449 * 1) CPU could have been running below the maximum TSC rate
@@ -1217,23 +1504,20 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
1217 vcpu->hw_tsc_khz = this_tsc_khz; 1504 vcpu->hw_tsc_khz = this_tsc_khz;
1218 } 1505 }
1219 1506
1220 if (max_kernel_ns > kernel_ns) 1507 /* with a master <monotonic time, tsc value> tuple,
1221 kernel_ns = max_kernel_ns; 1508 * pvclock clock reads always increase at the (scaled) rate
1222 1509 * of guest TSC - no need to deal with sampling errors.
1510 */
1511 if (!use_master_clock) {
1512 if (max_kernel_ns > kernel_ns)
1513 kernel_ns = max_kernel_ns;
1514 }
1223 /* With all the info we got, fill in the values */ 1515 /* With all the info we got, fill in the values */
1224 vcpu->hv_clock.tsc_timestamp = tsc_timestamp; 1516 vcpu->hv_clock.tsc_timestamp = tsc_timestamp;
1225 vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset; 1517 vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset;
1226 vcpu->last_kernel_ns = kernel_ns; 1518 vcpu->last_kernel_ns = kernel_ns;
1227 vcpu->last_guest_tsc = tsc_timestamp; 1519 vcpu->last_guest_tsc = tsc_timestamp;
1228 1520
1229 pvclock_flags = 0;
1230 if (vcpu->pvclock_set_guest_stopped_request) {
1231 pvclock_flags |= PVCLOCK_GUEST_STOPPED;
1232 vcpu->pvclock_set_guest_stopped_request = false;
1233 }
1234
1235 vcpu->hv_clock.flags = pvclock_flags;
1236
1237 /* 1521 /*
1238 * The interface expects us to write an even number signaling that the 1522 * The interface expects us to write an even number signaling that the
1239 * update is finished. Since the guest won't see the intermediate 1523 * update is finished. Since the guest won't see the intermediate
@@ -1243,6 +1527,22 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
1243 1527
1244 shared_kaddr = kmap_atomic(vcpu->time_page); 1528 shared_kaddr = kmap_atomic(vcpu->time_page);
1245 1529
1530 guest_hv_clock = shared_kaddr + vcpu->time_offset;
1531
1532 /* retain PVCLOCK_GUEST_STOPPED if set in guest copy */
1533 pvclock_flags = (guest_hv_clock->flags & PVCLOCK_GUEST_STOPPED);
1534
1535 if (vcpu->pvclock_set_guest_stopped_request) {
1536 pvclock_flags |= PVCLOCK_GUEST_STOPPED;
1537 vcpu->pvclock_set_guest_stopped_request = false;
1538 }
1539
1540 /* If the host uses TSC clocksource, then it is stable */
1541 if (use_master_clock)
1542 pvclock_flags |= PVCLOCK_TSC_STABLE_BIT;
1543
1544 vcpu->hv_clock.flags = pvclock_flags;
1545
1246 memcpy(shared_kaddr + vcpu->time_offset, &vcpu->hv_clock, 1546 memcpy(shared_kaddr + vcpu->time_offset, &vcpu->hv_clock,
1247 sizeof(vcpu->hv_clock)); 1547 sizeof(vcpu->hv_clock));
1248 1548
@@ -1572,9 +1872,11 @@ static void record_steal_time(struct kvm_vcpu *vcpu)
1572 &vcpu->arch.st.steal, sizeof(struct kvm_steal_time)); 1872 &vcpu->arch.st.steal, sizeof(struct kvm_steal_time));
1573} 1873}
1574 1874
1575int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) 1875int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
1576{ 1876{
1577 bool pr = false; 1877 bool pr = false;
1878 u32 msr = msr_info->index;
1879 u64 data = msr_info->data;
1578 1880
1579 switch (msr) { 1881 switch (msr) {
1580 case MSR_EFER: 1882 case MSR_EFER:
@@ -1625,6 +1927,15 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1625 case MSR_IA32_TSCDEADLINE: 1927 case MSR_IA32_TSCDEADLINE:
1626 kvm_set_lapic_tscdeadline_msr(vcpu, data); 1928 kvm_set_lapic_tscdeadline_msr(vcpu, data);
1627 break; 1929 break;
1930 case MSR_IA32_TSC_ADJUST:
1931 if (guest_cpuid_has_tsc_adjust(vcpu)) {
1932 if (!msr_info->host_initiated) {
1933 u64 adj = data - vcpu->arch.ia32_tsc_adjust_msr;
1934 kvm_x86_ops->adjust_tsc_offset(vcpu, adj, true);
1935 }
1936 vcpu->arch.ia32_tsc_adjust_msr = data;
1937 }
1938 break;
1628 case MSR_IA32_MISC_ENABLE: 1939 case MSR_IA32_MISC_ENABLE:
1629 vcpu->arch.ia32_misc_enable_msr = data; 1940 vcpu->arch.ia32_misc_enable_msr = data;
1630 break; 1941 break;
@@ -1984,6 +2295,9 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
1984 case MSR_IA32_TSCDEADLINE: 2295 case MSR_IA32_TSCDEADLINE:
1985 data = kvm_get_lapic_tscdeadline_msr(vcpu); 2296 data = kvm_get_lapic_tscdeadline_msr(vcpu);
1986 break; 2297 break;
2298 case MSR_IA32_TSC_ADJUST:
2299 data = (u64)vcpu->arch.ia32_tsc_adjust_msr;
2300 break;
1987 case MSR_IA32_MISC_ENABLE: 2301 case MSR_IA32_MISC_ENABLE:
1988 data = vcpu->arch.ia32_misc_enable_msr; 2302 data = vcpu->arch.ia32_misc_enable_msr;
1989 break; 2303 break;
@@ -2342,7 +2656,12 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
2342 kvm_x86_ops->write_tsc_offset(vcpu, offset); 2656 kvm_x86_ops->write_tsc_offset(vcpu, offset);
2343 vcpu->arch.tsc_catchup = 1; 2657 vcpu->arch.tsc_catchup = 1;
2344 } 2658 }
2345 kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); 2659 /*
2660 * On a host with synchronized TSC, there is no need to update
2661 * kvmclock on vcpu->cpu migration
2662 */
2663 if (!vcpu->kvm->arch.use_master_clock || vcpu->cpu == -1)
2664 kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
2346 if (vcpu->cpu != cpu) 2665 if (vcpu->cpu != cpu)
2347 kvm_migrate_timers(vcpu); 2666 kvm_migrate_timers(vcpu);
2348 vcpu->cpu = cpu; 2667 vcpu->cpu = cpu;
@@ -2691,15 +3010,10 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
2691 if (!vcpu->arch.apic) 3010 if (!vcpu->arch.apic)
2692 goto out; 3011 goto out;
2693 u.lapic = memdup_user(argp, sizeof(*u.lapic)); 3012 u.lapic = memdup_user(argp, sizeof(*u.lapic));
2694 if (IS_ERR(u.lapic)) { 3013 if (IS_ERR(u.lapic))
2695 r = PTR_ERR(u.lapic); 3014 return PTR_ERR(u.lapic);
2696 goto out;
2697 }
2698 3015
2699 r = kvm_vcpu_ioctl_set_lapic(vcpu, u.lapic); 3016 r = kvm_vcpu_ioctl_set_lapic(vcpu, u.lapic);
2700 if (r)
2701 goto out;
2702 r = 0;
2703 break; 3017 break;
2704 } 3018 }
2705 case KVM_INTERRUPT: { 3019 case KVM_INTERRUPT: {
@@ -2709,16 +3023,10 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
2709 if (copy_from_user(&irq, argp, sizeof irq)) 3023 if (copy_from_user(&irq, argp, sizeof irq))
2710 goto out; 3024 goto out;
2711 r = kvm_vcpu_ioctl_interrupt(vcpu, &irq); 3025 r = kvm_vcpu_ioctl_interrupt(vcpu, &irq);
2712 if (r)
2713 goto out;
2714 r = 0;
2715 break; 3026 break;
2716 } 3027 }
2717 case KVM_NMI: { 3028 case KVM_NMI: {
2718 r = kvm_vcpu_ioctl_nmi(vcpu); 3029 r = kvm_vcpu_ioctl_nmi(vcpu);
2719 if (r)
2720 goto out;
2721 r = 0;
2722 break; 3030 break;
2723 } 3031 }
2724 case KVM_SET_CPUID: { 3032 case KVM_SET_CPUID: {
@@ -2729,8 +3037,6 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
2729 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid)) 3037 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
2730 goto out; 3038 goto out;
2731 r = kvm_vcpu_ioctl_set_cpuid(vcpu, &cpuid, cpuid_arg->entries); 3039 r = kvm_vcpu_ioctl_set_cpuid(vcpu, &cpuid, cpuid_arg->entries);
2732 if (r)
2733 goto out;
2734 break; 3040 break;
2735 } 3041 }
2736 case KVM_SET_CPUID2: { 3042 case KVM_SET_CPUID2: {
@@ -2742,8 +3048,6 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
2742 goto out; 3048 goto out;
2743 r = kvm_vcpu_ioctl_set_cpuid2(vcpu, &cpuid, 3049 r = kvm_vcpu_ioctl_set_cpuid2(vcpu, &cpuid,
2744 cpuid_arg->entries); 3050 cpuid_arg->entries);
2745 if (r)
2746 goto out;
2747 break; 3051 break;
2748 } 3052 }
2749 case KVM_GET_CPUID2: { 3053 case KVM_GET_CPUID2: {
@@ -2875,10 +3179,8 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
2875 } 3179 }
2876 case KVM_SET_XSAVE: { 3180 case KVM_SET_XSAVE: {
2877 u.xsave = memdup_user(argp, sizeof(*u.xsave)); 3181 u.xsave = memdup_user(argp, sizeof(*u.xsave));
2878 if (IS_ERR(u.xsave)) { 3182 if (IS_ERR(u.xsave))
2879 r = PTR_ERR(u.xsave); 3183 return PTR_ERR(u.xsave);
2880 goto out;
2881 }
2882 3184
2883 r = kvm_vcpu_ioctl_x86_set_xsave(vcpu, u.xsave); 3185 r = kvm_vcpu_ioctl_x86_set_xsave(vcpu, u.xsave);
2884 break; 3186 break;
@@ -2900,10 +3202,8 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
2900 } 3202 }
2901 case KVM_SET_XCRS: { 3203 case KVM_SET_XCRS: {
2902 u.xcrs = memdup_user(argp, sizeof(*u.xcrs)); 3204 u.xcrs = memdup_user(argp, sizeof(*u.xcrs));
2903 if (IS_ERR(u.xcrs)) { 3205 if (IS_ERR(u.xcrs))
2904 r = PTR_ERR(u.xcrs); 3206 return PTR_ERR(u.xcrs);
2905 goto out;
2906 }
2907 3207
2908 r = kvm_vcpu_ioctl_x86_set_xcrs(vcpu, u.xcrs); 3208 r = kvm_vcpu_ioctl_x86_set_xcrs(vcpu, u.xcrs);
2909 break; 3209 break;
@@ -2951,7 +3251,7 @@ static int kvm_vm_ioctl_set_tss_addr(struct kvm *kvm, unsigned long addr)
2951 int ret; 3251 int ret;
2952 3252
2953 if (addr > (unsigned int)(-3 * PAGE_SIZE)) 3253 if (addr > (unsigned int)(-3 * PAGE_SIZE))
2954 return -1; 3254 return -EINVAL;
2955 ret = kvm_x86_ops->set_tss_addr(kvm, addr); 3255 ret = kvm_x86_ops->set_tss_addr(kvm, addr);
2956 return ret; 3256 return ret;
2957} 3257}
@@ -3212,8 +3512,6 @@ long kvm_arch_vm_ioctl(struct file *filp,
3212 switch (ioctl) { 3512 switch (ioctl) {
3213 case KVM_SET_TSS_ADDR: 3513 case KVM_SET_TSS_ADDR:
3214 r = kvm_vm_ioctl_set_tss_addr(kvm, arg); 3514 r = kvm_vm_ioctl_set_tss_addr(kvm, arg);
3215 if (r < 0)
3216 goto out;
3217 break; 3515 break;
3218 case KVM_SET_IDENTITY_MAP_ADDR: { 3516 case KVM_SET_IDENTITY_MAP_ADDR: {
3219 u64 ident_addr; 3517 u64 ident_addr;
@@ -3222,14 +3520,10 @@ long kvm_arch_vm_ioctl(struct file *filp,
3222 if (copy_from_user(&ident_addr, argp, sizeof ident_addr)) 3520 if (copy_from_user(&ident_addr, argp, sizeof ident_addr))
3223 goto out; 3521 goto out;
3224 r = kvm_vm_ioctl_set_identity_map_addr(kvm, ident_addr); 3522 r = kvm_vm_ioctl_set_identity_map_addr(kvm, ident_addr);
3225 if (r < 0)
3226 goto out;
3227 break; 3523 break;
3228 } 3524 }
3229 case KVM_SET_NR_MMU_PAGES: 3525 case KVM_SET_NR_MMU_PAGES:
3230 r = kvm_vm_ioctl_set_nr_mmu_pages(kvm, arg); 3526 r = kvm_vm_ioctl_set_nr_mmu_pages(kvm, arg);
3231 if (r)
3232 goto out;
3233 break; 3527 break;
3234 case KVM_GET_NR_MMU_PAGES: 3528 case KVM_GET_NR_MMU_PAGES:
3235 r = kvm_vm_ioctl_get_nr_mmu_pages(kvm); 3529 r = kvm_vm_ioctl_get_nr_mmu_pages(kvm);
@@ -3320,8 +3614,6 @@ long kvm_arch_vm_ioctl(struct file *filp,
3320 r = 0; 3614 r = 0;
3321 get_irqchip_out: 3615 get_irqchip_out:
3322 kfree(chip); 3616 kfree(chip);
3323 if (r)
3324 goto out;
3325 break; 3617 break;
3326 } 3618 }
3327 case KVM_SET_IRQCHIP: { 3619 case KVM_SET_IRQCHIP: {
@@ -3343,8 +3635,6 @@ long kvm_arch_vm_ioctl(struct file *filp,
3343 r = 0; 3635 r = 0;
3344 set_irqchip_out: 3636 set_irqchip_out:
3345 kfree(chip); 3637 kfree(chip);
3346 if (r)
3347 goto out;
3348 break; 3638 break;
3349 } 3639 }
3350 case KVM_GET_PIT: { 3640 case KVM_GET_PIT: {
@@ -3371,9 +3661,6 @@ long kvm_arch_vm_ioctl(struct file *filp,
3371 if (!kvm->arch.vpit) 3661 if (!kvm->arch.vpit)
3372 goto out; 3662 goto out;
3373 r = kvm_vm_ioctl_set_pit(kvm, &u.ps); 3663 r = kvm_vm_ioctl_set_pit(kvm, &u.ps);
3374 if (r)
3375 goto out;
3376 r = 0;
3377 break; 3664 break;
3378 } 3665 }
3379 case KVM_GET_PIT2: { 3666 case KVM_GET_PIT2: {
@@ -3397,9 +3684,6 @@ long kvm_arch_vm_ioctl(struct file *filp,
3397 if (!kvm->arch.vpit) 3684 if (!kvm->arch.vpit)
3398 goto out; 3685 goto out;
3399 r = kvm_vm_ioctl_set_pit2(kvm, &u.ps2); 3686 r = kvm_vm_ioctl_set_pit2(kvm, &u.ps2);
3400 if (r)
3401 goto out;
3402 r = 0;
3403 break; 3687 break;
3404 } 3688 }
3405 case KVM_REINJECT_CONTROL: { 3689 case KVM_REINJECT_CONTROL: {
@@ -3408,9 +3692,6 @@ long kvm_arch_vm_ioctl(struct file *filp,
3408 if (copy_from_user(&control, argp, sizeof(control))) 3692 if (copy_from_user(&control, argp, sizeof(control)))
3409 goto out; 3693 goto out;
3410 r = kvm_vm_ioctl_reinject(kvm, &control); 3694 r = kvm_vm_ioctl_reinject(kvm, &control);
3411 if (r)
3412 goto out;
3413 r = 0;
3414 break; 3695 break;
3415 } 3696 }
3416 case KVM_XEN_HVM_CONFIG: { 3697 case KVM_XEN_HVM_CONFIG: {
@@ -4273,7 +4554,12 @@ static int emulator_get_msr(struct x86_emulate_ctxt *ctxt,
4273static int emulator_set_msr(struct x86_emulate_ctxt *ctxt, 4554static int emulator_set_msr(struct x86_emulate_ctxt *ctxt,
4274 u32 msr_index, u64 data) 4555 u32 msr_index, u64 data)
4275{ 4556{
4276 return kvm_set_msr(emul_to_vcpu(ctxt), msr_index, data); 4557 struct msr_data msr;
4558
4559 msr.data = data;
4560 msr.index = msr_index;
4561 msr.host_initiated = false;
4562 return kvm_set_msr(emul_to_vcpu(ctxt), &msr);
4277} 4563}
4278 4564
4279static int emulator_read_pmc(struct x86_emulate_ctxt *ctxt, 4565static int emulator_read_pmc(struct x86_emulate_ctxt *ctxt,
@@ -4495,7 +4781,7 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t gva)
4495 * instruction -> ... 4781 * instruction -> ...
4496 */ 4782 */
4497 pfn = gfn_to_pfn(vcpu->kvm, gpa_to_gfn(gpa)); 4783 pfn = gfn_to_pfn(vcpu->kvm, gpa_to_gfn(gpa));
4498 if (!is_error_pfn(pfn)) { 4784 if (!is_error_noslot_pfn(pfn)) {
4499 kvm_release_pfn_clean(pfn); 4785 kvm_release_pfn_clean(pfn);
4500 return true; 4786 return true;
4501 } 4787 }
@@ -4881,6 +5167,50 @@ static void kvm_set_mmio_spte_mask(void)
4881 kvm_mmu_set_mmio_spte_mask(mask); 5167 kvm_mmu_set_mmio_spte_mask(mask);
4882} 5168}
4883 5169
5170#ifdef CONFIG_X86_64
5171static void pvclock_gtod_update_fn(struct work_struct *work)
5172{
5173 struct kvm *kvm;
5174
5175 struct kvm_vcpu *vcpu;
5176 int i;
5177
5178 raw_spin_lock(&kvm_lock);
5179 list_for_each_entry(kvm, &vm_list, vm_list)
5180 kvm_for_each_vcpu(i, vcpu, kvm)
5181 set_bit(KVM_REQ_MASTERCLOCK_UPDATE, &vcpu->requests);
5182 atomic_set(&kvm_guest_has_master_clock, 0);
5183 raw_spin_unlock(&kvm_lock);
5184}
5185
5186static DECLARE_WORK(pvclock_gtod_work, pvclock_gtod_update_fn);
5187
5188/*
5189 * Notification about pvclock gtod data update.
5190 */
5191static int pvclock_gtod_notify(struct notifier_block *nb, unsigned long unused,
5192 void *priv)
5193{
5194 struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
5195 struct timekeeper *tk = priv;
5196
5197 update_pvclock_gtod(tk);
5198
5199 /* disable master clock if host does not trust, or does not
5200 * use, TSC clocksource
5201 */
5202 if (gtod->clock.vclock_mode != VCLOCK_TSC &&
5203 atomic_read(&kvm_guest_has_master_clock) != 0)
5204 queue_work(system_long_wq, &pvclock_gtod_work);
5205
5206 return 0;
5207}
5208
5209static struct notifier_block pvclock_gtod_notifier = {
5210 .notifier_call = pvclock_gtod_notify,
5211};
5212#endif
5213
4884int kvm_arch_init(void *opaque) 5214int kvm_arch_init(void *opaque)
4885{ 5215{
4886 int r; 5216 int r;
@@ -4922,6 +5252,10 @@ int kvm_arch_init(void *opaque)
4922 host_xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK); 5252 host_xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
4923 5253
4924 kvm_lapic_init(); 5254 kvm_lapic_init();
5255#ifdef CONFIG_X86_64
5256 pvclock_gtod_register_notifier(&pvclock_gtod_notifier);
5257#endif
5258
4925 return 0; 5259 return 0;
4926 5260
4927out: 5261out:
@@ -4936,6 +5270,9 @@ void kvm_arch_exit(void)
4936 cpufreq_unregister_notifier(&kvmclock_cpufreq_notifier_block, 5270 cpufreq_unregister_notifier(&kvmclock_cpufreq_notifier_block,
4937 CPUFREQ_TRANSITION_NOTIFIER); 5271 CPUFREQ_TRANSITION_NOTIFIER);
4938 unregister_hotcpu_notifier(&kvmclock_cpu_notifier_block); 5272 unregister_hotcpu_notifier(&kvmclock_cpu_notifier_block);
5273#ifdef CONFIG_X86_64
5274 pvclock_gtod_unregister_notifier(&pvclock_gtod_notifier);
5275#endif
4939 kvm_x86_ops = NULL; 5276 kvm_x86_ops = NULL;
4940 kvm_mmu_module_exit(); 5277 kvm_mmu_module_exit();
4941} 5278}
@@ -5059,7 +5396,7 @@ out:
5059} 5396}
5060EXPORT_SYMBOL_GPL(kvm_emulate_hypercall); 5397EXPORT_SYMBOL_GPL(kvm_emulate_hypercall);
5061 5398
5062int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt) 5399static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt)
5063{ 5400{
5064 struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); 5401 struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
5065 char instruction[3]; 5402 char instruction[3];
@@ -5235,6 +5572,29 @@ static void process_nmi(struct kvm_vcpu *vcpu)
5235 kvm_make_request(KVM_REQ_EVENT, vcpu); 5572 kvm_make_request(KVM_REQ_EVENT, vcpu);
5236} 5573}
5237 5574
5575static void kvm_gen_update_masterclock(struct kvm *kvm)
5576{
5577#ifdef CONFIG_X86_64
5578 int i;
5579 struct kvm_vcpu *vcpu;
5580 struct kvm_arch *ka = &kvm->arch;
5581
5582 spin_lock(&ka->pvclock_gtod_sync_lock);
5583 kvm_make_mclock_inprogress_request(kvm);
5584 /* no guest entries from this point */
5585 pvclock_update_vm_gtod_copy(kvm);
5586
5587 kvm_for_each_vcpu(i, vcpu, kvm)
5588 set_bit(KVM_REQ_CLOCK_UPDATE, &vcpu->requests);
5589
5590 /* guest entries allowed */
5591 kvm_for_each_vcpu(i, vcpu, kvm)
5592 clear_bit(KVM_REQ_MCLOCK_INPROGRESS, &vcpu->requests);
5593
5594 spin_unlock(&ka->pvclock_gtod_sync_lock);
5595#endif
5596}
5597
5238static int vcpu_enter_guest(struct kvm_vcpu *vcpu) 5598static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
5239{ 5599{
5240 int r; 5600 int r;
@@ -5247,6 +5607,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
5247 kvm_mmu_unload(vcpu); 5607 kvm_mmu_unload(vcpu);
5248 if (kvm_check_request(KVM_REQ_MIGRATE_TIMER, vcpu)) 5608 if (kvm_check_request(KVM_REQ_MIGRATE_TIMER, vcpu))
5249 __kvm_migrate_timers(vcpu); 5609 __kvm_migrate_timers(vcpu);
5610 if (kvm_check_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu))
5611 kvm_gen_update_masterclock(vcpu->kvm);
5250 if (kvm_check_request(KVM_REQ_CLOCK_UPDATE, vcpu)) { 5612 if (kvm_check_request(KVM_REQ_CLOCK_UPDATE, vcpu)) {
5251 r = kvm_guest_time_update(vcpu); 5613 r = kvm_guest_time_update(vcpu);
5252 if (unlikely(r)) 5614 if (unlikely(r))
@@ -5362,7 +5724,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
5362 if (hw_breakpoint_active()) 5724 if (hw_breakpoint_active())
5363 hw_breakpoint_restore(); 5725 hw_breakpoint_restore();
5364 5726
5365 vcpu->arch.last_guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu); 5727 vcpu->arch.last_guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu,
5728 native_read_tsc());
5366 5729
5367 vcpu->mode = OUTSIDE_GUEST_MODE; 5730 vcpu->mode = OUTSIDE_GUEST_MODE;
5368 smp_wmb(); 5731 smp_wmb();
@@ -5419,7 +5782,7 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
5419 pr_debug("vcpu %d received sipi with vector # %x\n", 5782 pr_debug("vcpu %d received sipi with vector # %x\n",
5420 vcpu->vcpu_id, vcpu->arch.sipi_vector); 5783 vcpu->vcpu_id, vcpu->arch.sipi_vector);
5421 kvm_lapic_reset(vcpu); 5784 kvm_lapic_reset(vcpu);
5422 r = kvm_arch_vcpu_reset(vcpu); 5785 r = kvm_vcpu_reset(vcpu);
5423 if (r) 5786 if (r)
5424 return r; 5787 return r;
5425 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; 5788 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
@@ -6047,7 +6410,7 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
6047 r = vcpu_load(vcpu); 6410 r = vcpu_load(vcpu);
6048 if (r) 6411 if (r)
6049 return r; 6412 return r;
6050 r = kvm_arch_vcpu_reset(vcpu); 6413 r = kvm_vcpu_reset(vcpu);
6051 if (r == 0) 6414 if (r == 0)
6052 r = kvm_mmu_setup(vcpu); 6415 r = kvm_mmu_setup(vcpu);
6053 vcpu_put(vcpu); 6416 vcpu_put(vcpu);
@@ -6055,6 +6418,23 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
6055 return r; 6418 return r;
6056} 6419}
6057 6420
6421int kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
6422{
6423 int r;
6424 struct msr_data msr;
6425
6426 r = vcpu_load(vcpu);
6427 if (r)
6428 return r;
6429 msr.data = 0x0;
6430 msr.index = MSR_IA32_TSC;
6431 msr.host_initiated = true;
6432 kvm_write_tsc(vcpu, &msr);
6433 vcpu_put(vcpu);
6434
6435 return r;
6436}
6437
6058void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) 6438void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
6059{ 6439{
6060 int r; 6440 int r;
@@ -6069,7 +6449,7 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
6069 kvm_x86_ops->vcpu_free(vcpu); 6449 kvm_x86_ops->vcpu_free(vcpu);
6070} 6450}
6071 6451
6072int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu) 6452static int kvm_vcpu_reset(struct kvm_vcpu *vcpu)
6073{ 6453{
6074 atomic_set(&vcpu->arch.nmi_queued, 0); 6454 atomic_set(&vcpu->arch.nmi_queued, 0);
6075 vcpu->arch.nmi_pending = 0; 6455 vcpu->arch.nmi_pending = 0;
@@ -6092,6 +6472,10 @@ int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu)
6092 6472
6093 kvm_pmu_reset(vcpu); 6473 kvm_pmu_reset(vcpu);
6094 6474
6475 memset(vcpu->arch.regs, 0, sizeof(vcpu->arch.regs));
6476 vcpu->arch.regs_avail = ~0;
6477 vcpu->arch.regs_dirty = ~0;
6478
6095 return kvm_x86_ops->vcpu_reset(vcpu); 6479 return kvm_x86_ops->vcpu_reset(vcpu);
6096} 6480}
6097 6481
@@ -6168,6 +6552,8 @@ int kvm_arch_hardware_enable(void *garbage)
6168 kvm_for_each_vcpu(i, vcpu, kvm) { 6552 kvm_for_each_vcpu(i, vcpu, kvm) {
6169 vcpu->arch.tsc_offset_adjustment += delta_cyc; 6553 vcpu->arch.tsc_offset_adjustment += delta_cyc;
6170 vcpu->arch.last_host_tsc = local_tsc; 6554 vcpu->arch.last_host_tsc = local_tsc;
6555 set_bit(KVM_REQ_MASTERCLOCK_UPDATE,
6556 &vcpu->requests);
6171 } 6557 }
6172 6558
6173 /* 6559 /*
@@ -6258,10 +6644,17 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
6258 if (!zalloc_cpumask_var(&vcpu->arch.wbinvd_dirty_mask, GFP_KERNEL)) 6644 if (!zalloc_cpumask_var(&vcpu->arch.wbinvd_dirty_mask, GFP_KERNEL))
6259 goto fail_free_mce_banks; 6645 goto fail_free_mce_banks;
6260 6646
6647 r = fx_init(vcpu);
6648 if (r)
6649 goto fail_free_wbinvd_dirty_mask;
6650
6651 vcpu->arch.ia32_tsc_adjust_msr = 0x0;
6261 kvm_async_pf_hash_reset(vcpu); 6652 kvm_async_pf_hash_reset(vcpu);
6262 kvm_pmu_init(vcpu); 6653 kvm_pmu_init(vcpu);
6263 6654
6264 return 0; 6655 return 0;
6656fail_free_wbinvd_dirty_mask:
6657 free_cpumask_var(vcpu->arch.wbinvd_dirty_mask);
6265fail_free_mce_banks: 6658fail_free_mce_banks:
6266 kfree(vcpu->arch.mce_banks); 6659 kfree(vcpu->arch.mce_banks);
6267fail_free_lapic: 6660fail_free_lapic:
@@ -6305,6 +6698,9 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
6305 6698
6306 raw_spin_lock_init(&kvm->arch.tsc_write_lock); 6699 raw_spin_lock_init(&kvm->arch.tsc_write_lock);
6307 mutex_init(&kvm->arch.apic_map_lock); 6700 mutex_init(&kvm->arch.apic_map_lock);
6701 spin_lock_init(&kvm->arch.pvclock_gtod_sync_lock);
6702
6703 pvclock_update_vm_gtod_copy(kvm);
6308 6704
6309 return 0; 6705 return 0;
6310} 6706}
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 2b5219c12ac8..e224f7a671b6 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -112,7 +112,7 @@ void kvm_before_handle_nmi(struct kvm_vcpu *vcpu);
112void kvm_after_handle_nmi(struct kvm_vcpu *vcpu); 112void kvm_after_handle_nmi(struct kvm_vcpu *vcpu);
113int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip); 113int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip);
114 114
115void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data); 115void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr);
116 116
117int kvm_read_guest_virt(struct x86_emulate_ctxt *ctxt, 117int kvm_read_guest_virt(struct x86_emulate_ctxt *ctxt,
118 gva_t addr, void *val, unsigned int bytes, 118 gva_t addr, void *val, unsigned int bytes,
diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c
index 4df6c373421a..205ad328aa52 100644
--- a/arch/x86/vdso/vclock_gettime.c
+++ b/arch/x86/vdso/vclock_gettime.c
@@ -22,6 +22,7 @@
22#include <asm/hpet.h> 22#include <asm/hpet.h>
23#include <asm/unistd.h> 23#include <asm/unistd.h>
24#include <asm/io.h> 24#include <asm/io.h>
25#include <asm/pvclock.h>
25 26
26#define gtod (&VVAR(vsyscall_gtod_data)) 27#define gtod (&VVAR(vsyscall_gtod_data))
27 28
@@ -62,6 +63,76 @@ static notrace cycle_t vread_hpet(void)
62 return readl((const void __iomem *)fix_to_virt(VSYSCALL_HPET) + 0xf0); 63 return readl((const void __iomem *)fix_to_virt(VSYSCALL_HPET) + 0xf0);
63} 64}
64 65
66#ifdef CONFIG_PARAVIRT_CLOCK
67
68static notrace const struct pvclock_vsyscall_time_info *get_pvti(int cpu)
69{
70 const struct pvclock_vsyscall_time_info *pvti_base;
71 int idx = cpu / (PAGE_SIZE/PVTI_SIZE);
72 int offset = cpu % (PAGE_SIZE/PVTI_SIZE);
73
74 BUG_ON(PVCLOCK_FIXMAP_BEGIN + idx > PVCLOCK_FIXMAP_END);
75
76 pvti_base = (struct pvclock_vsyscall_time_info *)
77 __fix_to_virt(PVCLOCK_FIXMAP_BEGIN+idx);
78
79 return &pvti_base[offset];
80}
81
82static notrace cycle_t vread_pvclock(int *mode)
83{
84 const struct pvclock_vsyscall_time_info *pvti;
85 cycle_t ret;
86 u64 last;
87 u32 version;
88 u32 migrate_count;
89 u8 flags;
90 unsigned cpu, cpu1;
91
92
93 /*
94 * When looping to get a consistent (time-info, tsc) pair, we
95 * also need to deal with the possibility we can switch vcpus,
96 * so make sure we always re-fetch time-info for the current vcpu.
97 */
98 do {
99 cpu = __getcpu() & VGETCPU_CPU_MASK;
100 /* TODO: We can put vcpu id into higher bits of pvti.version.
101 * This will save a couple of cycles by getting rid of
102 * __getcpu() calls (Gleb).
103 */
104
105 pvti = get_pvti(cpu);
106
107 migrate_count = pvti->migrate_count;
108
109 version = __pvclock_read_cycles(&pvti->pvti, &ret, &flags);
110
111 /*
112 * Test we're still on the cpu as well as the version.
113 * We could have been migrated just after the first
114 * vgetcpu but before fetching the version, so we
115 * wouldn't notice a version change.
116 */
117 cpu1 = __getcpu() & VGETCPU_CPU_MASK;
118 } while (unlikely(cpu != cpu1 ||
119 (pvti->pvti.version & 1) ||
120 pvti->pvti.version != version ||
121 pvti->migrate_count != migrate_count));
122
123 if (unlikely(!(flags & PVCLOCK_TSC_STABLE_BIT)))
124 *mode = VCLOCK_NONE;
125
126 /* refer to tsc.c read_tsc() comment for rationale */
127 last = VVAR(vsyscall_gtod_data).clock.cycle_last;
128
129 if (likely(ret >= last))
130 return ret;
131
132 return last;
133}
134#endif
135
65notrace static long vdso_fallback_gettime(long clock, struct timespec *ts) 136notrace static long vdso_fallback_gettime(long clock, struct timespec *ts)
66{ 137{
67 long ret; 138 long ret;
@@ -80,7 +151,7 @@ notrace static long vdso_fallback_gtod(struct timeval *tv, struct timezone *tz)
80} 151}
81 152
82 153
83notrace static inline u64 vgetsns(void) 154notrace static inline u64 vgetsns(int *mode)
84{ 155{
85 long v; 156 long v;
86 cycles_t cycles; 157 cycles_t cycles;
@@ -88,6 +159,10 @@ notrace static inline u64 vgetsns(void)
88 cycles = vread_tsc(); 159 cycles = vread_tsc();
89 else if (gtod->clock.vclock_mode == VCLOCK_HPET) 160 else if (gtod->clock.vclock_mode == VCLOCK_HPET)
90 cycles = vread_hpet(); 161 cycles = vread_hpet();
162#ifdef CONFIG_PARAVIRT_CLOCK
163 else if (gtod->clock.vclock_mode == VCLOCK_PVCLOCK)
164 cycles = vread_pvclock(mode);
165#endif
91 else 166 else
92 return 0; 167 return 0;
93 v = (cycles - gtod->clock.cycle_last) & gtod->clock.mask; 168 v = (cycles - gtod->clock.cycle_last) & gtod->clock.mask;
@@ -107,7 +182,7 @@ notrace static int __always_inline do_realtime(struct timespec *ts)
107 mode = gtod->clock.vclock_mode; 182 mode = gtod->clock.vclock_mode;
108 ts->tv_sec = gtod->wall_time_sec; 183 ts->tv_sec = gtod->wall_time_sec;
109 ns = gtod->wall_time_snsec; 184 ns = gtod->wall_time_snsec;
110 ns += vgetsns(); 185 ns += vgetsns(&mode);
111 ns >>= gtod->clock.shift; 186 ns >>= gtod->clock.shift;
112 } while (unlikely(read_seqcount_retry(&gtod->seq, seq))); 187 } while (unlikely(read_seqcount_retry(&gtod->seq, seq)));
113 188
@@ -127,7 +202,7 @@ notrace static int do_monotonic(struct timespec *ts)
127 mode = gtod->clock.vclock_mode; 202 mode = gtod->clock.vclock_mode;
128 ts->tv_sec = gtod->monotonic_time_sec; 203 ts->tv_sec = gtod->monotonic_time_sec;
129 ns = gtod->monotonic_time_snsec; 204 ns = gtod->monotonic_time_snsec;
130 ns += vgetsns(); 205 ns += vgetsns(&mode);
131 ns >>= gtod->clock.shift; 206 ns >>= gtod->clock.shift;
132 } while (unlikely(read_seqcount_retry(&gtod->seq, seq))); 207 } while (unlikely(read_seqcount_retry(&gtod->seq, seq)));
133 timespec_add_ns(ts, ns); 208 timespec_add_ns(ts, ns);
diff --git a/arch/x86/vdso/vgetcpu.c b/arch/x86/vdso/vgetcpu.c
index 5463ad558573..2f94b039e55b 100644
--- a/arch/x86/vdso/vgetcpu.c
+++ b/arch/x86/vdso/vgetcpu.c
@@ -17,15 +17,10 @@ __vdso_getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *unused)
17{ 17{
18 unsigned int p; 18 unsigned int p;
19 19
20 if (VVAR(vgetcpu_mode) == VGETCPU_RDTSCP) { 20 p = __getcpu();
21 /* Load per CPU data from RDTSCP */ 21
22 native_read_tscp(&p);
23 } else {
24 /* Load per CPU data from GDT */
25 asm("lsl %1,%0" : "=r" (p) : "r" (__PER_CPU_SEG));
26 }
27 if (cpu) 22 if (cpu)
28 *cpu = p & 0xfff; 23 *cpu = p & VGETCPU_CPU_MASK;
29 if (node) 24 if (node)
30 *node = p >> 12; 25 *node = p >> 12;
31 return 0; 26 return 0;
diff --git a/drivers/tty/Kconfig b/drivers/tty/Kconfig
index d8e05eeab232..0ecf22b6a38e 100644
--- a/drivers/tty/Kconfig
+++ b/drivers/tty/Kconfig
@@ -357,6 +357,7 @@ config TRACE_SINK
357config PPC_EPAPR_HV_BYTECHAN 357config PPC_EPAPR_HV_BYTECHAN
358 tristate "ePAPR hypervisor byte channel driver" 358 tristate "ePAPR hypervisor byte channel driver"
359 depends on PPC 359 depends on PPC
360 select EPAPR_PARAVIRT
360 help 361 help
361 This driver creates /dev entries for each ePAPR hypervisor byte 362 This driver creates /dev entries for each ePAPR hypervisor byte
362 channel, thereby allowing applications to communicate with byte 363 channel, thereby allowing applications to communicate with byte
diff --git a/drivers/virt/Kconfig b/drivers/virt/Kconfig
index 2dcdbc9364d8..99ebdde590f8 100644
--- a/drivers/virt/Kconfig
+++ b/drivers/virt/Kconfig
@@ -15,6 +15,7 @@ if VIRT_DRIVERS
15config FSL_HV_MANAGER 15config FSL_HV_MANAGER
16 tristate "Freescale hypervisor management driver" 16 tristate "Freescale hypervisor management driver"
17 depends on FSL_SOC 17 depends on FSL_SOC
18 select EPAPR_PARAVIRT
18 help 19 help
19 The Freescale hypervisor management driver provides several services 20 The Freescale hypervisor management driver provides several services
20 to drivers and applications related to the Freescale hypervisor: 21 to drivers and applications related to the Freescale hypervisor:
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index d5cddd8dcc5c..2c497ab0d03d 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -47,28 +47,40 @@
47 47
48/* 48/*
49 * For the normal pfn, the highest 12 bits should be zero, 49 * For the normal pfn, the highest 12 bits should be zero,
50 * so we can mask these bits to indicate the error. 50 * so we can mask bit 62 ~ bit 52 to indicate the error pfn,
51 * mask bit 63 to indicate the noslot pfn.
51 */ 52 */
52#define KVM_PFN_ERR_MASK (0xfffULL << 52) 53#define KVM_PFN_ERR_MASK (0x7ffULL << 52)
54#define KVM_PFN_ERR_NOSLOT_MASK (0xfffULL << 52)
55#define KVM_PFN_NOSLOT (0x1ULL << 63)
53 56
54#define KVM_PFN_ERR_FAULT (KVM_PFN_ERR_MASK) 57#define KVM_PFN_ERR_FAULT (KVM_PFN_ERR_MASK)
55#define KVM_PFN_ERR_HWPOISON (KVM_PFN_ERR_MASK + 1) 58#define KVM_PFN_ERR_HWPOISON (KVM_PFN_ERR_MASK + 1)
56#define KVM_PFN_ERR_BAD (KVM_PFN_ERR_MASK + 2) 59#define KVM_PFN_ERR_RO_FAULT (KVM_PFN_ERR_MASK + 2)
57#define KVM_PFN_ERR_RO_FAULT (KVM_PFN_ERR_MASK + 3)
58 60
61/*
62 * error pfns indicate that the gfn is in slot but faild to
63 * translate it to pfn on host.
64 */
59static inline bool is_error_pfn(pfn_t pfn) 65static inline bool is_error_pfn(pfn_t pfn)
60{ 66{
61 return !!(pfn & KVM_PFN_ERR_MASK); 67 return !!(pfn & KVM_PFN_ERR_MASK);
62} 68}
63 69
64static inline bool is_noslot_pfn(pfn_t pfn) 70/*
71 * error_noslot pfns indicate that the gfn can not be
72 * translated to pfn - it is not in slot or failed to
73 * translate it to pfn.
74 */
75static inline bool is_error_noslot_pfn(pfn_t pfn)
65{ 76{
66 return pfn == KVM_PFN_ERR_BAD; 77 return !!(pfn & KVM_PFN_ERR_NOSLOT_MASK);
67} 78}
68 79
69static inline bool is_invalid_pfn(pfn_t pfn) 80/* noslot pfn indicates that the gfn is not in slot. */
81static inline bool is_noslot_pfn(pfn_t pfn)
70{ 82{
71 return !is_noslot_pfn(pfn) && is_error_pfn(pfn); 83 return pfn == KVM_PFN_NOSLOT;
72} 84}
73 85
74#define KVM_HVA_ERR_BAD (PAGE_OFFSET) 86#define KVM_HVA_ERR_BAD (PAGE_OFFSET)
@@ -107,6 +119,9 @@ static inline bool is_error_page(struct page *page)
107#define KVM_REQ_IMMEDIATE_EXIT 15 119#define KVM_REQ_IMMEDIATE_EXIT 15
108#define KVM_REQ_PMU 16 120#define KVM_REQ_PMU 16
109#define KVM_REQ_PMI 17 121#define KVM_REQ_PMI 17
122#define KVM_REQ_WATCHDOG 18
123#define KVM_REQ_MASTERCLOCK_UPDATE 19
124#define KVM_REQ_MCLOCK_INPROGRESS 20
110 125
111#define KVM_USERSPACE_IRQ_SOURCE_ID 0 126#define KVM_USERSPACE_IRQ_SOURCE_ID 0
112#define KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID 1 127#define KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID 1
@@ -516,6 +531,7 @@ void kvm_put_guest_fpu(struct kvm_vcpu *vcpu);
516 531
517void kvm_flush_remote_tlbs(struct kvm *kvm); 532void kvm_flush_remote_tlbs(struct kvm *kvm);
518void kvm_reload_remote_mmus(struct kvm *kvm); 533void kvm_reload_remote_mmus(struct kvm *kvm);
534void kvm_make_mclock_inprogress_request(struct kvm *kvm);
519 535
520long kvm_arch_dev_ioctl(struct file *filp, 536long kvm_arch_dev_ioctl(struct file *filp,
521 unsigned int ioctl, unsigned long arg); 537 unsigned int ioctl, unsigned long arg);
@@ -569,9 +585,9 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu);
569void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu); 585void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu);
570struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id); 586struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id);
571int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu); 587int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu);
588int kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu);
572void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu); 589void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu);
573 590
574int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu);
575int kvm_arch_hardware_enable(void *garbage); 591int kvm_arch_hardware_enable(void *garbage);
576void kvm_arch_hardware_disable(void *garbage); 592void kvm_arch_hardware_disable(void *garbage);
577int kvm_arch_hardware_setup(void); 593int kvm_arch_hardware_setup(void);
@@ -666,6 +682,7 @@ void kvm_get_intr_delivery_bitmask(struct kvm_ioapic *ioapic,
666 unsigned long *deliver_bitmask); 682 unsigned long *deliver_bitmask);
667#endif 683#endif
668int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level); 684int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level);
685int kvm_set_irq_inatomic(struct kvm *kvm, int irq_source_id, u32 irq, int level);
669int kvm_set_msi(struct kvm_kernel_irq_routing_entry *irq_entry, struct kvm *kvm, 686int kvm_set_msi(struct kvm_kernel_irq_routing_entry *irq_entry, struct kvm *kvm,
670 int irq_source_id, int level); 687 int irq_source_id, int level);
671void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin); 688void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin);
@@ -838,9 +855,9 @@ extern struct kvm_stats_debugfs_item debugfs_entries[];
838extern struct dentry *kvm_debugfs_dir; 855extern struct dentry *kvm_debugfs_dir;
839 856
840#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER) 857#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
841static inline int mmu_notifier_retry(struct kvm_vcpu *vcpu, unsigned long mmu_seq) 858static inline int mmu_notifier_retry(struct kvm *kvm, unsigned long mmu_seq)
842{ 859{
843 if (unlikely(vcpu->kvm->mmu_notifier_count)) 860 if (unlikely(kvm->mmu_notifier_count))
844 return 1; 861 return 1;
845 /* 862 /*
846 * Ensure the read of mmu_notifier_count happens before the read 863 * Ensure the read of mmu_notifier_count happens before the read
@@ -853,7 +870,7 @@ static inline int mmu_notifier_retry(struct kvm_vcpu *vcpu, unsigned long mmu_se
853 * can't rely on kvm->mmu_lock to keep things ordered. 870 * can't rely on kvm->mmu_lock to keep things ordered.
854 */ 871 */
855 smp_rmb(); 872 smp_rmb();
856 if (vcpu->kvm->mmu_notifier_seq != mmu_seq) 873 if (kvm->mmu_notifier_seq != mmu_seq)
857 return 1; 874 return 1;
858 return 0; 875 return 0;
859} 876}
@@ -881,10 +898,20 @@ static inline void kvm_free_irq_routing(struct kvm *kvm) {}
881#ifdef CONFIG_HAVE_KVM_EVENTFD 898#ifdef CONFIG_HAVE_KVM_EVENTFD
882 899
883void kvm_eventfd_init(struct kvm *kvm); 900void kvm_eventfd_init(struct kvm *kvm);
901int kvm_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args);
902
903#ifdef CONFIG_HAVE_KVM_IRQCHIP
884int kvm_irqfd(struct kvm *kvm, struct kvm_irqfd *args); 904int kvm_irqfd(struct kvm *kvm, struct kvm_irqfd *args);
885void kvm_irqfd_release(struct kvm *kvm); 905void kvm_irqfd_release(struct kvm *kvm);
886void kvm_irq_routing_update(struct kvm *, struct kvm_irq_routing_table *); 906void kvm_irq_routing_update(struct kvm *, struct kvm_irq_routing_table *);
887int kvm_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args); 907#else
908static inline int kvm_irqfd(struct kvm *kvm, struct kvm_irqfd *args)
909{
910 return -EINVAL;
911}
912
913static inline void kvm_irqfd_release(struct kvm *kvm) {}
914#endif
888 915
889#else 916#else
890 917
diff --git a/include/linux/pvclock_gtod.h b/include/linux/pvclock_gtod.h
new file mode 100644
index 000000000000..0ca75825b60d
--- /dev/null
+++ b/include/linux/pvclock_gtod.h
@@ -0,0 +1,9 @@
1#ifndef _PVCLOCK_GTOD_H
2#define _PVCLOCK_GTOD_H
3
4#include <linux/notifier.h>
5
6extern int pvclock_gtod_register_notifier(struct notifier_block *nb);
7extern int pvclock_gtod_unregister_notifier(struct notifier_block *nb);
8
9#endif /* _PVCLOCK_GTOD_H */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 651b51a36711..2c2f3072beef 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -107,6 +107,14 @@ extern unsigned long this_cpu_load(void);
107extern void calc_global_load(unsigned long ticks); 107extern void calc_global_load(unsigned long ticks);
108extern void update_cpu_load_nohz(void); 108extern void update_cpu_load_nohz(void);
109 109
110/* Notifier for when a task gets migrated to a new CPU */
111struct task_migration_notifier {
112 struct task_struct *task;
113 int from_cpu;
114 int to_cpu;
115};
116extern void register_task_migration_notifier(struct notifier_block *n);
117
110extern unsigned long get_parent_ip(unsigned long addr); 118extern unsigned long get_parent_ip(unsigned long addr);
111 119
112extern void dump_cpu_task(int cpu); 120extern void dump_cpu_task(int cpu);
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 0a6d6ba44c85..e6e5d4b13708 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -167,10 +167,15 @@ struct kvm_pit_config {
167#define KVM_EXIT_OSI 18 167#define KVM_EXIT_OSI 18
168#define KVM_EXIT_PAPR_HCALL 19 168#define KVM_EXIT_PAPR_HCALL 19
169#define KVM_EXIT_S390_UCONTROL 20 169#define KVM_EXIT_S390_UCONTROL 20
170#define KVM_EXIT_WATCHDOG 21
170 171
171/* For KVM_EXIT_INTERNAL_ERROR */ 172/* For KVM_EXIT_INTERNAL_ERROR */
172#define KVM_INTERNAL_ERROR_EMULATION 1 173/* Emulate instruction failed. */
173#define KVM_INTERNAL_ERROR_SIMUL_EX 2 174#define KVM_INTERNAL_ERROR_EMULATION 1
175/* Encounter unexpected simultaneous exceptions. */
176#define KVM_INTERNAL_ERROR_SIMUL_EX 2
177/* Encounter unexpected vm-exit due to delivery event. */
178#define KVM_INTERNAL_ERROR_DELIVERY_EV 3
174 179
175/* for KVM_RUN, returned by mmap(vcpu_fd, offset=0) */ 180/* for KVM_RUN, returned by mmap(vcpu_fd, offset=0) */
176struct kvm_run { 181struct kvm_run {
@@ -477,6 +482,8 @@ struct kvm_ppc_smmu_info {
477 struct kvm_ppc_one_seg_page_size sps[KVM_PPC_PAGE_SIZES_MAX_SZ]; 482 struct kvm_ppc_one_seg_page_size sps[KVM_PPC_PAGE_SIZES_MAX_SZ];
478}; 483};
479 484
485#define KVM_PPC_PVINFO_FLAGS_EV_IDLE (1<<0)
486
480#define KVMIO 0xAE 487#define KVMIO 0xAE
481 488
482/* machine type bits, to be used as argument to KVM_CREATE_VM */ 489/* machine type bits, to be used as argument to KVM_CREATE_VM */
@@ -626,6 +633,8 @@ struct kvm_ppc_smmu_info {
626#define KVM_CAP_READONLY_MEM 81 633#define KVM_CAP_READONLY_MEM 81
627#endif 634#endif
628#define KVM_CAP_IRQFD_RESAMPLE 82 635#define KVM_CAP_IRQFD_RESAMPLE 82
636#define KVM_CAP_PPC_BOOKE_WATCHDOG 83
637#define KVM_CAP_PPC_HTAB_FD 84
629 638
630#ifdef KVM_CAP_IRQ_ROUTING 639#ifdef KVM_CAP_IRQ_ROUTING
631 640
@@ -848,6 +857,11 @@ struct kvm_s390_ucas_mapping {
848#define KVM_PPC_GET_SMMU_INFO _IOR(KVMIO, 0xa6, struct kvm_ppc_smmu_info) 857#define KVM_PPC_GET_SMMU_INFO _IOR(KVMIO, 0xa6, struct kvm_ppc_smmu_info)
849/* Available with KVM_CAP_PPC_ALLOC_HTAB */ 858/* Available with KVM_CAP_PPC_ALLOC_HTAB */
850#define KVM_PPC_ALLOCATE_HTAB _IOWR(KVMIO, 0xa7, __u32) 859#define KVM_PPC_ALLOCATE_HTAB _IOWR(KVMIO, 0xa7, __u32)
860#define KVM_CREATE_SPAPR_TCE _IOW(KVMIO, 0xa8, struct kvm_create_spapr_tce)
861/* Available with KVM_CAP_RMA */
862#define KVM_ALLOCATE_RMA _IOR(KVMIO, 0xa9, struct kvm_allocate_rma)
863/* Available with KVM_CAP_PPC_HTAB_FD */
864#define KVM_PPC_GET_HTAB_FD _IOW(KVMIO, 0xaa, struct kvm_get_htab_fd)
851 865
852/* 866/*
853 * ioctls for vcpu fds 867 * ioctls for vcpu fds
@@ -911,9 +925,6 @@ struct kvm_s390_ucas_mapping {
911/* Available with KVM_CAP_XCRS */ 925/* Available with KVM_CAP_XCRS */
912#define KVM_GET_XCRS _IOR(KVMIO, 0xa6, struct kvm_xcrs) 926#define KVM_GET_XCRS _IOR(KVMIO, 0xa6, struct kvm_xcrs)
913#define KVM_SET_XCRS _IOW(KVMIO, 0xa7, struct kvm_xcrs) 927#define KVM_SET_XCRS _IOW(KVMIO, 0xa7, struct kvm_xcrs)
914#define KVM_CREATE_SPAPR_TCE _IOW(KVMIO, 0xa8, struct kvm_create_spapr_tce)
915/* Available with KVM_CAP_RMA */
916#define KVM_ALLOCATE_RMA _IOR(KVMIO, 0xa9, struct kvm_allocate_rma)
917/* Available with KVM_CAP_SW_TLB */ 928/* Available with KVM_CAP_SW_TLB */
918#define KVM_DIRTY_TLB _IOW(KVMIO, 0xaa, struct kvm_dirty_tlb) 929#define KVM_DIRTY_TLB _IOW(KVMIO, 0xaa, struct kvm_dirty_tlb)
919/* Available with KVM_CAP_ONE_REG */ 930/* Available with KVM_CAP_ONE_REG */
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 6271b89f87ac..0533496b6228 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -923,6 +923,13 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
923 rq->skip_clock_update = 1; 923 rq->skip_clock_update = 1;
924} 924}
925 925
926static ATOMIC_NOTIFIER_HEAD(task_migration_notifier);
927
928void register_task_migration_notifier(struct notifier_block *n)
929{
930 atomic_notifier_chain_register(&task_migration_notifier, n);
931}
932
926#ifdef CONFIG_SMP 933#ifdef CONFIG_SMP
927void set_task_cpu(struct task_struct *p, unsigned int new_cpu) 934void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
928{ 935{
@@ -953,10 +960,18 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
953 trace_sched_migrate_task(p, new_cpu); 960 trace_sched_migrate_task(p, new_cpu);
954 961
955 if (task_cpu(p) != new_cpu) { 962 if (task_cpu(p) != new_cpu) {
963 struct task_migration_notifier tmn;
964
956 if (p->sched_class->migrate_task_rq) 965 if (p->sched_class->migrate_task_rq)
957 p->sched_class->migrate_task_rq(p, new_cpu); 966 p->sched_class->migrate_task_rq(p, new_cpu);
958 p->se.nr_migrations++; 967 p->se.nr_migrations++;
959 perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, NULL, 0); 968 perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, NULL, 0);
969
970 tmn.task = p;
971 tmn.from_cpu = task_cpu(p);
972 tmn.to_cpu = new_cpu;
973
974 atomic_notifier_call_chain(&task_migration_notifier, 0, &tmn);
960 } 975 }
961 976
962 __set_task_cpu(p, new_cpu); 977 __set_task_cpu(p, new_cpu);
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 4c7de02eacdc..cbc6acb0db3f 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -21,6 +21,7 @@
21#include <linux/time.h> 21#include <linux/time.h>
22#include <linux/tick.h> 22#include <linux/tick.h>
23#include <linux/stop_machine.h> 23#include <linux/stop_machine.h>
24#include <linux/pvclock_gtod.h>
24 25
25 26
26static struct timekeeper timekeeper; 27static struct timekeeper timekeeper;
@@ -174,6 +175,54 @@ static inline s64 timekeeping_get_ns_raw(struct timekeeper *tk)
174 return nsec + arch_gettimeoffset(); 175 return nsec + arch_gettimeoffset();
175} 176}
176 177
178static RAW_NOTIFIER_HEAD(pvclock_gtod_chain);
179
180static void update_pvclock_gtod(struct timekeeper *tk)
181{
182 raw_notifier_call_chain(&pvclock_gtod_chain, 0, tk);
183}
184
185/**
186 * pvclock_gtod_register_notifier - register a pvclock timedata update listener
187 *
188 * Must hold write on timekeeper.lock
189 */
190int pvclock_gtod_register_notifier(struct notifier_block *nb)
191{
192 struct timekeeper *tk = &timekeeper;
193 unsigned long flags;
194 int ret;
195
196 write_seqlock_irqsave(&tk->lock, flags);
197 ret = raw_notifier_chain_register(&pvclock_gtod_chain, nb);
198 /* update timekeeping data */
199 update_pvclock_gtod(tk);
200 write_sequnlock_irqrestore(&tk->lock, flags);
201
202 return ret;
203}
204EXPORT_SYMBOL_GPL(pvclock_gtod_register_notifier);
205
206/**
207 * pvclock_gtod_unregister_notifier - unregister a pvclock
208 * timedata update listener
209 *
210 * Must hold write on timekeeper.lock
211 */
212int pvclock_gtod_unregister_notifier(struct notifier_block *nb)
213{
214 struct timekeeper *tk = &timekeeper;
215 unsigned long flags;
216 int ret;
217
218 write_seqlock_irqsave(&tk->lock, flags);
219 ret = raw_notifier_chain_unregister(&pvclock_gtod_chain, nb);
220 write_sequnlock_irqrestore(&tk->lock, flags);
221
222 return ret;
223}
224EXPORT_SYMBOL_GPL(pvclock_gtod_unregister_notifier);
225
177/* must hold write on timekeeper.lock */ 226/* must hold write on timekeeper.lock */
178static void timekeeping_update(struct timekeeper *tk, bool clearntp) 227static void timekeeping_update(struct timekeeper *tk, bool clearntp)
179{ 228{
@@ -182,6 +231,7 @@ static void timekeeping_update(struct timekeeper *tk, bool clearntp)
182 ntp_clear(); 231 ntp_clear();
183 } 232 }
184 update_vsyscall(tk); 233 update_vsyscall(tk);
234 update_pvclock_gtod(tk);
185} 235}
186 236
187/** 237/**
diff --git a/virt/kvm/assigned-dev.c b/virt/kvm/assigned-dev.c
index 23a41a9f8db9..3642239252b0 100644
--- a/virt/kvm/assigned-dev.c
+++ b/virt/kvm/assigned-dev.c
@@ -105,6 +105,15 @@ static irqreturn_t kvm_assigned_dev_thread_intx(int irq, void *dev_id)
105} 105}
106 106
107#ifdef __KVM_HAVE_MSI 107#ifdef __KVM_HAVE_MSI
108static irqreturn_t kvm_assigned_dev_msi(int irq, void *dev_id)
109{
110 struct kvm_assigned_dev_kernel *assigned_dev = dev_id;
111 int ret = kvm_set_irq_inatomic(assigned_dev->kvm,
112 assigned_dev->irq_source_id,
113 assigned_dev->guest_irq, 1);
114 return unlikely(ret == -EWOULDBLOCK) ? IRQ_WAKE_THREAD : IRQ_HANDLED;
115}
116
108static irqreturn_t kvm_assigned_dev_thread_msi(int irq, void *dev_id) 117static irqreturn_t kvm_assigned_dev_thread_msi(int irq, void *dev_id)
109{ 118{
110 struct kvm_assigned_dev_kernel *assigned_dev = dev_id; 119 struct kvm_assigned_dev_kernel *assigned_dev = dev_id;
@@ -117,6 +126,23 @@ static irqreturn_t kvm_assigned_dev_thread_msi(int irq, void *dev_id)
117#endif 126#endif
118 127
119#ifdef __KVM_HAVE_MSIX 128#ifdef __KVM_HAVE_MSIX
129static irqreturn_t kvm_assigned_dev_msix(int irq, void *dev_id)
130{
131 struct kvm_assigned_dev_kernel *assigned_dev = dev_id;
132 int index = find_index_from_host_irq(assigned_dev, irq);
133 u32 vector;
134 int ret = 0;
135
136 if (index >= 0) {
137 vector = assigned_dev->guest_msix_entries[index].vector;
138 ret = kvm_set_irq_inatomic(assigned_dev->kvm,
139 assigned_dev->irq_source_id,
140 vector, 1);
141 }
142
143 return unlikely(ret == -EWOULDBLOCK) ? IRQ_WAKE_THREAD : IRQ_HANDLED;
144}
145
120static irqreturn_t kvm_assigned_dev_thread_msix(int irq, void *dev_id) 146static irqreturn_t kvm_assigned_dev_thread_msix(int irq, void *dev_id)
121{ 147{
122 struct kvm_assigned_dev_kernel *assigned_dev = dev_id; 148 struct kvm_assigned_dev_kernel *assigned_dev = dev_id;
@@ -334,11 +360,6 @@ static int assigned_device_enable_host_intx(struct kvm *kvm,
334} 360}
335 361
336#ifdef __KVM_HAVE_MSI 362#ifdef __KVM_HAVE_MSI
337static irqreturn_t kvm_assigned_dev_msi(int irq, void *dev_id)
338{
339 return IRQ_WAKE_THREAD;
340}
341
342static int assigned_device_enable_host_msi(struct kvm *kvm, 363static int assigned_device_enable_host_msi(struct kvm *kvm,
343 struct kvm_assigned_dev_kernel *dev) 364 struct kvm_assigned_dev_kernel *dev)
344{ 365{
@@ -363,11 +384,6 @@ static int assigned_device_enable_host_msi(struct kvm *kvm,
363#endif 384#endif
364 385
365#ifdef __KVM_HAVE_MSIX 386#ifdef __KVM_HAVE_MSIX
366static irqreturn_t kvm_assigned_dev_msix(int irq, void *dev_id)
367{
368 return IRQ_WAKE_THREAD;
369}
370
371static int assigned_device_enable_host_msix(struct kvm *kvm, 387static int assigned_device_enable_host_msix(struct kvm *kvm,
372 struct kvm_assigned_dev_kernel *dev) 388 struct kvm_assigned_dev_kernel *dev)
373{ 389{
diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
index 9718e98d6d2a..b6eea5cc7b34 100644
--- a/virt/kvm/eventfd.c
+++ b/virt/kvm/eventfd.c
@@ -35,6 +35,7 @@
35 35
36#include "iodev.h" 36#include "iodev.h"
37 37
38#ifdef __KVM_HAVE_IOAPIC
38/* 39/*
39 * -------------------------------------------------------------------- 40 * --------------------------------------------------------------------
40 * irqfd: Allows an fd to be used to inject an interrupt to the guest 41 * irqfd: Allows an fd to be used to inject an interrupt to the guest
@@ -332,7 +333,7 @@ kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args)
332 mutex_lock(&kvm->irqfds.resampler_lock); 333 mutex_lock(&kvm->irqfds.resampler_lock);
333 334
334 list_for_each_entry(resampler, 335 list_for_each_entry(resampler,
335 &kvm->irqfds.resampler_list, list) { 336 &kvm->irqfds.resampler_list, link) {
336 if (resampler->notifier.gsi == irqfd->gsi) { 337 if (resampler->notifier.gsi == irqfd->gsi) {
337 irqfd->resampler = resampler; 338 irqfd->resampler = resampler;
338 break; 339 break;
@@ -425,17 +426,21 @@ fail:
425 kfree(irqfd); 426 kfree(irqfd);
426 return ret; 427 return ret;
427} 428}
429#endif
428 430
429void 431void
430kvm_eventfd_init(struct kvm *kvm) 432kvm_eventfd_init(struct kvm *kvm)
431{ 433{
434#ifdef __KVM_HAVE_IOAPIC
432 spin_lock_init(&kvm->irqfds.lock); 435 spin_lock_init(&kvm->irqfds.lock);
433 INIT_LIST_HEAD(&kvm->irqfds.items); 436 INIT_LIST_HEAD(&kvm->irqfds.items);
434 INIT_LIST_HEAD(&kvm->irqfds.resampler_list); 437 INIT_LIST_HEAD(&kvm->irqfds.resampler_list);
435 mutex_init(&kvm->irqfds.resampler_lock); 438 mutex_init(&kvm->irqfds.resampler_lock);
439#endif
436 INIT_LIST_HEAD(&kvm->ioeventfds); 440 INIT_LIST_HEAD(&kvm->ioeventfds);
437} 441}
438 442
443#ifdef __KVM_HAVE_IOAPIC
439/* 444/*
440 * shutdown any irqfd's that match fd+gsi 445 * shutdown any irqfd's that match fd+gsi
441 */ 446 */
@@ -555,6 +560,7 @@ static void __exit irqfd_module_exit(void)
555 560
556module_init(irqfd_module_init); 561module_init(irqfd_module_init);
557module_exit(irqfd_module_exit); 562module_exit(irqfd_module_exit);
563#endif
558 564
559/* 565/*
560 * -------------------------------------------------------------------- 566 * --------------------------------------------------------------------
diff --git a/virt/kvm/iommu.c b/virt/kvm/iommu.c
index 037cb6730e68..4a340cb23013 100644
--- a/virt/kvm/iommu.c
+++ b/virt/kvm/iommu.c
@@ -52,7 +52,7 @@ static pfn_t kvm_pin_pages(struct kvm_memory_slot *slot, gfn_t gfn,
52 end_gfn = gfn + (size >> PAGE_SHIFT); 52 end_gfn = gfn + (size >> PAGE_SHIFT);
53 gfn += 1; 53 gfn += 1;
54 54
55 if (is_error_pfn(pfn)) 55 if (is_error_noslot_pfn(pfn))
56 return pfn; 56 return pfn;
57 57
58 while (gfn < end_gfn) 58 while (gfn < end_gfn)
@@ -106,7 +106,7 @@ int kvm_iommu_map_pages(struct kvm *kvm, struct kvm_memory_slot *slot)
106 * important because we unmap and unpin in 4kb steps later. 106 * important because we unmap and unpin in 4kb steps later.
107 */ 107 */
108 pfn = kvm_pin_pages(slot, gfn, page_size); 108 pfn = kvm_pin_pages(slot, gfn, page_size);
109 if (is_error_pfn(pfn)) { 109 if (is_error_noslot_pfn(pfn)) {
110 gfn += 1; 110 gfn += 1;
111 continue; 111 continue;
112 } 112 }
@@ -168,11 +168,7 @@ int kvm_assign_device(struct kvm *kvm,
168 168
169 r = iommu_attach_device(domain, &pdev->dev); 169 r = iommu_attach_device(domain, &pdev->dev);
170 if (r) { 170 if (r) {
171 printk(KERN_ERR "assign device %x:%x:%x.%x failed", 171 dev_err(&pdev->dev, "kvm assign device failed ret %d", r);
172 pci_domain_nr(pdev->bus),
173 pdev->bus->number,
174 PCI_SLOT(pdev->devfn),
175 PCI_FUNC(pdev->devfn));
176 return r; 172 return r;
177 } 173 }
178 174
diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c
index 2eb58af7ee99..656fa455e154 100644
--- a/virt/kvm/irq_comm.c
+++ b/virt/kvm/irq_comm.c
@@ -102,6 +102,23 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
102 return r; 102 return r;
103} 103}
104 104
105static inline void kvm_set_msi_irq(struct kvm_kernel_irq_routing_entry *e,
106 struct kvm_lapic_irq *irq)
107{
108 trace_kvm_msi_set_irq(e->msi.address_lo, e->msi.data);
109
110 irq->dest_id = (e->msi.address_lo &
111 MSI_ADDR_DEST_ID_MASK) >> MSI_ADDR_DEST_ID_SHIFT;
112 irq->vector = (e->msi.data &
113 MSI_DATA_VECTOR_MASK) >> MSI_DATA_VECTOR_SHIFT;
114 irq->dest_mode = (1 << MSI_ADDR_DEST_MODE_SHIFT) & e->msi.address_lo;
115 irq->trig_mode = (1 << MSI_DATA_TRIGGER_SHIFT) & e->msi.data;
116 irq->delivery_mode = e->msi.data & 0x700;
117 irq->level = 1;
118 irq->shorthand = 0;
119 /* TODO Deal with RH bit of MSI message address */
120}
121
105int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e, 122int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e,
106 struct kvm *kvm, int irq_source_id, int level) 123 struct kvm *kvm, int irq_source_id, int level)
107{ 124{
@@ -110,22 +127,26 @@ int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e,
110 if (!level) 127 if (!level)
111 return -1; 128 return -1;
112 129
113 trace_kvm_msi_set_irq(e->msi.address_lo, e->msi.data); 130 kvm_set_msi_irq(e, &irq);
114 131
115 irq.dest_id = (e->msi.address_lo &
116 MSI_ADDR_DEST_ID_MASK) >> MSI_ADDR_DEST_ID_SHIFT;
117 irq.vector = (e->msi.data &
118 MSI_DATA_VECTOR_MASK) >> MSI_DATA_VECTOR_SHIFT;
119 irq.dest_mode = (1 << MSI_ADDR_DEST_MODE_SHIFT) & e->msi.address_lo;
120 irq.trig_mode = (1 << MSI_DATA_TRIGGER_SHIFT) & e->msi.data;
121 irq.delivery_mode = e->msi.data & 0x700;
122 irq.level = 1;
123 irq.shorthand = 0;
124
125 /* TODO Deal with RH bit of MSI message address */
126 return kvm_irq_delivery_to_apic(kvm, NULL, &irq); 132 return kvm_irq_delivery_to_apic(kvm, NULL, &irq);
127} 133}
128 134
135
136static int kvm_set_msi_inatomic(struct kvm_kernel_irq_routing_entry *e,
137 struct kvm *kvm)
138{
139 struct kvm_lapic_irq irq;
140 int r;
141
142 kvm_set_msi_irq(e, &irq);
143
144 if (kvm_irq_delivery_to_apic_fast(kvm, NULL, &irq, &r))
145 return r;
146 else
147 return -EWOULDBLOCK;
148}
149
129int kvm_send_userspace_msi(struct kvm *kvm, struct kvm_msi *msi) 150int kvm_send_userspace_msi(struct kvm *kvm, struct kvm_msi *msi)
130{ 151{
131 struct kvm_kernel_irq_routing_entry route; 152 struct kvm_kernel_irq_routing_entry route;
@@ -178,6 +199,44 @@ int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level)
178 return ret; 199 return ret;
179} 200}
180 201
202/*
203 * Deliver an IRQ in an atomic context if we can, or return a failure,
204 * user can retry in a process context.
205 * Return value:
206 * -EWOULDBLOCK - Can't deliver in atomic context: retry in a process context.
207 * Other values - No need to retry.
208 */
209int kvm_set_irq_inatomic(struct kvm *kvm, int irq_source_id, u32 irq, int level)
210{
211 struct kvm_kernel_irq_routing_entry *e;
212 int ret = -EINVAL;
213 struct kvm_irq_routing_table *irq_rt;
214 struct hlist_node *n;
215
216 trace_kvm_set_irq(irq, level, irq_source_id);
217
218 /*
219 * Injection into either PIC or IOAPIC might need to scan all CPUs,
220 * which would need to be retried from thread context; when same GSI
221 * is connected to both PIC and IOAPIC, we'd have to report a
222 * partial failure here.
223 * Since there's no easy way to do this, we only support injecting MSI
224 * which is limited to 1:1 GSI mapping.
225 */
226 rcu_read_lock();
227 irq_rt = rcu_dereference(kvm->irq_routing);
228 if (irq < irq_rt->nr_rt_entries)
229 hlist_for_each_entry(e, n, &irq_rt->map[irq], link) {
230 if (likely(e->type == KVM_IRQ_ROUTING_MSI))
231 ret = kvm_set_msi_inatomic(e, kvm);
232 else
233 ret = -EWOULDBLOCK;
234 break;
235 }
236 rcu_read_unlock();
237 return ret;
238}
239
181void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin) 240void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin)
182{ 241{
183 struct kvm_irq_ack_notifier *kian; 242 struct kvm_irq_ack_notifier *kian;
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index be70035fd42a..1cd693a76a51 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -212,6 +212,11 @@ void kvm_reload_remote_mmus(struct kvm *kvm)
212 make_all_cpus_request(kvm, KVM_REQ_MMU_RELOAD); 212 make_all_cpus_request(kvm, KVM_REQ_MMU_RELOAD);
213} 213}
214 214
215void kvm_make_mclock_inprogress_request(struct kvm *kvm)
216{
217 make_all_cpus_request(kvm, KVM_REQ_MCLOCK_INPROGRESS);
218}
219
215int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id) 220int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
216{ 221{
217 struct page *page; 222 struct page *page;
@@ -709,8 +714,7 @@ int __kvm_set_memory_region(struct kvm *kvm,
709 int r; 714 int r;
710 gfn_t base_gfn; 715 gfn_t base_gfn;
711 unsigned long npages; 716 unsigned long npages;
712 unsigned long i; 717 struct kvm_memory_slot *memslot, *slot;
713 struct kvm_memory_slot *memslot;
714 struct kvm_memory_slot old, new; 718 struct kvm_memory_slot old, new;
715 struct kvm_memslots *slots, *old_memslots; 719 struct kvm_memslots *slots, *old_memslots;
716 720
@@ -761,13 +765,11 @@ int __kvm_set_memory_region(struct kvm *kvm,
761 765
762 /* Check for overlaps */ 766 /* Check for overlaps */
763 r = -EEXIST; 767 r = -EEXIST;
764 for (i = 0; i < KVM_MEMORY_SLOTS; ++i) { 768 kvm_for_each_memslot(slot, kvm->memslots) {
765 struct kvm_memory_slot *s = &kvm->memslots->memslots[i]; 769 if (slot->id >= KVM_MEMORY_SLOTS || slot == memslot)
766
767 if (s == memslot || !s->npages)
768 continue; 770 continue;
769 if (!((base_gfn + npages <= s->base_gfn) || 771 if (!((base_gfn + npages <= slot->base_gfn) ||
770 (base_gfn >= s->base_gfn + s->npages))) 772 (base_gfn >= slot->base_gfn + slot->npages)))
771 goto out_free; 773 goto out_free;
772 } 774 }
773 775
@@ -1208,7 +1210,7 @@ __gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn, bool atomic,
1208 return KVM_PFN_ERR_RO_FAULT; 1210 return KVM_PFN_ERR_RO_FAULT;
1209 1211
1210 if (kvm_is_error_hva(addr)) 1212 if (kvm_is_error_hva(addr))
1211 return KVM_PFN_ERR_BAD; 1213 return KVM_PFN_NOSLOT;
1212 1214
1213 /* Do not map writable pfn in the readonly memslot. */ 1215 /* Do not map writable pfn in the readonly memslot. */
1214 if (writable && memslot_is_readonly(slot)) { 1216 if (writable && memslot_is_readonly(slot)) {
@@ -1290,7 +1292,7 @@ EXPORT_SYMBOL_GPL(gfn_to_page_many_atomic);
1290 1292
1291static struct page *kvm_pfn_to_page(pfn_t pfn) 1293static struct page *kvm_pfn_to_page(pfn_t pfn)
1292{ 1294{
1293 if (is_error_pfn(pfn)) 1295 if (is_error_noslot_pfn(pfn))
1294 return KVM_ERR_PTR_BAD_PAGE; 1296 return KVM_ERR_PTR_BAD_PAGE;
1295 1297
1296 if (kvm_is_mmio_pfn(pfn)) { 1298 if (kvm_is_mmio_pfn(pfn)) {
@@ -1322,7 +1324,7 @@ EXPORT_SYMBOL_GPL(kvm_release_page_clean);
1322 1324
1323void kvm_release_pfn_clean(pfn_t pfn) 1325void kvm_release_pfn_clean(pfn_t pfn)
1324{ 1326{
1325 if (!is_error_pfn(pfn) && !kvm_is_mmio_pfn(pfn)) 1327 if (!is_error_noslot_pfn(pfn) && !kvm_is_mmio_pfn(pfn))
1326 put_page(pfn_to_page(pfn)); 1328 put_page(pfn_to_page(pfn));
1327} 1329}
1328EXPORT_SYMBOL_GPL(kvm_release_pfn_clean); 1330EXPORT_SYMBOL_GPL(kvm_release_pfn_clean);
@@ -1848,6 +1850,7 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id)
1848 atomic_inc(&kvm->online_vcpus); 1850 atomic_inc(&kvm->online_vcpus);
1849 1851
1850 mutex_unlock(&kvm->lock); 1852 mutex_unlock(&kvm->lock);
1853 kvm_arch_vcpu_postcreate(vcpu);
1851 return r; 1854 return r;
1852 1855
1853unlock_vcpu_destroy: 1856unlock_vcpu_destroy:
@@ -1929,10 +1932,6 @@ out_free1:
1929 goto out; 1932 goto out;
1930 } 1933 }
1931 r = kvm_arch_vcpu_ioctl_set_regs(vcpu, kvm_regs); 1934 r = kvm_arch_vcpu_ioctl_set_regs(vcpu, kvm_regs);
1932 if (r)
1933 goto out_free2;
1934 r = 0;
1935out_free2:
1936 kfree(kvm_regs); 1935 kfree(kvm_regs);
1937 break; 1936 break;
1938 } 1937 }
@@ -1954,12 +1953,10 @@ out_free2:
1954 kvm_sregs = memdup_user(argp, sizeof(*kvm_sregs)); 1953 kvm_sregs = memdup_user(argp, sizeof(*kvm_sregs));
1955 if (IS_ERR(kvm_sregs)) { 1954 if (IS_ERR(kvm_sregs)) {
1956 r = PTR_ERR(kvm_sregs); 1955 r = PTR_ERR(kvm_sregs);
1956 kvm_sregs = NULL;
1957 goto out; 1957 goto out;
1958 } 1958 }
1959 r = kvm_arch_vcpu_ioctl_set_sregs(vcpu, kvm_sregs); 1959 r = kvm_arch_vcpu_ioctl_set_sregs(vcpu, kvm_sregs);
1960 if (r)
1961 goto out;
1962 r = 0;
1963 break; 1960 break;
1964 } 1961 }
1965 case KVM_GET_MP_STATE: { 1962 case KVM_GET_MP_STATE: {
@@ -1981,9 +1978,6 @@ out_free2:
1981 if (copy_from_user(&mp_state, argp, sizeof mp_state)) 1978 if (copy_from_user(&mp_state, argp, sizeof mp_state))
1982 goto out; 1979 goto out;
1983 r = kvm_arch_vcpu_ioctl_set_mpstate(vcpu, &mp_state); 1980 r = kvm_arch_vcpu_ioctl_set_mpstate(vcpu, &mp_state);
1984 if (r)
1985 goto out;
1986 r = 0;
1987 break; 1981 break;
1988 } 1982 }
1989 case KVM_TRANSLATE: { 1983 case KVM_TRANSLATE: {
@@ -2008,9 +2002,6 @@ out_free2:
2008 if (copy_from_user(&dbg, argp, sizeof dbg)) 2002 if (copy_from_user(&dbg, argp, sizeof dbg))
2009 goto out; 2003 goto out;
2010 r = kvm_arch_vcpu_ioctl_set_guest_debug(vcpu, &dbg); 2004 r = kvm_arch_vcpu_ioctl_set_guest_debug(vcpu, &dbg);
2011 if (r)
2012 goto out;
2013 r = 0;
2014 break; 2005 break;
2015 } 2006 }
2016 case KVM_SET_SIGNAL_MASK: { 2007 case KVM_SET_SIGNAL_MASK: {
@@ -2054,12 +2045,10 @@ out_free2:
2054 fpu = memdup_user(argp, sizeof(*fpu)); 2045 fpu = memdup_user(argp, sizeof(*fpu));
2055 if (IS_ERR(fpu)) { 2046 if (IS_ERR(fpu)) {
2056 r = PTR_ERR(fpu); 2047 r = PTR_ERR(fpu);
2048 fpu = NULL;
2057 goto out; 2049 goto out;
2058 } 2050 }
2059 r = kvm_arch_vcpu_ioctl_set_fpu(vcpu, fpu); 2051 r = kvm_arch_vcpu_ioctl_set_fpu(vcpu, fpu);
2060 if (r)
2061 goto out;
2062 r = 0;
2063 break; 2052 break;
2064 } 2053 }
2065 default: 2054 default:
@@ -2129,8 +2118,6 @@ static long kvm_vm_ioctl(struct file *filp,
2129 switch (ioctl) { 2118 switch (ioctl) {
2130 case KVM_CREATE_VCPU: 2119 case KVM_CREATE_VCPU:
2131 r = kvm_vm_ioctl_create_vcpu(kvm, arg); 2120 r = kvm_vm_ioctl_create_vcpu(kvm, arg);
2132 if (r < 0)
2133 goto out;
2134 break; 2121 break;
2135 case KVM_SET_USER_MEMORY_REGION: { 2122 case KVM_SET_USER_MEMORY_REGION: {
2136 struct kvm_userspace_memory_region kvm_userspace_mem; 2123 struct kvm_userspace_memory_region kvm_userspace_mem;
@@ -2141,8 +2128,6 @@ static long kvm_vm_ioctl(struct file *filp,
2141 goto out; 2128 goto out;
2142 2129
2143 r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem, 1); 2130 r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem, 1);
2144 if (r)
2145 goto out;
2146 break; 2131 break;
2147 } 2132 }
2148 case KVM_GET_DIRTY_LOG: { 2133 case KVM_GET_DIRTY_LOG: {
@@ -2152,8 +2137,6 @@ static long kvm_vm_ioctl(struct file *filp,
2152 if (copy_from_user(&log, argp, sizeof log)) 2137 if (copy_from_user(&log, argp, sizeof log))
2153 goto out; 2138 goto out;
2154 r = kvm_vm_ioctl_get_dirty_log(kvm, &log); 2139 r = kvm_vm_ioctl_get_dirty_log(kvm, &log);
2155 if (r)
2156 goto out;
2157 break; 2140 break;
2158 } 2141 }
2159#ifdef KVM_COALESCED_MMIO_PAGE_OFFSET 2142#ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
@@ -2163,9 +2146,6 @@ static long kvm_vm_ioctl(struct file *filp,
2163 if (copy_from_user(&zone, argp, sizeof zone)) 2146 if (copy_from_user(&zone, argp, sizeof zone))
2164 goto out; 2147 goto out;
2165 r = kvm_vm_ioctl_register_coalesced_mmio(kvm, &zone); 2148 r = kvm_vm_ioctl_register_coalesced_mmio(kvm, &zone);
2166 if (r)
2167 goto out;
2168 r = 0;
2169 break; 2149 break;
2170 } 2150 }
2171 case KVM_UNREGISTER_COALESCED_MMIO: { 2151 case KVM_UNREGISTER_COALESCED_MMIO: {
@@ -2174,9 +2154,6 @@ static long kvm_vm_ioctl(struct file *filp,
2174 if (copy_from_user(&zone, argp, sizeof zone)) 2154 if (copy_from_user(&zone, argp, sizeof zone))
2175 goto out; 2155 goto out;
2176 r = kvm_vm_ioctl_unregister_coalesced_mmio(kvm, &zone); 2156 r = kvm_vm_ioctl_unregister_coalesced_mmio(kvm, &zone);
2177 if (r)
2178 goto out;
2179 r = 0;
2180 break; 2157 break;
2181 } 2158 }
2182#endif 2159#endif
@@ -2285,8 +2262,6 @@ static long kvm_vm_compat_ioctl(struct file *filp,
2285 log.dirty_bitmap = compat_ptr(compat_log.dirty_bitmap); 2262 log.dirty_bitmap = compat_ptr(compat_log.dirty_bitmap);
2286 2263
2287 r = kvm_vm_ioctl_get_dirty_log(kvm, &log); 2264 r = kvm_vm_ioctl_get_dirty_log(kvm, &log);
2288 if (r)
2289 goto out;
2290 break; 2265 break;
2291 } 2266 }
2292 default: 2267 default: