aboutsummaryrefslogtreecommitdiffstats
path: root/arch
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2012-12-13 18:31:08 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2012-12-13 18:31:08 -0500
commit66cdd0ceaf65a18996f561b770eedde1d123b019 (patch)
tree4892eaa422d366fce5d1e866ff1fe0988af95569 /arch
parent896ea17d3da5f44b2625c9cda9874d7dfe447393 (diff)
parent58b7825bc324da55415034a9f6ca5d716b8fd898 (diff)
Merge tag 'kvm-3.8-1' of git://git.kernel.org/pub/scm/virt/kvm/kvm
Pull KVM updates from Marcelo Tosatti: "Considerable KVM/PPC work, x86 kvmclock vsyscall support, IA32_TSC_ADJUST MSR emulation, amongst others." Fix up trivial conflict in kernel/sched/core.c due to cross-cpu migration notifier added next to rq migration call-back. * tag 'kvm-3.8-1' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (156 commits) KVM: emulator: fix real mode segment checks in address linearization VMX: remove unneeded enable_unrestricted_guest check KVM: VMX: fix DPL during entry to protected mode x86/kexec: crash_vmclear_local_vmcss needs __rcu kvm: Fix irqfd resampler list walk KVM: VMX: provide the vmclear function and a bitmap to support VMCLEAR in kdump x86/kexec: VMCLEAR VMCSs loaded on all cpus if necessary KVM: MMU: optimize for set_spte KVM: PPC: booke: Get/set guest EPCR register using ONE_REG interface KVM: PPC: bookehv: Add EPCR support in mtspr/mfspr emulation KVM: PPC: bookehv: Add guest computation mode for irq delivery KVM: PPC: Make EPCR a valid field for booke64 and bookehv KVM: PPC: booke: Extend MAS2 EPN mask for 64-bit KVM: PPC: e500: Mask MAS2 EPN high 32-bits in 32/64 tlbwe emulation KVM: PPC: Mask ea's high 32-bits in 32/64 instr emulation KVM: PPC: e500: Add emulation helper for getting instruction ea KVM: PPC: bookehv64: Add support for interrupt handling KVM: PPC: bookehv: Remove GET_VCPU macro from exception handler KVM: PPC: booke: Fix get_tb() compile error on 64-bit KVM: PPC: e500: Silence bogus GCC warning in tlb code ...
Diffstat (limited to 'arch')
-rw-r--r--arch/ia64/kvm/kvm-ia64.c7
-rw-r--r--arch/powerpc/include/asm/Kbuild1
-rw-r--r--arch/powerpc/include/asm/epapr_hcalls.h83
-rw-r--r--arch/powerpc/include/asm/fsl_hcalls.h36
-rw-r--r--arch/powerpc/include/asm/kvm_asm.h1
-rw-r--r--arch/powerpc/include/asm/kvm_book3s.h12
-rw-r--r--arch/powerpc/include/asm/kvm_book3s_64.h33
-rw-r--r--arch/powerpc/include/asm/kvm_booke_hv_asm.h29
-rw-r--r--arch/powerpc/include/asm/kvm_host.h68
-rw-r--r--arch/powerpc/include/asm/kvm_para.h15
-rw-r--r--arch/powerpc/include/asm/kvm_ppc.h87
-rw-r--r--arch/powerpc/include/asm/mmu-book3e.h2
-rw-r--r--arch/powerpc/include/asm/mmu-hash64.h10
-rw-r--r--arch/powerpc/include/asm/reg.h1
-rw-r--r--arch/powerpc/include/asm/reg_booke.h7
-rw-r--r--arch/powerpc/include/asm/smp.h8
-rw-r--r--arch/powerpc/include/uapi/asm/Kbuild1
-rw-r--r--arch/powerpc/include/uapi/asm/epapr_hcalls.h98
-rw-r--r--arch/powerpc/include/uapi/asm/kvm.h86
-rw-r--r--arch/powerpc/include/uapi/asm/kvm_para.h7
-rw-r--r--arch/powerpc/kernel/asm-offsets.c4
-rw-r--r--arch/powerpc/kernel/epapr_hcalls.S28
-rw-r--r--arch/powerpc/kernel/epapr_paravirt.c11
-rw-r--r--arch/powerpc/kernel/kvm.c2
-rw-r--r--arch/powerpc/kernel/ppc_ksyms.c5
-rw-r--r--arch/powerpc/kernel/smp.c46
-rw-r--r--arch/powerpc/kvm/44x.c1
-rw-r--r--arch/powerpc/kvm/44x_emulate.c112
-rw-r--r--arch/powerpc/kvm/Kconfig4
-rw-r--r--arch/powerpc/kvm/Makefile5
-rw-r--r--arch/powerpc/kvm/book3s.c125
-rw-r--r--arch/powerpc/kvm/book3s_32_mmu_host.c3
-rw-r--r--arch/powerpc/kvm/book3s_64_mmu_host.c3
-rw-r--r--arch/powerpc/kvm/book3s_64_mmu_hv.c474
-rw-r--r--arch/powerpc/kvm/book3s_emulate.c16
-rw-r--r--arch/powerpc/kvm/book3s_exports.c3
-rw-r--r--arch/powerpc/kvm/book3s_hv.c655
-rw-r--r--arch/powerpc/kvm/book3s_hv_builtin.c4
-rw-r--r--arch/powerpc/kvm/book3s_hv_ras.c144
-rw-r--r--arch/powerpc/kvm/book3s_hv_rm_mmu.c143
-rw-r--r--arch/powerpc/kvm/book3s_hv_rmhandlers.S142
-rw-r--r--arch/powerpc/kvm/book3s_mmu_hpte.c5
-rw-r--r--arch/powerpc/kvm/book3s_pr.c294
-rw-r--r--arch/powerpc/kvm/book3s_rmhandlers.S18
-rw-r--r--arch/powerpc/kvm/booke.c346
-rw-r--r--arch/powerpc/kvm/booke.h1
-rw-r--r--arch/powerpc/kvm/booke_emulate.c36
-rw-r--r--arch/powerpc/kvm/bookehv_interrupts.S145
-rw-r--r--arch/powerpc/kvm/e500.h11
-rw-r--r--arch/powerpc/kvm/e500_emulate.c14
-rw-r--r--arch/powerpc/kvm/e500_tlb.c132
-rw-r--r--arch/powerpc/kvm/emulate.c221
-rw-r--r--arch/powerpc/kvm/powerpc.c187
-rw-r--r--arch/powerpc/kvm/trace.h200
-rw-r--r--arch/powerpc/platforms/Kconfig1
-rw-r--r--arch/powerpc/sysdev/fsl_msi.c9
-rw-r--r--arch/powerpc/sysdev/fsl_soc.c2
-rw-r--r--arch/s390/kvm/interrupt.c19
-rw-r--r--arch/s390/kvm/kvm-s390.c7
-rw-r--r--arch/x86/include/asm/clocksource.h1
-rw-r--r--arch/x86/include/asm/cpufeature.h1
-rw-r--r--arch/x86/include/asm/fixmap.h5
-rw-r--r--arch/x86/include/asm/kexec.h3
-rw-r--r--arch/x86/include/asm/kvm_guest.h6
-rw-r--r--arch/x86/include/asm/kvm_host.h24
-rw-r--r--arch/x86/include/asm/msr-index.h1
-rw-r--r--arch/x86/include/asm/pvclock.h47
-rw-r--r--arch/x86/include/asm/vmx.h3
-rw-r--r--arch/x86/include/asm/vsyscall.h20
-rw-r--r--arch/x86/kernel/crash.c32
-rw-r--r--arch/x86/kernel/kvm.c20
-rw-r--r--arch/x86/kernel/kvmclock.c88
-rw-r--r--arch/x86/kernel/pvclock.c143
-rw-r--r--arch/x86/kvm/cpuid.c3
-rw-r--r--arch/x86/kvm/cpuid.h8
-rw-r--r--arch/x86/kvm/emulate.c5
-rw-r--r--arch/x86/kvm/lapic.c2
-rw-r--r--arch/x86/kvm/mmu.c65
-rw-r--r--arch/x86/kvm/paging_tmpl.h115
-rw-r--r--arch/x86/kvm/svm.c48
-rw-r--r--arch/x86/kvm/trace.h63
-rw-r--r--arch/x86/kvm/vmx.c203
-rw-r--r--arch/x86/kvm/x86.c548
-rw-r--r--arch/x86/kvm/x86.h2
-rw-r--r--arch/x86/vdso/vclock_gettime.c81
-rw-r--r--arch/x86/vdso/vgetcpu.c11
86 files changed, 4525 insertions, 1193 deletions
diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c
index 0a88cb5d316d..bd1c51555038 100644
--- a/arch/ia64/kvm/kvm-ia64.c
+++ b/arch/ia64/kvm/kvm-ia64.c
@@ -1330,6 +1330,11 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
1330 return 0; 1330 return 0;
1331} 1331}
1332 1332
1333int kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
1334{
1335 return 0;
1336}
1337
1333int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) 1338int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
1334{ 1339{
1335 return -EINVAL; 1340 return -EINVAL;
@@ -1362,11 +1367,9 @@ static void kvm_release_vm_pages(struct kvm *kvm)
1362 struct kvm_memslots *slots; 1367 struct kvm_memslots *slots;
1363 struct kvm_memory_slot *memslot; 1368 struct kvm_memory_slot *memslot;
1364 int j; 1369 int j;
1365 unsigned long base_gfn;
1366 1370
1367 slots = kvm_memslots(kvm); 1371 slots = kvm_memslots(kvm);
1368 kvm_for_each_memslot(memslot, slots) { 1372 kvm_for_each_memslot(memslot, slots) {
1369 base_gfn = memslot->base_gfn;
1370 for (j = 0; j < memslot->npages; j++) { 1373 for (j = 0; j < memslot->npages; j++) {
1371 if (memslot->rmap[j]) 1374 if (memslot->rmap[j])
1372 put_page((struct page *)memslot->rmap[j]); 1375 put_page((struct page *)memslot->rmap[j]);
diff --git a/arch/powerpc/include/asm/Kbuild b/arch/powerpc/include/asm/Kbuild
index 2d62b484b3fc..650757c300db 100644
--- a/arch/powerpc/include/asm/Kbuild
+++ b/arch/powerpc/include/asm/Kbuild
@@ -1,5 +1,4 @@
1 1
2
3generic-y += clkdev.h 2generic-y += clkdev.h
4generic-y += rwsem.h 3generic-y += rwsem.h
5generic-y += trace_clock.h 4generic-y += trace_clock.h
diff --git a/arch/powerpc/include/asm/epapr_hcalls.h b/arch/powerpc/include/asm/epapr_hcalls.h
index bf2c06c33871..d3d634274d2c 100644
--- a/arch/powerpc/include/asm/epapr_hcalls.h
+++ b/arch/powerpc/include/asm/epapr_hcalls.h
@@ -50,64 +50,13 @@
50#ifndef _EPAPR_HCALLS_H 50#ifndef _EPAPR_HCALLS_H
51#define _EPAPR_HCALLS_H 51#define _EPAPR_HCALLS_H
52 52
53#include <uapi/asm/epapr_hcalls.h>
54
55#ifndef __ASSEMBLY__
53#include <linux/types.h> 56#include <linux/types.h>
54#include <linux/errno.h> 57#include <linux/errno.h>
55#include <asm/byteorder.h> 58#include <asm/byteorder.h>
56 59
57#define EV_BYTE_CHANNEL_SEND 1
58#define EV_BYTE_CHANNEL_RECEIVE 2
59#define EV_BYTE_CHANNEL_POLL 3
60#define EV_INT_SET_CONFIG 4
61#define EV_INT_GET_CONFIG 5
62#define EV_INT_SET_MASK 6
63#define EV_INT_GET_MASK 7
64#define EV_INT_IACK 9
65#define EV_INT_EOI 10
66#define EV_INT_SEND_IPI 11
67#define EV_INT_SET_TASK_PRIORITY 12
68#define EV_INT_GET_TASK_PRIORITY 13
69#define EV_DOORBELL_SEND 14
70#define EV_MSGSND 15
71#define EV_IDLE 16
72
73/* vendor ID: epapr */
74#define EV_LOCAL_VENDOR_ID 0 /* for private use */
75#define EV_EPAPR_VENDOR_ID 1
76#define EV_FSL_VENDOR_ID 2 /* Freescale Semiconductor */
77#define EV_IBM_VENDOR_ID 3 /* IBM */
78#define EV_GHS_VENDOR_ID 4 /* Green Hills Software */
79#define EV_ENEA_VENDOR_ID 5 /* Enea */
80#define EV_WR_VENDOR_ID 6 /* Wind River Systems */
81#define EV_AMCC_VENDOR_ID 7 /* Applied Micro Circuits */
82#define EV_KVM_VENDOR_ID 42 /* KVM */
83
84/* The max number of bytes that a byte channel can send or receive per call */
85#define EV_BYTE_CHANNEL_MAX_BYTES 16
86
87
88#define _EV_HCALL_TOKEN(id, num) (((id) << 16) | (num))
89#define EV_HCALL_TOKEN(hcall_num) _EV_HCALL_TOKEN(EV_EPAPR_VENDOR_ID, hcall_num)
90
91/* epapr error codes */
92#define EV_EPERM 1 /* Operation not permitted */
93#define EV_ENOENT 2 /* Entry Not Found */
94#define EV_EIO 3 /* I/O error occured */
95#define EV_EAGAIN 4 /* The operation had insufficient
96 * resources to complete and should be
97 * retried
98 */
99#define EV_ENOMEM 5 /* There was insufficient memory to
100 * complete the operation */
101#define EV_EFAULT 6 /* Bad guest address */
102#define EV_ENODEV 7 /* No such device */
103#define EV_EINVAL 8 /* An argument supplied to the hcall
104 was out of range or invalid */
105#define EV_INTERNAL 9 /* An internal error occured */
106#define EV_CONFIG 10 /* A configuration error was detected */
107#define EV_INVALID_STATE 11 /* The object is in an invalid state */
108#define EV_UNIMPLEMENTED 12 /* Unimplemented hypercall */
109#define EV_BUFFER_OVERFLOW 13 /* Caller-supplied buffer too small */
110
111/* 60/*
112 * Hypercall register clobber list 61 * Hypercall register clobber list
113 * 62 *
@@ -193,7 +142,7 @@ static inline unsigned int ev_int_set_config(unsigned int interrupt,
193 r5 = priority; 142 r5 = priority;
194 r6 = destination; 143 r6 = destination;
195 144
196 __asm__ __volatile__ ("sc 1" 145 asm volatile("bl epapr_hypercall_start"
197 : "+r" (r11), "+r" (r3), "+r" (r4), "+r" (r5), "+r" (r6) 146 : "+r" (r11), "+r" (r3), "+r" (r4), "+r" (r5), "+r" (r6)
198 : : EV_HCALL_CLOBBERS4 147 : : EV_HCALL_CLOBBERS4
199 ); 148 );
@@ -222,7 +171,7 @@ static inline unsigned int ev_int_get_config(unsigned int interrupt,
222 r11 = EV_HCALL_TOKEN(EV_INT_GET_CONFIG); 171 r11 = EV_HCALL_TOKEN(EV_INT_GET_CONFIG);
223 r3 = interrupt; 172 r3 = interrupt;
224 173
225 __asm__ __volatile__ ("sc 1" 174 asm volatile("bl epapr_hypercall_start"
226 : "+r" (r11), "+r" (r3), "=r" (r4), "=r" (r5), "=r" (r6) 175 : "+r" (r11), "+r" (r3), "=r" (r4), "=r" (r5), "=r" (r6)
227 : : EV_HCALL_CLOBBERS4 176 : : EV_HCALL_CLOBBERS4
228 ); 177 );
@@ -252,7 +201,7 @@ static inline unsigned int ev_int_set_mask(unsigned int interrupt,
252 r3 = interrupt; 201 r3 = interrupt;
253 r4 = mask; 202 r4 = mask;
254 203
255 __asm__ __volatile__ ("sc 1" 204 asm volatile("bl epapr_hypercall_start"
256 : "+r" (r11), "+r" (r3), "+r" (r4) 205 : "+r" (r11), "+r" (r3), "+r" (r4)
257 : : EV_HCALL_CLOBBERS2 206 : : EV_HCALL_CLOBBERS2
258 ); 207 );
@@ -277,7 +226,7 @@ static inline unsigned int ev_int_get_mask(unsigned int interrupt,
277 r11 = EV_HCALL_TOKEN(EV_INT_GET_MASK); 226 r11 = EV_HCALL_TOKEN(EV_INT_GET_MASK);
278 r3 = interrupt; 227 r3 = interrupt;
279 228
280 __asm__ __volatile__ ("sc 1" 229 asm volatile("bl epapr_hypercall_start"
281 : "+r" (r11), "+r" (r3), "=r" (r4) 230 : "+r" (r11), "+r" (r3), "=r" (r4)
282 : : EV_HCALL_CLOBBERS2 231 : : EV_HCALL_CLOBBERS2
283 ); 232 );
@@ -305,7 +254,7 @@ static inline unsigned int ev_int_eoi(unsigned int interrupt)
305 r11 = EV_HCALL_TOKEN(EV_INT_EOI); 254 r11 = EV_HCALL_TOKEN(EV_INT_EOI);
306 r3 = interrupt; 255 r3 = interrupt;
307 256
308 __asm__ __volatile__ ("sc 1" 257 asm volatile("bl epapr_hypercall_start"
309 : "+r" (r11), "+r" (r3) 258 : "+r" (r11), "+r" (r3)
310 : : EV_HCALL_CLOBBERS1 259 : : EV_HCALL_CLOBBERS1
311 ); 260 );
@@ -344,7 +293,7 @@ static inline unsigned int ev_byte_channel_send(unsigned int handle,
344 r7 = be32_to_cpu(p[2]); 293 r7 = be32_to_cpu(p[2]);
345 r8 = be32_to_cpu(p[3]); 294 r8 = be32_to_cpu(p[3]);
346 295
347 __asm__ __volatile__ ("sc 1" 296 asm volatile("bl epapr_hypercall_start"
348 : "+r" (r11), "+r" (r3), 297 : "+r" (r11), "+r" (r3),
349 "+r" (r4), "+r" (r5), "+r" (r6), "+r" (r7), "+r" (r8) 298 "+r" (r4), "+r" (r5), "+r" (r6), "+r" (r7), "+r" (r8)
350 : : EV_HCALL_CLOBBERS6 299 : : EV_HCALL_CLOBBERS6
@@ -383,7 +332,7 @@ static inline unsigned int ev_byte_channel_receive(unsigned int handle,
383 r3 = handle; 332 r3 = handle;
384 r4 = *count; 333 r4 = *count;
385 334
386 __asm__ __volatile__ ("sc 1" 335 asm volatile("bl epapr_hypercall_start"
387 : "+r" (r11), "+r" (r3), "+r" (r4), 336 : "+r" (r11), "+r" (r3), "+r" (r4),
388 "=r" (r5), "=r" (r6), "=r" (r7), "=r" (r8) 337 "=r" (r5), "=r" (r6), "=r" (r7), "=r" (r8)
389 : : EV_HCALL_CLOBBERS6 338 : : EV_HCALL_CLOBBERS6
@@ -421,7 +370,7 @@ static inline unsigned int ev_byte_channel_poll(unsigned int handle,
421 r11 = EV_HCALL_TOKEN(EV_BYTE_CHANNEL_POLL); 370 r11 = EV_HCALL_TOKEN(EV_BYTE_CHANNEL_POLL);
422 r3 = handle; 371 r3 = handle;
423 372
424 __asm__ __volatile__ ("sc 1" 373 asm volatile("bl epapr_hypercall_start"
425 : "+r" (r11), "+r" (r3), "=r" (r4), "=r" (r5) 374 : "+r" (r11), "+r" (r3), "=r" (r4), "=r" (r5)
426 : : EV_HCALL_CLOBBERS3 375 : : EV_HCALL_CLOBBERS3
427 ); 376 );
@@ -454,7 +403,7 @@ static inline unsigned int ev_int_iack(unsigned int handle,
454 r11 = EV_HCALL_TOKEN(EV_INT_IACK); 403 r11 = EV_HCALL_TOKEN(EV_INT_IACK);
455 r3 = handle; 404 r3 = handle;
456 405
457 __asm__ __volatile__ ("sc 1" 406 asm volatile("bl epapr_hypercall_start"
458 : "+r" (r11), "+r" (r3), "=r" (r4) 407 : "+r" (r11), "+r" (r3), "=r" (r4)
459 : : EV_HCALL_CLOBBERS2 408 : : EV_HCALL_CLOBBERS2
460 ); 409 );
@@ -478,7 +427,7 @@ static inline unsigned int ev_doorbell_send(unsigned int handle)
478 r11 = EV_HCALL_TOKEN(EV_DOORBELL_SEND); 427 r11 = EV_HCALL_TOKEN(EV_DOORBELL_SEND);
479 r3 = handle; 428 r3 = handle;
480 429
481 __asm__ __volatile__ ("sc 1" 430 asm volatile("bl epapr_hypercall_start"
482 : "+r" (r11), "+r" (r3) 431 : "+r" (r11), "+r" (r3)
483 : : EV_HCALL_CLOBBERS1 432 : : EV_HCALL_CLOBBERS1
484 ); 433 );
@@ -498,12 +447,12 @@ static inline unsigned int ev_idle(void)
498 447
499 r11 = EV_HCALL_TOKEN(EV_IDLE); 448 r11 = EV_HCALL_TOKEN(EV_IDLE);
500 449
501 __asm__ __volatile__ ("sc 1" 450 asm volatile("bl epapr_hypercall_start"
502 : "+r" (r11), "=r" (r3) 451 : "+r" (r11), "=r" (r3)
503 : : EV_HCALL_CLOBBERS1 452 : : EV_HCALL_CLOBBERS1
504 ); 453 );
505 454
506 return r3; 455 return r3;
507} 456}
508 457#endif /* !__ASSEMBLY__ */
509#endif 458#endif /* _EPAPR_HCALLS_H */
diff --git a/arch/powerpc/include/asm/fsl_hcalls.h b/arch/powerpc/include/asm/fsl_hcalls.h
index 922d9b5fe3d5..3abb58394da4 100644
--- a/arch/powerpc/include/asm/fsl_hcalls.h
+++ b/arch/powerpc/include/asm/fsl_hcalls.h
@@ -96,7 +96,7 @@ static inline unsigned int fh_send_nmi(unsigned int vcpu_mask)
96 r11 = FH_HCALL_TOKEN(FH_SEND_NMI); 96 r11 = FH_HCALL_TOKEN(FH_SEND_NMI);
97 r3 = vcpu_mask; 97 r3 = vcpu_mask;
98 98
99 __asm__ __volatile__ ("sc 1" 99 asm volatile("bl epapr_hypercall_start"
100 : "+r" (r11), "+r" (r3) 100 : "+r" (r11), "+r" (r3)
101 : : EV_HCALL_CLOBBERS1 101 : : EV_HCALL_CLOBBERS1
102 ); 102 );
@@ -151,7 +151,7 @@ static inline unsigned int fh_partition_get_dtprop(int handle,
151 r9 = (uint32_t)propvalue_addr; 151 r9 = (uint32_t)propvalue_addr;
152 r10 = *propvalue_len; 152 r10 = *propvalue_len;
153 153
154 __asm__ __volatile__ ("sc 1" 154 asm volatile("bl epapr_hypercall_start"
155 : "+r" (r11), 155 : "+r" (r11),
156 "+r" (r3), "+r" (r4), "+r" (r5), "+r" (r6), "+r" (r7), 156 "+r" (r3), "+r" (r4), "+r" (r5), "+r" (r6), "+r" (r7),
157 "+r" (r8), "+r" (r9), "+r" (r10) 157 "+r" (r8), "+r" (r9), "+r" (r10)
@@ -205,7 +205,7 @@ static inline unsigned int fh_partition_set_dtprop(int handle,
205 r9 = (uint32_t)propvalue_addr; 205 r9 = (uint32_t)propvalue_addr;
206 r10 = propvalue_len; 206 r10 = propvalue_len;
207 207
208 __asm__ __volatile__ ("sc 1" 208 asm volatile("bl epapr_hypercall_start"
209 : "+r" (r11), 209 : "+r" (r11),
210 "+r" (r3), "+r" (r4), "+r" (r5), "+r" (r6), "+r" (r7), 210 "+r" (r3), "+r" (r4), "+r" (r5), "+r" (r6), "+r" (r7),
211 "+r" (r8), "+r" (r9), "+r" (r10) 211 "+r" (r8), "+r" (r9), "+r" (r10)
@@ -229,7 +229,7 @@ static inline unsigned int fh_partition_restart(unsigned int partition)
229 r11 = FH_HCALL_TOKEN(FH_PARTITION_RESTART); 229 r11 = FH_HCALL_TOKEN(FH_PARTITION_RESTART);
230 r3 = partition; 230 r3 = partition;
231 231
232 __asm__ __volatile__ ("sc 1" 232 asm volatile("bl epapr_hypercall_start"
233 : "+r" (r11), "+r" (r3) 233 : "+r" (r11), "+r" (r3)
234 : : EV_HCALL_CLOBBERS1 234 : : EV_HCALL_CLOBBERS1
235 ); 235 );
@@ -262,7 +262,7 @@ static inline unsigned int fh_partition_get_status(unsigned int partition,
262 r11 = FH_HCALL_TOKEN(FH_PARTITION_GET_STATUS); 262 r11 = FH_HCALL_TOKEN(FH_PARTITION_GET_STATUS);
263 r3 = partition; 263 r3 = partition;
264 264
265 __asm__ __volatile__ ("sc 1" 265 asm volatile("bl epapr_hypercall_start"
266 : "+r" (r11), "+r" (r3), "=r" (r4) 266 : "+r" (r11), "+r" (r3), "=r" (r4)
267 : : EV_HCALL_CLOBBERS2 267 : : EV_HCALL_CLOBBERS2
268 ); 268 );
@@ -295,7 +295,7 @@ static inline unsigned int fh_partition_start(unsigned int partition,
295 r4 = entry_point; 295 r4 = entry_point;
296 r5 = load; 296 r5 = load;
297 297
298 __asm__ __volatile__ ("sc 1" 298 asm volatile("bl epapr_hypercall_start"
299 : "+r" (r11), "+r" (r3), "+r" (r4), "+r" (r5) 299 : "+r" (r11), "+r" (r3), "+r" (r4), "+r" (r5)
300 : : EV_HCALL_CLOBBERS3 300 : : EV_HCALL_CLOBBERS3
301 ); 301 );
@@ -317,7 +317,7 @@ static inline unsigned int fh_partition_stop(unsigned int partition)
317 r11 = FH_HCALL_TOKEN(FH_PARTITION_STOP); 317 r11 = FH_HCALL_TOKEN(FH_PARTITION_STOP);
318 r3 = partition; 318 r3 = partition;
319 319
320 __asm__ __volatile__ ("sc 1" 320 asm volatile("bl epapr_hypercall_start"
321 : "+r" (r11), "+r" (r3) 321 : "+r" (r11), "+r" (r3)
322 : : EV_HCALL_CLOBBERS1 322 : : EV_HCALL_CLOBBERS1
323 ); 323 );
@@ -376,7 +376,7 @@ static inline unsigned int fh_partition_memcpy(unsigned int source,
376#endif 376#endif
377 r7 = count; 377 r7 = count;
378 378
379 __asm__ __volatile__ ("sc 1" 379 asm volatile("bl epapr_hypercall_start"
380 : "+r" (r11), 380 : "+r" (r11),
381 "+r" (r3), "+r" (r4), "+r" (r5), "+r" (r6), "+r" (r7) 381 "+r" (r3), "+r" (r4), "+r" (r5), "+r" (r6), "+r" (r7)
382 : : EV_HCALL_CLOBBERS5 382 : : EV_HCALL_CLOBBERS5
@@ -399,7 +399,7 @@ static inline unsigned int fh_dma_enable(unsigned int liodn)
399 r11 = FH_HCALL_TOKEN(FH_DMA_ENABLE); 399 r11 = FH_HCALL_TOKEN(FH_DMA_ENABLE);
400 r3 = liodn; 400 r3 = liodn;
401 401
402 __asm__ __volatile__ ("sc 1" 402 asm volatile("bl epapr_hypercall_start"
403 : "+r" (r11), "+r" (r3) 403 : "+r" (r11), "+r" (r3)
404 : : EV_HCALL_CLOBBERS1 404 : : EV_HCALL_CLOBBERS1
405 ); 405 );
@@ -421,7 +421,7 @@ static inline unsigned int fh_dma_disable(unsigned int liodn)
421 r11 = FH_HCALL_TOKEN(FH_DMA_DISABLE); 421 r11 = FH_HCALL_TOKEN(FH_DMA_DISABLE);
422 r3 = liodn; 422 r3 = liodn;
423 423
424 __asm__ __volatile__ ("sc 1" 424 asm volatile("bl epapr_hypercall_start"
425 : "+r" (r11), "+r" (r3) 425 : "+r" (r11), "+r" (r3)
426 : : EV_HCALL_CLOBBERS1 426 : : EV_HCALL_CLOBBERS1
427 ); 427 );
@@ -447,7 +447,7 @@ static inline unsigned int fh_vmpic_get_msir(unsigned int interrupt,
447 r11 = FH_HCALL_TOKEN(FH_VMPIC_GET_MSIR); 447 r11 = FH_HCALL_TOKEN(FH_VMPIC_GET_MSIR);
448 r3 = interrupt; 448 r3 = interrupt;
449 449
450 __asm__ __volatile__ ("sc 1" 450 asm volatile("bl epapr_hypercall_start"
451 : "+r" (r11), "+r" (r3), "=r" (r4) 451 : "+r" (r11), "+r" (r3), "=r" (r4)
452 : : EV_HCALL_CLOBBERS2 452 : : EV_HCALL_CLOBBERS2
453 ); 453 );
@@ -469,7 +469,7 @@ static inline unsigned int fh_system_reset(void)
469 469
470 r11 = FH_HCALL_TOKEN(FH_SYSTEM_RESET); 470 r11 = FH_HCALL_TOKEN(FH_SYSTEM_RESET);
471 471
472 __asm__ __volatile__ ("sc 1" 472 asm volatile("bl epapr_hypercall_start"
473 : "+r" (r11), "=r" (r3) 473 : "+r" (r11), "=r" (r3)
474 : : EV_HCALL_CLOBBERS1 474 : : EV_HCALL_CLOBBERS1
475 ); 475 );
@@ -506,7 +506,7 @@ static inline unsigned int fh_err_get_info(int queue, uint32_t *bufsize,
506 r6 = addr_lo; 506 r6 = addr_lo;
507 r7 = peek; 507 r7 = peek;
508 508
509 __asm__ __volatile__ ("sc 1" 509 asm volatile("bl epapr_hypercall_start"
510 : "+r" (r11), "+r" (r3), "+r" (r4), "+r" (r5), "+r" (r6), 510 : "+r" (r11), "+r" (r3), "+r" (r4), "+r" (r5), "+r" (r6),
511 "+r" (r7) 511 "+r" (r7)
512 : : EV_HCALL_CLOBBERS5 512 : : EV_HCALL_CLOBBERS5
@@ -542,7 +542,7 @@ static inline unsigned int fh_get_core_state(unsigned int handle,
542 r3 = handle; 542 r3 = handle;
543 r4 = vcpu; 543 r4 = vcpu;
544 544
545 __asm__ __volatile__ ("sc 1" 545 asm volatile("bl epapr_hypercall_start"
546 : "+r" (r11), "+r" (r3), "+r" (r4) 546 : "+r" (r11), "+r" (r3), "+r" (r4)
547 : : EV_HCALL_CLOBBERS2 547 : : EV_HCALL_CLOBBERS2
548 ); 548 );
@@ -572,7 +572,7 @@ static inline unsigned int fh_enter_nap(unsigned int handle, unsigned int vcpu)
572 r3 = handle; 572 r3 = handle;
573 r4 = vcpu; 573 r4 = vcpu;
574 574
575 __asm__ __volatile__ ("sc 1" 575 asm volatile("bl epapr_hypercall_start"
576 : "+r" (r11), "+r" (r3), "+r" (r4) 576 : "+r" (r11), "+r" (r3), "+r" (r4)
577 : : EV_HCALL_CLOBBERS2 577 : : EV_HCALL_CLOBBERS2
578 ); 578 );
@@ -597,7 +597,7 @@ static inline unsigned int fh_exit_nap(unsigned int handle, unsigned int vcpu)
597 r3 = handle; 597 r3 = handle;
598 r4 = vcpu; 598 r4 = vcpu;
599 599
600 __asm__ __volatile__ ("sc 1" 600 asm volatile("bl epapr_hypercall_start"
601 : "+r" (r11), "+r" (r3), "+r" (r4) 601 : "+r" (r11), "+r" (r3), "+r" (r4)
602 : : EV_HCALL_CLOBBERS2 602 : : EV_HCALL_CLOBBERS2
603 ); 603 );
@@ -618,7 +618,7 @@ static inline unsigned int fh_claim_device(unsigned int handle)
618 r11 = FH_HCALL_TOKEN(FH_CLAIM_DEVICE); 618 r11 = FH_HCALL_TOKEN(FH_CLAIM_DEVICE);
619 r3 = handle; 619 r3 = handle;
620 620
621 __asm__ __volatile__ ("sc 1" 621 asm volatile("bl epapr_hypercall_start"
622 : "+r" (r11), "+r" (r3) 622 : "+r" (r11), "+r" (r3)
623 : : EV_HCALL_CLOBBERS1 623 : : EV_HCALL_CLOBBERS1
624 ); 624 );
@@ -645,7 +645,7 @@ static inline unsigned int fh_partition_stop_dma(unsigned int handle)
645 r11 = FH_HCALL_TOKEN(FH_PARTITION_STOP_DMA); 645 r11 = FH_HCALL_TOKEN(FH_PARTITION_STOP_DMA);
646 r3 = handle; 646 r3 = handle;
647 647
648 __asm__ __volatile__ ("sc 1" 648 asm volatile("bl epapr_hypercall_start"
649 : "+r" (r11), "+r" (r3) 649 : "+r" (r11), "+r" (r3)
650 : : EV_HCALL_CLOBBERS1 650 : : EV_HCALL_CLOBBERS1
651 ); 651 );
diff --git a/arch/powerpc/include/asm/kvm_asm.h b/arch/powerpc/include/asm/kvm_asm.h
index 76fdcfef0889..aabcdba8f6b0 100644
--- a/arch/powerpc/include/asm/kvm_asm.h
+++ b/arch/powerpc/include/asm/kvm_asm.h
@@ -118,6 +118,7 @@
118 118
119#define RESUME_FLAG_NV (1<<0) /* Reload guest nonvolatile state? */ 119#define RESUME_FLAG_NV (1<<0) /* Reload guest nonvolatile state? */
120#define RESUME_FLAG_HOST (1<<1) /* Resume host? */ 120#define RESUME_FLAG_HOST (1<<1) /* Resume host? */
121#define RESUME_FLAG_ARCH1 (1<<2)
121 122
122#define RESUME_GUEST 0 123#define RESUME_GUEST 0
123#define RESUME_GUEST_NV RESUME_FLAG_NV 124#define RESUME_GUEST_NV RESUME_FLAG_NV
diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h
index 7aefdb3e1ce4..5a56e1c5f851 100644
--- a/arch/powerpc/include/asm/kvm_book3s.h
+++ b/arch/powerpc/include/asm/kvm_book3s.h
@@ -81,6 +81,8 @@ struct kvmppc_vcpu_book3s {
81 u64 sdr1; 81 u64 sdr1;
82 u64 hior; 82 u64 hior;
83 u64 msr_mask; 83 u64 msr_mask;
84 u64 purr_offset;
85 u64 spurr_offset;
84#ifdef CONFIG_PPC_BOOK3S_32 86#ifdef CONFIG_PPC_BOOK3S_32
85 u32 vsid_pool[VSID_POOL_SIZE]; 87 u32 vsid_pool[VSID_POOL_SIZE];
86 u32 vsid_next; 88 u32 vsid_next;
@@ -157,10 +159,14 @@ extern void *kvmppc_pin_guest_page(struct kvm *kvm, unsigned long addr,
157extern void kvmppc_unpin_guest_page(struct kvm *kvm, void *addr); 159extern void kvmppc_unpin_guest_page(struct kvm *kvm, void *addr);
158extern long kvmppc_virtmode_h_enter(struct kvm_vcpu *vcpu, unsigned long flags, 160extern long kvmppc_virtmode_h_enter(struct kvm_vcpu *vcpu, unsigned long flags,
159 long pte_index, unsigned long pteh, unsigned long ptel); 161 long pte_index, unsigned long pteh, unsigned long ptel);
160extern long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags, 162extern long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags,
161 long pte_index, unsigned long pteh, unsigned long ptel); 163 long pte_index, unsigned long pteh, unsigned long ptel,
164 pgd_t *pgdir, bool realmode, unsigned long *idx_ret);
165extern long kvmppc_do_h_remove(struct kvm *kvm, unsigned long flags,
166 unsigned long pte_index, unsigned long avpn,
167 unsigned long *hpret);
162extern long kvmppc_hv_get_dirty_log(struct kvm *kvm, 168extern long kvmppc_hv_get_dirty_log(struct kvm *kvm,
163 struct kvm_memory_slot *memslot); 169 struct kvm_memory_slot *memslot, unsigned long *map);
164 170
165extern void kvmppc_entry_trampoline(void); 171extern void kvmppc_entry_trampoline(void);
166extern void kvmppc_hv_entry_trampoline(void); 172extern void kvmppc_hv_entry_trampoline(void);
diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h
index 0dd1d86d3e31..38bec1dc9928 100644
--- a/arch/powerpc/include/asm/kvm_book3s_64.h
+++ b/arch/powerpc/include/asm/kvm_book3s_64.h
@@ -50,6 +50,15 @@ extern int kvm_hpt_order; /* order of preallocated HPTs */
50#define HPTE_V_HVLOCK 0x40UL 50#define HPTE_V_HVLOCK 0x40UL
51#define HPTE_V_ABSENT 0x20UL 51#define HPTE_V_ABSENT 0x20UL
52 52
53/*
54 * We use this bit in the guest_rpte field of the revmap entry
55 * to indicate a modified HPTE.
56 */
57#define HPTE_GR_MODIFIED (1ul << 62)
58
59/* These bits are reserved in the guest view of the HPTE */
60#define HPTE_GR_RESERVED HPTE_GR_MODIFIED
61
53static inline long try_lock_hpte(unsigned long *hpte, unsigned long bits) 62static inline long try_lock_hpte(unsigned long *hpte, unsigned long bits)
54{ 63{
55 unsigned long tmp, old; 64 unsigned long tmp, old;
@@ -60,7 +69,7 @@ static inline long try_lock_hpte(unsigned long *hpte, unsigned long bits)
60 " ori %0,%0,%4\n" 69 " ori %0,%0,%4\n"
61 " stdcx. %0,0,%2\n" 70 " stdcx. %0,0,%2\n"
62 " beq+ 2f\n" 71 " beq+ 2f\n"
63 " li %1,%3\n" 72 " mr %1,%3\n"
64 "2: isync" 73 "2: isync"
65 : "=&r" (tmp), "=&r" (old) 74 : "=&r" (tmp), "=&r" (old)
66 : "r" (hpte), "r" (bits), "i" (HPTE_V_HVLOCK) 75 : "r" (hpte), "r" (bits), "i" (HPTE_V_HVLOCK)
@@ -237,4 +246,26 @@ static inline bool slot_is_aligned(struct kvm_memory_slot *memslot,
237 return !(memslot->base_gfn & mask) && !(memslot->npages & mask); 246 return !(memslot->base_gfn & mask) && !(memslot->npages & mask);
238} 247}
239 248
249/*
250 * This works for 4k, 64k and 16M pages on POWER7,
251 * and 4k and 16M pages on PPC970.
252 */
253static inline unsigned long slb_pgsize_encoding(unsigned long psize)
254{
255 unsigned long senc = 0;
256
257 if (psize > 0x1000) {
258 senc = SLB_VSID_L;
259 if (psize == 0x10000)
260 senc |= SLB_VSID_LP_01;
261 }
262 return senc;
263}
264
265static inline int is_vrma_hpte(unsigned long hpte_v)
266{
267 return (hpte_v & ~0xffffffUL) ==
268 (HPTE_V_1TB_SEG | (VRMA_VSID << (40 - 16)));
269}
270
240#endif /* __ASM_KVM_BOOK3S_64_H__ */ 271#endif /* __ASM_KVM_BOOK3S_64_H__ */
diff --git a/arch/powerpc/include/asm/kvm_booke_hv_asm.h b/arch/powerpc/include/asm/kvm_booke_hv_asm.h
index 30a600fa1b6a..3a79f5325712 100644
--- a/arch/powerpc/include/asm/kvm_booke_hv_asm.h
+++ b/arch/powerpc/include/asm/kvm_booke_hv_asm.h
@@ -17,6 +17,7 @@
17 * there are no exceptions for which we fall through directly to 17 * there are no exceptions for which we fall through directly to
18 * the normal host handler. 18 * the normal host handler.
19 * 19 *
20 * 32-bit host
20 * Expected inputs (normal exceptions): 21 * Expected inputs (normal exceptions):
21 * SCRATCH0 = saved r10 22 * SCRATCH0 = saved r10
22 * r10 = thread struct 23 * r10 = thread struct
@@ -33,14 +34,38 @@
33 * *(r8 + GPR9) = saved r9 34 * *(r8 + GPR9) = saved r9
34 * *(r8 + GPR10) = saved r10 (r10 not yet clobbered) 35 * *(r8 + GPR10) = saved r10 (r10 not yet clobbered)
35 * *(r8 + GPR11) = saved r11 36 * *(r8 + GPR11) = saved r11
37 *
38 * 64-bit host
39 * Expected inputs (GEN/GDBELL/DBG/MC exception types):
40 * r10 = saved CR
41 * r13 = PACA_POINTER
42 * *(r13 + PACA_EX##type + EX_R10) = saved r10
43 * *(r13 + PACA_EX##type + EX_R11) = saved r11
44 * SPRN_SPRG_##type##_SCRATCH = saved r13
45 *
46 * Expected inputs (CRIT exception type):
47 * r10 = saved CR
48 * r13 = PACA_POINTER
49 * *(r13 + PACA_EX##type + EX_R10) = saved r10
50 * *(r13 + PACA_EX##type + EX_R11) = saved r11
51 * *(r13 + PACA_EX##type + EX_R13) = saved r13
52 *
53 * Expected inputs (TLB exception type):
54 * r10 = saved CR
55 * r13 = PACA_POINTER
56 * *(r13 + PACA_EX##type + EX_TLB_R10) = saved r10
57 * *(r13 + PACA_EX##type + EX_TLB_R11) = saved r11
58 * SPRN_SPRG_GEN_SCRATCH = saved r13
59 *
60 * Only the bolted version of TLB miss exception handlers is supported now.
36 */ 61 */
37.macro DO_KVM intno srr1 62.macro DO_KVM intno srr1
38#ifdef CONFIG_KVM_BOOKE_HV 63#ifdef CONFIG_KVM_BOOKE_HV
39BEGIN_FTR_SECTION 64BEGIN_FTR_SECTION
40 mtocrf 0x80, r11 /* check MSR[GS] without clobbering reg */ 65 mtocrf 0x80, r11 /* check MSR[GS] without clobbering reg */
41 bf 3, kvmppc_resume_\intno\()_\srr1 66 bf 3, 1975f
42 b kvmppc_handler_\intno\()_\srr1 67 b kvmppc_handler_\intno\()_\srr1
43kvmppc_resume_\intno\()_\srr1: 681975:
44END_FTR_SECTION_IFSET(CPU_FTR_EMB_HV) 69END_FTR_SECTION_IFSET(CPU_FTR_EMB_HV)
45#endif 70#endif
46.endm 71.endm
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index 28e8f5e5c63e..ca9bf459db6a 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -46,7 +46,7 @@
46#define KVM_COALESCED_MMIO_PAGE_OFFSET 1 46#define KVM_COALESCED_MMIO_PAGE_OFFSET 1
47#endif 47#endif
48 48
49#ifdef CONFIG_KVM_BOOK3S_64_HV 49#if !defined(CONFIG_KVM_440)
50#include <linux/mmu_notifier.h> 50#include <linux/mmu_notifier.h>
51 51
52#define KVM_ARCH_WANT_MMU_NOTIFIER 52#define KVM_ARCH_WANT_MMU_NOTIFIER
@@ -204,7 +204,7 @@ struct revmap_entry {
204}; 204};
205 205
206/* 206/*
207 * We use the top bit of each memslot->rmap entry as a lock bit, 207 * We use the top bit of each memslot->arch.rmap entry as a lock bit,
208 * and bit 32 as a present flag. The bottom 32 bits are the 208 * and bit 32 as a present flag. The bottom 32 bits are the
209 * index in the guest HPT of a HPTE that points to the page. 209 * index in the guest HPT of a HPTE that points to the page.
210 */ 210 */
@@ -215,14 +215,17 @@ struct revmap_entry {
215#define KVMPPC_RMAP_PRESENT 0x100000000ul 215#define KVMPPC_RMAP_PRESENT 0x100000000ul
216#define KVMPPC_RMAP_INDEX 0xfffffffful 216#define KVMPPC_RMAP_INDEX 0xfffffffful
217 217
218/* Low-order bits in kvm->arch.slot_phys[][] */ 218/* Low-order bits in memslot->arch.slot_phys[] */
219#define KVMPPC_PAGE_ORDER_MASK 0x1f 219#define KVMPPC_PAGE_ORDER_MASK 0x1f
220#define KVMPPC_PAGE_NO_CACHE HPTE_R_I /* 0x20 */ 220#define KVMPPC_PAGE_NO_CACHE HPTE_R_I /* 0x20 */
221#define KVMPPC_PAGE_WRITETHRU HPTE_R_W /* 0x40 */ 221#define KVMPPC_PAGE_WRITETHRU HPTE_R_W /* 0x40 */
222#define KVMPPC_GOT_PAGE 0x80 222#define KVMPPC_GOT_PAGE 0x80
223 223
224struct kvm_arch_memory_slot { 224struct kvm_arch_memory_slot {
225#ifdef CONFIG_KVM_BOOK3S_64_HV
225 unsigned long *rmap; 226 unsigned long *rmap;
227 unsigned long *slot_phys;
228#endif /* CONFIG_KVM_BOOK3S_64_HV */
226}; 229};
227 230
228struct kvm_arch { 231struct kvm_arch {
@@ -243,12 +246,12 @@ struct kvm_arch {
243 int using_mmu_notifiers; 246 int using_mmu_notifiers;
244 u32 hpt_order; 247 u32 hpt_order;
245 atomic_t vcpus_running; 248 atomic_t vcpus_running;
249 u32 online_vcores;
246 unsigned long hpt_npte; 250 unsigned long hpt_npte;
247 unsigned long hpt_mask; 251 unsigned long hpt_mask;
252 atomic_t hpte_mod_interest;
248 spinlock_t slot_phys_lock; 253 spinlock_t slot_phys_lock;
249 unsigned long *slot_phys[KVM_MEM_SLOTS_NUM]; 254 cpumask_t need_tlb_flush;
250 int slot_npages[KVM_MEM_SLOTS_NUM];
251 unsigned short last_vcpu[NR_CPUS];
252 struct kvmppc_vcore *vcores[KVM_MAX_VCORES]; 255 struct kvmppc_vcore *vcores[KVM_MAX_VCORES];
253 struct kvmppc_linear_info *hpt_li; 256 struct kvmppc_linear_info *hpt_li;
254#endif /* CONFIG_KVM_BOOK3S_64_HV */ 257#endif /* CONFIG_KVM_BOOK3S_64_HV */
@@ -273,6 +276,7 @@ struct kvmppc_vcore {
273 int nap_count; 276 int nap_count;
274 int napping_threads; 277 int napping_threads;
275 u16 pcpu; 278 u16 pcpu;
279 u16 last_cpu;
276 u8 vcore_state; 280 u8 vcore_state;
277 u8 in_guest; 281 u8 in_guest;
278 struct list_head runnable_threads; 282 struct list_head runnable_threads;
@@ -288,9 +292,10 @@ struct kvmppc_vcore {
288 292
289/* Values for vcore_state */ 293/* Values for vcore_state */
290#define VCORE_INACTIVE 0 294#define VCORE_INACTIVE 0
291#define VCORE_RUNNING 1 295#define VCORE_SLEEPING 1
292#define VCORE_EXITING 2 296#define VCORE_STARTING 2
293#define VCORE_SLEEPING 3 297#define VCORE_RUNNING 3
298#define VCORE_EXITING 4
294 299
295/* 300/*
296 * Struct used to manage memory for a virtual processor area 301 * Struct used to manage memory for a virtual processor area
@@ -346,6 +351,27 @@ struct kvmppc_slb {
346 bool class : 1; 351 bool class : 1;
347}; 352};
348 353
354# ifdef CONFIG_PPC_FSL_BOOK3E
355#define KVMPPC_BOOKE_IAC_NUM 2
356#define KVMPPC_BOOKE_DAC_NUM 2
357# else
358#define KVMPPC_BOOKE_IAC_NUM 4
359#define KVMPPC_BOOKE_DAC_NUM 2
360# endif
361#define KVMPPC_BOOKE_MAX_IAC 4
362#define KVMPPC_BOOKE_MAX_DAC 2
363
364struct kvmppc_booke_debug_reg {
365 u32 dbcr0;
366 u32 dbcr1;
367 u32 dbcr2;
368#ifdef CONFIG_KVM_E500MC
369 u32 dbcr4;
370#endif
371 u64 iac[KVMPPC_BOOKE_MAX_IAC];
372 u64 dac[KVMPPC_BOOKE_MAX_DAC];
373};
374
349struct kvm_vcpu_arch { 375struct kvm_vcpu_arch {
350 ulong host_stack; 376 ulong host_stack;
351 u32 host_pid; 377 u32 host_pid;
@@ -380,13 +406,18 @@ struct kvm_vcpu_arch {
380 u32 host_mas4; 406 u32 host_mas4;
381 u32 host_mas6; 407 u32 host_mas6;
382 u32 shadow_epcr; 408 u32 shadow_epcr;
383 u32 epcr;
384 u32 shadow_msrp; 409 u32 shadow_msrp;
385 u32 eplc; 410 u32 eplc;
386 u32 epsc; 411 u32 epsc;
387 u32 oldpir; 412 u32 oldpir;
388#endif 413#endif
389 414
415#if defined(CONFIG_BOOKE)
416#if defined(CONFIG_KVM_BOOKE_HV) || defined(CONFIG_64BIT)
417 u32 epcr;
418#endif
419#endif
420
390#ifdef CONFIG_PPC_BOOK3S 421#ifdef CONFIG_PPC_BOOK3S
391 /* For Gekko paired singles */ 422 /* For Gekko paired singles */
392 u32 qpr[32]; 423 u32 qpr[32];
@@ -440,8 +471,6 @@ struct kvm_vcpu_arch {
440 471
441 u32 ccr0; 472 u32 ccr0;
442 u32 ccr1; 473 u32 ccr1;
443 u32 dbcr0;
444 u32 dbcr1;
445 u32 dbsr; 474 u32 dbsr;
446 475
447 u64 mmcr[3]; 476 u64 mmcr[3];
@@ -471,9 +500,12 @@ struct kvm_vcpu_arch {
471 ulong fault_esr; 500 ulong fault_esr;
472 ulong queued_dear; 501 ulong queued_dear;
473 ulong queued_esr; 502 ulong queued_esr;
503 spinlock_t wdt_lock;
504 struct timer_list wdt_timer;
474 u32 tlbcfg[4]; 505 u32 tlbcfg[4];
475 u32 mmucfg; 506 u32 mmucfg;
476 u32 epr; 507 u32 epr;
508 struct kvmppc_booke_debug_reg dbg_reg;
477#endif 509#endif
478 gpa_t paddr_accessed; 510 gpa_t paddr_accessed;
479 gva_t vaddr_accessed; 511 gva_t vaddr_accessed;
@@ -486,6 +518,7 @@ struct kvm_vcpu_arch {
486 u8 osi_needed; 518 u8 osi_needed;
487 u8 osi_enabled; 519 u8 osi_enabled;
488 u8 papr_enabled; 520 u8 papr_enabled;
521 u8 watchdog_enabled;
489 u8 sane; 522 u8 sane;
490 u8 cpu_type; 523 u8 cpu_type;
491 u8 hcall_needed; 524 u8 hcall_needed;
@@ -497,7 +530,6 @@ struct kvm_vcpu_arch {
497 u64 dec_jiffies; 530 u64 dec_jiffies;
498 u64 dec_expires; 531 u64 dec_expires;
499 unsigned long pending_exceptions; 532 unsigned long pending_exceptions;
500 u16 last_cpu;
501 u8 ceded; 533 u8 ceded;
502 u8 prodded; 534 u8 prodded;
503 u32 last_inst; 535 u32 last_inst;
@@ -534,13 +566,17 @@ struct kvm_vcpu_arch {
534 unsigned long dtl_index; 566 unsigned long dtl_index;
535 u64 stolen_logged; 567 u64 stolen_logged;
536 struct kvmppc_vpa slb_shadow; 568 struct kvmppc_vpa slb_shadow;
569
570 spinlock_t tbacct_lock;
571 u64 busy_stolen;
572 u64 busy_preempt;
537#endif 573#endif
538}; 574};
539 575
540/* Values for vcpu->arch.state */ 576/* Values for vcpu->arch.state */
541#define KVMPPC_VCPU_STOPPED 0 577#define KVMPPC_VCPU_NOTREADY 0
542#define KVMPPC_VCPU_BUSY_IN_HOST 1 578#define KVMPPC_VCPU_RUNNABLE 1
543#define KVMPPC_VCPU_RUNNABLE 2 579#define KVMPPC_VCPU_BUSY_IN_HOST 2
544 580
545/* Values for vcpu->arch.io_gpr */ 581/* Values for vcpu->arch.io_gpr */
546#define KVM_MMIO_REG_MASK 0x001f 582#define KVM_MMIO_REG_MASK 0x001f
diff --git a/arch/powerpc/include/asm/kvm_para.h b/arch/powerpc/include/asm/kvm_para.h
index 9365860fb7f6..2b119654b4c1 100644
--- a/arch/powerpc/include/asm/kvm_para.h
+++ b/arch/powerpc/include/asm/kvm_para.h
@@ -21,7 +21,6 @@
21 21
22#include <uapi/asm/kvm_para.h> 22#include <uapi/asm/kvm_para.h>
23 23
24
25#ifdef CONFIG_KVM_GUEST 24#ifdef CONFIG_KVM_GUEST
26 25
27#include <linux/of.h> 26#include <linux/of.h>
@@ -55,7 +54,7 @@ static unsigned long kvm_hypercall(unsigned long *in,
55 unsigned long *out, 54 unsigned long *out,
56 unsigned long nr) 55 unsigned long nr)
57{ 56{
58 return HC_EV_UNIMPLEMENTED; 57 return EV_UNIMPLEMENTED;
59} 58}
60 59
61#endif 60#endif
@@ -66,7 +65,7 @@ static inline long kvm_hypercall0_1(unsigned int nr, unsigned long *r2)
66 unsigned long out[8]; 65 unsigned long out[8];
67 unsigned long r; 66 unsigned long r;
68 67
69 r = kvm_hypercall(in, out, nr | HC_VENDOR_KVM); 68 r = kvm_hypercall(in, out, KVM_HCALL_TOKEN(nr));
70 *r2 = out[0]; 69 *r2 = out[0];
71 70
72 return r; 71 return r;
@@ -77,7 +76,7 @@ static inline long kvm_hypercall0(unsigned int nr)
77 unsigned long in[8]; 76 unsigned long in[8];
78 unsigned long out[8]; 77 unsigned long out[8];
79 78
80 return kvm_hypercall(in, out, nr | HC_VENDOR_KVM); 79 return kvm_hypercall(in, out, KVM_HCALL_TOKEN(nr));
81} 80}
82 81
83static inline long kvm_hypercall1(unsigned int nr, unsigned long p1) 82static inline long kvm_hypercall1(unsigned int nr, unsigned long p1)
@@ -86,7 +85,7 @@ static inline long kvm_hypercall1(unsigned int nr, unsigned long p1)
86 unsigned long out[8]; 85 unsigned long out[8];
87 86
88 in[0] = p1; 87 in[0] = p1;
89 return kvm_hypercall(in, out, nr | HC_VENDOR_KVM); 88 return kvm_hypercall(in, out, KVM_HCALL_TOKEN(nr));
90} 89}
91 90
92static inline long kvm_hypercall2(unsigned int nr, unsigned long p1, 91static inline long kvm_hypercall2(unsigned int nr, unsigned long p1,
@@ -97,7 +96,7 @@ static inline long kvm_hypercall2(unsigned int nr, unsigned long p1,
97 96
98 in[0] = p1; 97 in[0] = p1;
99 in[1] = p2; 98 in[1] = p2;
100 return kvm_hypercall(in, out, nr | HC_VENDOR_KVM); 99 return kvm_hypercall(in, out, KVM_HCALL_TOKEN(nr));
101} 100}
102 101
103static inline long kvm_hypercall3(unsigned int nr, unsigned long p1, 102static inline long kvm_hypercall3(unsigned int nr, unsigned long p1,
@@ -109,7 +108,7 @@ static inline long kvm_hypercall3(unsigned int nr, unsigned long p1,
109 in[0] = p1; 108 in[0] = p1;
110 in[1] = p2; 109 in[1] = p2;
111 in[2] = p3; 110 in[2] = p3;
112 return kvm_hypercall(in, out, nr | HC_VENDOR_KVM); 111 return kvm_hypercall(in, out, KVM_HCALL_TOKEN(nr));
113} 112}
114 113
115static inline long kvm_hypercall4(unsigned int nr, unsigned long p1, 114static inline long kvm_hypercall4(unsigned int nr, unsigned long p1,
@@ -123,7 +122,7 @@ static inline long kvm_hypercall4(unsigned int nr, unsigned long p1,
123 in[1] = p2; 122 in[1] = p2;
124 in[2] = p3; 123 in[2] = p3;
125 in[3] = p4; 124 in[3] = p4;
126 return kvm_hypercall(in, out, nr | HC_VENDOR_KVM); 125 return kvm_hypercall(in, out, KVM_HCALL_TOKEN(nr));
127} 126}
128 127
129 128
diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
index e006f0bdea95..572aa7530619 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -28,6 +28,7 @@
28#include <linux/types.h> 28#include <linux/types.h>
29#include <linux/kvm_types.h> 29#include <linux/kvm_types.h>
30#include <linux/kvm_host.h> 30#include <linux/kvm_host.h>
31#include <linux/bug.h>
31#ifdef CONFIG_PPC_BOOK3S 32#ifdef CONFIG_PPC_BOOK3S
32#include <asm/kvm_book3s.h> 33#include <asm/kvm_book3s.h>
33#else 34#else
@@ -68,6 +69,8 @@ extern void kvmppc_emulate_dec(struct kvm_vcpu *vcpu);
68extern u32 kvmppc_get_dec(struct kvm_vcpu *vcpu, u64 tb); 69extern u32 kvmppc_get_dec(struct kvm_vcpu *vcpu, u64 tb);
69extern void kvmppc_decrementer_func(unsigned long data); 70extern void kvmppc_decrementer_func(unsigned long data);
70extern int kvmppc_sanity_check(struct kvm_vcpu *vcpu); 71extern int kvmppc_sanity_check(struct kvm_vcpu *vcpu);
72extern int kvmppc_subarch_vcpu_init(struct kvm_vcpu *vcpu);
73extern void kvmppc_subarch_vcpu_uninit(struct kvm_vcpu *vcpu);
71 74
72/* Core-specific hooks */ 75/* Core-specific hooks */
73 76
@@ -104,6 +107,7 @@ extern void kvmppc_core_queue_external(struct kvm_vcpu *vcpu,
104 struct kvm_interrupt *irq); 107 struct kvm_interrupt *irq);
105extern void kvmppc_core_dequeue_external(struct kvm_vcpu *vcpu, 108extern void kvmppc_core_dequeue_external(struct kvm_vcpu *vcpu,
106 struct kvm_interrupt *irq); 109 struct kvm_interrupt *irq);
110extern void kvmppc_core_flush_tlb(struct kvm_vcpu *vcpu);
107 111
108extern int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu, 112extern int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,
109 unsigned int op, int *advance); 113 unsigned int op, int *advance);
@@ -111,6 +115,7 @@ extern int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn,
111 ulong val); 115 ulong val);
112extern int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, 116extern int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn,
113 ulong *val); 117 ulong *val);
118extern int kvmppc_core_check_requests(struct kvm_vcpu *vcpu);
114 119
115extern int kvmppc_booke_init(void); 120extern int kvmppc_booke_init(void);
116extern void kvmppc_booke_exit(void); 121extern void kvmppc_booke_exit(void);
@@ -139,16 +144,28 @@ extern struct kvmppc_linear_info *kvm_alloc_hpt(void);
139extern void kvm_release_hpt(struct kvmppc_linear_info *li); 144extern void kvm_release_hpt(struct kvmppc_linear_info *li);
140extern int kvmppc_core_init_vm(struct kvm *kvm); 145extern int kvmppc_core_init_vm(struct kvm *kvm);
141extern void kvmppc_core_destroy_vm(struct kvm *kvm); 146extern void kvmppc_core_destroy_vm(struct kvm *kvm);
147extern void kvmppc_core_free_memslot(struct kvm_memory_slot *free,
148 struct kvm_memory_slot *dont);
149extern int kvmppc_core_create_memslot(struct kvm_memory_slot *slot,
150 unsigned long npages);
142extern int kvmppc_core_prepare_memory_region(struct kvm *kvm, 151extern int kvmppc_core_prepare_memory_region(struct kvm *kvm,
152 struct kvm_memory_slot *memslot,
143 struct kvm_userspace_memory_region *mem); 153 struct kvm_userspace_memory_region *mem);
144extern void kvmppc_core_commit_memory_region(struct kvm *kvm, 154extern void kvmppc_core_commit_memory_region(struct kvm *kvm,
145 struct kvm_userspace_memory_region *mem); 155 struct kvm_userspace_memory_region *mem,
156 struct kvm_memory_slot old);
146extern int kvm_vm_ioctl_get_smmu_info(struct kvm *kvm, 157extern int kvm_vm_ioctl_get_smmu_info(struct kvm *kvm,
147 struct kvm_ppc_smmu_info *info); 158 struct kvm_ppc_smmu_info *info);
159extern void kvmppc_core_flush_memslot(struct kvm *kvm,
160 struct kvm_memory_slot *memslot);
148 161
149extern int kvmppc_bookehv_init(void); 162extern int kvmppc_bookehv_init(void);
150extern void kvmppc_bookehv_exit(void); 163extern void kvmppc_bookehv_exit(void);
151 164
165extern int kvmppc_prepare_to_enter(struct kvm_vcpu *vcpu);
166
167extern int kvm_vm_ioctl_get_htab_fd(struct kvm *kvm, struct kvm_get_htab_fd *);
168
152/* 169/*
153 * Cuts out inst bits with ordering according to spec. 170 * Cuts out inst bits with ordering according to spec.
154 * That means the leftmost bit is zero. All given bits are included. 171 * That means the leftmost bit is zero. All given bits are included.
@@ -182,6 +199,41 @@ static inline u32 kvmppc_set_field(u64 inst, int msb, int lsb, int value)
182 return r; 199 return r;
183} 200}
184 201
202union kvmppc_one_reg {
203 u32 wval;
204 u64 dval;
205 vector128 vval;
206 u64 vsxval[2];
207 struct {
208 u64 addr;
209 u64 length;
210 } vpaval;
211};
212
213#define one_reg_size(id) \
214 (1ul << (((id) & KVM_REG_SIZE_MASK) >> KVM_REG_SIZE_SHIFT))
215
216#define get_reg_val(id, reg) ({ \
217 union kvmppc_one_reg __u; \
218 switch (one_reg_size(id)) { \
219 case 4: __u.wval = (reg); break; \
220 case 8: __u.dval = (reg); break; \
221 default: BUG(); \
222 } \
223 __u; \
224})
225
226
227#define set_reg_val(id, val) ({ \
228 u64 __v; \
229 switch (one_reg_size(id)) { \
230 case 4: __v = (val).wval; break; \
231 case 8: __v = (val).dval; break; \
232 default: BUG(); \
233 } \
234 __v; \
235})
236
185void kvmppc_core_get_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs); 237void kvmppc_core_get_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs);
186int kvmppc_core_set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs); 238int kvmppc_core_set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs);
187 239
@@ -190,6 +242,8 @@ int kvmppc_set_sregs_ivor(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs);
190 242
191int kvm_vcpu_ioctl_get_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg); 243int kvm_vcpu_ioctl_get_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg);
192int kvm_vcpu_ioctl_set_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg); 244int kvm_vcpu_ioctl_set_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg);
245int kvmppc_get_one_reg(struct kvm_vcpu *vcpu, u64 id, union kvmppc_one_reg *);
246int kvmppc_set_one_reg(struct kvm_vcpu *vcpu, u64 id, union kvmppc_one_reg *);
193 247
194void kvmppc_set_pid(struct kvm_vcpu *vcpu, u32 pid); 248void kvmppc_set_pid(struct kvm_vcpu *vcpu, u32 pid);
195 249
@@ -230,5 +284,36 @@ static inline void kvmppc_mmu_flush_icache(pfn_t pfn)
230 } 284 }
231} 285}
232 286
287/* Please call after prepare_to_enter. This function puts the lazy ee state
288 back to normal mode, without actually enabling interrupts. */
289static inline void kvmppc_lazy_ee_enable(void)
290{
291#ifdef CONFIG_PPC64
292 /* Only need to enable IRQs by hard enabling them after this */
293 local_paca->irq_happened = 0;
294 local_paca->soft_enabled = 1;
295#endif
296}
297
298static inline ulong kvmppc_get_ea_indexed(struct kvm_vcpu *vcpu, int ra, int rb)
299{
300 ulong ea;
301 ulong msr_64bit = 0;
302
303 ea = kvmppc_get_gpr(vcpu, rb);
304 if (ra)
305 ea += kvmppc_get_gpr(vcpu, ra);
306
307#if defined(CONFIG_PPC_BOOK3E_64)
308 msr_64bit = MSR_CM;
309#elif defined(CONFIG_PPC_BOOK3S_64)
310 msr_64bit = MSR_SF;
311#endif
312
313 if (!(vcpu->arch.shared->msr & msr_64bit))
314 ea = (uint32_t)ea;
315
316 return ea;
317}
233 318
234#endif /* __POWERPC_KVM_PPC_H__ */ 319#endif /* __POWERPC_KVM_PPC_H__ */
diff --git a/arch/powerpc/include/asm/mmu-book3e.h b/arch/powerpc/include/asm/mmu-book3e.h
index eeabcdbc30f7..99d43e0c1e4a 100644
--- a/arch/powerpc/include/asm/mmu-book3e.h
+++ b/arch/powerpc/include/asm/mmu-book3e.h
@@ -59,7 +59,7 @@
59#define MAS1_TSIZE_SHIFT 7 59#define MAS1_TSIZE_SHIFT 7
60#define MAS1_TSIZE(x) (((x) << MAS1_TSIZE_SHIFT) & MAS1_TSIZE_MASK) 60#define MAS1_TSIZE(x) (((x) << MAS1_TSIZE_SHIFT) & MAS1_TSIZE_MASK)
61 61
62#define MAS2_EPN 0xFFFFF000 62#define MAS2_EPN (~0xFFFUL)
63#define MAS2_X0 0x00000040 63#define MAS2_X0 0x00000040
64#define MAS2_X1 0x00000020 64#define MAS2_X1 0x00000020
65#define MAS2_W 0x00000010 65#define MAS2_W 0x00000010
diff --git a/arch/powerpc/include/asm/mmu-hash64.h b/arch/powerpc/include/asm/mmu-hash64.h
index 9673f73eb8db..2fdb47a19efd 100644
--- a/arch/powerpc/include/asm/mmu-hash64.h
+++ b/arch/powerpc/include/asm/mmu-hash64.h
@@ -121,6 +121,16 @@ extern char initial_stab[];
121#define PP_RXRX 3 /* Supervisor read, User read */ 121#define PP_RXRX 3 /* Supervisor read, User read */
122#define PP_RXXX (HPTE_R_PP0 | 2) /* Supervisor read, user none */ 122#define PP_RXXX (HPTE_R_PP0 | 2) /* Supervisor read, user none */
123 123
124/* Fields for tlbiel instruction in architecture 2.06 */
125#define TLBIEL_INVAL_SEL_MASK 0xc00 /* invalidation selector */
126#define TLBIEL_INVAL_PAGE 0x000 /* invalidate a single page */
127#define TLBIEL_INVAL_SET_LPID 0x800 /* invalidate a set for current LPID */
128#define TLBIEL_INVAL_SET 0xc00 /* invalidate a set for all LPIDs */
129#define TLBIEL_INVAL_SET_MASK 0xfff000 /* set number to inval. */
130#define TLBIEL_INVAL_SET_SHIFT 12
131
132#define POWER7_TLB_SETS 128 /* # sets in POWER7 TLB */
133
124#ifndef __ASSEMBLY__ 134#ifndef __ASSEMBLY__
125 135
126struct hash_pte { 136struct hash_pte {
diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h
index d24c14163966..97d37278ea2d 100644
--- a/arch/powerpc/include/asm/reg.h
+++ b/arch/powerpc/include/asm/reg.h
@@ -518,6 +518,7 @@
518#define SRR1_WS_DEEPER 0x00020000 /* Some resources not maintained */ 518#define SRR1_WS_DEEPER 0x00020000 /* Some resources not maintained */
519#define SRR1_WS_DEEP 0x00010000 /* All resources maintained */ 519#define SRR1_WS_DEEP 0x00010000 /* All resources maintained */
520#define SRR1_PROGFPE 0x00100000 /* Floating Point Enabled */ 520#define SRR1_PROGFPE 0x00100000 /* Floating Point Enabled */
521#define SRR1_PROGILL 0x00080000 /* Illegal instruction */
521#define SRR1_PROGPRIV 0x00040000 /* Privileged instruction */ 522#define SRR1_PROGPRIV 0x00040000 /* Privileged instruction */
522#define SRR1_PROGTRAP 0x00020000 /* Trap */ 523#define SRR1_PROGTRAP 0x00020000 /* Trap */
523#define SRR1_PROGADDR 0x00010000 /* SRR0 contains subsequent addr */ 524#define SRR1_PROGADDR 0x00010000 /* SRR0 contains subsequent addr */
diff --git a/arch/powerpc/include/asm/reg_booke.h b/arch/powerpc/include/asm/reg_booke.h
index 2d916c4982c5..e07e6af5e1ff 100644
--- a/arch/powerpc/include/asm/reg_booke.h
+++ b/arch/powerpc/include/asm/reg_booke.h
@@ -539,6 +539,13 @@
539#define TCR_FIE 0x00800000 /* FIT Interrupt Enable */ 539#define TCR_FIE 0x00800000 /* FIT Interrupt Enable */
540#define TCR_ARE 0x00400000 /* Auto Reload Enable */ 540#define TCR_ARE 0x00400000 /* Auto Reload Enable */
541 541
542#ifdef CONFIG_E500
543#define TCR_GET_WP(tcr) ((((tcr) & 0xC0000000) >> 30) | \
544 (((tcr) & 0x1E0000) >> 15))
545#else
546#define TCR_GET_WP(tcr) (((tcr) & 0xC0000000) >> 30)
547#endif
548
542/* Bit definitions for the TSR. */ 549/* Bit definitions for the TSR. */
543#define TSR_ENW 0x80000000 /* Enable Next Watchdog */ 550#define TSR_ENW 0x80000000 /* Enable Next Watchdog */
544#define TSR_WIS 0x40000000 /* WDT Interrupt Status */ 551#define TSR_WIS 0x40000000 /* WDT Interrupt Status */
diff --git a/arch/powerpc/include/asm/smp.h b/arch/powerpc/include/asm/smp.h
index e807e9d8e3f7..5a4e437c238d 100644
--- a/arch/powerpc/include/asm/smp.h
+++ b/arch/powerpc/include/asm/smp.h
@@ -67,6 +67,14 @@ void generic_mach_cpu_die(void);
67void generic_set_cpu_dead(unsigned int cpu); 67void generic_set_cpu_dead(unsigned int cpu);
68void generic_set_cpu_up(unsigned int cpu); 68void generic_set_cpu_up(unsigned int cpu);
69int generic_check_cpu_restart(unsigned int cpu); 69int generic_check_cpu_restart(unsigned int cpu);
70
71extern void inhibit_secondary_onlining(void);
72extern void uninhibit_secondary_onlining(void);
73
74#else /* HOTPLUG_CPU */
75static inline void inhibit_secondary_onlining(void) {}
76static inline void uninhibit_secondary_onlining(void) {}
77
70#endif 78#endif
71 79
72#ifdef CONFIG_PPC64 80#ifdef CONFIG_PPC64
diff --git a/arch/powerpc/include/uapi/asm/Kbuild b/arch/powerpc/include/uapi/asm/Kbuild
index a33c3c03bb2e..f7bca6370745 100644
--- a/arch/powerpc/include/uapi/asm/Kbuild
+++ b/arch/powerpc/include/uapi/asm/Kbuild
@@ -7,6 +7,7 @@ header-y += bootx.h
7header-y += byteorder.h 7header-y += byteorder.h
8header-y += cputable.h 8header-y += cputable.h
9header-y += elf.h 9header-y += elf.h
10header-y += epapr_hcalls.h
10header-y += errno.h 11header-y += errno.h
11header-y += fcntl.h 12header-y += fcntl.h
12header-y += ioctl.h 13header-y += ioctl.h
diff --git a/arch/powerpc/include/uapi/asm/epapr_hcalls.h b/arch/powerpc/include/uapi/asm/epapr_hcalls.h
new file mode 100644
index 000000000000..7f9c74b46704
--- /dev/null
+++ b/arch/powerpc/include/uapi/asm/epapr_hcalls.h
@@ -0,0 +1,98 @@
1/*
2 * ePAPR hcall interface
3 *
4 * Copyright 2008-2011 Freescale Semiconductor, Inc.
5 *
6 * Author: Timur Tabi <timur@freescale.com>
7 *
8 * This file is provided under a dual BSD/GPL license. When using or
9 * redistributing this file, you may do so under either license.
10 *
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions are met:
13 * * Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * * Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 * * Neither the name of Freescale Semiconductor nor the
19 * names of its contributors may be used to endorse or promote products
20 * derived from this software without specific prior written permission.
21 *
22 *
23 * ALTERNATIVELY, this software may be distributed under the terms of the
24 * GNU General Public License ("GPL") as published by the Free Software
25 * Foundation, either version 2 of that License or (at your option) any
26 * later version.
27 *
28 * THIS SOFTWARE IS PROVIDED BY Freescale Semiconductor ``AS IS'' AND ANY
29 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
30 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
31 * DISCLAIMED. IN NO EVENT SHALL Freescale Semiconductor BE LIABLE FOR ANY
32 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
33 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
34 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
35 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
36 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
37 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
38 */
39
40#ifndef _UAPI_ASM_POWERPC_EPAPR_HCALLS_H
41#define _UAPI_ASM_POWERPC_EPAPR_HCALLS_H
42
43#define EV_BYTE_CHANNEL_SEND 1
44#define EV_BYTE_CHANNEL_RECEIVE 2
45#define EV_BYTE_CHANNEL_POLL 3
46#define EV_INT_SET_CONFIG 4
47#define EV_INT_GET_CONFIG 5
48#define EV_INT_SET_MASK 6
49#define EV_INT_GET_MASK 7
50#define EV_INT_IACK 9
51#define EV_INT_EOI 10
52#define EV_INT_SEND_IPI 11
53#define EV_INT_SET_TASK_PRIORITY 12
54#define EV_INT_GET_TASK_PRIORITY 13
55#define EV_DOORBELL_SEND 14
56#define EV_MSGSND 15
57#define EV_IDLE 16
58
59/* vendor ID: epapr */
60#define EV_LOCAL_VENDOR_ID 0 /* for private use */
61#define EV_EPAPR_VENDOR_ID 1
62#define EV_FSL_VENDOR_ID 2 /* Freescale Semiconductor */
63#define EV_IBM_VENDOR_ID 3 /* IBM */
64#define EV_GHS_VENDOR_ID 4 /* Green Hills Software */
65#define EV_ENEA_VENDOR_ID 5 /* Enea */
66#define EV_WR_VENDOR_ID 6 /* Wind River Systems */
67#define EV_AMCC_VENDOR_ID 7 /* Applied Micro Circuits */
68#define EV_KVM_VENDOR_ID 42 /* KVM */
69
70/* The max number of bytes that a byte channel can send or receive per call */
71#define EV_BYTE_CHANNEL_MAX_BYTES 16
72
73
74#define _EV_HCALL_TOKEN(id, num) (((id) << 16) | (num))
75#define EV_HCALL_TOKEN(hcall_num) _EV_HCALL_TOKEN(EV_EPAPR_VENDOR_ID, hcall_num)
76
77/* epapr return codes */
78#define EV_SUCCESS 0
79#define EV_EPERM 1 /* Operation not permitted */
80#define EV_ENOENT 2 /* Entry Not Found */
81#define EV_EIO 3 /* I/O error occured */
82#define EV_EAGAIN 4 /* The operation had insufficient
83 * resources to complete and should be
84 * retried
85 */
86#define EV_ENOMEM 5 /* There was insufficient memory to
87 * complete the operation */
88#define EV_EFAULT 6 /* Bad guest address */
89#define EV_ENODEV 7 /* No such device */
90#define EV_EINVAL 8 /* An argument supplied to the hcall
91 was out of range or invalid */
92#define EV_INTERNAL 9 /* An internal error occured */
93#define EV_CONFIG 10 /* A configuration error was detected */
94#define EV_INVALID_STATE 11 /* The object is in an invalid state */
95#define EV_UNIMPLEMENTED 12 /* Unimplemented hypercall */
96#define EV_BUFFER_OVERFLOW 13 /* Caller-supplied buffer too small */
97
98#endif /* _UAPI_ASM_POWERPC_EPAPR_HCALLS_H */
diff --git a/arch/powerpc/include/uapi/asm/kvm.h b/arch/powerpc/include/uapi/asm/kvm.h
index 1bea4d8ea6f4..2fba8a66fb10 100644
--- a/arch/powerpc/include/uapi/asm/kvm.h
+++ b/arch/powerpc/include/uapi/asm/kvm.h
@@ -221,6 +221,12 @@ struct kvm_sregs {
221 221
222 __u32 dbsr; /* KVM_SREGS_E_UPDATE_DBSR */ 222 __u32 dbsr; /* KVM_SREGS_E_UPDATE_DBSR */
223 __u32 dbcr[3]; 223 __u32 dbcr[3];
224 /*
225 * iac/dac registers are 64bit wide, while this API
226 * interface provides only lower 32 bits on 64 bit
227 * processors. ONE_REG interface is added for 64bit
228 * iac/dac registers.
229 */
224 __u32 iac[4]; 230 __u32 iac[4];
225 __u32 dac[2]; 231 __u32 dac[2];
226 __u32 dvc[2]; 232 __u32 dvc[2];
@@ -325,6 +331,86 @@ struct kvm_book3e_206_tlb_params {
325 __u32 reserved[8]; 331 __u32 reserved[8];
326}; 332};
327 333
334/* For KVM_PPC_GET_HTAB_FD */
335struct kvm_get_htab_fd {
336 __u64 flags;
337 __u64 start_index;
338 __u64 reserved[2];
339};
340
341/* Values for kvm_get_htab_fd.flags */
342#define KVM_GET_HTAB_BOLTED_ONLY ((__u64)0x1)
343#define KVM_GET_HTAB_WRITE ((__u64)0x2)
344
345/*
346 * Data read on the file descriptor is formatted as a series of
347 * records, each consisting of a header followed by a series of
348 * `n_valid' HPTEs (16 bytes each), which are all valid. Following
349 * those valid HPTEs there are `n_invalid' invalid HPTEs, which
350 * are not represented explicitly in the stream. The same format
351 * is used for writing.
352 */
353struct kvm_get_htab_header {
354 __u32 index;
355 __u16 n_valid;
356 __u16 n_invalid;
357};
358
328#define KVM_REG_PPC_HIOR (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x1) 359#define KVM_REG_PPC_HIOR (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x1)
360#define KVM_REG_PPC_IAC1 (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x2)
361#define KVM_REG_PPC_IAC2 (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x3)
362#define KVM_REG_PPC_IAC3 (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x4)
363#define KVM_REG_PPC_IAC4 (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x5)
364#define KVM_REG_PPC_DAC1 (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x6)
365#define KVM_REG_PPC_DAC2 (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x7)
366#define KVM_REG_PPC_DABR (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x8)
367#define KVM_REG_PPC_DSCR (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x9)
368#define KVM_REG_PPC_PURR (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xa)
369#define KVM_REG_PPC_SPURR (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xb)
370#define KVM_REG_PPC_DAR (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xc)
371#define KVM_REG_PPC_DSISR (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0xd)
372#define KVM_REG_PPC_AMR (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xe)
373#define KVM_REG_PPC_UAMOR (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xf)
374
375#define KVM_REG_PPC_MMCR0 (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x10)
376#define KVM_REG_PPC_MMCR1 (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x11)
377#define KVM_REG_PPC_MMCRA (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x12)
378
379#define KVM_REG_PPC_PMC1 (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x18)
380#define KVM_REG_PPC_PMC2 (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x19)
381#define KVM_REG_PPC_PMC3 (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x1a)
382#define KVM_REG_PPC_PMC4 (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x1b)
383#define KVM_REG_PPC_PMC5 (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x1c)
384#define KVM_REG_PPC_PMC6 (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x1d)
385#define KVM_REG_PPC_PMC7 (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x1e)
386#define KVM_REG_PPC_PMC8 (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x1f)
387
388/* 32 floating-point registers */
389#define KVM_REG_PPC_FPR0 (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x20)
390#define KVM_REG_PPC_FPR(n) (KVM_REG_PPC_FPR0 + (n))
391#define KVM_REG_PPC_FPR31 (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x3f)
392
393/* 32 VMX/Altivec vector registers */
394#define KVM_REG_PPC_VR0 (KVM_REG_PPC | KVM_REG_SIZE_U128 | 0x40)
395#define KVM_REG_PPC_VR(n) (KVM_REG_PPC_VR0 + (n))
396#define KVM_REG_PPC_VR31 (KVM_REG_PPC | KVM_REG_SIZE_U128 | 0x5f)
397
398/* 32 double-width FP registers for VSX */
399/* High-order halves overlap with FP regs */
400#define KVM_REG_PPC_VSR0 (KVM_REG_PPC | KVM_REG_SIZE_U128 | 0x60)
401#define KVM_REG_PPC_VSR(n) (KVM_REG_PPC_VSR0 + (n))
402#define KVM_REG_PPC_VSR31 (KVM_REG_PPC | KVM_REG_SIZE_U128 | 0x7f)
403
404/* FP and vector status/control registers */
405#define KVM_REG_PPC_FPSCR (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x80)
406#define KVM_REG_PPC_VSCR (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x81)
407
408/* Virtual processor areas */
409/* For SLB & DTL, address in high (first) half, length in low half */
410#define KVM_REG_PPC_VPA_ADDR (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x82)
411#define KVM_REG_PPC_VPA_SLB (KVM_REG_PPC | KVM_REG_SIZE_U128 | 0x83)
412#define KVM_REG_PPC_VPA_DTL (KVM_REG_PPC | KVM_REG_SIZE_U128 | 0x84)
413
414#define KVM_REG_PPC_EPCR (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x85)
329 415
330#endif /* __LINUX_KVM_POWERPC_H */ 416#endif /* __LINUX_KVM_POWERPC_H */
diff --git a/arch/powerpc/include/uapi/asm/kvm_para.h b/arch/powerpc/include/uapi/asm/kvm_para.h
index 5e04383a1db5..ed0e0254b47f 100644
--- a/arch/powerpc/include/uapi/asm/kvm_para.h
+++ b/arch/powerpc/include/uapi/asm/kvm_para.h
@@ -75,9 +75,10 @@ struct kvm_vcpu_arch_shared {
75}; 75};
76 76
77#define KVM_SC_MAGIC_R0 0x4b564d21 /* "KVM!" */ 77#define KVM_SC_MAGIC_R0 0x4b564d21 /* "KVM!" */
78#define HC_VENDOR_KVM (42 << 16) 78
79#define HC_EV_SUCCESS 0 79#define KVM_HCALL_TOKEN(num) _EV_HCALL_TOKEN(EV_KVM_VENDOR_ID, num)
80#define HC_EV_UNIMPLEMENTED 12 80
81#include <uapi/asm/epapr_hcalls.h>
81 82
82#define KVM_FEATURE_MAGIC_PAGE 1 83#define KVM_FEATURE_MAGIC_PAGE 1
83 84
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
index 7523539cfe9f..4e23ba2f3ca7 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -441,8 +441,7 @@ int main(void)
441 DEFINE(KVM_HOST_LPCR, offsetof(struct kvm, arch.host_lpcr)); 441 DEFINE(KVM_HOST_LPCR, offsetof(struct kvm, arch.host_lpcr));
442 DEFINE(KVM_HOST_SDR1, offsetof(struct kvm, arch.host_sdr1)); 442 DEFINE(KVM_HOST_SDR1, offsetof(struct kvm, arch.host_sdr1));
443 DEFINE(KVM_TLBIE_LOCK, offsetof(struct kvm, arch.tlbie_lock)); 443 DEFINE(KVM_TLBIE_LOCK, offsetof(struct kvm, arch.tlbie_lock));
444 DEFINE(KVM_ONLINE_CPUS, offsetof(struct kvm, online_vcpus.counter)); 444 DEFINE(KVM_NEED_FLUSH, offsetof(struct kvm, arch.need_tlb_flush.bits));
445 DEFINE(KVM_LAST_VCPU, offsetof(struct kvm, arch.last_vcpu));
446 DEFINE(KVM_LPCR, offsetof(struct kvm, arch.lpcr)); 445 DEFINE(KVM_LPCR, offsetof(struct kvm, arch.lpcr));
447 DEFINE(KVM_RMOR, offsetof(struct kvm, arch.rmor)); 446 DEFINE(KVM_RMOR, offsetof(struct kvm, arch.rmor));
448 DEFINE(KVM_VRMA_SLB_V, offsetof(struct kvm, arch.vrma_slb_v)); 447 DEFINE(KVM_VRMA_SLB_V, offsetof(struct kvm, arch.vrma_slb_v));
@@ -470,7 +469,6 @@ int main(void)
470 DEFINE(VCPU_SLB, offsetof(struct kvm_vcpu, arch.slb)); 469 DEFINE(VCPU_SLB, offsetof(struct kvm_vcpu, arch.slb));
471 DEFINE(VCPU_SLB_MAX, offsetof(struct kvm_vcpu, arch.slb_max)); 470 DEFINE(VCPU_SLB_MAX, offsetof(struct kvm_vcpu, arch.slb_max));
472 DEFINE(VCPU_SLB_NR, offsetof(struct kvm_vcpu, arch.slb_nr)); 471 DEFINE(VCPU_SLB_NR, offsetof(struct kvm_vcpu, arch.slb_nr));
473 DEFINE(VCPU_LAST_CPU, offsetof(struct kvm_vcpu, arch.last_cpu));
474 DEFINE(VCPU_FAULT_DSISR, offsetof(struct kvm_vcpu, arch.fault_dsisr)); 472 DEFINE(VCPU_FAULT_DSISR, offsetof(struct kvm_vcpu, arch.fault_dsisr));
475 DEFINE(VCPU_FAULT_DAR, offsetof(struct kvm_vcpu, arch.fault_dar)); 473 DEFINE(VCPU_FAULT_DAR, offsetof(struct kvm_vcpu, arch.fault_dar));
476 DEFINE(VCPU_LAST_INST, offsetof(struct kvm_vcpu, arch.last_inst)); 474 DEFINE(VCPU_LAST_INST, offsetof(struct kvm_vcpu, arch.last_inst));
diff --git a/arch/powerpc/kernel/epapr_hcalls.S b/arch/powerpc/kernel/epapr_hcalls.S
index 697b390ebfd8..62c0dc237826 100644
--- a/arch/powerpc/kernel/epapr_hcalls.S
+++ b/arch/powerpc/kernel/epapr_hcalls.S
@@ -8,13 +8,41 @@
8 */ 8 */
9 9
10#include <linux/threads.h> 10#include <linux/threads.h>
11#include <asm/epapr_hcalls.h>
11#include <asm/reg.h> 12#include <asm/reg.h>
12#include <asm/page.h> 13#include <asm/page.h>
13#include <asm/cputable.h> 14#include <asm/cputable.h>
14#include <asm/thread_info.h> 15#include <asm/thread_info.h>
15#include <asm/ppc_asm.h> 16#include <asm/ppc_asm.h>
17#include <asm/asm-compat.h>
16#include <asm/asm-offsets.h> 18#include <asm/asm-offsets.h>
17 19
20/* epapr_ev_idle() was derived from e500_idle() */
21_GLOBAL(epapr_ev_idle)
22 CURRENT_THREAD_INFO(r3, r1)
23 PPC_LL r4, TI_LOCAL_FLAGS(r3) /* set napping bit */
24 ori r4, r4,_TLF_NAPPING /* so when we take an exception */
25 PPC_STL r4, TI_LOCAL_FLAGS(r3) /* it will return to our caller */
26
27 wrteei 1
28
29idle_loop:
30 LOAD_REG_IMMEDIATE(r11, EV_HCALL_TOKEN(EV_IDLE))
31
32.global epapr_ev_idle_start
33epapr_ev_idle_start:
34 li r3, -1
35 nop
36 nop
37 nop
38
39 /*
40 * Guard against spurious wakeups from a hypervisor --
41 * only interrupt will cause us to return to LR due to
42 * _TLF_NAPPING.
43 */
44 b idle_loop
45
18/* Hypercall entry point. Will be patched with device tree instructions. */ 46/* Hypercall entry point. Will be patched with device tree instructions. */
19.global epapr_hypercall_start 47.global epapr_hypercall_start
20epapr_hypercall_start: 48epapr_hypercall_start:
diff --git a/arch/powerpc/kernel/epapr_paravirt.c b/arch/powerpc/kernel/epapr_paravirt.c
index 028aeae370b6..f3eab8594d9f 100644
--- a/arch/powerpc/kernel/epapr_paravirt.c
+++ b/arch/powerpc/kernel/epapr_paravirt.c
@@ -21,6 +21,10 @@
21#include <asm/epapr_hcalls.h> 21#include <asm/epapr_hcalls.h>
22#include <asm/cacheflush.h> 22#include <asm/cacheflush.h>
23#include <asm/code-patching.h> 23#include <asm/code-patching.h>
24#include <asm/machdep.h>
25
26extern void epapr_ev_idle(void);
27extern u32 epapr_ev_idle_start[];
24 28
25bool epapr_paravirt_enabled; 29bool epapr_paravirt_enabled;
26 30
@@ -41,8 +45,13 @@ static int __init epapr_paravirt_init(void)
41 if (len % 4 || len > (4 * 4)) 45 if (len % 4 || len > (4 * 4))
42 return -ENODEV; 46 return -ENODEV;
43 47
44 for (i = 0; i < (len / 4); i++) 48 for (i = 0; i < (len / 4); i++) {
45 patch_instruction(epapr_hypercall_start + i, insts[i]); 49 patch_instruction(epapr_hypercall_start + i, insts[i]);
50 patch_instruction(epapr_ev_idle_start + i, insts[i]);
51 }
52
53 if (of_get_property(hyper_node, "has-idle", NULL))
54 ppc_md.power_save = epapr_ev_idle;
46 55
47 epapr_paravirt_enabled = true; 56 epapr_paravirt_enabled = true;
48 57
diff --git a/arch/powerpc/kernel/kvm.c b/arch/powerpc/kernel/kvm.c
index 867db1de8949..a61b133c4f99 100644
--- a/arch/powerpc/kernel/kvm.c
+++ b/arch/powerpc/kernel/kvm.c
@@ -419,7 +419,7 @@ static void kvm_map_magic_page(void *data)
419 in[0] = KVM_MAGIC_PAGE; 419 in[0] = KVM_MAGIC_PAGE;
420 in[1] = KVM_MAGIC_PAGE; 420 in[1] = KVM_MAGIC_PAGE;
421 421
422 kvm_hypercall(in, out, HC_VENDOR_KVM | KVM_HC_PPC_MAP_MAGIC_PAGE); 422 kvm_hypercall(in, out, KVM_HCALL_TOKEN(KVM_HC_PPC_MAP_MAGIC_PAGE));
423 423
424 *features = out[0]; 424 *features = out[0];
425} 425}
diff --git a/arch/powerpc/kernel/ppc_ksyms.c b/arch/powerpc/kernel/ppc_ksyms.c
index 19e4288d8486..78b8766fd79e 100644
--- a/arch/powerpc/kernel/ppc_ksyms.c
+++ b/arch/powerpc/kernel/ppc_ksyms.c
@@ -43,6 +43,7 @@
43#include <asm/dcr.h> 43#include <asm/dcr.h>
44#include <asm/ftrace.h> 44#include <asm/ftrace.h>
45#include <asm/switch_to.h> 45#include <asm/switch_to.h>
46#include <asm/epapr_hcalls.h>
46 47
47#ifdef CONFIG_PPC32 48#ifdef CONFIG_PPC32
48extern void transfer_to_handler(void); 49extern void transfer_to_handler(void);
@@ -191,3 +192,7 @@ EXPORT_SYMBOL(__arch_hweight64);
191#ifdef CONFIG_PPC_BOOK3S_64 192#ifdef CONFIG_PPC_BOOK3S_64
192EXPORT_SYMBOL_GPL(mmu_psize_defs); 193EXPORT_SYMBOL_GPL(mmu_psize_defs);
193#endif 194#endif
195
196#ifdef CONFIG_EPAPR_PARAVIRT
197EXPORT_SYMBOL(epapr_hypercall_start);
198#endif
diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index 2b952b5386fd..e5b133ebd8a5 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -427,6 +427,45 @@ int generic_check_cpu_restart(unsigned int cpu)
427{ 427{
428 return per_cpu(cpu_state, cpu) == CPU_UP_PREPARE; 428 return per_cpu(cpu_state, cpu) == CPU_UP_PREPARE;
429} 429}
430
431static atomic_t secondary_inhibit_count;
432
433/*
434 * Don't allow secondary CPU threads to come online
435 */
436void inhibit_secondary_onlining(void)
437{
438 /*
439 * This makes secondary_inhibit_count stable during cpu
440 * online/offline operations.
441 */
442 get_online_cpus();
443
444 atomic_inc(&secondary_inhibit_count);
445 put_online_cpus();
446}
447EXPORT_SYMBOL_GPL(inhibit_secondary_onlining);
448
449/*
450 * Allow secondary CPU threads to come online again
451 */
452void uninhibit_secondary_onlining(void)
453{
454 get_online_cpus();
455 atomic_dec(&secondary_inhibit_count);
456 put_online_cpus();
457}
458EXPORT_SYMBOL_GPL(uninhibit_secondary_onlining);
459
460static int secondaries_inhibited(void)
461{
462 return atomic_read(&secondary_inhibit_count);
463}
464
465#else /* HOTPLUG_CPU */
466
467#define secondaries_inhibited() 0
468
430#endif 469#endif
431 470
432static void cpu_idle_thread_init(unsigned int cpu, struct task_struct *idle) 471static void cpu_idle_thread_init(unsigned int cpu, struct task_struct *idle)
@@ -445,6 +484,13 @@ int __cpuinit __cpu_up(unsigned int cpu, struct task_struct *tidle)
445{ 484{
446 int rc, c; 485 int rc, c;
447 486
487 /*
488 * Don't allow secondary threads to come online if inhibited
489 */
490 if (threads_per_core > 1 && secondaries_inhibited() &&
491 cpu % threads_per_core != 0)
492 return -EBUSY;
493
448 if (smp_ops == NULL || 494 if (smp_ops == NULL ||
449 (smp_ops->cpu_bootable && !smp_ops->cpu_bootable(cpu))) 495 (smp_ops->cpu_bootable && !smp_ops->cpu_bootable(cpu)))
450 return -EINVAL; 496 return -EINVAL;
diff --git a/arch/powerpc/kvm/44x.c b/arch/powerpc/kvm/44x.c
index 50e7dbc7356c..3d7fd21c65f9 100644
--- a/arch/powerpc/kvm/44x.c
+++ b/arch/powerpc/kvm/44x.c
@@ -83,6 +83,7 @@ int kvmppc_core_vcpu_setup(struct kvm_vcpu *vcpu)
83 vcpu_44x->shadow_refs[i].gtlb_index = -1; 83 vcpu_44x->shadow_refs[i].gtlb_index = -1;
84 84
85 vcpu->arch.cpu_type = KVM_CPU_440; 85 vcpu->arch.cpu_type = KVM_CPU_440;
86 vcpu->arch.pvr = mfspr(SPRN_PVR);
86 87
87 return 0; 88 return 0;
88} 89}
diff --git a/arch/powerpc/kvm/44x_emulate.c b/arch/powerpc/kvm/44x_emulate.c
index c8c61578fdfc..35ec0a8547da 100644
--- a/arch/powerpc/kvm/44x_emulate.c
+++ b/arch/powerpc/kvm/44x_emulate.c
@@ -27,12 +27,70 @@
27#include "booke.h" 27#include "booke.h"
28#include "44x_tlb.h" 28#include "44x_tlb.h"
29 29
30#define XOP_MFDCRX 259
30#define XOP_MFDCR 323 31#define XOP_MFDCR 323
32#define XOP_MTDCRX 387
31#define XOP_MTDCR 451 33#define XOP_MTDCR 451
32#define XOP_TLBSX 914 34#define XOP_TLBSX 914
33#define XOP_ICCCI 966 35#define XOP_ICCCI 966
34#define XOP_TLBWE 978 36#define XOP_TLBWE 978
35 37
38static int emulate_mtdcr(struct kvm_vcpu *vcpu, int rs, int dcrn)
39{
40 /* emulate some access in kernel */
41 switch (dcrn) {
42 case DCRN_CPR0_CONFIG_ADDR:
43 vcpu->arch.cpr0_cfgaddr = kvmppc_get_gpr(vcpu, rs);
44 return EMULATE_DONE;
45 default:
46 vcpu->run->dcr.dcrn = dcrn;
47 vcpu->run->dcr.data = kvmppc_get_gpr(vcpu, rs);
48 vcpu->run->dcr.is_write = 1;
49 vcpu->arch.dcr_is_write = 1;
50 vcpu->arch.dcr_needed = 1;
51 kvmppc_account_exit(vcpu, DCR_EXITS);
52 return EMULATE_DO_DCR;
53 }
54}
55
56static int emulate_mfdcr(struct kvm_vcpu *vcpu, int rt, int dcrn)
57{
58 /* The guest may access CPR0 registers to determine the timebase
59 * frequency, and it must know the real host frequency because it
60 * can directly access the timebase registers.
61 *
62 * It would be possible to emulate those accesses in userspace,
63 * but userspace can really only figure out the end frequency.
64 * We could decompose that into the factors that compute it, but
65 * that's tricky math, and it's easier to just report the real
66 * CPR0 values.
67 */
68 switch (dcrn) {
69 case DCRN_CPR0_CONFIG_ADDR:
70 kvmppc_set_gpr(vcpu, rt, vcpu->arch.cpr0_cfgaddr);
71 break;
72 case DCRN_CPR0_CONFIG_DATA:
73 local_irq_disable();
74 mtdcr(DCRN_CPR0_CONFIG_ADDR,
75 vcpu->arch.cpr0_cfgaddr);
76 kvmppc_set_gpr(vcpu, rt,
77 mfdcr(DCRN_CPR0_CONFIG_DATA));
78 local_irq_enable();
79 break;
80 default:
81 vcpu->run->dcr.dcrn = dcrn;
82 vcpu->run->dcr.data = 0;
83 vcpu->run->dcr.is_write = 0;
84 vcpu->arch.dcr_is_write = 0;
85 vcpu->arch.io_gpr = rt;
86 vcpu->arch.dcr_needed = 1;
87 kvmppc_account_exit(vcpu, DCR_EXITS);
88 return EMULATE_DO_DCR;
89 }
90
91 return EMULATE_DONE;
92}
93
36int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu, 94int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,
37 unsigned int inst, int *advance) 95 unsigned int inst, int *advance)
38{ 96{
@@ -50,55 +108,21 @@ int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,
50 switch (get_xop(inst)) { 108 switch (get_xop(inst)) {
51 109
52 case XOP_MFDCR: 110 case XOP_MFDCR:
53 /* The guest may access CPR0 registers to determine the timebase 111 emulated = emulate_mfdcr(vcpu, rt, dcrn);
54 * frequency, and it must know the real host frequency because it 112 break;
55 * can directly access the timebase registers.
56 *
57 * It would be possible to emulate those accesses in userspace,
58 * but userspace can really only figure out the end frequency.
59 * We could decompose that into the factors that compute it, but
60 * that's tricky math, and it's easier to just report the real
61 * CPR0 values.
62 */
63 switch (dcrn) {
64 case DCRN_CPR0_CONFIG_ADDR:
65 kvmppc_set_gpr(vcpu, rt, vcpu->arch.cpr0_cfgaddr);
66 break;
67 case DCRN_CPR0_CONFIG_DATA:
68 local_irq_disable();
69 mtdcr(DCRN_CPR0_CONFIG_ADDR,
70 vcpu->arch.cpr0_cfgaddr);
71 kvmppc_set_gpr(vcpu, rt,
72 mfdcr(DCRN_CPR0_CONFIG_DATA));
73 local_irq_enable();
74 break;
75 default:
76 run->dcr.dcrn = dcrn;
77 run->dcr.data = 0;
78 run->dcr.is_write = 0;
79 vcpu->arch.io_gpr = rt;
80 vcpu->arch.dcr_needed = 1;
81 kvmppc_account_exit(vcpu, DCR_EXITS);
82 emulated = EMULATE_DO_DCR;
83 }
84 113
114 case XOP_MFDCRX:
115 emulated = emulate_mfdcr(vcpu, rt,
116 kvmppc_get_gpr(vcpu, ra));
85 break; 117 break;
86 118
87 case XOP_MTDCR: 119 case XOP_MTDCR:
88 /* emulate some access in kernel */ 120 emulated = emulate_mtdcr(vcpu, rs, dcrn);
89 switch (dcrn) { 121 break;
90 case DCRN_CPR0_CONFIG_ADDR:
91 vcpu->arch.cpr0_cfgaddr = kvmppc_get_gpr(vcpu, rs);
92 break;
93 default:
94 run->dcr.dcrn = dcrn;
95 run->dcr.data = kvmppc_get_gpr(vcpu, rs);
96 run->dcr.is_write = 1;
97 vcpu->arch.dcr_needed = 1;
98 kvmppc_account_exit(vcpu, DCR_EXITS);
99 emulated = EMULATE_DO_DCR;
100 }
101 122
123 case XOP_MTDCRX:
124 emulated = emulate_mtdcr(vcpu, rs,
125 kvmppc_get_gpr(vcpu, ra));
102 break; 126 break;
103 127
104 case XOP_TLBWE: 128 case XOP_TLBWE:
diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig
index f4dacb9c57fa..4730c953f435 100644
--- a/arch/powerpc/kvm/Kconfig
+++ b/arch/powerpc/kvm/Kconfig
@@ -20,6 +20,7 @@ config KVM
20 bool 20 bool
21 select PREEMPT_NOTIFIERS 21 select PREEMPT_NOTIFIERS
22 select ANON_INODES 22 select ANON_INODES
23 select HAVE_KVM_EVENTFD
23 24
24config KVM_BOOK3S_HANDLER 25config KVM_BOOK3S_HANDLER
25 bool 26 bool
@@ -36,6 +37,7 @@ config KVM_BOOK3S_64_HANDLER
36config KVM_BOOK3S_PR 37config KVM_BOOK3S_PR
37 bool 38 bool
38 select KVM_MMIO 39 select KVM_MMIO
40 select MMU_NOTIFIER
39 41
40config KVM_BOOK3S_32 42config KVM_BOOK3S_32
41 tristate "KVM support for PowerPC book3s_32 processors" 43 tristate "KVM support for PowerPC book3s_32 processors"
@@ -123,6 +125,7 @@ config KVM_E500V2
123 depends on EXPERIMENTAL && E500 && !PPC_E500MC 125 depends on EXPERIMENTAL && E500 && !PPC_E500MC
124 select KVM 126 select KVM
125 select KVM_MMIO 127 select KVM_MMIO
128 select MMU_NOTIFIER
126 ---help--- 129 ---help---
127 Support running unmodified E500 guest kernels in virtual machines on 130 Support running unmodified E500 guest kernels in virtual machines on
128 E500v2 host processors. 131 E500v2 host processors.
@@ -138,6 +141,7 @@ config KVM_E500MC
138 select KVM 141 select KVM
139 select KVM_MMIO 142 select KVM_MMIO
140 select KVM_BOOKE_HV 143 select KVM_BOOKE_HV
144 select MMU_NOTIFIER
141 ---help--- 145 ---help---
142 Support running unmodified E500MC/E5500 (32-bit) guest kernels in 146 Support running unmodified E500MC/E5500 (32-bit) guest kernels in
143 virtual machines on E500MC/E5500 host processors. 147 virtual machines on E500MC/E5500 host processors.
diff --git a/arch/powerpc/kvm/Makefile b/arch/powerpc/kvm/Makefile
index c2a08636e6d4..1e473d46322c 100644
--- a/arch/powerpc/kvm/Makefile
+++ b/arch/powerpc/kvm/Makefile
@@ -6,7 +6,8 @@ subdir-ccflags-$(CONFIG_PPC_WERROR) := -Werror
6 6
7ccflags-y := -Ivirt/kvm -Iarch/powerpc/kvm 7ccflags-y := -Ivirt/kvm -Iarch/powerpc/kvm
8 8
9common-objs-y = $(addprefix ../../../virt/kvm/, kvm_main.o coalesced_mmio.o) 9common-objs-y = $(addprefix ../../../virt/kvm/, kvm_main.o coalesced_mmio.o \
10 eventfd.o)
10 11
11CFLAGS_44x_tlb.o := -I. 12CFLAGS_44x_tlb.o := -I.
12CFLAGS_e500_tlb.o := -I. 13CFLAGS_e500_tlb.o := -I.
@@ -72,10 +73,12 @@ kvm-book3s_64-builtin-objs-$(CONFIG_KVM_BOOK3S_64_HV) := \
72 book3s_hv_rmhandlers.o \ 73 book3s_hv_rmhandlers.o \
73 book3s_hv_rm_mmu.o \ 74 book3s_hv_rm_mmu.o \
74 book3s_64_vio_hv.o \ 75 book3s_64_vio_hv.o \
76 book3s_hv_ras.o \
75 book3s_hv_builtin.o 77 book3s_hv_builtin.o
76 78
77kvm-book3s_64-module-objs := \ 79kvm-book3s_64-module-objs := \
78 ../../../virt/kvm/kvm_main.o \ 80 ../../../virt/kvm/kvm_main.o \
81 ../../../virt/kvm/eventfd.o \
79 powerpc.o \ 82 powerpc.o \
80 emulate.o \ 83 emulate.o \
81 book3s.o \ 84 book3s.o \
diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
index 3f2a8360c857..a4b645285240 100644
--- a/arch/powerpc/kvm/book3s.c
+++ b/arch/powerpc/kvm/book3s.c
@@ -411,6 +411,15 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
411 return 0; 411 return 0;
412} 412}
413 413
414int kvmppc_subarch_vcpu_init(struct kvm_vcpu *vcpu)
415{
416 return 0;
417}
418
419void kvmppc_subarch_vcpu_uninit(struct kvm_vcpu *vcpu)
420{
421}
422
414int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) 423int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
415{ 424{
416 int i; 425 int i;
@@ -476,6 +485,122 @@ int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
476 return -ENOTSUPP; 485 return -ENOTSUPP;
477} 486}
478 487
488int kvm_vcpu_ioctl_get_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg)
489{
490 int r;
491 union kvmppc_one_reg val;
492 int size;
493 long int i;
494
495 size = one_reg_size(reg->id);
496 if (size > sizeof(val))
497 return -EINVAL;
498
499 r = kvmppc_get_one_reg(vcpu, reg->id, &val);
500
501 if (r == -EINVAL) {
502 r = 0;
503 switch (reg->id) {
504 case KVM_REG_PPC_DAR:
505 val = get_reg_val(reg->id, vcpu->arch.shared->dar);
506 break;
507 case KVM_REG_PPC_DSISR:
508 val = get_reg_val(reg->id, vcpu->arch.shared->dsisr);
509 break;
510 case KVM_REG_PPC_FPR0 ... KVM_REG_PPC_FPR31:
511 i = reg->id - KVM_REG_PPC_FPR0;
512 val = get_reg_val(reg->id, vcpu->arch.fpr[i]);
513 break;
514 case KVM_REG_PPC_FPSCR:
515 val = get_reg_val(reg->id, vcpu->arch.fpscr);
516 break;
517#ifdef CONFIG_ALTIVEC
518 case KVM_REG_PPC_VR0 ... KVM_REG_PPC_VR31:
519 if (!cpu_has_feature(CPU_FTR_ALTIVEC)) {
520 r = -ENXIO;
521 break;
522 }
523 val.vval = vcpu->arch.vr[reg->id - KVM_REG_PPC_VR0];
524 break;
525 case KVM_REG_PPC_VSCR:
526 if (!cpu_has_feature(CPU_FTR_ALTIVEC)) {
527 r = -ENXIO;
528 break;
529 }
530 val = get_reg_val(reg->id, vcpu->arch.vscr.u[3]);
531 break;
532#endif /* CONFIG_ALTIVEC */
533 default:
534 r = -EINVAL;
535 break;
536 }
537 }
538 if (r)
539 return r;
540
541 if (copy_to_user((char __user *)(unsigned long)reg->addr, &val, size))
542 r = -EFAULT;
543
544 return r;
545}
546
547int kvm_vcpu_ioctl_set_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg)
548{
549 int r;
550 union kvmppc_one_reg val;
551 int size;
552 long int i;
553
554 size = one_reg_size(reg->id);
555 if (size > sizeof(val))
556 return -EINVAL;
557
558 if (copy_from_user(&val, (char __user *)(unsigned long)reg->addr, size))
559 return -EFAULT;
560
561 r = kvmppc_set_one_reg(vcpu, reg->id, &val);
562
563 if (r == -EINVAL) {
564 r = 0;
565 switch (reg->id) {
566 case KVM_REG_PPC_DAR:
567 vcpu->arch.shared->dar = set_reg_val(reg->id, val);
568 break;
569 case KVM_REG_PPC_DSISR:
570 vcpu->arch.shared->dsisr = set_reg_val(reg->id, val);
571 break;
572 case KVM_REG_PPC_FPR0 ... KVM_REG_PPC_FPR31:
573 i = reg->id - KVM_REG_PPC_FPR0;
574 vcpu->arch.fpr[i] = set_reg_val(reg->id, val);
575 break;
576 case KVM_REG_PPC_FPSCR:
577 vcpu->arch.fpscr = set_reg_val(reg->id, val);
578 break;
579#ifdef CONFIG_ALTIVEC
580 case KVM_REG_PPC_VR0 ... KVM_REG_PPC_VR31:
581 if (!cpu_has_feature(CPU_FTR_ALTIVEC)) {
582 r = -ENXIO;
583 break;
584 }
585 vcpu->arch.vr[reg->id - KVM_REG_PPC_VR0] = val.vval;
586 break;
587 case KVM_REG_PPC_VSCR:
588 if (!cpu_has_feature(CPU_FTR_ALTIVEC)) {
589 r = -ENXIO;
590 break;
591 }
592 vcpu->arch.vscr.u[3] = set_reg_val(reg->id, val);
593 break;
594#endif /* CONFIG_ALTIVEC */
595 default:
596 r = -EINVAL;
597 break;
598 }
599 }
600
601 return r;
602}
603
479int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu, 604int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
480 struct kvm_translation *tr) 605 struct kvm_translation *tr)
481{ 606{
diff --git a/arch/powerpc/kvm/book3s_32_mmu_host.c b/arch/powerpc/kvm/book3s_32_mmu_host.c
index b0f625a33345..00e619bf608e 100644
--- a/arch/powerpc/kvm/book3s_32_mmu_host.c
+++ b/arch/powerpc/kvm/book3s_32_mmu_host.c
@@ -155,7 +155,7 @@ int kvmppc_mmu_map_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *orig_pte)
155 155
156 /* Get host physical address for gpa */ 156 /* Get host physical address for gpa */
157 hpaddr = kvmppc_gfn_to_pfn(vcpu, orig_pte->raddr >> PAGE_SHIFT); 157 hpaddr = kvmppc_gfn_to_pfn(vcpu, orig_pte->raddr >> PAGE_SHIFT);
158 if (is_error_pfn(hpaddr)) { 158 if (is_error_noslot_pfn(hpaddr)) {
159 printk(KERN_INFO "Couldn't get guest page for gfn %lx!\n", 159 printk(KERN_INFO "Couldn't get guest page for gfn %lx!\n",
160 orig_pte->eaddr); 160 orig_pte->eaddr);
161 r = -EINVAL; 161 r = -EINVAL;
@@ -254,6 +254,7 @@ next_pteg:
254 254
255 kvmppc_mmu_hpte_cache_map(vcpu, pte); 255 kvmppc_mmu_hpte_cache_map(vcpu, pte);
256 256
257 kvm_release_pfn_clean(hpaddr >> PAGE_SHIFT);
257out: 258out:
258 return r; 259 return r;
259} 260}
diff --git a/arch/powerpc/kvm/book3s_64_mmu_host.c b/arch/powerpc/kvm/book3s_64_mmu_host.c
index 4d72f9ebc554..ead58e317294 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_host.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_host.c
@@ -93,7 +93,7 @@ int kvmppc_mmu_map_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *orig_pte)
93 93
94 /* Get host physical address for gpa */ 94 /* Get host physical address for gpa */
95 hpaddr = kvmppc_gfn_to_pfn(vcpu, orig_pte->raddr >> PAGE_SHIFT); 95 hpaddr = kvmppc_gfn_to_pfn(vcpu, orig_pte->raddr >> PAGE_SHIFT);
96 if (is_error_pfn(hpaddr)) { 96 if (is_error_noslot_pfn(hpaddr)) {
97 printk(KERN_INFO "Couldn't get guest page for gfn %lx!\n", orig_pte->eaddr); 97 printk(KERN_INFO "Couldn't get guest page for gfn %lx!\n", orig_pte->eaddr);
98 r = -EINVAL; 98 r = -EINVAL;
99 goto out; 99 goto out;
@@ -171,6 +171,7 @@ map_again:
171 171
172 kvmppc_mmu_hpte_cache_map(vcpu, pte); 172 kvmppc_mmu_hpte_cache_map(vcpu, pte);
173 } 173 }
174 kvm_release_pfn_clean(hpaddr >> PAGE_SHIFT);
174 175
175out: 176out:
176 return r; 177 return r;
diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c
index d95d11322a15..8cc18abd6dde 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
@@ -24,6 +24,9 @@
24#include <linux/slab.h> 24#include <linux/slab.h>
25#include <linux/hugetlb.h> 25#include <linux/hugetlb.h>
26#include <linux/vmalloc.h> 26#include <linux/vmalloc.h>
27#include <linux/srcu.h>
28#include <linux/anon_inodes.h>
29#include <linux/file.h>
27 30
28#include <asm/tlbflush.h> 31#include <asm/tlbflush.h>
29#include <asm/kvm_ppc.h> 32#include <asm/kvm_ppc.h>
@@ -40,6 +43,11 @@
40/* Power architecture requires HPT is at least 256kB */ 43/* Power architecture requires HPT is at least 256kB */
41#define PPC_MIN_HPT_ORDER 18 44#define PPC_MIN_HPT_ORDER 18
42 45
46static long kvmppc_virtmode_do_h_enter(struct kvm *kvm, unsigned long flags,
47 long pte_index, unsigned long pteh,
48 unsigned long ptel, unsigned long *pte_idx_ret);
49static void kvmppc_rmap_reset(struct kvm *kvm);
50
43long kvmppc_alloc_hpt(struct kvm *kvm, u32 *htab_orderp) 51long kvmppc_alloc_hpt(struct kvm *kvm, u32 *htab_orderp)
44{ 52{
45 unsigned long hpt; 53 unsigned long hpt;
@@ -137,10 +145,11 @@ long kvmppc_alloc_reset_hpt(struct kvm *kvm, u32 *htab_orderp)
137 /* Set the entire HPT to 0, i.e. invalid HPTEs */ 145 /* Set the entire HPT to 0, i.e. invalid HPTEs */
138 memset((void *)kvm->arch.hpt_virt, 0, 1ul << order); 146 memset((void *)kvm->arch.hpt_virt, 0, 1ul << order);
139 /* 147 /*
140 * Set the whole last_vcpu array to an invalid vcpu number. 148 * Reset all the reverse-mapping chains for all memslots
141 * This ensures that each vcpu will flush its TLB on next entry.
142 */ 149 */
143 memset(kvm->arch.last_vcpu, 0xff, sizeof(kvm->arch.last_vcpu)); 150 kvmppc_rmap_reset(kvm);
151 /* Ensure that each vcpu will flush its TLB on next entry. */
152 cpumask_setall(&kvm->arch.need_tlb_flush);
144 *htab_orderp = order; 153 *htab_orderp = order;
145 err = 0; 154 err = 0;
146 } else { 155 } else {
@@ -184,6 +193,7 @@ void kvmppc_map_vrma(struct kvm_vcpu *vcpu, struct kvm_memory_slot *memslot,
184 unsigned long addr, hash; 193 unsigned long addr, hash;
185 unsigned long psize; 194 unsigned long psize;
186 unsigned long hp0, hp1; 195 unsigned long hp0, hp1;
196 unsigned long idx_ret;
187 long ret; 197 long ret;
188 struct kvm *kvm = vcpu->kvm; 198 struct kvm *kvm = vcpu->kvm;
189 199
@@ -215,7 +225,8 @@ void kvmppc_map_vrma(struct kvm_vcpu *vcpu, struct kvm_memory_slot *memslot,
215 hash = (hash << 3) + 7; 225 hash = (hash << 3) + 7;
216 hp_v = hp0 | ((addr >> 16) & ~0x7fUL); 226 hp_v = hp0 | ((addr >> 16) & ~0x7fUL);
217 hp_r = hp1 | addr; 227 hp_r = hp1 | addr;
218 ret = kvmppc_virtmode_h_enter(vcpu, H_EXACT, hash, hp_v, hp_r); 228 ret = kvmppc_virtmode_do_h_enter(kvm, H_EXACT, hash, hp_v, hp_r,
229 &idx_ret);
219 if (ret != H_SUCCESS) { 230 if (ret != H_SUCCESS) {
220 pr_err("KVM: map_vrma at %lx failed, ret=%ld\n", 231 pr_err("KVM: map_vrma at %lx failed, ret=%ld\n",
221 addr, ret); 232 addr, ret);
@@ -260,7 +271,7 @@ static void kvmppc_mmu_book3s_64_hv_reset_msr(struct kvm_vcpu *vcpu)
260 271
261/* 272/*
262 * This is called to get a reference to a guest page if there isn't 273 * This is called to get a reference to a guest page if there isn't
263 * one already in the kvm->arch.slot_phys[][] arrays. 274 * one already in the memslot->arch.slot_phys[] array.
264 */ 275 */
265static long kvmppc_get_guest_page(struct kvm *kvm, unsigned long gfn, 276static long kvmppc_get_guest_page(struct kvm *kvm, unsigned long gfn,
266 struct kvm_memory_slot *memslot, 277 struct kvm_memory_slot *memslot,
@@ -275,7 +286,7 @@ static long kvmppc_get_guest_page(struct kvm *kvm, unsigned long gfn,
275 struct vm_area_struct *vma; 286 struct vm_area_struct *vma;
276 unsigned long pfn, i, npages; 287 unsigned long pfn, i, npages;
277 288
278 physp = kvm->arch.slot_phys[memslot->id]; 289 physp = memslot->arch.slot_phys;
279 if (!physp) 290 if (!physp)
280 return -EINVAL; 291 return -EINVAL;
281 if (physp[gfn - memslot->base_gfn]) 292 if (physp[gfn - memslot->base_gfn])
@@ -353,15 +364,10 @@ static long kvmppc_get_guest_page(struct kvm *kvm, unsigned long gfn,
353 return err; 364 return err;
354} 365}
355 366
356/* 367long kvmppc_virtmode_do_h_enter(struct kvm *kvm, unsigned long flags,
357 * We come here on a H_ENTER call from the guest when we are not 368 long pte_index, unsigned long pteh,
358 * using mmu notifiers and we don't have the requested page pinned 369 unsigned long ptel, unsigned long *pte_idx_ret)
359 * already.
360 */
361long kvmppc_virtmode_h_enter(struct kvm_vcpu *vcpu, unsigned long flags,
362 long pte_index, unsigned long pteh, unsigned long ptel)
363{ 370{
364 struct kvm *kvm = vcpu->kvm;
365 unsigned long psize, gpa, gfn; 371 unsigned long psize, gpa, gfn;
366 struct kvm_memory_slot *memslot; 372 struct kvm_memory_slot *memslot;
367 long ret; 373 long ret;
@@ -389,8 +395,8 @@ long kvmppc_virtmode_h_enter(struct kvm_vcpu *vcpu, unsigned long flags,
389 do_insert: 395 do_insert:
390 /* Protect linux PTE lookup from page table destruction */ 396 /* Protect linux PTE lookup from page table destruction */
391 rcu_read_lock_sched(); /* this disables preemption too */ 397 rcu_read_lock_sched(); /* this disables preemption too */
392 vcpu->arch.pgdir = current->mm->pgd; 398 ret = kvmppc_do_h_enter(kvm, flags, pte_index, pteh, ptel,
393 ret = kvmppc_h_enter(vcpu, flags, pte_index, pteh, ptel); 399 current->mm->pgd, false, pte_idx_ret);
394 rcu_read_unlock_sched(); 400 rcu_read_unlock_sched();
395 if (ret == H_TOO_HARD) { 401 if (ret == H_TOO_HARD) {
396 /* this can't happen */ 402 /* this can't happen */
@@ -401,6 +407,19 @@ long kvmppc_virtmode_h_enter(struct kvm_vcpu *vcpu, unsigned long flags,
401 407
402} 408}
403 409
410/*
411 * We come here on a H_ENTER call from the guest when we are not
412 * using mmu notifiers and we don't have the requested page pinned
413 * already.
414 */
415long kvmppc_virtmode_h_enter(struct kvm_vcpu *vcpu, unsigned long flags,
416 long pte_index, unsigned long pteh,
417 unsigned long ptel)
418{
419 return kvmppc_virtmode_do_h_enter(vcpu->kvm, flags, pte_index,
420 pteh, ptel, &vcpu->arch.gpr[4]);
421}
422
404static struct kvmppc_slb *kvmppc_mmu_book3s_hv_find_slbe(struct kvm_vcpu *vcpu, 423static struct kvmppc_slb *kvmppc_mmu_book3s_hv_find_slbe(struct kvm_vcpu *vcpu,
405 gva_t eaddr) 424 gva_t eaddr)
406{ 425{
@@ -570,7 +589,7 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
570 struct kvm *kvm = vcpu->kvm; 589 struct kvm *kvm = vcpu->kvm;
571 unsigned long *hptep, hpte[3], r; 590 unsigned long *hptep, hpte[3], r;
572 unsigned long mmu_seq, psize, pte_size; 591 unsigned long mmu_seq, psize, pte_size;
573 unsigned long gfn, hva, pfn; 592 unsigned long gpa, gfn, hva, pfn;
574 struct kvm_memory_slot *memslot; 593 struct kvm_memory_slot *memslot;
575 unsigned long *rmap; 594 unsigned long *rmap;
576 struct revmap_entry *rev; 595 struct revmap_entry *rev;
@@ -608,15 +627,14 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
608 627
609 /* Translate the logical address and get the page */ 628 /* Translate the logical address and get the page */
610 psize = hpte_page_size(hpte[0], r); 629 psize = hpte_page_size(hpte[0], r);
611 gfn = hpte_rpn(r, psize); 630 gpa = (r & HPTE_R_RPN & ~(psize - 1)) | (ea & (psize - 1));
631 gfn = gpa >> PAGE_SHIFT;
612 memslot = gfn_to_memslot(kvm, gfn); 632 memslot = gfn_to_memslot(kvm, gfn);
613 633
614 /* No memslot means it's an emulated MMIO region */ 634 /* No memslot means it's an emulated MMIO region */
615 if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID)) { 635 if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID))
616 unsigned long gpa = (gfn << PAGE_SHIFT) | (ea & (psize - 1));
617 return kvmppc_hv_emulate_mmio(run, vcpu, gpa, ea, 636 return kvmppc_hv_emulate_mmio(run, vcpu, gpa, ea,
618 dsisr & DSISR_ISSTORE); 637 dsisr & DSISR_ISSTORE);
619 }
620 638
621 if (!kvm->arch.using_mmu_notifiers) 639 if (!kvm->arch.using_mmu_notifiers)
622 return -EFAULT; /* should never get here */ 640 return -EFAULT; /* should never get here */
@@ -710,7 +728,7 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
710 728
711 /* Check if we might have been invalidated; let the guest retry if so */ 729 /* Check if we might have been invalidated; let the guest retry if so */
712 ret = RESUME_GUEST; 730 ret = RESUME_GUEST;
713 if (mmu_notifier_retry(vcpu, mmu_seq)) { 731 if (mmu_notifier_retry(vcpu->kvm, mmu_seq)) {
714 unlock_rmap(rmap); 732 unlock_rmap(rmap);
715 goto out_unlock; 733 goto out_unlock;
716 } 734 }
@@ -756,6 +774,25 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
756 goto out_put; 774 goto out_put;
757} 775}
758 776
777static void kvmppc_rmap_reset(struct kvm *kvm)
778{
779 struct kvm_memslots *slots;
780 struct kvm_memory_slot *memslot;
781 int srcu_idx;
782
783 srcu_idx = srcu_read_lock(&kvm->srcu);
784 slots = kvm->memslots;
785 kvm_for_each_memslot(memslot, slots) {
786 /*
787 * This assumes it is acceptable to lose reference and
788 * change bits across a reset.
789 */
790 memset(memslot->arch.rmap, 0,
791 memslot->npages * sizeof(*memslot->arch.rmap));
792 }
793 srcu_read_unlock(&kvm->srcu, srcu_idx);
794}
795
759static int kvm_handle_hva_range(struct kvm *kvm, 796static int kvm_handle_hva_range(struct kvm *kvm,
760 unsigned long start, 797 unsigned long start,
761 unsigned long end, 798 unsigned long end,
@@ -850,7 +887,8 @@ static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp,
850 psize = hpte_page_size(hptep[0], ptel); 887 psize = hpte_page_size(hptep[0], ptel);
851 if ((hptep[0] & HPTE_V_VALID) && 888 if ((hptep[0] & HPTE_V_VALID) &&
852 hpte_rpn(ptel, psize) == gfn) { 889 hpte_rpn(ptel, psize) == gfn) {
853 hptep[0] |= HPTE_V_ABSENT; 890 if (kvm->arch.using_mmu_notifiers)
891 hptep[0] |= HPTE_V_ABSENT;
854 kvmppc_invalidate_hpte(kvm, hptep, i); 892 kvmppc_invalidate_hpte(kvm, hptep, i);
855 /* Harvest R and C */ 893 /* Harvest R and C */
856 rcbits = hptep[1] & (HPTE_R_R | HPTE_R_C); 894 rcbits = hptep[1] & (HPTE_R_R | HPTE_R_C);
@@ -877,6 +915,28 @@ int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end)
877 return 0; 915 return 0;
878} 916}
879 917
918void kvmppc_core_flush_memslot(struct kvm *kvm, struct kvm_memory_slot *memslot)
919{
920 unsigned long *rmapp;
921 unsigned long gfn;
922 unsigned long n;
923
924 rmapp = memslot->arch.rmap;
925 gfn = memslot->base_gfn;
926 for (n = memslot->npages; n; --n) {
927 /*
928 * Testing the present bit without locking is OK because
929 * the memslot has been marked invalid already, and hence
930 * no new HPTEs referencing this page can be created,
931 * thus the present bit can't go from 0 to 1.
932 */
933 if (*rmapp & KVMPPC_RMAP_PRESENT)
934 kvm_unmap_rmapp(kvm, rmapp, gfn);
935 ++rmapp;
936 ++gfn;
937 }
938}
939
880static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp, 940static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
881 unsigned long gfn) 941 unsigned long gfn)
882{ 942{
@@ -1030,16 +1090,16 @@ static int kvm_test_clear_dirty(struct kvm *kvm, unsigned long *rmapp)
1030 return ret; 1090 return ret;
1031} 1091}
1032 1092
1033long kvmppc_hv_get_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot) 1093long kvmppc_hv_get_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot,
1094 unsigned long *map)
1034{ 1095{
1035 unsigned long i; 1096 unsigned long i;
1036 unsigned long *rmapp, *map; 1097 unsigned long *rmapp;
1037 1098
1038 preempt_disable(); 1099 preempt_disable();
1039 rmapp = memslot->arch.rmap; 1100 rmapp = memslot->arch.rmap;
1040 map = memslot->dirty_bitmap;
1041 for (i = 0; i < memslot->npages; ++i) { 1101 for (i = 0; i < memslot->npages; ++i) {
1042 if (kvm_test_clear_dirty(kvm, rmapp)) 1102 if (kvm_test_clear_dirty(kvm, rmapp) && map)
1043 __set_bit_le(i, map); 1103 __set_bit_le(i, map);
1044 ++rmapp; 1104 ++rmapp;
1045 } 1105 }
@@ -1057,20 +1117,22 @@ void *kvmppc_pin_guest_page(struct kvm *kvm, unsigned long gpa,
1057 unsigned long hva, psize, offset; 1117 unsigned long hva, psize, offset;
1058 unsigned long pa; 1118 unsigned long pa;
1059 unsigned long *physp; 1119 unsigned long *physp;
1120 int srcu_idx;
1060 1121
1122 srcu_idx = srcu_read_lock(&kvm->srcu);
1061 memslot = gfn_to_memslot(kvm, gfn); 1123 memslot = gfn_to_memslot(kvm, gfn);
1062 if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID)) 1124 if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID))
1063 return NULL; 1125 goto err;
1064 if (!kvm->arch.using_mmu_notifiers) { 1126 if (!kvm->arch.using_mmu_notifiers) {
1065 physp = kvm->arch.slot_phys[memslot->id]; 1127 physp = memslot->arch.slot_phys;
1066 if (!physp) 1128 if (!physp)
1067 return NULL; 1129 goto err;
1068 physp += gfn - memslot->base_gfn; 1130 physp += gfn - memslot->base_gfn;
1069 pa = *physp; 1131 pa = *physp;
1070 if (!pa) { 1132 if (!pa) {
1071 if (kvmppc_get_guest_page(kvm, gfn, memslot, 1133 if (kvmppc_get_guest_page(kvm, gfn, memslot,
1072 PAGE_SIZE) < 0) 1134 PAGE_SIZE) < 0)
1073 return NULL; 1135 goto err;
1074 pa = *physp; 1136 pa = *physp;
1075 } 1137 }
1076 page = pfn_to_page(pa >> PAGE_SHIFT); 1138 page = pfn_to_page(pa >> PAGE_SHIFT);
@@ -1079,9 +1141,11 @@ void *kvmppc_pin_guest_page(struct kvm *kvm, unsigned long gpa,
1079 hva = gfn_to_hva_memslot(memslot, gfn); 1141 hva = gfn_to_hva_memslot(memslot, gfn);
1080 npages = get_user_pages_fast(hva, 1, 1, pages); 1142 npages = get_user_pages_fast(hva, 1, 1, pages);
1081 if (npages < 1) 1143 if (npages < 1)
1082 return NULL; 1144 goto err;
1083 page = pages[0]; 1145 page = pages[0];
1084 } 1146 }
1147 srcu_read_unlock(&kvm->srcu, srcu_idx);
1148
1085 psize = PAGE_SIZE; 1149 psize = PAGE_SIZE;
1086 if (PageHuge(page)) { 1150 if (PageHuge(page)) {
1087 page = compound_head(page); 1151 page = compound_head(page);
@@ -1091,6 +1155,10 @@ void *kvmppc_pin_guest_page(struct kvm *kvm, unsigned long gpa,
1091 if (nb_ret) 1155 if (nb_ret)
1092 *nb_ret = psize - offset; 1156 *nb_ret = psize - offset;
1093 return page_address(page) + offset; 1157 return page_address(page) + offset;
1158
1159 err:
1160 srcu_read_unlock(&kvm->srcu, srcu_idx);
1161 return NULL;
1094} 1162}
1095 1163
1096void kvmppc_unpin_guest_page(struct kvm *kvm, void *va) 1164void kvmppc_unpin_guest_page(struct kvm *kvm, void *va)
@@ -1100,6 +1168,348 @@ void kvmppc_unpin_guest_page(struct kvm *kvm, void *va)
1100 put_page(page); 1168 put_page(page);
1101} 1169}
1102 1170
1171/*
1172 * Functions for reading and writing the hash table via reads and
1173 * writes on a file descriptor.
1174 *
1175 * Reads return the guest view of the hash table, which has to be
1176 * pieced together from the real hash table and the guest_rpte
1177 * values in the revmap array.
1178 *
1179 * On writes, each HPTE written is considered in turn, and if it
1180 * is valid, it is written to the HPT as if an H_ENTER with the
1181 * exact flag set was done. When the invalid count is non-zero
1182 * in the header written to the stream, the kernel will make
1183 * sure that that many HPTEs are invalid, and invalidate them
1184 * if not.
1185 */
1186
1187struct kvm_htab_ctx {
1188 unsigned long index;
1189 unsigned long flags;
1190 struct kvm *kvm;
1191 int first_pass;
1192};
1193
1194#define HPTE_SIZE (2 * sizeof(unsigned long))
1195
1196static long record_hpte(unsigned long flags, unsigned long *hptp,
1197 unsigned long *hpte, struct revmap_entry *revp,
1198 int want_valid, int first_pass)
1199{
1200 unsigned long v, r;
1201 int ok = 1;
1202 int valid, dirty;
1203
1204 /* Unmodified entries are uninteresting except on the first pass */
1205 dirty = !!(revp->guest_rpte & HPTE_GR_MODIFIED);
1206 if (!first_pass && !dirty)
1207 return 0;
1208
1209 valid = 0;
1210 if (hptp[0] & (HPTE_V_VALID | HPTE_V_ABSENT)) {
1211 valid = 1;
1212 if ((flags & KVM_GET_HTAB_BOLTED_ONLY) &&
1213 !(hptp[0] & HPTE_V_BOLTED))
1214 valid = 0;
1215 }
1216 if (valid != want_valid)
1217 return 0;
1218
1219 v = r = 0;
1220 if (valid || dirty) {
1221 /* lock the HPTE so it's stable and read it */
1222 preempt_disable();
1223 while (!try_lock_hpte(hptp, HPTE_V_HVLOCK))
1224 cpu_relax();
1225 v = hptp[0];
1226 if (v & HPTE_V_ABSENT) {
1227 v &= ~HPTE_V_ABSENT;
1228 v |= HPTE_V_VALID;
1229 }
1230 /* re-evaluate valid and dirty from synchronized HPTE value */
1231 valid = !!(v & HPTE_V_VALID);
1232 if ((flags & KVM_GET_HTAB_BOLTED_ONLY) && !(v & HPTE_V_BOLTED))
1233 valid = 0;
1234 r = revp->guest_rpte | (hptp[1] & (HPTE_R_R | HPTE_R_C));
1235 dirty = !!(revp->guest_rpte & HPTE_GR_MODIFIED);
1236 /* only clear modified if this is the right sort of entry */
1237 if (valid == want_valid && dirty) {
1238 r &= ~HPTE_GR_MODIFIED;
1239 revp->guest_rpte = r;
1240 }
1241 asm volatile(PPC_RELEASE_BARRIER "" : : : "memory");
1242 hptp[0] &= ~HPTE_V_HVLOCK;
1243 preempt_enable();
1244 if (!(valid == want_valid && (first_pass || dirty)))
1245 ok = 0;
1246 }
1247 hpte[0] = v;
1248 hpte[1] = r;
1249 return ok;
1250}
1251
1252static ssize_t kvm_htab_read(struct file *file, char __user *buf,
1253 size_t count, loff_t *ppos)
1254{
1255 struct kvm_htab_ctx *ctx = file->private_data;
1256 struct kvm *kvm = ctx->kvm;
1257 struct kvm_get_htab_header hdr;
1258 unsigned long *hptp;
1259 struct revmap_entry *revp;
1260 unsigned long i, nb, nw;
1261 unsigned long __user *lbuf;
1262 struct kvm_get_htab_header __user *hptr;
1263 unsigned long flags;
1264 int first_pass;
1265 unsigned long hpte[2];
1266
1267 if (!access_ok(VERIFY_WRITE, buf, count))
1268 return -EFAULT;
1269
1270 first_pass = ctx->first_pass;
1271 flags = ctx->flags;
1272
1273 i = ctx->index;
1274 hptp = (unsigned long *)(kvm->arch.hpt_virt + (i * HPTE_SIZE));
1275 revp = kvm->arch.revmap + i;
1276 lbuf = (unsigned long __user *)buf;
1277
1278 nb = 0;
1279 while (nb + sizeof(hdr) + HPTE_SIZE < count) {
1280 /* Initialize header */
1281 hptr = (struct kvm_get_htab_header __user *)buf;
1282 hdr.n_valid = 0;
1283 hdr.n_invalid = 0;
1284 nw = nb;
1285 nb += sizeof(hdr);
1286 lbuf = (unsigned long __user *)(buf + sizeof(hdr));
1287
1288 /* Skip uninteresting entries, i.e. clean on not-first pass */
1289 if (!first_pass) {
1290 while (i < kvm->arch.hpt_npte &&
1291 !(revp->guest_rpte & HPTE_GR_MODIFIED)) {
1292 ++i;
1293 hptp += 2;
1294 ++revp;
1295 }
1296 }
1297 hdr.index = i;
1298
1299 /* Grab a series of valid entries */
1300 while (i < kvm->arch.hpt_npte &&
1301 hdr.n_valid < 0xffff &&
1302 nb + HPTE_SIZE < count &&
1303 record_hpte(flags, hptp, hpte, revp, 1, first_pass)) {
1304 /* valid entry, write it out */
1305 ++hdr.n_valid;
1306 if (__put_user(hpte[0], lbuf) ||
1307 __put_user(hpte[1], lbuf + 1))
1308 return -EFAULT;
1309 nb += HPTE_SIZE;
1310 lbuf += 2;
1311 ++i;
1312 hptp += 2;
1313 ++revp;
1314 }
1315 /* Now skip invalid entries while we can */
1316 while (i < kvm->arch.hpt_npte &&
1317 hdr.n_invalid < 0xffff &&
1318 record_hpte(flags, hptp, hpte, revp, 0, first_pass)) {
1319 /* found an invalid entry */
1320 ++hdr.n_invalid;
1321 ++i;
1322 hptp += 2;
1323 ++revp;
1324 }
1325
1326 if (hdr.n_valid || hdr.n_invalid) {
1327 /* write back the header */
1328 if (__copy_to_user(hptr, &hdr, sizeof(hdr)))
1329 return -EFAULT;
1330 nw = nb;
1331 buf = (char __user *)lbuf;
1332 } else {
1333 nb = nw;
1334 }
1335
1336 /* Check if we've wrapped around the hash table */
1337 if (i >= kvm->arch.hpt_npte) {
1338 i = 0;
1339 ctx->first_pass = 0;
1340 break;
1341 }
1342 }
1343
1344 ctx->index = i;
1345
1346 return nb;
1347}
1348
1349static ssize_t kvm_htab_write(struct file *file, const char __user *buf,
1350 size_t count, loff_t *ppos)
1351{
1352 struct kvm_htab_ctx *ctx = file->private_data;
1353 struct kvm *kvm = ctx->kvm;
1354 struct kvm_get_htab_header hdr;
1355 unsigned long i, j;
1356 unsigned long v, r;
1357 unsigned long __user *lbuf;
1358 unsigned long *hptp;
1359 unsigned long tmp[2];
1360 ssize_t nb;
1361 long int err, ret;
1362 int rma_setup;
1363
1364 if (!access_ok(VERIFY_READ, buf, count))
1365 return -EFAULT;
1366
1367 /* lock out vcpus from running while we're doing this */
1368 mutex_lock(&kvm->lock);
1369 rma_setup = kvm->arch.rma_setup_done;
1370 if (rma_setup) {
1371 kvm->arch.rma_setup_done = 0; /* temporarily */
1372 /* order rma_setup_done vs. vcpus_running */
1373 smp_mb();
1374 if (atomic_read(&kvm->arch.vcpus_running)) {
1375 kvm->arch.rma_setup_done = 1;
1376 mutex_unlock(&kvm->lock);
1377 return -EBUSY;
1378 }
1379 }
1380
1381 err = 0;
1382 for (nb = 0; nb + sizeof(hdr) <= count; ) {
1383 err = -EFAULT;
1384 if (__copy_from_user(&hdr, buf, sizeof(hdr)))
1385 break;
1386
1387 err = 0;
1388 if (nb + hdr.n_valid * HPTE_SIZE > count)
1389 break;
1390
1391 nb += sizeof(hdr);
1392 buf += sizeof(hdr);
1393
1394 err = -EINVAL;
1395 i = hdr.index;
1396 if (i >= kvm->arch.hpt_npte ||
1397 i + hdr.n_valid + hdr.n_invalid > kvm->arch.hpt_npte)
1398 break;
1399
1400 hptp = (unsigned long *)(kvm->arch.hpt_virt + (i * HPTE_SIZE));
1401 lbuf = (unsigned long __user *)buf;
1402 for (j = 0; j < hdr.n_valid; ++j) {
1403 err = -EFAULT;
1404 if (__get_user(v, lbuf) || __get_user(r, lbuf + 1))
1405 goto out;
1406 err = -EINVAL;
1407 if (!(v & HPTE_V_VALID))
1408 goto out;
1409 lbuf += 2;
1410 nb += HPTE_SIZE;
1411
1412 if (hptp[0] & (HPTE_V_VALID | HPTE_V_ABSENT))
1413 kvmppc_do_h_remove(kvm, 0, i, 0, tmp);
1414 err = -EIO;
1415 ret = kvmppc_virtmode_do_h_enter(kvm, H_EXACT, i, v, r,
1416 tmp);
1417 if (ret != H_SUCCESS) {
1418 pr_err("kvm_htab_write ret %ld i=%ld v=%lx "
1419 "r=%lx\n", ret, i, v, r);
1420 goto out;
1421 }
1422 if (!rma_setup && is_vrma_hpte(v)) {
1423 unsigned long psize = hpte_page_size(v, r);
1424 unsigned long senc = slb_pgsize_encoding(psize);
1425 unsigned long lpcr;
1426
1427 kvm->arch.vrma_slb_v = senc | SLB_VSID_B_1T |
1428 (VRMA_VSID << SLB_VSID_SHIFT_1T);
1429 lpcr = kvm->arch.lpcr & ~LPCR_VRMASD;
1430 lpcr |= senc << (LPCR_VRMASD_SH - 4);
1431 kvm->arch.lpcr = lpcr;
1432 rma_setup = 1;
1433 }
1434 ++i;
1435 hptp += 2;
1436 }
1437
1438 for (j = 0; j < hdr.n_invalid; ++j) {
1439 if (hptp[0] & (HPTE_V_VALID | HPTE_V_ABSENT))
1440 kvmppc_do_h_remove(kvm, 0, i, 0, tmp);
1441 ++i;
1442 hptp += 2;
1443 }
1444 err = 0;
1445 }
1446
1447 out:
1448 /* Order HPTE updates vs. rma_setup_done */
1449 smp_wmb();
1450 kvm->arch.rma_setup_done = rma_setup;
1451 mutex_unlock(&kvm->lock);
1452
1453 if (err)
1454 return err;
1455 return nb;
1456}
1457
1458static int kvm_htab_release(struct inode *inode, struct file *filp)
1459{
1460 struct kvm_htab_ctx *ctx = filp->private_data;
1461
1462 filp->private_data = NULL;
1463 if (!(ctx->flags & KVM_GET_HTAB_WRITE))
1464 atomic_dec(&ctx->kvm->arch.hpte_mod_interest);
1465 kvm_put_kvm(ctx->kvm);
1466 kfree(ctx);
1467 return 0;
1468}
1469
1470static struct file_operations kvm_htab_fops = {
1471 .read = kvm_htab_read,
1472 .write = kvm_htab_write,
1473 .llseek = default_llseek,
1474 .release = kvm_htab_release,
1475};
1476
1477int kvm_vm_ioctl_get_htab_fd(struct kvm *kvm, struct kvm_get_htab_fd *ghf)
1478{
1479 int ret;
1480 struct kvm_htab_ctx *ctx;
1481 int rwflag;
1482
1483 /* reject flags we don't recognize */
1484 if (ghf->flags & ~(KVM_GET_HTAB_BOLTED_ONLY | KVM_GET_HTAB_WRITE))
1485 return -EINVAL;
1486 ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
1487 if (!ctx)
1488 return -ENOMEM;
1489 kvm_get_kvm(kvm);
1490 ctx->kvm = kvm;
1491 ctx->index = ghf->start_index;
1492 ctx->flags = ghf->flags;
1493 ctx->first_pass = 1;
1494
1495 rwflag = (ghf->flags & KVM_GET_HTAB_WRITE) ? O_WRONLY : O_RDONLY;
1496 ret = anon_inode_getfd("kvm-htab", &kvm_htab_fops, ctx, rwflag);
1497 if (ret < 0) {
1498 kvm_put_kvm(kvm);
1499 return ret;
1500 }
1501
1502 if (rwflag == O_RDONLY) {
1503 mutex_lock(&kvm->slots_lock);
1504 atomic_inc(&kvm->arch.hpte_mod_interest);
1505 /* make sure kvmppc_do_h_enter etc. see the increment */
1506 synchronize_srcu_expedited(&kvm->srcu);
1507 mutex_unlock(&kvm->slots_lock);
1508 }
1509
1510 return ret;
1511}
1512
1103void kvmppc_mmu_book3s_hv_init(struct kvm_vcpu *vcpu) 1513void kvmppc_mmu_book3s_hv_init(struct kvm_vcpu *vcpu)
1104{ 1514{
1105 struct kvmppc_mmu *mmu = &vcpu->arch.mmu; 1515 struct kvmppc_mmu *mmu = &vcpu->arch.mmu;
diff --git a/arch/powerpc/kvm/book3s_emulate.c b/arch/powerpc/kvm/book3s_emulate.c
index b9a989dc76cc..d31a716f7f2b 100644
--- a/arch/powerpc/kvm/book3s_emulate.c
+++ b/arch/powerpc/kvm/book3s_emulate.c
@@ -22,6 +22,7 @@
22#include <asm/kvm_book3s.h> 22#include <asm/kvm_book3s.h>
23#include <asm/reg.h> 23#include <asm/reg.h>
24#include <asm/switch_to.h> 24#include <asm/switch_to.h>
25#include <asm/time.h>
25 26
26#define OP_19_XOP_RFID 18 27#define OP_19_XOP_RFID 18
27#define OP_19_XOP_RFI 50 28#define OP_19_XOP_RFI 50
@@ -395,6 +396,12 @@ int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, ulong spr_val)
395 (mfmsr() & MSR_HV)) 396 (mfmsr() & MSR_HV))
396 vcpu->arch.hflags |= BOOK3S_HFLAG_DCBZ32; 397 vcpu->arch.hflags |= BOOK3S_HFLAG_DCBZ32;
397 break; 398 break;
399 case SPRN_PURR:
400 to_book3s(vcpu)->purr_offset = spr_val - get_tb();
401 break;
402 case SPRN_SPURR:
403 to_book3s(vcpu)->spurr_offset = spr_val - get_tb();
404 break;
398 case SPRN_GQR0: 405 case SPRN_GQR0:
399 case SPRN_GQR1: 406 case SPRN_GQR1:
400 case SPRN_GQR2: 407 case SPRN_GQR2:
@@ -412,6 +419,7 @@ int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, ulong spr_val)
412 case SPRN_CTRLF: 419 case SPRN_CTRLF:
413 case SPRN_CTRLT: 420 case SPRN_CTRLT:
414 case SPRN_L2CR: 421 case SPRN_L2CR:
422 case SPRN_DSCR:
415 case SPRN_MMCR0_GEKKO: 423 case SPRN_MMCR0_GEKKO:
416 case SPRN_MMCR1_GEKKO: 424 case SPRN_MMCR1_GEKKO:
417 case SPRN_PMC1_GEKKO: 425 case SPRN_PMC1_GEKKO:
@@ -483,9 +491,15 @@ int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, ulong *spr_val)
483 *spr_val = to_book3s(vcpu)->hid[5]; 491 *spr_val = to_book3s(vcpu)->hid[5];
484 break; 492 break;
485 case SPRN_CFAR: 493 case SPRN_CFAR:
486 case SPRN_PURR: 494 case SPRN_DSCR:
487 *spr_val = 0; 495 *spr_val = 0;
488 break; 496 break;
497 case SPRN_PURR:
498 *spr_val = get_tb() + to_book3s(vcpu)->purr_offset;
499 break;
500 case SPRN_SPURR:
501 *spr_val = get_tb() + to_book3s(vcpu)->purr_offset;
502 break;
489 case SPRN_GQR0: 503 case SPRN_GQR0:
490 case SPRN_GQR1: 504 case SPRN_GQR1:
491 case SPRN_GQR2: 505 case SPRN_GQR2:
diff --git a/arch/powerpc/kvm/book3s_exports.c b/arch/powerpc/kvm/book3s_exports.c
index a150817d6d4c..7057a02f0906 100644
--- a/arch/powerpc/kvm/book3s_exports.c
+++ b/arch/powerpc/kvm/book3s_exports.c
@@ -28,8 +28,5 @@ EXPORT_SYMBOL_GPL(kvmppc_load_up_fpu);
28#ifdef CONFIG_ALTIVEC 28#ifdef CONFIG_ALTIVEC
29EXPORT_SYMBOL_GPL(kvmppc_load_up_altivec); 29EXPORT_SYMBOL_GPL(kvmppc_load_up_altivec);
30#endif 30#endif
31#ifdef CONFIG_VSX
32EXPORT_SYMBOL_GPL(kvmppc_load_up_vsx);
33#endif
34#endif 31#endif
35 32
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 721d4603a235..71d0c90b62bf 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -30,6 +30,7 @@
30#include <linux/cpumask.h> 30#include <linux/cpumask.h>
31#include <linux/spinlock.h> 31#include <linux/spinlock.h>
32#include <linux/page-flags.h> 32#include <linux/page-flags.h>
33#include <linux/srcu.h>
33 34
34#include <asm/reg.h> 35#include <asm/reg.h>
35#include <asm/cputable.h> 36#include <asm/cputable.h>
@@ -46,6 +47,7 @@
46#include <asm/page.h> 47#include <asm/page.h>
47#include <asm/hvcall.h> 48#include <asm/hvcall.h>
48#include <asm/switch_to.h> 49#include <asm/switch_to.h>
50#include <asm/smp.h>
49#include <linux/gfp.h> 51#include <linux/gfp.h>
50#include <linux/vmalloc.h> 52#include <linux/vmalloc.h>
51#include <linux/highmem.h> 53#include <linux/highmem.h>
@@ -55,25 +57,77 @@
55/* #define EXIT_DEBUG_SIMPLE */ 57/* #define EXIT_DEBUG_SIMPLE */
56/* #define EXIT_DEBUG_INT */ 58/* #define EXIT_DEBUG_INT */
57 59
60/* Used to indicate that a guest page fault needs to be handled */
61#define RESUME_PAGE_FAULT (RESUME_GUEST | RESUME_FLAG_ARCH1)
62
63/* Used as a "null" value for timebase values */
64#define TB_NIL (~(u64)0)
65
58static void kvmppc_end_cede(struct kvm_vcpu *vcpu); 66static void kvmppc_end_cede(struct kvm_vcpu *vcpu);
59static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu); 67static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu);
60 68
69/*
70 * We use the vcpu_load/put functions to measure stolen time.
71 * Stolen time is counted as time when either the vcpu is able to
72 * run as part of a virtual core, but the task running the vcore
73 * is preempted or sleeping, or when the vcpu needs something done
74 * in the kernel by the task running the vcpu, but that task is
75 * preempted or sleeping. Those two things have to be counted
76 * separately, since one of the vcpu tasks will take on the job
77 * of running the core, and the other vcpu tasks in the vcore will
78 * sleep waiting for it to do that, but that sleep shouldn't count
79 * as stolen time.
80 *
81 * Hence we accumulate stolen time when the vcpu can run as part of
82 * a vcore using vc->stolen_tb, and the stolen time when the vcpu
83 * needs its task to do other things in the kernel (for example,
84 * service a page fault) in busy_stolen. We don't accumulate
85 * stolen time for a vcore when it is inactive, or for a vcpu
86 * when it is in state RUNNING or NOTREADY. NOTREADY is a bit of
87 * a misnomer; it means that the vcpu task is not executing in
88 * the KVM_VCPU_RUN ioctl, i.e. it is in userspace or elsewhere in
89 * the kernel. We don't have any way of dividing up that time
90 * between time that the vcpu is genuinely stopped, time that
91 * the task is actively working on behalf of the vcpu, and time
92 * that the task is preempted, so we don't count any of it as
93 * stolen.
94 *
95 * Updates to busy_stolen are protected by arch.tbacct_lock;
96 * updates to vc->stolen_tb are protected by the arch.tbacct_lock
97 * of the vcpu that has taken responsibility for running the vcore
98 * (i.e. vc->runner). The stolen times are measured in units of
99 * timebase ticks. (Note that the != TB_NIL checks below are
100 * purely defensive; they should never fail.)
101 */
102
61void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu) 103void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
62{ 104{
63 struct kvmppc_vcore *vc = vcpu->arch.vcore; 105 struct kvmppc_vcore *vc = vcpu->arch.vcore;
64 106
65 local_paca->kvm_hstate.kvm_vcpu = vcpu; 107 spin_lock(&vcpu->arch.tbacct_lock);
66 local_paca->kvm_hstate.kvm_vcore = vc; 108 if (vc->runner == vcpu && vc->vcore_state != VCORE_INACTIVE &&
67 if (vc->runner == vcpu && vc->vcore_state != VCORE_INACTIVE) 109 vc->preempt_tb != TB_NIL) {
68 vc->stolen_tb += mftb() - vc->preempt_tb; 110 vc->stolen_tb += mftb() - vc->preempt_tb;
111 vc->preempt_tb = TB_NIL;
112 }
113 if (vcpu->arch.state == KVMPPC_VCPU_BUSY_IN_HOST &&
114 vcpu->arch.busy_preempt != TB_NIL) {
115 vcpu->arch.busy_stolen += mftb() - vcpu->arch.busy_preempt;
116 vcpu->arch.busy_preempt = TB_NIL;
117 }
118 spin_unlock(&vcpu->arch.tbacct_lock);
69} 119}
70 120
71void kvmppc_core_vcpu_put(struct kvm_vcpu *vcpu) 121void kvmppc_core_vcpu_put(struct kvm_vcpu *vcpu)
72{ 122{
73 struct kvmppc_vcore *vc = vcpu->arch.vcore; 123 struct kvmppc_vcore *vc = vcpu->arch.vcore;
74 124
125 spin_lock(&vcpu->arch.tbacct_lock);
75 if (vc->runner == vcpu && vc->vcore_state != VCORE_INACTIVE) 126 if (vc->runner == vcpu && vc->vcore_state != VCORE_INACTIVE)
76 vc->preempt_tb = mftb(); 127 vc->preempt_tb = mftb();
128 if (vcpu->arch.state == KVMPPC_VCPU_BUSY_IN_HOST)
129 vcpu->arch.busy_preempt = mftb();
130 spin_unlock(&vcpu->arch.tbacct_lock);
77} 131}
78 132
79void kvmppc_set_msr(struct kvm_vcpu *vcpu, u64 msr) 133void kvmppc_set_msr(struct kvm_vcpu *vcpu, u64 msr)
@@ -142,6 +196,22 @@ static void init_vpa(struct kvm_vcpu *vcpu, struct lppaca *vpa)
142 vpa->yield_count = 1; 196 vpa->yield_count = 1;
143} 197}
144 198
199static int set_vpa(struct kvm_vcpu *vcpu, struct kvmppc_vpa *v,
200 unsigned long addr, unsigned long len)
201{
202 /* check address is cacheline aligned */
203 if (addr & (L1_CACHE_BYTES - 1))
204 return -EINVAL;
205 spin_lock(&vcpu->arch.vpa_update_lock);
206 if (v->next_gpa != addr || v->len != len) {
207 v->next_gpa = addr;
208 v->len = addr ? len : 0;
209 v->update_pending = 1;
210 }
211 spin_unlock(&vcpu->arch.vpa_update_lock);
212 return 0;
213}
214
145/* Length for a per-processor buffer is passed in at offset 4 in the buffer */ 215/* Length for a per-processor buffer is passed in at offset 4 in the buffer */
146struct reg_vpa { 216struct reg_vpa {
147 u32 dummy; 217 u32 dummy;
@@ -317,10 +387,16 @@ static void kvmppc_update_vpa(struct kvm_vcpu *vcpu, struct kvmppc_vpa *vpap)
317 387
318static void kvmppc_update_vpas(struct kvm_vcpu *vcpu) 388static void kvmppc_update_vpas(struct kvm_vcpu *vcpu)
319{ 389{
390 if (!(vcpu->arch.vpa.update_pending ||
391 vcpu->arch.slb_shadow.update_pending ||
392 vcpu->arch.dtl.update_pending))
393 return;
394
320 spin_lock(&vcpu->arch.vpa_update_lock); 395 spin_lock(&vcpu->arch.vpa_update_lock);
321 if (vcpu->arch.vpa.update_pending) { 396 if (vcpu->arch.vpa.update_pending) {
322 kvmppc_update_vpa(vcpu, &vcpu->arch.vpa); 397 kvmppc_update_vpa(vcpu, &vcpu->arch.vpa);
323 init_vpa(vcpu, vcpu->arch.vpa.pinned_addr); 398 if (vcpu->arch.vpa.pinned_addr)
399 init_vpa(vcpu, vcpu->arch.vpa.pinned_addr);
324 } 400 }
325 if (vcpu->arch.dtl.update_pending) { 401 if (vcpu->arch.dtl.update_pending) {
326 kvmppc_update_vpa(vcpu, &vcpu->arch.dtl); 402 kvmppc_update_vpa(vcpu, &vcpu->arch.dtl);
@@ -332,24 +408,61 @@ static void kvmppc_update_vpas(struct kvm_vcpu *vcpu)
332 spin_unlock(&vcpu->arch.vpa_update_lock); 408 spin_unlock(&vcpu->arch.vpa_update_lock);
333} 409}
334 410
411/*
412 * Return the accumulated stolen time for the vcore up until `now'.
413 * The caller should hold the vcore lock.
414 */
415static u64 vcore_stolen_time(struct kvmppc_vcore *vc, u64 now)
416{
417 u64 p;
418
419 /*
420 * If we are the task running the vcore, then since we hold
421 * the vcore lock, we can't be preempted, so stolen_tb/preempt_tb
422 * can't be updated, so we don't need the tbacct_lock.
423 * If the vcore is inactive, it can't become active (since we
424 * hold the vcore lock), so the vcpu load/put functions won't
425 * update stolen_tb/preempt_tb, and we don't need tbacct_lock.
426 */
427 if (vc->vcore_state != VCORE_INACTIVE &&
428 vc->runner->arch.run_task != current) {
429 spin_lock(&vc->runner->arch.tbacct_lock);
430 p = vc->stolen_tb;
431 if (vc->preempt_tb != TB_NIL)
432 p += now - vc->preempt_tb;
433 spin_unlock(&vc->runner->arch.tbacct_lock);
434 } else {
435 p = vc->stolen_tb;
436 }
437 return p;
438}
439
335static void kvmppc_create_dtl_entry(struct kvm_vcpu *vcpu, 440static void kvmppc_create_dtl_entry(struct kvm_vcpu *vcpu,
336 struct kvmppc_vcore *vc) 441 struct kvmppc_vcore *vc)
337{ 442{
338 struct dtl_entry *dt; 443 struct dtl_entry *dt;
339 struct lppaca *vpa; 444 struct lppaca *vpa;
340 unsigned long old_stolen; 445 unsigned long stolen;
446 unsigned long core_stolen;
447 u64 now;
341 448
342 dt = vcpu->arch.dtl_ptr; 449 dt = vcpu->arch.dtl_ptr;
343 vpa = vcpu->arch.vpa.pinned_addr; 450 vpa = vcpu->arch.vpa.pinned_addr;
344 old_stolen = vcpu->arch.stolen_logged; 451 now = mftb();
345 vcpu->arch.stolen_logged = vc->stolen_tb; 452 core_stolen = vcore_stolen_time(vc, now);
453 stolen = core_stolen - vcpu->arch.stolen_logged;
454 vcpu->arch.stolen_logged = core_stolen;
455 spin_lock(&vcpu->arch.tbacct_lock);
456 stolen += vcpu->arch.busy_stolen;
457 vcpu->arch.busy_stolen = 0;
458 spin_unlock(&vcpu->arch.tbacct_lock);
346 if (!dt || !vpa) 459 if (!dt || !vpa)
347 return; 460 return;
348 memset(dt, 0, sizeof(struct dtl_entry)); 461 memset(dt, 0, sizeof(struct dtl_entry));
349 dt->dispatch_reason = 7; 462 dt->dispatch_reason = 7;
350 dt->processor_id = vc->pcpu + vcpu->arch.ptid; 463 dt->processor_id = vc->pcpu + vcpu->arch.ptid;
351 dt->timebase = mftb(); 464 dt->timebase = now;
352 dt->enqueue_to_dispatch_time = vc->stolen_tb - old_stolen; 465 dt->enqueue_to_dispatch_time = stolen;
353 dt->srr0 = kvmppc_get_pc(vcpu); 466 dt->srr0 = kvmppc_get_pc(vcpu);
354 dt->srr1 = vcpu->arch.shregs.msr; 467 dt->srr1 = vcpu->arch.shregs.msr;
355 ++dt; 468 ++dt;
@@ -366,13 +479,16 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu)
366 unsigned long req = kvmppc_get_gpr(vcpu, 3); 479 unsigned long req = kvmppc_get_gpr(vcpu, 3);
367 unsigned long target, ret = H_SUCCESS; 480 unsigned long target, ret = H_SUCCESS;
368 struct kvm_vcpu *tvcpu; 481 struct kvm_vcpu *tvcpu;
482 int idx;
369 483
370 switch (req) { 484 switch (req) {
371 case H_ENTER: 485 case H_ENTER:
486 idx = srcu_read_lock(&vcpu->kvm->srcu);
372 ret = kvmppc_virtmode_h_enter(vcpu, kvmppc_get_gpr(vcpu, 4), 487 ret = kvmppc_virtmode_h_enter(vcpu, kvmppc_get_gpr(vcpu, 4),
373 kvmppc_get_gpr(vcpu, 5), 488 kvmppc_get_gpr(vcpu, 5),
374 kvmppc_get_gpr(vcpu, 6), 489 kvmppc_get_gpr(vcpu, 6),
375 kvmppc_get_gpr(vcpu, 7)); 490 kvmppc_get_gpr(vcpu, 7));
491 srcu_read_unlock(&vcpu->kvm->srcu, idx);
376 break; 492 break;
377 case H_CEDE: 493 case H_CEDE:
378 break; 494 break;
@@ -429,6 +545,17 @@ static int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
429 case BOOK3S_INTERRUPT_PERFMON: 545 case BOOK3S_INTERRUPT_PERFMON:
430 r = RESUME_GUEST; 546 r = RESUME_GUEST;
431 break; 547 break;
548 case BOOK3S_INTERRUPT_MACHINE_CHECK:
549 /*
550 * Deliver a machine check interrupt to the guest.
551 * We have to do this, even if the host has handled the
552 * machine check, because machine checks use SRR0/1 and
553 * the interrupt might have trashed guest state in them.
554 */
555 kvmppc_book3s_queue_irqprio(vcpu,
556 BOOK3S_INTERRUPT_MACHINE_CHECK);
557 r = RESUME_GUEST;
558 break;
432 case BOOK3S_INTERRUPT_PROGRAM: 559 case BOOK3S_INTERRUPT_PROGRAM:
433 { 560 {
434 ulong flags; 561 ulong flags;
@@ -470,12 +597,12 @@ static int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
470 * have been handled already. 597 * have been handled already.
471 */ 598 */
472 case BOOK3S_INTERRUPT_H_DATA_STORAGE: 599 case BOOK3S_INTERRUPT_H_DATA_STORAGE:
473 r = kvmppc_book3s_hv_page_fault(run, vcpu, 600 r = RESUME_PAGE_FAULT;
474 vcpu->arch.fault_dar, vcpu->arch.fault_dsisr);
475 break; 601 break;
476 case BOOK3S_INTERRUPT_H_INST_STORAGE: 602 case BOOK3S_INTERRUPT_H_INST_STORAGE:
477 r = kvmppc_book3s_hv_page_fault(run, vcpu, 603 vcpu->arch.fault_dar = kvmppc_get_pc(vcpu);
478 kvmppc_get_pc(vcpu), 0); 604 vcpu->arch.fault_dsisr = 0;
605 r = RESUME_PAGE_FAULT;
479 break; 606 break;
480 /* 607 /*
481 * This occurs if the guest executes an illegal instruction. 608 * This occurs if the guest executes an illegal instruction.
@@ -535,36 +662,174 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
535 return 0; 662 return 0;
536} 663}
537 664
538int kvm_vcpu_ioctl_get_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg) 665int kvmppc_get_one_reg(struct kvm_vcpu *vcpu, u64 id, union kvmppc_one_reg *val)
539{ 666{
540 int r = -EINVAL; 667 int r = 0;
668 long int i;
541 669
542 switch (reg->id) { 670 switch (id) {
543 case KVM_REG_PPC_HIOR: 671 case KVM_REG_PPC_HIOR:
544 r = put_user(0, (u64 __user *)reg->addr); 672 *val = get_reg_val(id, 0);
673 break;
674 case KVM_REG_PPC_DABR:
675 *val = get_reg_val(id, vcpu->arch.dabr);
676 break;
677 case KVM_REG_PPC_DSCR:
678 *val = get_reg_val(id, vcpu->arch.dscr);
679 break;
680 case KVM_REG_PPC_PURR:
681 *val = get_reg_val(id, vcpu->arch.purr);
682 break;
683 case KVM_REG_PPC_SPURR:
684 *val = get_reg_val(id, vcpu->arch.spurr);
685 break;
686 case KVM_REG_PPC_AMR:
687 *val = get_reg_val(id, vcpu->arch.amr);
688 break;
689 case KVM_REG_PPC_UAMOR:
690 *val = get_reg_val(id, vcpu->arch.uamor);
691 break;
692 case KVM_REG_PPC_MMCR0 ... KVM_REG_PPC_MMCRA:
693 i = id - KVM_REG_PPC_MMCR0;
694 *val = get_reg_val(id, vcpu->arch.mmcr[i]);
695 break;
696 case KVM_REG_PPC_PMC1 ... KVM_REG_PPC_PMC8:
697 i = id - KVM_REG_PPC_PMC1;
698 *val = get_reg_val(id, vcpu->arch.pmc[i]);
699 break;
700#ifdef CONFIG_VSX
701 case KVM_REG_PPC_FPR0 ... KVM_REG_PPC_FPR31:
702 if (cpu_has_feature(CPU_FTR_VSX)) {
703 /* VSX => FP reg i is stored in arch.vsr[2*i] */
704 long int i = id - KVM_REG_PPC_FPR0;
705 *val = get_reg_val(id, vcpu->arch.vsr[2 * i]);
706 } else {
707 /* let generic code handle it */
708 r = -EINVAL;
709 }
710 break;
711 case KVM_REG_PPC_VSR0 ... KVM_REG_PPC_VSR31:
712 if (cpu_has_feature(CPU_FTR_VSX)) {
713 long int i = id - KVM_REG_PPC_VSR0;
714 val->vsxval[0] = vcpu->arch.vsr[2 * i];
715 val->vsxval[1] = vcpu->arch.vsr[2 * i + 1];
716 } else {
717 r = -ENXIO;
718 }
719 break;
720#endif /* CONFIG_VSX */
721 case KVM_REG_PPC_VPA_ADDR:
722 spin_lock(&vcpu->arch.vpa_update_lock);
723 *val = get_reg_val(id, vcpu->arch.vpa.next_gpa);
724 spin_unlock(&vcpu->arch.vpa_update_lock);
725 break;
726 case KVM_REG_PPC_VPA_SLB:
727 spin_lock(&vcpu->arch.vpa_update_lock);
728 val->vpaval.addr = vcpu->arch.slb_shadow.next_gpa;
729 val->vpaval.length = vcpu->arch.slb_shadow.len;
730 spin_unlock(&vcpu->arch.vpa_update_lock);
731 break;
732 case KVM_REG_PPC_VPA_DTL:
733 spin_lock(&vcpu->arch.vpa_update_lock);
734 val->vpaval.addr = vcpu->arch.dtl.next_gpa;
735 val->vpaval.length = vcpu->arch.dtl.len;
736 spin_unlock(&vcpu->arch.vpa_update_lock);
545 break; 737 break;
546 default: 738 default:
739 r = -EINVAL;
547 break; 740 break;
548 } 741 }
549 742
550 return r; 743 return r;
551} 744}
552 745
553int kvm_vcpu_ioctl_set_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg) 746int kvmppc_set_one_reg(struct kvm_vcpu *vcpu, u64 id, union kvmppc_one_reg *val)
554{ 747{
555 int r = -EINVAL; 748 int r = 0;
749 long int i;
750 unsigned long addr, len;
556 751
557 switch (reg->id) { 752 switch (id) {
558 case KVM_REG_PPC_HIOR: 753 case KVM_REG_PPC_HIOR:
559 {
560 u64 hior;
561 /* Only allow this to be set to zero */ 754 /* Only allow this to be set to zero */
562 r = get_user(hior, (u64 __user *)reg->addr); 755 if (set_reg_val(id, *val))
563 if (!r && (hior != 0))
564 r = -EINVAL; 756 r = -EINVAL;
565 break; 757 break;
566 } 758 case KVM_REG_PPC_DABR:
759 vcpu->arch.dabr = set_reg_val(id, *val);
760 break;
761 case KVM_REG_PPC_DSCR:
762 vcpu->arch.dscr = set_reg_val(id, *val);
763 break;
764 case KVM_REG_PPC_PURR:
765 vcpu->arch.purr = set_reg_val(id, *val);
766 break;
767 case KVM_REG_PPC_SPURR:
768 vcpu->arch.spurr = set_reg_val(id, *val);
769 break;
770 case KVM_REG_PPC_AMR:
771 vcpu->arch.amr = set_reg_val(id, *val);
772 break;
773 case KVM_REG_PPC_UAMOR:
774 vcpu->arch.uamor = set_reg_val(id, *val);
775 break;
776 case KVM_REG_PPC_MMCR0 ... KVM_REG_PPC_MMCRA:
777 i = id - KVM_REG_PPC_MMCR0;
778 vcpu->arch.mmcr[i] = set_reg_val(id, *val);
779 break;
780 case KVM_REG_PPC_PMC1 ... KVM_REG_PPC_PMC8:
781 i = id - KVM_REG_PPC_PMC1;
782 vcpu->arch.pmc[i] = set_reg_val(id, *val);
783 break;
784#ifdef CONFIG_VSX
785 case KVM_REG_PPC_FPR0 ... KVM_REG_PPC_FPR31:
786 if (cpu_has_feature(CPU_FTR_VSX)) {
787 /* VSX => FP reg i is stored in arch.vsr[2*i] */
788 long int i = id - KVM_REG_PPC_FPR0;
789 vcpu->arch.vsr[2 * i] = set_reg_val(id, *val);
790 } else {
791 /* let generic code handle it */
792 r = -EINVAL;
793 }
794 break;
795 case KVM_REG_PPC_VSR0 ... KVM_REG_PPC_VSR31:
796 if (cpu_has_feature(CPU_FTR_VSX)) {
797 long int i = id - KVM_REG_PPC_VSR0;
798 vcpu->arch.vsr[2 * i] = val->vsxval[0];
799 vcpu->arch.vsr[2 * i + 1] = val->vsxval[1];
800 } else {
801 r = -ENXIO;
802 }
803 break;
804#endif /* CONFIG_VSX */
805 case KVM_REG_PPC_VPA_ADDR:
806 addr = set_reg_val(id, *val);
807 r = -EINVAL;
808 if (!addr && (vcpu->arch.slb_shadow.next_gpa ||
809 vcpu->arch.dtl.next_gpa))
810 break;
811 r = set_vpa(vcpu, &vcpu->arch.vpa, addr, sizeof(struct lppaca));
812 break;
813 case KVM_REG_PPC_VPA_SLB:
814 addr = val->vpaval.addr;
815 len = val->vpaval.length;
816 r = -EINVAL;
817 if (addr && !vcpu->arch.vpa.next_gpa)
818 break;
819 r = set_vpa(vcpu, &vcpu->arch.slb_shadow, addr, len);
820 break;
821 case KVM_REG_PPC_VPA_DTL:
822 addr = val->vpaval.addr;
823 len = val->vpaval.length;
824 r = -EINVAL;
825 if (addr && (len < sizeof(struct dtl_entry) ||
826 !vcpu->arch.vpa.next_gpa))
827 break;
828 len -= len % sizeof(struct dtl_entry);
829 r = set_vpa(vcpu, &vcpu->arch.dtl, addr, len);
830 break;
567 default: 831 default:
832 r = -EINVAL;
568 break; 833 break;
569 } 834 }
570 835
@@ -599,20 +864,18 @@ struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id)
599 goto free_vcpu; 864 goto free_vcpu;
600 865
601 vcpu->arch.shared = &vcpu->arch.shregs; 866 vcpu->arch.shared = &vcpu->arch.shregs;
602 vcpu->arch.last_cpu = -1;
603 vcpu->arch.mmcr[0] = MMCR0_FC; 867 vcpu->arch.mmcr[0] = MMCR0_FC;
604 vcpu->arch.ctrl = CTRL_RUNLATCH; 868 vcpu->arch.ctrl = CTRL_RUNLATCH;
605 /* default to host PVR, since we can't spoof it */ 869 /* default to host PVR, since we can't spoof it */
606 vcpu->arch.pvr = mfspr(SPRN_PVR); 870 vcpu->arch.pvr = mfspr(SPRN_PVR);
607 kvmppc_set_pvr(vcpu, vcpu->arch.pvr); 871 kvmppc_set_pvr(vcpu, vcpu->arch.pvr);
608 spin_lock_init(&vcpu->arch.vpa_update_lock); 872 spin_lock_init(&vcpu->arch.vpa_update_lock);
873 spin_lock_init(&vcpu->arch.tbacct_lock);
874 vcpu->arch.busy_preempt = TB_NIL;
609 875
610 kvmppc_mmu_book3s_hv_init(vcpu); 876 kvmppc_mmu_book3s_hv_init(vcpu);
611 877
612 /* 878 vcpu->arch.state = KVMPPC_VCPU_NOTREADY;
613 * We consider the vcpu stopped until we see the first run ioctl for it.
614 */
615 vcpu->arch.state = KVMPPC_VCPU_STOPPED;
616 879
617 init_waitqueue_head(&vcpu->arch.cpu_run); 880 init_waitqueue_head(&vcpu->arch.cpu_run);
618 881
@@ -624,9 +887,10 @@ struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id)
624 INIT_LIST_HEAD(&vcore->runnable_threads); 887 INIT_LIST_HEAD(&vcore->runnable_threads);
625 spin_lock_init(&vcore->lock); 888 spin_lock_init(&vcore->lock);
626 init_waitqueue_head(&vcore->wq); 889 init_waitqueue_head(&vcore->wq);
627 vcore->preempt_tb = mftb(); 890 vcore->preempt_tb = TB_NIL;
628 } 891 }
629 kvm->arch.vcores[core] = vcore; 892 kvm->arch.vcores[core] = vcore;
893 kvm->arch.online_vcores++;
630 } 894 }
631 mutex_unlock(&kvm->lock); 895 mutex_unlock(&kvm->lock);
632 896
@@ -637,7 +901,6 @@ struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id)
637 ++vcore->num_threads; 901 ++vcore->num_threads;
638 spin_unlock(&vcore->lock); 902 spin_unlock(&vcore->lock);
639 vcpu->arch.vcore = vcore; 903 vcpu->arch.vcore = vcore;
640 vcpu->arch.stolen_logged = vcore->stolen_tb;
641 904
642 vcpu->arch.cpu_type = KVM_CPU_3S_64; 905 vcpu->arch.cpu_type = KVM_CPU_3S_64;
643 kvmppc_sanity_check(vcpu); 906 kvmppc_sanity_check(vcpu);
@@ -697,17 +960,18 @@ extern void xics_wake_cpu(int cpu);
697static void kvmppc_remove_runnable(struct kvmppc_vcore *vc, 960static void kvmppc_remove_runnable(struct kvmppc_vcore *vc,
698 struct kvm_vcpu *vcpu) 961 struct kvm_vcpu *vcpu)
699{ 962{
700 struct kvm_vcpu *v; 963 u64 now;
701 964
702 if (vcpu->arch.state != KVMPPC_VCPU_RUNNABLE) 965 if (vcpu->arch.state != KVMPPC_VCPU_RUNNABLE)
703 return; 966 return;
967 spin_lock(&vcpu->arch.tbacct_lock);
968 now = mftb();
969 vcpu->arch.busy_stolen += vcore_stolen_time(vc, now) -
970 vcpu->arch.stolen_logged;
971 vcpu->arch.busy_preempt = now;
704 vcpu->arch.state = KVMPPC_VCPU_BUSY_IN_HOST; 972 vcpu->arch.state = KVMPPC_VCPU_BUSY_IN_HOST;
973 spin_unlock(&vcpu->arch.tbacct_lock);
705 --vc->n_runnable; 974 --vc->n_runnable;
706 ++vc->n_busy;
707 /* decrement the physical thread id of each following vcpu */
708 v = vcpu;
709 list_for_each_entry_continue(v, &vc->runnable_threads, arch.run_list)
710 --v->arch.ptid;
711 list_del(&vcpu->arch.run_list); 975 list_del(&vcpu->arch.run_list);
712} 976}
713 977
@@ -720,6 +984,7 @@ static int kvmppc_grab_hwthread(int cpu)
720 984
721 /* Ensure the thread won't go into the kernel if it wakes */ 985 /* Ensure the thread won't go into the kernel if it wakes */
722 tpaca->kvm_hstate.hwthread_req = 1; 986 tpaca->kvm_hstate.hwthread_req = 1;
987 tpaca->kvm_hstate.kvm_vcpu = NULL;
723 988
724 /* 989 /*
725 * If the thread is already executing in the kernel (e.g. handling 990 * If the thread is already executing in the kernel (e.g. handling
@@ -769,7 +1034,6 @@ static void kvmppc_start_thread(struct kvm_vcpu *vcpu)
769 smp_wmb(); 1034 smp_wmb();
770#if defined(CONFIG_PPC_ICP_NATIVE) && defined(CONFIG_SMP) 1035#if defined(CONFIG_PPC_ICP_NATIVE) && defined(CONFIG_SMP)
771 if (vcpu->arch.ptid) { 1036 if (vcpu->arch.ptid) {
772 kvmppc_grab_hwthread(cpu);
773 xics_wake_cpu(cpu); 1037 xics_wake_cpu(cpu);
774 ++vc->n_woken; 1038 ++vc->n_woken;
775 } 1039 }
@@ -795,7 +1059,8 @@ static void kvmppc_wait_for_nap(struct kvmppc_vcore *vc)
795 1059
796/* 1060/*
797 * Check that we are on thread 0 and that any other threads in 1061 * Check that we are on thread 0 and that any other threads in
798 * this core are off-line. 1062 * this core are off-line. Then grab the threads so they can't
1063 * enter the kernel.
799 */ 1064 */
800static int on_primary_thread(void) 1065static int on_primary_thread(void)
801{ 1066{
@@ -807,6 +1072,17 @@ static int on_primary_thread(void)
807 while (++thr < threads_per_core) 1072 while (++thr < threads_per_core)
808 if (cpu_online(cpu + thr)) 1073 if (cpu_online(cpu + thr))
809 return 0; 1074 return 0;
1075
1076 /* Grab all hw threads so they can't go into the kernel */
1077 for (thr = 1; thr < threads_per_core; ++thr) {
1078 if (kvmppc_grab_hwthread(cpu + thr)) {
1079 /* Couldn't grab one; let the others go */
1080 do {
1081 kvmppc_release_hwthread(cpu + thr);
1082 } while (--thr > 0);
1083 return 0;
1084 }
1085 }
810 return 1; 1086 return 1;
811} 1087}
812 1088
@@ -814,21 +1090,24 @@ static int on_primary_thread(void)
814 * Run a set of guest threads on a physical core. 1090 * Run a set of guest threads on a physical core.
815 * Called with vc->lock held. 1091 * Called with vc->lock held.
816 */ 1092 */
817static int kvmppc_run_core(struct kvmppc_vcore *vc) 1093static void kvmppc_run_core(struct kvmppc_vcore *vc)
818{ 1094{
819 struct kvm_vcpu *vcpu, *vcpu0, *vnext; 1095 struct kvm_vcpu *vcpu, *vcpu0, *vnext;
820 long ret; 1096 long ret;
821 u64 now; 1097 u64 now;
822 int ptid, i, need_vpa_update; 1098 int ptid, i, need_vpa_update;
1099 int srcu_idx;
1100 struct kvm_vcpu *vcpus_to_update[threads_per_core];
823 1101
824 /* don't start if any threads have a signal pending */ 1102 /* don't start if any threads have a signal pending */
825 need_vpa_update = 0; 1103 need_vpa_update = 0;
826 list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list) { 1104 list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list) {
827 if (signal_pending(vcpu->arch.run_task)) 1105 if (signal_pending(vcpu->arch.run_task))
828 return 0; 1106 return;
829 need_vpa_update |= vcpu->arch.vpa.update_pending | 1107 if (vcpu->arch.vpa.update_pending ||
830 vcpu->arch.slb_shadow.update_pending | 1108 vcpu->arch.slb_shadow.update_pending ||
831 vcpu->arch.dtl.update_pending; 1109 vcpu->arch.dtl.update_pending)
1110 vcpus_to_update[need_vpa_update++] = vcpu;
832 } 1111 }
833 1112
834 /* 1113 /*
@@ -838,7 +1117,7 @@ static int kvmppc_run_core(struct kvmppc_vcore *vc)
838 vc->n_woken = 0; 1117 vc->n_woken = 0;
839 vc->nap_count = 0; 1118 vc->nap_count = 0;
840 vc->entry_exit_count = 0; 1119 vc->entry_exit_count = 0;
841 vc->vcore_state = VCORE_RUNNING; 1120 vc->vcore_state = VCORE_STARTING;
842 vc->in_guest = 0; 1121 vc->in_guest = 0;
843 vc->napping_threads = 0; 1122 vc->napping_threads = 0;
844 1123
@@ -848,24 +1127,12 @@ static int kvmppc_run_core(struct kvmppc_vcore *vc)
848 */ 1127 */
849 if (need_vpa_update) { 1128 if (need_vpa_update) {
850 spin_unlock(&vc->lock); 1129 spin_unlock(&vc->lock);
851 list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list) 1130 for (i = 0; i < need_vpa_update; ++i)
852 kvmppc_update_vpas(vcpu); 1131 kvmppc_update_vpas(vcpus_to_update[i]);
853 spin_lock(&vc->lock); 1132 spin_lock(&vc->lock);
854 } 1133 }
855 1134
856 /* 1135 /*
857 * Make sure we are running on thread 0, and that
858 * secondary threads are offline.
859 * XXX we should also block attempts to bring any
860 * secondary threads online.
861 */
862 if (threads_per_core > 1 && !on_primary_thread()) {
863 list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list)
864 vcpu->arch.ret = -EBUSY;
865 goto out;
866 }
867
868 /*
869 * Assign physical thread IDs, first to non-ceded vcpus 1136 * Assign physical thread IDs, first to non-ceded vcpus
870 * and then to ceded ones. 1137 * and then to ceded ones.
871 */ 1138 */
@@ -879,28 +1146,36 @@ static int kvmppc_run_core(struct kvmppc_vcore *vc)
879 } 1146 }
880 } 1147 }
881 if (!vcpu0) 1148 if (!vcpu0)
882 return 0; /* nothing to run */ 1149 goto out; /* nothing to run; should never happen */
883 list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list) 1150 list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list)
884 if (vcpu->arch.ceded) 1151 if (vcpu->arch.ceded)
885 vcpu->arch.ptid = ptid++; 1152 vcpu->arch.ptid = ptid++;
886 1153
887 vc->stolen_tb += mftb() - vc->preempt_tb; 1154 /*
1155 * Make sure we are running on thread 0, and that
1156 * secondary threads are offline.
1157 */
1158 if (threads_per_core > 1 && !on_primary_thread()) {
1159 list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list)
1160 vcpu->arch.ret = -EBUSY;
1161 goto out;
1162 }
1163
888 vc->pcpu = smp_processor_id(); 1164 vc->pcpu = smp_processor_id();
889 list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list) { 1165 list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list) {
890 kvmppc_start_thread(vcpu); 1166 kvmppc_start_thread(vcpu);
891 kvmppc_create_dtl_entry(vcpu, vc); 1167 kvmppc_create_dtl_entry(vcpu, vc);
892 } 1168 }
893 /* Grab any remaining hw threads so they can't go into the kernel */
894 for (i = ptid; i < threads_per_core; ++i)
895 kvmppc_grab_hwthread(vc->pcpu + i);
896 1169
1170 vc->vcore_state = VCORE_RUNNING;
897 preempt_disable(); 1171 preempt_disable();
898 spin_unlock(&vc->lock); 1172 spin_unlock(&vc->lock);
899 1173
900 kvm_guest_enter(); 1174 kvm_guest_enter();
1175
1176 srcu_idx = srcu_read_lock(&vcpu0->kvm->srcu);
1177
901 __kvmppc_vcore_entry(NULL, vcpu0); 1178 __kvmppc_vcore_entry(NULL, vcpu0);
902 for (i = 0; i < threads_per_core; ++i)
903 kvmppc_release_hwthread(vc->pcpu + i);
904 1179
905 spin_lock(&vc->lock); 1180 spin_lock(&vc->lock);
906 /* disable sending of IPIs on virtual external irqs */ 1181 /* disable sending of IPIs on virtual external irqs */
@@ -909,10 +1184,14 @@ static int kvmppc_run_core(struct kvmppc_vcore *vc)
909 /* wait for secondary threads to finish writing their state to memory */ 1184 /* wait for secondary threads to finish writing their state to memory */
910 if (vc->nap_count < vc->n_woken) 1185 if (vc->nap_count < vc->n_woken)
911 kvmppc_wait_for_nap(vc); 1186 kvmppc_wait_for_nap(vc);
1187 for (i = 0; i < threads_per_core; ++i)
1188 kvmppc_release_hwthread(vc->pcpu + i);
912 /* prevent other vcpu threads from doing kvmppc_start_thread() now */ 1189 /* prevent other vcpu threads from doing kvmppc_start_thread() now */
913 vc->vcore_state = VCORE_EXITING; 1190 vc->vcore_state = VCORE_EXITING;
914 spin_unlock(&vc->lock); 1191 spin_unlock(&vc->lock);
915 1192
1193 srcu_read_unlock(&vcpu0->kvm->srcu, srcu_idx);
1194
916 /* make sure updates to secondary vcpu structs are visible now */ 1195 /* make sure updates to secondary vcpu structs are visible now */
917 smp_mb(); 1196 smp_mb();
918 kvm_guest_exit(); 1197 kvm_guest_exit();
@@ -920,6 +1199,7 @@ static int kvmppc_run_core(struct kvmppc_vcore *vc)
920 preempt_enable(); 1199 preempt_enable();
921 kvm_resched(vcpu); 1200 kvm_resched(vcpu);
922 1201
1202 spin_lock(&vc->lock);
923 now = get_tb(); 1203 now = get_tb();
924 list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list) { 1204 list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list) {
925 /* cancel pending dec exception if dec is positive */ 1205 /* cancel pending dec exception if dec is positive */
@@ -943,10 +1223,8 @@ static int kvmppc_run_core(struct kvmppc_vcore *vc)
943 } 1223 }
944 } 1224 }
945 1225
946 spin_lock(&vc->lock);
947 out: 1226 out:
948 vc->vcore_state = VCORE_INACTIVE; 1227 vc->vcore_state = VCORE_INACTIVE;
949 vc->preempt_tb = mftb();
950 list_for_each_entry_safe(vcpu, vnext, &vc->runnable_threads, 1228 list_for_each_entry_safe(vcpu, vnext, &vc->runnable_threads,
951 arch.run_list) { 1229 arch.run_list) {
952 if (vcpu->arch.ret != RESUME_GUEST) { 1230 if (vcpu->arch.ret != RESUME_GUEST) {
@@ -954,8 +1232,6 @@ static int kvmppc_run_core(struct kvmppc_vcore *vc)
954 wake_up(&vcpu->arch.cpu_run); 1232 wake_up(&vcpu->arch.cpu_run);
955 } 1233 }
956 } 1234 }
957
958 return 1;
959} 1235}
960 1236
961/* 1237/*
@@ -979,20 +1255,11 @@ static void kvmppc_wait_for_exec(struct kvm_vcpu *vcpu, int wait_state)
979static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc) 1255static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc)
980{ 1256{
981 DEFINE_WAIT(wait); 1257 DEFINE_WAIT(wait);
982 struct kvm_vcpu *v;
983 int all_idle = 1;
984 1258
985 prepare_to_wait(&vc->wq, &wait, TASK_INTERRUPTIBLE); 1259 prepare_to_wait(&vc->wq, &wait, TASK_INTERRUPTIBLE);
986 vc->vcore_state = VCORE_SLEEPING; 1260 vc->vcore_state = VCORE_SLEEPING;
987 spin_unlock(&vc->lock); 1261 spin_unlock(&vc->lock);
988 list_for_each_entry(v, &vc->runnable_threads, arch.run_list) { 1262 schedule();
989 if (!v->arch.ceded || v->arch.pending_exceptions) {
990 all_idle = 0;
991 break;
992 }
993 }
994 if (all_idle)
995 schedule();
996 finish_wait(&vc->wq, &wait); 1263 finish_wait(&vc->wq, &wait);
997 spin_lock(&vc->lock); 1264 spin_lock(&vc->lock);
998 vc->vcore_state = VCORE_INACTIVE; 1265 vc->vcore_state = VCORE_INACTIVE;
@@ -1001,13 +1268,13 @@ static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc)
1001static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) 1268static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
1002{ 1269{
1003 int n_ceded; 1270 int n_ceded;
1004 int prev_state;
1005 struct kvmppc_vcore *vc; 1271 struct kvmppc_vcore *vc;
1006 struct kvm_vcpu *v, *vn; 1272 struct kvm_vcpu *v, *vn;
1007 1273
1008 kvm_run->exit_reason = 0; 1274 kvm_run->exit_reason = 0;
1009 vcpu->arch.ret = RESUME_GUEST; 1275 vcpu->arch.ret = RESUME_GUEST;
1010 vcpu->arch.trap = 0; 1276 vcpu->arch.trap = 0;
1277 kvmppc_update_vpas(vcpu);
1011 1278
1012 /* 1279 /*
1013 * Synchronize with other threads in this virtual core 1280 * Synchronize with other threads in this virtual core
@@ -1017,8 +1284,9 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
1017 vcpu->arch.ceded = 0; 1284 vcpu->arch.ceded = 0;
1018 vcpu->arch.run_task = current; 1285 vcpu->arch.run_task = current;
1019 vcpu->arch.kvm_run = kvm_run; 1286 vcpu->arch.kvm_run = kvm_run;
1020 prev_state = vcpu->arch.state; 1287 vcpu->arch.stolen_logged = vcore_stolen_time(vc, mftb());
1021 vcpu->arch.state = KVMPPC_VCPU_RUNNABLE; 1288 vcpu->arch.state = KVMPPC_VCPU_RUNNABLE;
1289 vcpu->arch.busy_preempt = TB_NIL;
1022 list_add_tail(&vcpu->arch.run_list, &vc->runnable_threads); 1290 list_add_tail(&vcpu->arch.run_list, &vc->runnable_threads);
1023 ++vc->n_runnable; 1291 ++vc->n_runnable;
1024 1292
@@ -1027,33 +1295,26 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
1027 * If the vcore is already running, we may be able to start 1295 * If the vcore is already running, we may be able to start
1028 * this thread straight away and have it join in. 1296 * this thread straight away and have it join in.
1029 */ 1297 */
1030 if (prev_state == KVMPPC_VCPU_STOPPED) { 1298 if (!signal_pending(current)) {
1031 if (vc->vcore_state == VCORE_RUNNING && 1299 if (vc->vcore_state == VCORE_RUNNING &&
1032 VCORE_EXIT_COUNT(vc) == 0) { 1300 VCORE_EXIT_COUNT(vc) == 0) {
1033 vcpu->arch.ptid = vc->n_runnable - 1; 1301 vcpu->arch.ptid = vc->n_runnable - 1;
1302 kvmppc_create_dtl_entry(vcpu, vc);
1034 kvmppc_start_thread(vcpu); 1303 kvmppc_start_thread(vcpu);
1304 } else if (vc->vcore_state == VCORE_SLEEPING) {
1305 wake_up(&vc->wq);
1035 } 1306 }
1036 1307
1037 } else if (prev_state == KVMPPC_VCPU_BUSY_IN_HOST) 1308 }
1038 --vc->n_busy;
1039 1309
1040 while (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE && 1310 while (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE &&
1041 !signal_pending(current)) { 1311 !signal_pending(current)) {
1042 if (vc->n_busy || vc->vcore_state != VCORE_INACTIVE) { 1312 if (vc->vcore_state != VCORE_INACTIVE) {
1043 spin_unlock(&vc->lock); 1313 spin_unlock(&vc->lock);
1044 kvmppc_wait_for_exec(vcpu, TASK_INTERRUPTIBLE); 1314 kvmppc_wait_for_exec(vcpu, TASK_INTERRUPTIBLE);
1045 spin_lock(&vc->lock); 1315 spin_lock(&vc->lock);
1046 continue; 1316 continue;
1047 } 1317 }
1048 vc->runner = vcpu;
1049 n_ceded = 0;
1050 list_for_each_entry(v, &vc->runnable_threads, arch.run_list)
1051 n_ceded += v->arch.ceded;
1052 if (n_ceded == vc->n_runnable)
1053 kvmppc_vcore_blocked(vc);
1054 else
1055 kvmppc_run_core(vc);
1056
1057 list_for_each_entry_safe(v, vn, &vc->runnable_threads, 1318 list_for_each_entry_safe(v, vn, &vc->runnable_threads,
1058 arch.run_list) { 1319 arch.run_list) {
1059 kvmppc_core_prepare_to_enter(v); 1320 kvmppc_core_prepare_to_enter(v);
@@ -1065,22 +1326,40 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
1065 wake_up(&v->arch.cpu_run); 1326 wake_up(&v->arch.cpu_run);
1066 } 1327 }
1067 } 1328 }
1329 if (!vc->n_runnable || vcpu->arch.state != KVMPPC_VCPU_RUNNABLE)
1330 break;
1331 vc->runner = vcpu;
1332 n_ceded = 0;
1333 list_for_each_entry(v, &vc->runnable_threads, arch.run_list)
1334 if (!v->arch.pending_exceptions)
1335 n_ceded += v->arch.ceded;
1336 if (n_ceded == vc->n_runnable)
1337 kvmppc_vcore_blocked(vc);
1338 else
1339 kvmppc_run_core(vc);
1068 vc->runner = NULL; 1340 vc->runner = NULL;
1069 } 1341 }
1070 1342
1071 if (signal_pending(current)) { 1343 while (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE &&
1072 if (vc->vcore_state == VCORE_RUNNING || 1344 (vc->vcore_state == VCORE_RUNNING ||
1073 vc->vcore_state == VCORE_EXITING) { 1345 vc->vcore_state == VCORE_EXITING)) {
1074 spin_unlock(&vc->lock); 1346 spin_unlock(&vc->lock);
1075 kvmppc_wait_for_exec(vcpu, TASK_UNINTERRUPTIBLE); 1347 kvmppc_wait_for_exec(vcpu, TASK_UNINTERRUPTIBLE);
1076 spin_lock(&vc->lock); 1348 spin_lock(&vc->lock);
1077 } 1349 }
1078 if (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE) { 1350
1079 kvmppc_remove_runnable(vc, vcpu); 1351 if (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE) {
1080 vcpu->stat.signal_exits++; 1352 kvmppc_remove_runnable(vc, vcpu);
1081 kvm_run->exit_reason = KVM_EXIT_INTR; 1353 vcpu->stat.signal_exits++;
1082 vcpu->arch.ret = -EINTR; 1354 kvm_run->exit_reason = KVM_EXIT_INTR;
1083 } 1355 vcpu->arch.ret = -EINTR;
1356 }
1357
1358 if (vc->n_runnable && vc->vcore_state == VCORE_INACTIVE) {
1359 /* Wake up some vcpu to run the core */
1360 v = list_first_entry(&vc->runnable_threads,
1361 struct kvm_vcpu, arch.run_list);
1362 wake_up(&v->arch.cpu_run);
1084 } 1363 }
1085 1364
1086 spin_unlock(&vc->lock); 1365 spin_unlock(&vc->lock);
@@ -1090,6 +1369,7 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
1090int kvmppc_vcpu_run(struct kvm_run *run, struct kvm_vcpu *vcpu) 1369int kvmppc_vcpu_run(struct kvm_run *run, struct kvm_vcpu *vcpu)
1091{ 1370{
1092 int r; 1371 int r;
1372 int srcu_idx;
1093 1373
1094 if (!vcpu->arch.sane) { 1374 if (!vcpu->arch.sane) {
1095 run->exit_reason = KVM_EXIT_INTERNAL_ERROR; 1375 run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
@@ -1120,6 +1400,7 @@ int kvmppc_vcpu_run(struct kvm_run *run, struct kvm_vcpu *vcpu)
1120 flush_vsx_to_thread(current); 1400 flush_vsx_to_thread(current);
1121 vcpu->arch.wqp = &vcpu->arch.vcore->wq; 1401 vcpu->arch.wqp = &vcpu->arch.vcore->wq;
1122 vcpu->arch.pgdir = current->mm->pgd; 1402 vcpu->arch.pgdir = current->mm->pgd;
1403 vcpu->arch.state = KVMPPC_VCPU_BUSY_IN_HOST;
1123 1404
1124 do { 1405 do {
1125 r = kvmppc_run_vcpu(run, vcpu); 1406 r = kvmppc_run_vcpu(run, vcpu);
@@ -1128,10 +1409,16 @@ int kvmppc_vcpu_run(struct kvm_run *run, struct kvm_vcpu *vcpu)
1128 !(vcpu->arch.shregs.msr & MSR_PR)) { 1409 !(vcpu->arch.shregs.msr & MSR_PR)) {
1129 r = kvmppc_pseries_do_hcall(vcpu); 1410 r = kvmppc_pseries_do_hcall(vcpu);
1130 kvmppc_core_prepare_to_enter(vcpu); 1411 kvmppc_core_prepare_to_enter(vcpu);
1412 } else if (r == RESUME_PAGE_FAULT) {
1413 srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
1414 r = kvmppc_book3s_hv_page_fault(run, vcpu,
1415 vcpu->arch.fault_dar, vcpu->arch.fault_dsisr);
1416 srcu_read_unlock(&vcpu->kvm->srcu, srcu_idx);
1131 } 1417 }
1132 } while (r == RESUME_GUEST); 1418 } while (r == RESUME_GUEST);
1133 1419
1134 out: 1420 out:
1421 vcpu->arch.state = KVMPPC_VCPU_NOTREADY;
1135 atomic_dec(&vcpu->kvm->arch.vcpus_running); 1422 atomic_dec(&vcpu->kvm->arch.vcpus_running);
1136 return r; 1423 return r;
1137} 1424}
@@ -1273,7 +1560,7 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log)
1273 n = kvm_dirty_bitmap_bytes(memslot); 1560 n = kvm_dirty_bitmap_bytes(memslot);
1274 memset(memslot->dirty_bitmap, 0, n); 1561 memset(memslot->dirty_bitmap, 0, n);
1275 1562
1276 r = kvmppc_hv_get_dirty_log(kvm, memslot); 1563 r = kvmppc_hv_get_dirty_log(kvm, memslot, memslot->dirty_bitmap);
1277 if (r) 1564 if (r)
1278 goto out; 1565 goto out;
1279 1566
@@ -1287,67 +1574,88 @@ out:
1287 return r; 1574 return r;
1288} 1575}
1289 1576
1290static unsigned long slb_pgsize_encoding(unsigned long psize) 1577static void unpin_slot(struct kvm_memory_slot *memslot)
1291{ 1578{
1292 unsigned long senc = 0; 1579 unsigned long *physp;
1580 unsigned long j, npages, pfn;
1581 struct page *page;
1293 1582
1294 if (psize > 0x1000) { 1583 physp = memslot->arch.slot_phys;
1295 senc = SLB_VSID_L; 1584 npages = memslot->npages;
1296 if (psize == 0x10000) 1585 if (!physp)
1297 senc |= SLB_VSID_LP_01; 1586 return;
1587 for (j = 0; j < npages; j++) {
1588 if (!(physp[j] & KVMPPC_GOT_PAGE))
1589 continue;
1590 pfn = physp[j] >> PAGE_SHIFT;
1591 page = pfn_to_page(pfn);
1592 SetPageDirty(page);
1593 put_page(page);
1594 }
1595}
1596
1597void kvmppc_core_free_memslot(struct kvm_memory_slot *free,
1598 struct kvm_memory_slot *dont)
1599{
1600 if (!dont || free->arch.rmap != dont->arch.rmap) {
1601 vfree(free->arch.rmap);
1602 free->arch.rmap = NULL;
1603 }
1604 if (!dont || free->arch.slot_phys != dont->arch.slot_phys) {
1605 unpin_slot(free);
1606 vfree(free->arch.slot_phys);
1607 free->arch.slot_phys = NULL;
1298 } 1608 }
1299 return senc; 1609}
1610
1611int kvmppc_core_create_memslot(struct kvm_memory_slot *slot,
1612 unsigned long npages)
1613{
1614 slot->arch.rmap = vzalloc(npages * sizeof(*slot->arch.rmap));
1615 if (!slot->arch.rmap)
1616 return -ENOMEM;
1617 slot->arch.slot_phys = NULL;
1618
1619 return 0;
1300} 1620}
1301 1621
1302int kvmppc_core_prepare_memory_region(struct kvm *kvm, 1622int kvmppc_core_prepare_memory_region(struct kvm *kvm,
1303 struct kvm_userspace_memory_region *mem) 1623 struct kvm_memory_slot *memslot,
1624 struct kvm_userspace_memory_region *mem)
1304{ 1625{
1305 unsigned long npages;
1306 unsigned long *phys; 1626 unsigned long *phys;
1307 1627
1308 /* Allocate a slot_phys array */ 1628 /* Allocate a slot_phys array if needed */
1309 phys = kvm->arch.slot_phys[mem->slot]; 1629 phys = memslot->arch.slot_phys;
1310 if (!kvm->arch.using_mmu_notifiers && !phys) { 1630 if (!kvm->arch.using_mmu_notifiers && !phys && memslot->npages) {
1311 npages = mem->memory_size >> PAGE_SHIFT; 1631 phys = vzalloc(memslot->npages * sizeof(unsigned long));
1312 phys = vzalloc(npages * sizeof(unsigned long));
1313 if (!phys) 1632 if (!phys)
1314 return -ENOMEM; 1633 return -ENOMEM;
1315 kvm->arch.slot_phys[mem->slot] = phys; 1634 memslot->arch.slot_phys = phys;
1316 kvm->arch.slot_npages[mem->slot] = npages;
1317 } 1635 }
1318 1636
1319 return 0; 1637 return 0;
1320} 1638}
1321 1639
1322static void unpin_slot(struct kvm *kvm, int slot_id) 1640void kvmppc_core_commit_memory_region(struct kvm *kvm,
1641 struct kvm_userspace_memory_region *mem,
1642 struct kvm_memory_slot old)
1323{ 1643{
1324 unsigned long *physp; 1644 unsigned long npages = mem->memory_size >> PAGE_SHIFT;
1325 unsigned long j, npages, pfn; 1645 struct kvm_memory_slot *memslot;
1326 struct page *page;
1327 1646
1328 physp = kvm->arch.slot_phys[slot_id]; 1647 if (npages && old.npages) {
1329 npages = kvm->arch.slot_npages[slot_id]; 1648 /*
1330 if (physp) { 1649 * If modifying a memslot, reset all the rmap dirty bits.
1331 spin_lock(&kvm->arch.slot_phys_lock); 1650 * If this is a new memslot, we don't need to do anything
1332 for (j = 0; j < npages; j++) { 1651 * since the rmap array starts out as all zeroes,
1333 if (!(physp[j] & KVMPPC_GOT_PAGE)) 1652 * i.e. no pages are dirty.
1334 continue; 1653 */
1335 pfn = physp[j] >> PAGE_SHIFT; 1654 memslot = id_to_memslot(kvm->memslots, mem->slot);
1336 page = pfn_to_page(pfn); 1655 kvmppc_hv_get_dirty_log(kvm, memslot, NULL);
1337 SetPageDirty(page);
1338 put_page(page);
1339 }
1340 kvm->arch.slot_phys[slot_id] = NULL;
1341 spin_unlock(&kvm->arch.slot_phys_lock);
1342 vfree(physp);
1343 } 1656 }
1344} 1657}
1345 1658
1346void kvmppc_core_commit_memory_region(struct kvm *kvm,
1347 struct kvm_userspace_memory_region *mem)
1348{
1349}
1350
1351static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu) 1659static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu)
1352{ 1660{
1353 int err = 0; 1661 int err = 0;
@@ -1362,6 +1670,7 @@ static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu)
1362 unsigned long rmls; 1670 unsigned long rmls;
1363 unsigned long *physp; 1671 unsigned long *physp;
1364 unsigned long i, npages; 1672 unsigned long i, npages;
1673 int srcu_idx;
1365 1674
1366 mutex_lock(&kvm->lock); 1675 mutex_lock(&kvm->lock);
1367 if (kvm->arch.rma_setup_done) 1676 if (kvm->arch.rma_setup_done)
@@ -1377,12 +1686,13 @@ static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu)
1377 } 1686 }
1378 1687
1379 /* Look up the memslot for guest physical address 0 */ 1688 /* Look up the memslot for guest physical address 0 */
1689 srcu_idx = srcu_read_lock(&kvm->srcu);
1380 memslot = gfn_to_memslot(kvm, 0); 1690 memslot = gfn_to_memslot(kvm, 0);
1381 1691
1382 /* We must have some memory at 0 by now */ 1692 /* We must have some memory at 0 by now */
1383 err = -EINVAL; 1693 err = -EINVAL;
1384 if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID)) 1694 if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID))
1385 goto out; 1695 goto out_srcu;
1386 1696
1387 /* Look up the VMA for the start of this memory slot */ 1697 /* Look up the VMA for the start of this memory slot */
1388 hva = memslot->userspace_addr; 1698 hva = memslot->userspace_addr;
@@ -1406,14 +1716,14 @@ static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu)
1406 err = -EPERM; 1716 err = -EPERM;
1407 if (cpu_has_feature(CPU_FTR_ARCH_201)) { 1717 if (cpu_has_feature(CPU_FTR_ARCH_201)) {
1408 pr_err("KVM: CPU requires an RMO\n"); 1718 pr_err("KVM: CPU requires an RMO\n");
1409 goto out; 1719 goto out_srcu;
1410 } 1720 }
1411 1721
1412 /* We can handle 4k, 64k or 16M pages in the VRMA */ 1722 /* We can handle 4k, 64k or 16M pages in the VRMA */
1413 err = -EINVAL; 1723 err = -EINVAL;
1414 if (!(psize == 0x1000 || psize == 0x10000 || 1724 if (!(psize == 0x1000 || psize == 0x10000 ||
1415 psize == 0x1000000)) 1725 psize == 0x1000000))
1416 goto out; 1726 goto out_srcu;
1417 1727
1418 /* Update VRMASD field in the LPCR */ 1728 /* Update VRMASD field in the LPCR */
1419 senc = slb_pgsize_encoding(psize); 1729 senc = slb_pgsize_encoding(psize);
@@ -1436,7 +1746,7 @@ static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu)
1436 err = -EINVAL; 1746 err = -EINVAL;
1437 if (rmls < 0) { 1747 if (rmls < 0) {
1438 pr_err("KVM: Can't use RMA of 0x%lx bytes\n", rma_size); 1748 pr_err("KVM: Can't use RMA of 0x%lx bytes\n", rma_size);
1439 goto out; 1749 goto out_srcu;
1440 } 1750 }
1441 atomic_inc(&ri->use_count); 1751 atomic_inc(&ri->use_count);
1442 kvm->arch.rma = ri; 1752 kvm->arch.rma = ri;
@@ -1465,17 +1775,24 @@ static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu)
1465 /* Initialize phys addrs of pages in RMO */ 1775 /* Initialize phys addrs of pages in RMO */
1466 npages = ri->npages; 1776 npages = ri->npages;
1467 porder = __ilog2(npages); 1777 porder = __ilog2(npages);
1468 physp = kvm->arch.slot_phys[memslot->id]; 1778 physp = memslot->arch.slot_phys;
1469 spin_lock(&kvm->arch.slot_phys_lock); 1779 if (physp) {
1470 for (i = 0; i < npages; ++i) 1780 if (npages > memslot->npages)
1471 physp[i] = ((ri->base_pfn + i) << PAGE_SHIFT) + porder; 1781 npages = memslot->npages;
1472 spin_unlock(&kvm->arch.slot_phys_lock); 1782 spin_lock(&kvm->arch.slot_phys_lock);
1783 for (i = 0; i < npages; ++i)
1784 physp[i] = ((ri->base_pfn + i) << PAGE_SHIFT) +
1785 porder;
1786 spin_unlock(&kvm->arch.slot_phys_lock);
1787 }
1473 } 1788 }
1474 1789
1475 /* Order updates to kvm->arch.lpcr etc. vs. rma_setup_done */ 1790 /* Order updates to kvm->arch.lpcr etc. vs. rma_setup_done */
1476 smp_wmb(); 1791 smp_wmb();
1477 kvm->arch.rma_setup_done = 1; 1792 kvm->arch.rma_setup_done = 1;
1478 err = 0; 1793 err = 0;
1794 out_srcu:
1795 srcu_read_unlock(&kvm->srcu, srcu_idx);
1479 out: 1796 out:
1480 mutex_unlock(&kvm->lock); 1797 mutex_unlock(&kvm->lock);
1481 return err; 1798 return err;
@@ -1496,6 +1813,13 @@ int kvmppc_core_init_vm(struct kvm *kvm)
1496 return -ENOMEM; 1813 return -ENOMEM;
1497 kvm->arch.lpid = lpid; 1814 kvm->arch.lpid = lpid;
1498 1815
1816 /*
1817 * Since we don't flush the TLB when tearing down a VM,
1818 * and this lpid might have previously been used,
1819 * make sure we flush on each core before running the new VM.
1820 */
1821 cpumask_setall(&kvm->arch.need_tlb_flush);
1822
1499 INIT_LIST_HEAD(&kvm->arch.spapr_tce_tables); 1823 INIT_LIST_HEAD(&kvm->arch.spapr_tce_tables);
1500 1824
1501 kvm->arch.rma = NULL; 1825 kvm->arch.rma = NULL;
@@ -1523,16 +1847,19 @@ int kvmppc_core_init_vm(struct kvm *kvm)
1523 1847
1524 kvm->arch.using_mmu_notifiers = !!cpu_has_feature(CPU_FTR_ARCH_206); 1848 kvm->arch.using_mmu_notifiers = !!cpu_has_feature(CPU_FTR_ARCH_206);
1525 spin_lock_init(&kvm->arch.slot_phys_lock); 1849 spin_lock_init(&kvm->arch.slot_phys_lock);
1850
1851 /*
1852 * Don't allow secondary CPU threads to come online
1853 * while any KVM VMs exist.
1854 */
1855 inhibit_secondary_onlining();
1856
1526 return 0; 1857 return 0;
1527} 1858}
1528 1859
1529void kvmppc_core_destroy_vm(struct kvm *kvm) 1860void kvmppc_core_destroy_vm(struct kvm *kvm)
1530{ 1861{
1531 unsigned long i; 1862 uninhibit_secondary_onlining();
1532
1533 if (!kvm->arch.using_mmu_notifiers)
1534 for (i = 0; i < KVM_MEM_SLOTS_NUM; i++)
1535 unpin_slot(kvm, i);
1536 1863
1537 if (kvm->arch.rma) { 1864 if (kvm->arch.rma) {
1538 kvm_release_rma(kvm->arch.rma); 1865 kvm_release_rma(kvm->arch.rma);
diff --git a/arch/powerpc/kvm/book3s_hv_builtin.c b/arch/powerpc/kvm/book3s_hv_builtin.c
index fb4eac290fef..ec0a9e5de100 100644
--- a/arch/powerpc/kvm/book3s_hv_builtin.c
+++ b/arch/powerpc/kvm/book3s_hv_builtin.c
@@ -157,8 +157,8 @@ static void __init kvm_linear_init_one(ulong size, int count, int type)
157 linear_info = alloc_bootmem(count * sizeof(struct kvmppc_linear_info)); 157 linear_info = alloc_bootmem(count * sizeof(struct kvmppc_linear_info));
158 for (i = 0; i < count; ++i) { 158 for (i = 0; i < count; ++i) {
159 linear = alloc_bootmem_align(size, size); 159 linear = alloc_bootmem_align(size, size);
160 pr_info("Allocated KVM %s at %p (%ld MB)\n", typestr, linear, 160 pr_debug("Allocated KVM %s at %p (%ld MB)\n", typestr, linear,
161 size >> 20); 161 size >> 20);
162 linear_info[i].base_virt = linear; 162 linear_info[i].base_virt = linear;
163 linear_info[i].base_pfn = __pa(linear) >> PAGE_SHIFT; 163 linear_info[i].base_pfn = __pa(linear) >> PAGE_SHIFT;
164 linear_info[i].npages = npages; 164 linear_info[i].npages = npages;
diff --git a/arch/powerpc/kvm/book3s_hv_ras.c b/arch/powerpc/kvm/book3s_hv_ras.c
new file mode 100644
index 000000000000..35f3cf0269b3
--- /dev/null
+++ b/arch/powerpc/kvm/book3s_hv_ras.c
@@ -0,0 +1,144 @@
1/*
2 * This program is free software; you can redistribute it and/or modify
3 * it under the terms of the GNU General Public License, version 2, as
4 * published by the Free Software Foundation.
5 *
6 * Copyright 2012 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
7 */
8
9#include <linux/types.h>
10#include <linux/string.h>
11#include <linux/kvm.h>
12#include <linux/kvm_host.h>
13#include <linux/kernel.h>
14#include <asm/opal.h>
15
16/* SRR1 bits for machine check on POWER7 */
17#define SRR1_MC_LDSTERR (1ul << (63-42))
18#define SRR1_MC_IFETCH_SH (63-45)
19#define SRR1_MC_IFETCH_MASK 0x7
20#define SRR1_MC_IFETCH_SLBPAR 2 /* SLB parity error */
21#define SRR1_MC_IFETCH_SLBMULTI 3 /* SLB multi-hit */
22#define SRR1_MC_IFETCH_SLBPARMULTI 4 /* SLB parity + multi-hit */
23#define SRR1_MC_IFETCH_TLBMULTI 5 /* I-TLB multi-hit */
24
25/* DSISR bits for machine check on POWER7 */
26#define DSISR_MC_DERAT_MULTI 0x800 /* D-ERAT multi-hit */
27#define DSISR_MC_TLB_MULTI 0x400 /* D-TLB multi-hit */
28#define DSISR_MC_SLB_PARITY 0x100 /* SLB parity error */
29#define DSISR_MC_SLB_MULTI 0x080 /* SLB multi-hit */
30#define DSISR_MC_SLB_PARMULTI 0x040 /* SLB parity + multi-hit */
31
32/* POWER7 SLB flush and reload */
33static void reload_slb(struct kvm_vcpu *vcpu)
34{
35 struct slb_shadow *slb;
36 unsigned long i, n;
37
38 /* First clear out SLB */
39 asm volatile("slbmte %0,%0; slbia" : : "r" (0));
40
41 /* Do they have an SLB shadow buffer registered? */
42 slb = vcpu->arch.slb_shadow.pinned_addr;
43 if (!slb)
44 return;
45
46 /* Sanity check */
47 n = min_t(u32, slb->persistent, SLB_MIN_SIZE);
48 if ((void *) &slb->save_area[n] > vcpu->arch.slb_shadow.pinned_end)
49 return;
50
51 /* Load up the SLB from that */
52 for (i = 0; i < n; ++i) {
53 unsigned long rb = slb->save_area[i].esid;
54 unsigned long rs = slb->save_area[i].vsid;
55
56 rb = (rb & ~0xFFFul) | i; /* insert entry number */
57 asm volatile("slbmte %0,%1" : : "r" (rs), "r" (rb));
58 }
59}
60
61/* POWER7 TLB flush */
62static void flush_tlb_power7(struct kvm_vcpu *vcpu)
63{
64 unsigned long i, rb;
65
66 rb = TLBIEL_INVAL_SET_LPID;
67 for (i = 0; i < POWER7_TLB_SETS; ++i) {
68 asm volatile("tlbiel %0" : : "r" (rb));
69 rb += 1 << TLBIEL_INVAL_SET_SHIFT;
70 }
71}
72
73/*
74 * On POWER7, see if we can handle a machine check that occurred inside
75 * the guest in real mode, without switching to the host partition.
76 *
77 * Returns: 0 => exit guest, 1 => deliver machine check to guest
78 */
79static long kvmppc_realmode_mc_power7(struct kvm_vcpu *vcpu)
80{
81 unsigned long srr1 = vcpu->arch.shregs.msr;
82 struct opal_machine_check_event *opal_evt;
83 long handled = 1;
84
85 if (srr1 & SRR1_MC_LDSTERR) {
86 /* error on load/store */
87 unsigned long dsisr = vcpu->arch.shregs.dsisr;
88
89 if (dsisr & (DSISR_MC_SLB_PARMULTI | DSISR_MC_SLB_MULTI |
90 DSISR_MC_SLB_PARITY | DSISR_MC_DERAT_MULTI)) {
91 /* flush and reload SLB; flushes D-ERAT too */
92 reload_slb(vcpu);
93 dsisr &= ~(DSISR_MC_SLB_PARMULTI | DSISR_MC_SLB_MULTI |
94 DSISR_MC_SLB_PARITY | DSISR_MC_DERAT_MULTI);
95 }
96 if (dsisr & DSISR_MC_TLB_MULTI) {
97 flush_tlb_power7(vcpu);
98 dsisr &= ~DSISR_MC_TLB_MULTI;
99 }
100 /* Any other errors we don't understand? */
101 if (dsisr & 0xffffffffUL)
102 handled = 0;
103 }
104
105 switch ((srr1 >> SRR1_MC_IFETCH_SH) & SRR1_MC_IFETCH_MASK) {
106 case 0:
107 break;
108 case SRR1_MC_IFETCH_SLBPAR:
109 case SRR1_MC_IFETCH_SLBMULTI:
110 case SRR1_MC_IFETCH_SLBPARMULTI:
111 reload_slb(vcpu);
112 break;
113 case SRR1_MC_IFETCH_TLBMULTI:
114 flush_tlb_power7(vcpu);
115 break;
116 default:
117 handled = 0;
118 }
119
120 /*
121 * See if OPAL has already handled the condition.
122 * We assume that if the condition is recovered then OPAL
123 * will have generated an error log event that we will pick
124 * up and log later.
125 */
126 opal_evt = local_paca->opal_mc_evt;
127 if (opal_evt->version == OpalMCE_V1 &&
128 (opal_evt->severity == OpalMCE_SEV_NO_ERROR ||
129 opal_evt->disposition == OpalMCE_DISPOSITION_RECOVERED))
130 handled = 1;
131
132 if (handled)
133 opal_evt->in_use = 0;
134
135 return handled;
136}
137
138long kvmppc_realmode_machine_check(struct kvm_vcpu *vcpu)
139{
140 if (cpu_has_feature(CPU_FTR_ARCH_206))
141 return kvmppc_realmode_mc_power7(vcpu);
142
143 return 0;
144}
diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
index fb0e821622d4..19c93bae1aea 100644
--- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
@@ -35,6 +35,37 @@ static void *real_vmalloc_addr(void *x)
35 return __va(addr); 35 return __va(addr);
36} 36}
37 37
38/* Return 1 if we need to do a global tlbie, 0 if we can use tlbiel */
39static int global_invalidates(struct kvm *kvm, unsigned long flags)
40{
41 int global;
42
43 /*
44 * If there is only one vcore, and it's currently running,
45 * we can use tlbiel as long as we mark all other physical
46 * cores as potentially having stale TLB entries for this lpid.
47 * If we're not using MMU notifiers, we never take pages away
48 * from the guest, so we can use tlbiel if requested.
49 * Otherwise, don't use tlbiel.
50 */
51 if (kvm->arch.online_vcores == 1 && local_paca->kvm_hstate.kvm_vcore)
52 global = 0;
53 else if (kvm->arch.using_mmu_notifiers)
54 global = 1;
55 else
56 global = !(flags & H_LOCAL);
57
58 if (!global) {
59 /* any other core might now have stale TLB entries... */
60 smp_wmb();
61 cpumask_setall(&kvm->arch.need_tlb_flush);
62 cpumask_clear_cpu(local_paca->kvm_hstate.kvm_vcore->pcpu,
63 &kvm->arch.need_tlb_flush);
64 }
65
66 return global;
67}
68
38/* 69/*
39 * Add this HPTE into the chain for the real page. 70 * Add this HPTE into the chain for the real page.
40 * Must be called with the chain locked; it unlocks the chain. 71 * Must be called with the chain locked; it unlocks the chain.
@@ -59,13 +90,24 @@ void kvmppc_add_revmap_chain(struct kvm *kvm, struct revmap_entry *rev,
59 head->back = pte_index; 90 head->back = pte_index;
60 } else { 91 } else {
61 rev->forw = rev->back = pte_index; 92 rev->forw = rev->back = pte_index;
62 i = pte_index; 93 *rmap = (*rmap & ~KVMPPC_RMAP_INDEX) |
94 pte_index | KVMPPC_RMAP_PRESENT;
63 } 95 }
64 smp_wmb(); 96 unlock_rmap(rmap);
65 *rmap = i | KVMPPC_RMAP_REFERENCED | KVMPPC_RMAP_PRESENT; /* unlock */
66} 97}
67EXPORT_SYMBOL_GPL(kvmppc_add_revmap_chain); 98EXPORT_SYMBOL_GPL(kvmppc_add_revmap_chain);
68 99
100/*
101 * Note modification of an HPTE; set the HPTE modified bit
102 * if anyone is interested.
103 */
104static inline void note_hpte_modification(struct kvm *kvm,
105 struct revmap_entry *rev)
106{
107 if (atomic_read(&kvm->arch.hpte_mod_interest))
108 rev->guest_rpte |= HPTE_GR_MODIFIED;
109}
110
69/* Remove this HPTE from the chain for a real page */ 111/* Remove this HPTE from the chain for a real page */
70static void remove_revmap_chain(struct kvm *kvm, long pte_index, 112static void remove_revmap_chain(struct kvm *kvm, long pte_index,
71 struct revmap_entry *rev, 113 struct revmap_entry *rev,
@@ -81,7 +123,7 @@ static void remove_revmap_chain(struct kvm *kvm, long pte_index,
81 ptel = rev->guest_rpte |= rcbits; 123 ptel = rev->guest_rpte |= rcbits;
82 gfn = hpte_rpn(ptel, hpte_page_size(hpte_v, ptel)); 124 gfn = hpte_rpn(ptel, hpte_page_size(hpte_v, ptel));
83 memslot = __gfn_to_memslot(kvm_memslots(kvm), gfn); 125 memslot = __gfn_to_memslot(kvm_memslots(kvm), gfn);
84 if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID)) 126 if (!memslot)
85 return; 127 return;
86 128
87 rmap = real_vmalloc_addr(&memslot->arch.rmap[gfn - memslot->base_gfn]); 129 rmap = real_vmalloc_addr(&memslot->arch.rmap[gfn - memslot->base_gfn]);
@@ -103,14 +145,14 @@ static void remove_revmap_chain(struct kvm *kvm, long pte_index,
103 unlock_rmap(rmap); 145 unlock_rmap(rmap);
104} 146}
105 147
106static pte_t lookup_linux_pte(struct kvm_vcpu *vcpu, unsigned long hva, 148static pte_t lookup_linux_pte(pgd_t *pgdir, unsigned long hva,
107 int writing, unsigned long *pte_sizep) 149 int writing, unsigned long *pte_sizep)
108{ 150{
109 pte_t *ptep; 151 pte_t *ptep;
110 unsigned long ps = *pte_sizep; 152 unsigned long ps = *pte_sizep;
111 unsigned int shift; 153 unsigned int shift;
112 154
113 ptep = find_linux_pte_or_hugepte(vcpu->arch.pgdir, hva, &shift); 155 ptep = find_linux_pte_or_hugepte(pgdir, hva, &shift);
114 if (!ptep) 156 if (!ptep)
115 return __pte(0); 157 return __pte(0);
116 if (shift) 158 if (shift)
@@ -130,15 +172,15 @@ static inline void unlock_hpte(unsigned long *hpte, unsigned long hpte_v)
130 hpte[0] = hpte_v; 172 hpte[0] = hpte_v;
131} 173}
132 174
133long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags, 175long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags,
134 long pte_index, unsigned long pteh, unsigned long ptel) 176 long pte_index, unsigned long pteh, unsigned long ptel,
177 pgd_t *pgdir, bool realmode, unsigned long *pte_idx_ret)
135{ 178{
136 struct kvm *kvm = vcpu->kvm;
137 unsigned long i, pa, gpa, gfn, psize; 179 unsigned long i, pa, gpa, gfn, psize;
138 unsigned long slot_fn, hva; 180 unsigned long slot_fn, hva;
139 unsigned long *hpte; 181 unsigned long *hpte;
140 struct revmap_entry *rev; 182 struct revmap_entry *rev;
141 unsigned long g_ptel = ptel; 183 unsigned long g_ptel;
142 struct kvm_memory_slot *memslot; 184 struct kvm_memory_slot *memslot;
143 unsigned long *physp, pte_size; 185 unsigned long *physp, pte_size;
144 unsigned long is_io; 186 unsigned long is_io;
@@ -147,13 +189,14 @@ long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags,
147 unsigned int writing; 189 unsigned int writing;
148 unsigned long mmu_seq; 190 unsigned long mmu_seq;
149 unsigned long rcbits; 191 unsigned long rcbits;
150 bool realmode = vcpu->arch.vcore->vcore_state == VCORE_RUNNING;
151 192
152 psize = hpte_page_size(pteh, ptel); 193 psize = hpte_page_size(pteh, ptel);
153 if (!psize) 194 if (!psize)
154 return H_PARAMETER; 195 return H_PARAMETER;
155 writing = hpte_is_writable(ptel); 196 writing = hpte_is_writable(ptel);
156 pteh &= ~(HPTE_V_HVLOCK | HPTE_V_ABSENT | HPTE_V_VALID); 197 pteh &= ~(HPTE_V_HVLOCK | HPTE_V_ABSENT | HPTE_V_VALID);
198 ptel &= ~HPTE_GR_RESERVED;
199 g_ptel = ptel;
157 200
158 /* used later to detect if we might have been invalidated */ 201 /* used later to detect if we might have been invalidated */
159 mmu_seq = kvm->mmu_notifier_seq; 202 mmu_seq = kvm->mmu_notifier_seq;
@@ -183,7 +226,7 @@ long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags,
183 rmap = &memslot->arch.rmap[slot_fn]; 226 rmap = &memslot->arch.rmap[slot_fn];
184 227
185 if (!kvm->arch.using_mmu_notifiers) { 228 if (!kvm->arch.using_mmu_notifiers) {
186 physp = kvm->arch.slot_phys[memslot->id]; 229 physp = memslot->arch.slot_phys;
187 if (!physp) 230 if (!physp)
188 return H_PARAMETER; 231 return H_PARAMETER;
189 physp += slot_fn; 232 physp += slot_fn;
@@ -201,7 +244,7 @@ long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags,
201 244
202 /* Look up the Linux PTE for the backing page */ 245 /* Look up the Linux PTE for the backing page */
203 pte_size = psize; 246 pte_size = psize;
204 pte = lookup_linux_pte(vcpu, hva, writing, &pte_size); 247 pte = lookup_linux_pte(pgdir, hva, writing, &pte_size);
205 if (pte_present(pte)) { 248 if (pte_present(pte)) {
206 if (writing && !pte_write(pte)) 249 if (writing && !pte_write(pte))
207 /* make the actual HPTE be read-only */ 250 /* make the actual HPTE be read-only */
@@ -210,6 +253,7 @@ long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags,
210 pa = pte_pfn(pte) << PAGE_SHIFT; 253 pa = pte_pfn(pte) << PAGE_SHIFT;
211 } 254 }
212 } 255 }
256
213 if (pte_size < psize) 257 if (pte_size < psize)
214 return H_PARAMETER; 258 return H_PARAMETER;
215 if (pa && pte_size > psize) 259 if (pa && pte_size > psize)
@@ -287,8 +331,10 @@ long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags,
287 rev = &kvm->arch.revmap[pte_index]; 331 rev = &kvm->arch.revmap[pte_index];
288 if (realmode) 332 if (realmode)
289 rev = real_vmalloc_addr(rev); 333 rev = real_vmalloc_addr(rev);
290 if (rev) 334 if (rev) {
291 rev->guest_rpte = g_ptel; 335 rev->guest_rpte = g_ptel;
336 note_hpte_modification(kvm, rev);
337 }
292 338
293 /* Link HPTE into reverse-map chain */ 339 /* Link HPTE into reverse-map chain */
294 if (pteh & HPTE_V_VALID) { 340 if (pteh & HPTE_V_VALID) {
@@ -297,7 +343,7 @@ long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags,
297 lock_rmap(rmap); 343 lock_rmap(rmap);
298 /* Check for pending invalidations under the rmap chain lock */ 344 /* Check for pending invalidations under the rmap chain lock */
299 if (kvm->arch.using_mmu_notifiers && 345 if (kvm->arch.using_mmu_notifiers &&
300 mmu_notifier_retry(vcpu, mmu_seq)) { 346 mmu_notifier_retry(kvm, mmu_seq)) {
301 /* inval in progress, write a non-present HPTE */ 347 /* inval in progress, write a non-present HPTE */
302 pteh |= HPTE_V_ABSENT; 348 pteh |= HPTE_V_ABSENT;
303 pteh &= ~HPTE_V_VALID; 349 pteh &= ~HPTE_V_VALID;
@@ -318,10 +364,17 @@ long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags,
318 hpte[0] = pteh; 364 hpte[0] = pteh;
319 asm volatile("ptesync" : : : "memory"); 365 asm volatile("ptesync" : : : "memory");
320 366
321 vcpu->arch.gpr[4] = pte_index; 367 *pte_idx_ret = pte_index;
322 return H_SUCCESS; 368 return H_SUCCESS;
323} 369}
324EXPORT_SYMBOL_GPL(kvmppc_h_enter); 370EXPORT_SYMBOL_GPL(kvmppc_do_h_enter);
371
372long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags,
373 long pte_index, unsigned long pteh, unsigned long ptel)
374{
375 return kvmppc_do_h_enter(vcpu->kvm, flags, pte_index, pteh, ptel,
376 vcpu->arch.pgdir, true, &vcpu->arch.gpr[4]);
377}
325 378
326#define LOCK_TOKEN (*(u32 *)(&get_paca()->lock_token)) 379#define LOCK_TOKEN (*(u32 *)(&get_paca()->lock_token))
327 380
@@ -343,11 +396,10 @@ static inline int try_lock_tlbie(unsigned int *lock)
343 return old == 0; 396 return old == 0;
344} 397}
345 398
346long kvmppc_h_remove(struct kvm_vcpu *vcpu, unsigned long flags, 399long kvmppc_do_h_remove(struct kvm *kvm, unsigned long flags,
347 unsigned long pte_index, unsigned long avpn, 400 unsigned long pte_index, unsigned long avpn,
348 unsigned long va) 401 unsigned long *hpret)
349{ 402{
350 struct kvm *kvm = vcpu->kvm;
351 unsigned long *hpte; 403 unsigned long *hpte;
352 unsigned long v, r, rb; 404 unsigned long v, r, rb;
353 struct revmap_entry *rev; 405 struct revmap_entry *rev;
@@ -369,7 +421,7 @@ long kvmppc_h_remove(struct kvm_vcpu *vcpu, unsigned long flags,
369 if (v & HPTE_V_VALID) { 421 if (v & HPTE_V_VALID) {
370 hpte[0] &= ~HPTE_V_VALID; 422 hpte[0] &= ~HPTE_V_VALID;
371 rb = compute_tlbie_rb(v, hpte[1], pte_index); 423 rb = compute_tlbie_rb(v, hpte[1], pte_index);
372 if (!(flags & H_LOCAL) && atomic_read(&kvm->online_vcpus) > 1) { 424 if (global_invalidates(kvm, flags)) {
373 while (!try_lock_tlbie(&kvm->arch.tlbie_lock)) 425 while (!try_lock_tlbie(&kvm->arch.tlbie_lock))
374 cpu_relax(); 426 cpu_relax();
375 asm volatile("ptesync" : : : "memory"); 427 asm volatile("ptesync" : : : "memory");
@@ -385,13 +437,22 @@ long kvmppc_h_remove(struct kvm_vcpu *vcpu, unsigned long flags,
385 /* Read PTE low word after tlbie to get final R/C values */ 437 /* Read PTE low word after tlbie to get final R/C values */
386 remove_revmap_chain(kvm, pte_index, rev, v, hpte[1]); 438 remove_revmap_chain(kvm, pte_index, rev, v, hpte[1]);
387 } 439 }
388 r = rev->guest_rpte; 440 r = rev->guest_rpte & ~HPTE_GR_RESERVED;
441 note_hpte_modification(kvm, rev);
389 unlock_hpte(hpte, 0); 442 unlock_hpte(hpte, 0);
390 443
391 vcpu->arch.gpr[4] = v; 444 hpret[0] = v;
392 vcpu->arch.gpr[5] = r; 445 hpret[1] = r;
393 return H_SUCCESS; 446 return H_SUCCESS;
394} 447}
448EXPORT_SYMBOL_GPL(kvmppc_do_h_remove);
449
450long kvmppc_h_remove(struct kvm_vcpu *vcpu, unsigned long flags,
451 unsigned long pte_index, unsigned long avpn)
452{
453 return kvmppc_do_h_remove(vcpu->kvm, flags, pte_index, avpn,
454 &vcpu->arch.gpr[4]);
455}
395 456
396long kvmppc_h_bulk_remove(struct kvm_vcpu *vcpu) 457long kvmppc_h_bulk_remove(struct kvm_vcpu *vcpu)
397{ 458{
@@ -459,6 +520,7 @@ long kvmppc_h_bulk_remove(struct kvm_vcpu *vcpu)
459 520
460 args[j] = ((0x80 | flags) << 56) + pte_index; 521 args[j] = ((0x80 | flags) << 56) + pte_index;
461 rev = real_vmalloc_addr(&kvm->arch.revmap[pte_index]); 522 rev = real_vmalloc_addr(&kvm->arch.revmap[pte_index]);
523 note_hpte_modification(kvm, rev);
462 524
463 if (!(hp[0] & HPTE_V_VALID)) { 525 if (!(hp[0] & HPTE_V_VALID)) {
464 /* insert R and C bits from PTE */ 526 /* insert R and C bits from PTE */
@@ -534,8 +596,6 @@ long kvmppc_h_protect(struct kvm_vcpu *vcpu, unsigned long flags,
534 return H_NOT_FOUND; 596 return H_NOT_FOUND;
535 } 597 }
536 598
537 if (atomic_read(&kvm->online_vcpus) == 1)
538 flags |= H_LOCAL;
539 v = hpte[0]; 599 v = hpte[0];
540 bits = (flags << 55) & HPTE_R_PP0; 600 bits = (flags << 55) & HPTE_R_PP0;
541 bits |= (flags << 48) & HPTE_R_KEY_HI; 601 bits |= (flags << 48) & HPTE_R_KEY_HI;
@@ -548,6 +608,7 @@ long kvmppc_h_protect(struct kvm_vcpu *vcpu, unsigned long flags,
548 if (rev) { 608 if (rev) {
549 r = (rev->guest_rpte & ~mask) | bits; 609 r = (rev->guest_rpte & ~mask) | bits;
550 rev->guest_rpte = r; 610 rev->guest_rpte = r;
611 note_hpte_modification(kvm, rev);
551 } 612 }
552 r = (hpte[1] & ~mask) | bits; 613 r = (hpte[1] & ~mask) | bits;
553 614
@@ -555,7 +616,7 @@ long kvmppc_h_protect(struct kvm_vcpu *vcpu, unsigned long flags,
555 if (v & HPTE_V_VALID) { 616 if (v & HPTE_V_VALID) {
556 rb = compute_tlbie_rb(v, r, pte_index); 617 rb = compute_tlbie_rb(v, r, pte_index);
557 hpte[0] = v & ~HPTE_V_VALID; 618 hpte[0] = v & ~HPTE_V_VALID;
558 if (!(flags & H_LOCAL)) { 619 if (global_invalidates(kvm, flags)) {
559 while(!try_lock_tlbie(&kvm->arch.tlbie_lock)) 620 while(!try_lock_tlbie(&kvm->arch.tlbie_lock))
560 cpu_relax(); 621 cpu_relax();
561 asm volatile("ptesync" : : : "memory"); 622 asm volatile("ptesync" : : : "memory");
@@ -568,6 +629,28 @@ long kvmppc_h_protect(struct kvm_vcpu *vcpu, unsigned long flags,
568 asm volatile("tlbiel %0" : : "r" (rb)); 629 asm volatile("tlbiel %0" : : "r" (rb));
569 asm volatile("ptesync" : : : "memory"); 630 asm volatile("ptesync" : : : "memory");
570 } 631 }
632 /*
633 * If the host has this page as readonly but the guest
634 * wants to make it read/write, reduce the permissions.
635 * Checking the host permissions involves finding the
636 * memslot and then the Linux PTE for the page.
637 */
638 if (hpte_is_writable(r) && kvm->arch.using_mmu_notifiers) {
639 unsigned long psize, gfn, hva;
640 struct kvm_memory_slot *memslot;
641 pgd_t *pgdir = vcpu->arch.pgdir;
642 pte_t pte;
643
644 psize = hpte_page_size(v, r);
645 gfn = ((r & HPTE_R_RPN) & ~(psize - 1)) >> PAGE_SHIFT;
646 memslot = __gfn_to_memslot(kvm_memslots(kvm), gfn);
647 if (memslot) {
648 hva = __gfn_to_hva_memslot(memslot, gfn);
649 pte = lookup_linux_pte(pgdir, hva, 1, &psize);
650 if (pte_present(pte) && !pte_write(pte))
651 r = hpte_make_readonly(r);
652 }
653 }
571 } 654 }
572 hpte[1] = r; 655 hpte[1] = r;
573 eieio(); 656 eieio();
@@ -599,8 +682,10 @@ long kvmppc_h_read(struct kvm_vcpu *vcpu, unsigned long flags,
599 v &= ~HPTE_V_ABSENT; 682 v &= ~HPTE_V_ABSENT;
600 v |= HPTE_V_VALID; 683 v |= HPTE_V_VALID;
601 } 684 }
602 if (v & HPTE_V_VALID) 685 if (v & HPTE_V_VALID) {
603 r = rev[i].guest_rpte | (r & (HPTE_R_R | HPTE_R_C)); 686 r = rev[i].guest_rpte | (r & (HPTE_R_R | HPTE_R_C));
687 r &= ~HPTE_GR_RESERVED;
688 }
604 vcpu->arch.gpr[4 + i * 2] = v; 689 vcpu->arch.gpr[4 + i * 2] = v;
605 vcpu->arch.gpr[5 + i * 2] = r; 690 vcpu->arch.gpr[5 + i * 2] = r;
606 } 691 }
diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index 74a24bbb9637..10b6c358dd77 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -27,6 +27,7 @@
27#include <asm/asm-offsets.h> 27#include <asm/asm-offsets.h>
28#include <asm/exception-64s.h> 28#include <asm/exception-64s.h>
29#include <asm/kvm_book3s_asm.h> 29#include <asm/kvm_book3s_asm.h>
30#include <asm/mmu-hash64.h>
30 31
31/***************************************************************************** 32/*****************************************************************************
32 * * 33 * *
@@ -134,8 +135,11 @@ kvm_start_guest:
134 135
13527: /* XXX should handle hypervisor maintenance interrupts etc. here */ 13627: /* XXX should handle hypervisor maintenance interrupts etc. here */
136 137
138 /* reload vcpu pointer after clearing the IPI */
139 ld r4,HSTATE_KVM_VCPU(r13)
140 cmpdi r4,0
137 /* if we have no vcpu to run, go back to sleep */ 141 /* if we have no vcpu to run, go back to sleep */
138 beq cr1,kvm_no_guest 142 beq kvm_no_guest
139 143
140 /* were we napping due to cede? */ 144 /* were we napping due to cede? */
141 lbz r0,HSTATE_NAPPING(r13) 145 lbz r0,HSTATE_NAPPING(r13)
@@ -310,7 +314,33 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201)
310 mtspr SPRN_SDR1,r6 /* switch to partition page table */ 314 mtspr SPRN_SDR1,r6 /* switch to partition page table */
311 mtspr SPRN_LPID,r7 315 mtspr SPRN_LPID,r7
312 isync 316 isync
317
318 /* See if we need to flush the TLB */
319 lhz r6,PACAPACAINDEX(r13) /* test_bit(cpu, need_tlb_flush) */
320 clrldi r7,r6,64-6 /* extract bit number (6 bits) */
321 srdi r6,r6,6 /* doubleword number */
322 sldi r6,r6,3 /* address offset */
323 add r6,r6,r9
324 addi r6,r6,KVM_NEED_FLUSH /* dword in kvm->arch.need_tlb_flush */
313 li r0,1 325 li r0,1
326 sld r0,r0,r7
327 ld r7,0(r6)
328 and. r7,r7,r0
329 beq 22f
33023: ldarx r7,0,r6 /* if set, clear the bit */
331 andc r7,r7,r0
332 stdcx. r7,0,r6
333 bne 23b
334 li r6,128 /* and flush the TLB */
335 mtctr r6
336 li r7,0x800 /* IS field = 0b10 */
337 ptesync
33828: tlbiel r7
339 addi r7,r7,0x1000
340 bdnz 28b
341 ptesync
342
34322: li r0,1
314 stb r0,VCORE_IN_GUEST(r5) /* signal secondaries to continue */ 344 stb r0,VCORE_IN_GUEST(r5) /* signal secondaries to continue */
315 b 10f 345 b 10f
316 346
@@ -333,36 +363,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201)
333 mr r9,r4 363 mr r9,r4
334 blt hdec_soon 364 blt hdec_soon
335 365
336 /*
337 * Invalidate the TLB if we could possibly have stale TLB
338 * entries for this partition on this core due to the use
339 * of tlbiel.
340 * XXX maybe only need this on primary thread?
341 */
342 ld r9,VCPU_KVM(r4) /* pointer to struct kvm */
343 lwz r5,VCPU_VCPUID(r4)
344 lhz r6,PACAPACAINDEX(r13)
345 rldimi r6,r5,0,62 /* XXX map as if threads 1:1 p:v */
346 lhz r8,VCPU_LAST_CPU(r4)
347 sldi r7,r6,1 /* see if this is the same vcpu */
348 add r7,r7,r9 /* as last ran on this pcpu */
349 lhz r0,KVM_LAST_VCPU(r7)
350 cmpw r6,r8 /* on the same cpu core as last time? */
351 bne 3f
352 cmpw r0,r5 /* same vcpu as this core last ran? */
353 beq 1f
3543: sth r6,VCPU_LAST_CPU(r4) /* if not, invalidate partition TLB */
355 sth r5,KVM_LAST_VCPU(r7)
356 li r6,128
357 mtctr r6
358 li r7,0x800 /* IS field = 0b10 */
359 ptesync
3602: tlbiel r7
361 addi r7,r7,0x1000
362 bdnz 2b
363 ptesync
3641:
365
366 /* Save purr/spurr */ 366 /* Save purr/spurr */
367 mfspr r5,SPRN_PURR 367 mfspr r5,SPRN_PURR
368 mfspr r6,SPRN_SPURR 368 mfspr r6,SPRN_SPURR
@@ -679,8 +679,7 @@ BEGIN_FTR_SECTION
6791: 6791:
680END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206) 680END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206)
681 681
682nohpte_cont: 682guest_exit_cont: /* r9 = vcpu, r12 = trap, r13 = paca */
683hcall_real_cont: /* r9 = vcpu, r12 = trap, r13 = paca */
684 /* Save DEC */ 683 /* Save DEC */
685 mfspr r5,SPRN_DEC 684 mfspr r5,SPRN_DEC
686 mftb r6 685 mftb r6
@@ -701,6 +700,11 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206)
701 std r6, VCPU_FAULT_DAR(r9) 700 std r6, VCPU_FAULT_DAR(r9)
702 stw r7, VCPU_FAULT_DSISR(r9) 701 stw r7, VCPU_FAULT_DSISR(r9)
703 702
703 /* See if it is a machine check */
704 cmpwi r12, BOOK3S_INTERRUPT_MACHINE_CHECK
705 beq machine_check_realmode
706mc_cont:
707
704 /* Save guest CTRL register, set runlatch to 1 */ 708 /* Save guest CTRL register, set runlatch to 1 */
7056: mfspr r6,SPRN_CTRLF 7096: mfspr r6,SPRN_CTRLF
706 stw r6,VCPU_CTRL(r9) 710 stw r6,VCPU_CTRL(r9)
@@ -1113,38 +1117,41 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201)
1113 /* 1117 /*
1114 * For external and machine check interrupts, we need 1118 * For external and machine check interrupts, we need
1115 * to call the Linux handler to process the interrupt. 1119 * to call the Linux handler to process the interrupt.
1116 * We do that by jumping to the interrupt vector address 1120 * We do that by jumping to absolute address 0x500 for
1117 * which we have in r12. The [h]rfid at the end of the 1121 * external interrupts, or the machine_check_fwnmi label
1122 * for machine checks (since firmware might have patched
1123 * the vector area at 0x200). The [h]rfid at the end of the
1118 * handler will return to the book3s_hv_interrupts.S code. 1124 * handler will return to the book3s_hv_interrupts.S code.
1119 * For other interrupts we do the rfid to get back 1125 * For other interrupts we do the rfid to get back
1120 * to the book3s_interrupts.S code here. 1126 * to the book3s_hv_interrupts.S code here.
1121 */ 1127 */
1122 ld r8, HSTATE_VMHANDLER(r13) 1128 ld r8, HSTATE_VMHANDLER(r13)
1123 ld r7, HSTATE_HOST_MSR(r13) 1129 ld r7, HSTATE_HOST_MSR(r13)
1124 1130
1131 cmpwi cr1, r12, BOOK3S_INTERRUPT_MACHINE_CHECK
1125 cmpwi r12, BOOK3S_INTERRUPT_EXTERNAL 1132 cmpwi r12, BOOK3S_INTERRUPT_EXTERNAL
1133BEGIN_FTR_SECTION
1126 beq 11f 1134 beq 11f
1127 cmpwi r12, BOOK3S_INTERRUPT_MACHINE_CHECK 1135END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206)
1128 1136
1129 /* RFI into the highmem handler, or branch to interrupt handler */ 1137 /* RFI into the highmem handler, or branch to interrupt handler */
113012: mfmsr r6 1138 mfmsr r6
1131 mtctr r12
1132 li r0, MSR_RI 1139 li r0, MSR_RI
1133 andc r6, r6, r0 1140 andc r6, r6, r0
1134 mtmsrd r6, 1 /* Clear RI in MSR */ 1141 mtmsrd r6, 1 /* Clear RI in MSR */
1135 mtsrr0 r8 1142 mtsrr0 r8
1136 mtsrr1 r7 1143 mtsrr1 r7
1137 beqctr 1144 beqa 0x500 /* external interrupt (PPC970) */
1145 beq cr1, 13f /* machine check */
1138 RFI 1146 RFI
1139 1147
114011: 1148 /* On POWER7, we have external interrupts set to use HSRR0/1 */
1141BEGIN_FTR_SECTION 114911: mtspr SPRN_HSRR0, r8
1142 b 12b
1143END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201)
1144 mtspr SPRN_HSRR0, r8
1145 mtspr SPRN_HSRR1, r7 1150 mtspr SPRN_HSRR1, r7
1146 ba 0x500 1151 ba 0x500
1147 1152
115313: b machine_check_fwnmi
1154
1148/* 1155/*
1149 * Check whether an HDSI is an HPTE not found fault or something else. 1156 * Check whether an HDSI is an HPTE not found fault or something else.
1150 * If it is an HPTE not found fault that is due to the guest accessing 1157 * If it is an HPTE not found fault that is due to the guest accessing
@@ -1177,7 +1184,7 @@ kvmppc_hdsi:
1177 cmpdi r3, 0 /* retry the instruction */ 1184 cmpdi r3, 0 /* retry the instruction */
1178 beq 6f 1185 beq 6f
1179 cmpdi r3, -1 /* handle in kernel mode */ 1186 cmpdi r3, -1 /* handle in kernel mode */
1180 beq nohpte_cont 1187 beq guest_exit_cont
1181 cmpdi r3, -2 /* MMIO emulation; need instr word */ 1188 cmpdi r3, -2 /* MMIO emulation; need instr word */
1182 beq 2f 1189 beq 2f
1183 1190
@@ -1191,6 +1198,7 @@ kvmppc_hdsi:
1191 li r10, BOOK3S_INTERRUPT_DATA_STORAGE 1198 li r10, BOOK3S_INTERRUPT_DATA_STORAGE
1192 li r11, (MSR_ME << 1) | 1 /* synthesize MSR_SF | MSR_ME */ 1199 li r11, (MSR_ME << 1) | 1 /* synthesize MSR_SF | MSR_ME */
1193 rotldi r11, r11, 63 1200 rotldi r11, r11, 63
1201fast_interrupt_c_return:
11946: ld r7, VCPU_CTR(r9) 12026: ld r7, VCPU_CTR(r9)
1195 lwz r8, VCPU_XER(r9) 1203 lwz r8, VCPU_XER(r9)
1196 mtctr r7 1204 mtctr r7
@@ -1223,7 +1231,7 @@ kvmppc_hdsi:
1223 /* Unset guest mode. */ 1231 /* Unset guest mode. */
1224 li r0, KVM_GUEST_MODE_NONE 1232 li r0, KVM_GUEST_MODE_NONE
1225 stb r0, HSTATE_IN_GUEST(r13) 1233 stb r0, HSTATE_IN_GUEST(r13)
1226 b nohpte_cont 1234 b guest_exit_cont
1227 1235
1228/* 1236/*
1229 * Similarly for an HISI, reflect it to the guest as an ISI unless 1237 * Similarly for an HISI, reflect it to the guest as an ISI unless
@@ -1249,9 +1257,9 @@ kvmppc_hisi:
1249 ld r11, VCPU_MSR(r9) 1257 ld r11, VCPU_MSR(r9)
1250 li r12, BOOK3S_INTERRUPT_H_INST_STORAGE 1258 li r12, BOOK3S_INTERRUPT_H_INST_STORAGE
1251 cmpdi r3, 0 /* retry the instruction */ 1259 cmpdi r3, 0 /* retry the instruction */
1252 beq 6f 1260 beq fast_interrupt_c_return
1253 cmpdi r3, -1 /* handle in kernel mode */ 1261 cmpdi r3, -1 /* handle in kernel mode */
1254 beq nohpte_cont 1262 beq guest_exit_cont
1255 1263
1256 /* Synthesize an ISI for the guest */ 1264 /* Synthesize an ISI for the guest */
1257 mr r11, r3 1265 mr r11, r3
@@ -1260,12 +1268,7 @@ kvmppc_hisi:
1260 li r10, BOOK3S_INTERRUPT_INST_STORAGE 1268 li r10, BOOK3S_INTERRUPT_INST_STORAGE
1261 li r11, (MSR_ME << 1) | 1 /* synthesize MSR_SF | MSR_ME */ 1269 li r11, (MSR_ME << 1) | 1 /* synthesize MSR_SF | MSR_ME */
1262 rotldi r11, r11, 63 1270 rotldi r11, r11, 63
12636: ld r7, VCPU_CTR(r9) 1271 b fast_interrupt_c_return
1264 lwz r8, VCPU_XER(r9)
1265 mtctr r7
1266 mtxer r8
1267 mr r4, r9
1268 b fast_guest_return
1269 1272
12703: ld r6, VCPU_KVM(r9) /* not relocated, use VRMA */ 12733: ld r6, VCPU_KVM(r9) /* not relocated, use VRMA */
1271 ld r5, KVM_VRMA_SLB_V(r6) 1274 ld r5, KVM_VRMA_SLB_V(r6)
@@ -1281,14 +1284,14 @@ kvmppc_hisi:
1281hcall_try_real_mode: 1284hcall_try_real_mode:
1282 ld r3,VCPU_GPR(R3)(r9) 1285 ld r3,VCPU_GPR(R3)(r9)
1283 andi. r0,r11,MSR_PR 1286 andi. r0,r11,MSR_PR
1284 bne hcall_real_cont 1287 bne guest_exit_cont
1285 clrrdi r3,r3,2 1288 clrrdi r3,r3,2
1286 cmpldi r3,hcall_real_table_end - hcall_real_table 1289 cmpldi r3,hcall_real_table_end - hcall_real_table
1287 bge hcall_real_cont 1290 bge guest_exit_cont
1288 LOAD_REG_ADDR(r4, hcall_real_table) 1291 LOAD_REG_ADDR(r4, hcall_real_table)
1289 lwzx r3,r3,r4 1292 lwzx r3,r3,r4
1290 cmpwi r3,0 1293 cmpwi r3,0
1291 beq hcall_real_cont 1294 beq guest_exit_cont
1292 add r3,r3,r4 1295 add r3,r3,r4
1293 mtctr r3 1296 mtctr r3
1294 mr r3,r9 /* get vcpu pointer */ 1297 mr r3,r9 /* get vcpu pointer */
@@ -1309,7 +1312,7 @@ hcall_real_fallback:
1309 li r12,BOOK3S_INTERRUPT_SYSCALL 1312 li r12,BOOK3S_INTERRUPT_SYSCALL
1310 ld r9, HSTATE_KVM_VCPU(r13) 1313 ld r9, HSTATE_KVM_VCPU(r13)
1311 1314
1312 b hcall_real_cont 1315 b guest_exit_cont
1313 1316
1314 .globl hcall_real_table 1317 .globl hcall_real_table
1315hcall_real_table: 1318hcall_real_table:
@@ -1568,6 +1571,21 @@ kvm_cede_exit:
1568 li r3,H_TOO_HARD 1571 li r3,H_TOO_HARD
1569 blr 1572 blr
1570 1573
1574 /* Try to handle a machine check in real mode */
1575machine_check_realmode:
1576 mr r3, r9 /* get vcpu pointer */
1577 bl .kvmppc_realmode_machine_check
1578 nop
1579 cmpdi r3, 0 /* continue exiting from guest? */
1580 ld r9, HSTATE_KVM_VCPU(r13)
1581 li r12, BOOK3S_INTERRUPT_MACHINE_CHECK
1582 beq mc_cont
1583 /* If not, deliver a machine check. SRR0/1 are already set */
1584 li r10, BOOK3S_INTERRUPT_MACHINE_CHECK
1585 li r11, (MSR_ME << 1) | 1 /* synthesize MSR_SF | MSR_ME */
1586 rotldi r11, r11, 63
1587 b fast_interrupt_c_return
1588
1571secondary_too_late: 1589secondary_too_late:
1572 ld r5,HSTATE_KVM_VCORE(r13) 1590 ld r5,HSTATE_KVM_VCORE(r13)
1573 HMT_LOW 1591 HMT_LOW
@@ -1587,6 +1605,10 @@ secondary_too_late:
1587 .endr 1605 .endr
1588 1606
1589secondary_nap: 1607secondary_nap:
1608 /* Clear our vcpu pointer so we don't come back in early */
1609 li r0, 0
1610 std r0, HSTATE_KVM_VCPU(r13)
1611 lwsync
1590 /* Clear any pending IPI - assume we're a secondary thread */ 1612 /* Clear any pending IPI - assume we're a secondary thread */
1591 ld r5, HSTATE_XICS_PHYS(r13) 1613 ld r5, HSTATE_XICS_PHYS(r13)
1592 li r7, XICS_XIRR 1614 li r7, XICS_XIRR
@@ -1612,8 +1634,6 @@ secondary_nap:
1612kvm_no_guest: 1634kvm_no_guest:
1613 li r0, KVM_HWTHREAD_IN_NAP 1635 li r0, KVM_HWTHREAD_IN_NAP
1614 stb r0, HSTATE_HWTHREAD_STATE(r13) 1636 stb r0, HSTATE_HWTHREAD_STATE(r13)
1615 li r0, 0
1616 std r0, HSTATE_KVM_VCPU(r13)
1617 1637
1618 li r3, LPCR_PECE0 1638 li r3, LPCR_PECE0
1619 mfspr r4, SPRN_LPCR 1639 mfspr r4, SPRN_LPCR
diff --git a/arch/powerpc/kvm/book3s_mmu_hpte.c b/arch/powerpc/kvm/book3s_mmu_hpte.c
index 41cb0017e757..2c86b0d63714 100644
--- a/arch/powerpc/kvm/book3s_mmu_hpte.c
+++ b/arch/powerpc/kvm/book3s_mmu_hpte.c
@@ -114,11 +114,6 @@ static void invalidate_pte(struct kvm_vcpu *vcpu, struct hpte_cache *pte)
114 hlist_del_init_rcu(&pte->list_vpte); 114 hlist_del_init_rcu(&pte->list_vpte);
115 hlist_del_init_rcu(&pte->list_vpte_long); 115 hlist_del_init_rcu(&pte->list_vpte_long);
116 116
117 if (pte->pte.may_write)
118 kvm_release_pfn_dirty(pte->pfn);
119 else
120 kvm_release_pfn_clean(pte->pfn);
121
122 spin_unlock(&vcpu3s->mmu_lock); 117 spin_unlock(&vcpu3s->mmu_lock);
123 118
124 vcpu3s->hpte_cache_count--; 119 vcpu3s->hpte_cache_count--;
diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c
index 05c28f59f77f..28d38adeca73 100644
--- a/arch/powerpc/kvm/book3s_pr.c
+++ b/arch/powerpc/kvm/book3s_pr.c
@@ -52,8 +52,6 @@ static int kvmppc_handle_ext(struct kvm_vcpu *vcpu, unsigned int exit_nr,
52#define MSR_USER32 MSR_USER 52#define MSR_USER32 MSR_USER
53#define MSR_USER64 MSR_USER 53#define MSR_USER64 MSR_USER
54#define HW_PAGE_SIZE PAGE_SIZE 54#define HW_PAGE_SIZE PAGE_SIZE
55#define __hard_irq_disable local_irq_disable
56#define __hard_irq_enable local_irq_enable
57#endif 55#endif
58 56
59void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu) 57void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
@@ -66,7 +64,7 @@ void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
66 svcpu->slb_max = to_book3s(vcpu)->slb_shadow_max; 64 svcpu->slb_max = to_book3s(vcpu)->slb_shadow_max;
67 svcpu_put(svcpu); 65 svcpu_put(svcpu);
68#endif 66#endif
69 67 vcpu->cpu = smp_processor_id();
70#ifdef CONFIG_PPC_BOOK3S_32 68#ifdef CONFIG_PPC_BOOK3S_32
71 current->thread.kvm_shadow_vcpu = to_book3s(vcpu)->shadow_vcpu; 69 current->thread.kvm_shadow_vcpu = to_book3s(vcpu)->shadow_vcpu;
72#endif 70#endif
@@ -83,17 +81,71 @@ void kvmppc_core_vcpu_put(struct kvm_vcpu *vcpu)
83 svcpu_put(svcpu); 81 svcpu_put(svcpu);
84#endif 82#endif
85 83
86 kvmppc_giveup_ext(vcpu, MSR_FP); 84 kvmppc_giveup_ext(vcpu, MSR_FP | MSR_VEC | MSR_VSX);
87 kvmppc_giveup_ext(vcpu, MSR_VEC); 85 vcpu->cpu = -1;
88 kvmppc_giveup_ext(vcpu, MSR_VSX); 86}
87
88int kvmppc_core_check_requests(struct kvm_vcpu *vcpu)
89{
90 int r = 1; /* Indicate we want to get back into the guest */
91
92 /* We misuse TLB_FLUSH to indicate that we want to clear
93 all shadow cache entries */
94 if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu))
95 kvmppc_mmu_pte_flush(vcpu, 0, 0);
96
97 return r;
98}
99
100/************* MMU Notifiers *************/
101
102int kvm_unmap_hva(struct kvm *kvm, unsigned long hva)
103{
104 trace_kvm_unmap_hva(hva);
105
106 /*
107 * Flush all shadow tlb entries everywhere. This is slow, but
108 * we are 100% sure that we catch the to be unmapped page
109 */
110 kvm_flush_remote_tlbs(kvm);
111
112 return 0;
113}
114
115int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end)
116{
117 /* kvm_unmap_hva flushes everything anyways */
118 kvm_unmap_hva(kvm, start);
119
120 return 0;
121}
122
123int kvm_age_hva(struct kvm *kvm, unsigned long hva)
124{
125 /* XXX could be more clever ;) */
126 return 0;
127}
128
129int kvm_test_age_hva(struct kvm *kvm, unsigned long hva)
130{
131 /* XXX could be more clever ;) */
132 return 0;
89} 133}
90 134
135void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
136{
137 /* The page will get remapped properly on its next fault */
138 kvm_unmap_hva(kvm, hva);
139}
140
141/*****************************************/
142
91static void kvmppc_recalc_shadow_msr(struct kvm_vcpu *vcpu) 143static void kvmppc_recalc_shadow_msr(struct kvm_vcpu *vcpu)
92{ 144{
93 ulong smsr = vcpu->arch.shared->msr; 145 ulong smsr = vcpu->arch.shared->msr;
94 146
95 /* Guest MSR values */ 147 /* Guest MSR values */
96 smsr &= MSR_FE0 | MSR_FE1 | MSR_SF | MSR_SE | MSR_BE | MSR_DE; 148 smsr &= MSR_FE0 | MSR_FE1 | MSR_SF | MSR_SE | MSR_BE;
97 /* Process MSR values */ 149 /* Process MSR values */
98 smsr |= MSR_ME | MSR_RI | MSR_IR | MSR_DR | MSR_PR | MSR_EE; 150 smsr |= MSR_ME | MSR_RI | MSR_IR | MSR_DR | MSR_PR | MSR_EE;
99 /* External providers the guest reserved */ 151 /* External providers the guest reserved */
@@ -379,10 +431,7 @@ int kvmppc_handle_pagefault(struct kvm_run *run, struct kvm_vcpu *vcpu,
379 431
380static inline int get_fpr_index(int i) 432static inline int get_fpr_index(int i)
381{ 433{
382#ifdef CONFIG_VSX 434 return i * TS_FPRWIDTH;
383 i *= 2;
384#endif
385 return i;
386} 435}
387 436
388/* Give up external provider (FPU, Altivec, VSX) */ 437/* Give up external provider (FPU, Altivec, VSX) */
@@ -396,41 +445,49 @@ void kvmppc_giveup_ext(struct kvm_vcpu *vcpu, ulong msr)
396 u64 *thread_fpr = (u64*)t->fpr; 445 u64 *thread_fpr = (u64*)t->fpr;
397 int i; 446 int i;
398 447
399 if (!(vcpu->arch.guest_owned_ext & msr)) 448 /*
449 * VSX instructions can access FP and vector registers, so if
450 * we are giving up VSX, make sure we give up FP and VMX as well.
451 */
452 if (msr & MSR_VSX)
453 msr |= MSR_FP | MSR_VEC;
454
455 msr &= vcpu->arch.guest_owned_ext;
456 if (!msr)
400 return; 457 return;
401 458
402#ifdef DEBUG_EXT 459#ifdef DEBUG_EXT
403 printk(KERN_INFO "Giving up ext 0x%lx\n", msr); 460 printk(KERN_INFO "Giving up ext 0x%lx\n", msr);
404#endif 461#endif
405 462
406 switch (msr) { 463 if (msr & MSR_FP) {
407 case MSR_FP: 464 /*
465 * Note that on CPUs with VSX, giveup_fpu stores
466 * both the traditional FP registers and the added VSX
467 * registers into thread.fpr[].
468 */
408 giveup_fpu(current); 469 giveup_fpu(current);
409 for (i = 0; i < ARRAY_SIZE(vcpu->arch.fpr); i++) 470 for (i = 0; i < ARRAY_SIZE(vcpu->arch.fpr); i++)
410 vcpu_fpr[i] = thread_fpr[get_fpr_index(i)]; 471 vcpu_fpr[i] = thread_fpr[get_fpr_index(i)];
411 472
412 vcpu->arch.fpscr = t->fpscr.val; 473 vcpu->arch.fpscr = t->fpscr.val;
413 break; 474
414 case MSR_VEC: 475#ifdef CONFIG_VSX
476 if (cpu_has_feature(CPU_FTR_VSX))
477 for (i = 0; i < ARRAY_SIZE(vcpu->arch.vsr) / 2; i++)
478 vcpu_vsx[i] = thread_fpr[get_fpr_index(i) + 1];
479#endif
480 }
481
415#ifdef CONFIG_ALTIVEC 482#ifdef CONFIG_ALTIVEC
483 if (msr & MSR_VEC) {
416 giveup_altivec(current); 484 giveup_altivec(current);
417 memcpy(vcpu->arch.vr, t->vr, sizeof(vcpu->arch.vr)); 485 memcpy(vcpu->arch.vr, t->vr, sizeof(vcpu->arch.vr));
418 vcpu->arch.vscr = t->vscr; 486 vcpu->arch.vscr = t->vscr;
419#endif
420 break;
421 case MSR_VSX:
422#ifdef CONFIG_VSX
423 __giveup_vsx(current);
424 for (i = 0; i < ARRAY_SIZE(vcpu->arch.vsr); i++)
425 vcpu_vsx[i] = thread_fpr[get_fpr_index(i) + 1];
426#endif
427 break;
428 default:
429 BUG();
430 } 487 }
488#endif
431 489
432 vcpu->arch.guest_owned_ext &= ~msr; 490 vcpu->arch.guest_owned_ext &= ~(msr | MSR_VSX);
433 current->thread.regs->msr &= ~msr;
434 kvmppc_recalc_shadow_msr(vcpu); 491 kvmppc_recalc_shadow_msr(vcpu);
435} 492}
436 493
@@ -490,47 +547,56 @@ static int kvmppc_handle_ext(struct kvm_vcpu *vcpu, unsigned int exit_nr,
490 return RESUME_GUEST; 547 return RESUME_GUEST;
491 } 548 }
492 549
493 /* We already own the ext */ 550 if (msr == MSR_VSX) {
494 if (vcpu->arch.guest_owned_ext & msr) { 551 /* No VSX? Give an illegal instruction interrupt */
495 return RESUME_GUEST; 552#ifdef CONFIG_VSX
553 if (!cpu_has_feature(CPU_FTR_VSX))
554#endif
555 {
556 kvmppc_core_queue_program(vcpu, SRR1_PROGILL);
557 return RESUME_GUEST;
558 }
559
560 /*
561 * We have to load up all the FP and VMX registers before
562 * we can let the guest use VSX instructions.
563 */
564 msr = MSR_FP | MSR_VEC | MSR_VSX;
496 } 565 }
497 566
567 /* See if we already own all the ext(s) needed */
568 msr &= ~vcpu->arch.guest_owned_ext;
569 if (!msr)
570 return RESUME_GUEST;
571
498#ifdef DEBUG_EXT 572#ifdef DEBUG_EXT
499 printk(KERN_INFO "Loading up ext 0x%lx\n", msr); 573 printk(KERN_INFO "Loading up ext 0x%lx\n", msr);
500#endif 574#endif
501 575
502 current->thread.regs->msr |= msr; 576 current->thread.regs->msr |= msr;
503 577
504 switch (msr) { 578 if (msr & MSR_FP) {
505 case MSR_FP:
506 for (i = 0; i < ARRAY_SIZE(vcpu->arch.fpr); i++) 579 for (i = 0; i < ARRAY_SIZE(vcpu->arch.fpr); i++)
507 thread_fpr[get_fpr_index(i)] = vcpu_fpr[i]; 580 thread_fpr[get_fpr_index(i)] = vcpu_fpr[i];
508 581#ifdef CONFIG_VSX
582 for (i = 0; i < ARRAY_SIZE(vcpu->arch.vsr) / 2; i++)
583 thread_fpr[get_fpr_index(i) + 1] = vcpu_vsx[i];
584#endif
509 t->fpscr.val = vcpu->arch.fpscr; 585 t->fpscr.val = vcpu->arch.fpscr;
510 t->fpexc_mode = 0; 586 t->fpexc_mode = 0;
511 kvmppc_load_up_fpu(); 587 kvmppc_load_up_fpu();
512 break; 588 }
513 case MSR_VEC: 589
590 if (msr & MSR_VEC) {
514#ifdef CONFIG_ALTIVEC 591#ifdef CONFIG_ALTIVEC
515 memcpy(t->vr, vcpu->arch.vr, sizeof(vcpu->arch.vr)); 592 memcpy(t->vr, vcpu->arch.vr, sizeof(vcpu->arch.vr));
516 t->vscr = vcpu->arch.vscr; 593 t->vscr = vcpu->arch.vscr;
517 t->vrsave = -1; 594 t->vrsave = -1;
518 kvmppc_load_up_altivec(); 595 kvmppc_load_up_altivec();
519#endif 596#endif
520 break;
521 case MSR_VSX:
522#ifdef CONFIG_VSX
523 for (i = 0; i < ARRAY_SIZE(vcpu->arch.vsr); i++)
524 thread_fpr[get_fpr_index(i) + 1] = vcpu_vsx[i];
525 kvmppc_load_up_vsx();
526#endif
527 break;
528 default:
529 BUG();
530 } 597 }
531 598
532 vcpu->arch.guest_owned_ext |= msr; 599 vcpu->arch.guest_owned_ext |= msr;
533
534 kvmppc_recalc_shadow_msr(vcpu); 600 kvmppc_recalc_shadow_msr(vcpu);
535 601
536 return RESUME_GUEST; 602 return RESUME_GUEST;
@@ -540,18 +606,18 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
540 unsigned int exit_nr) 606 unsigned int exit_nr)
541{ 607{
542 int r = RESUME_HOST; 608 int r = RESUME_HOST;
609 int s;
543 610
544 vcpu->stat.sum_exits++; 611 vcpu->stat.sum_exits++;
545 612
546 run->exit_reason = KVM_EXIT_UNKNOWN; 613 run->exit_reason = KVM_EXIT_UNKNOWN;
547 run->ready_for_interrupt_injection = 1; 614 run->ready_for_interrupt_injection = 1;
548 615
549 /* We get here with MSR.EE=0, so enable it to be a nice citizen */ 616 /* We get here with MSR.EE=1 */
550 __hard_irq_enable(); 617
618 trace_kvm_exit(exit_nr, vcpu);
619 kvm_guest_exit();
551 620
552 trace_kvm_book3s_exit(exit_nr, vcpu);
553 preempt_enable();
554 kvm_resched(vcpu);
555 switch (exit_nr) { 621 switch (exit_nr) {
556 case BOOK3S_INTERRUPT_INST_STORAGE: 622 case BOOK3S_INTERRUPT_INST_STORAGE:
557 { 623 {
@@ -802,7 +868,6 @@ program_interrupt:
802 } 868 }
803 } 869 }
804 870
805 preempt_disable();
806 if (!(r & RESUME_HOST)) { 871 if (!(r & RESUME_HOST)) {
807 /* To avoid clobbering exit_reason, only check for signals if 872 /* To avoid clobbering exit_reason, only check for signals if
808 * we aren't already exiting to userspace for some other 873 * we aren't already exiting to userspace for some other
@@ -814,20 +879,13 @@ program_interrupt:
814 * and if we really did time things so badly, then we just exit 879 * and if we really did time things so badly, then we just exit
815 * again due to a host external interrupt. 880 * again due to a host external interrupt.
816 */ 881 */
817 __hard_irq_disable(); 882 local_irq_disable();
818 if (signal_pending(current)) { 883 s = kvmppc_prepare_to_enter(vcpu);
819 __hard_irq_enable(); 884 if (s <= 0) {
820#ifdef EXIT_DEBUG 885 local_irq_enable();
821 printk(KERN_EMERG "KVM: Going back to host\n"); 886 r = s;
822#endif
823 vcpu->stat.signal_exits++;
824 run->exit_reason = KVM_EXIT_INTR;
825 r = -EINTR;
826 } else { 887 } else {
827 /* In case an interrupt came in that was triggered 888 kvmppc_lazy_ee_enable();
828 * from userspace (like DEC), we need to check what
829 * to inject now! */
830 kvmppc_core_prepare_to_enter(vcpu);
831 } 889 }
832 } 890 }
833 891
@@ -899,34 +957,59 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
899 return 0; 957 return 0;
900} 958}
901 959
902int kvm_vcpu_ioctl_get_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg) 960int kvmppc_get_one_reg(struct kvm_vcpu *vcpu, u64 id, union kvmppc_one_reg *val)
903{ 961{
904 int r = -EINVAL; 962 int r = 0;
905 963
906 switch (reg->id) { 964 switch (id) {
907 case KVM_REG_PPC_HIOR: 965 case KVM_REG_PPC_HIOR:
908 r = copy_to_user((u64 __user *)(long)reg->addr, 966 *val = get_reg_val(id, to_book3s(vcpu)->hior);
909 &to_book3s(vcpu)->hior, sizeof(u64));
910 break; 967 break;
968#ifdef CONFIG_VSX
969 case KVM_REG_PPC_VSR0 ... KVM_REG_PPC_VSR31: {
970 long int i = id - KVM_REG_PPC_VSR0;
971
972 if (!cpu_has_feature(CPU_FTR_VSX)) {
973 r = -ENXIO;
974 break;
975 }
976 val->vsxval[0] = vcpu->arch.fpr[i];
977 val->vsxval[1] = vcpu->arch.vsr[i];
978 break;
979 }
980#endif /* CONFIG_VSX */
911 default: 981 default:
982 r = -EINVAL;
912 break; 983 break;
913 } 984 }
914 985
915 return r; 986 return r;
916} 987}
917 988
918int kvm_vcpu_ioctl_set_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg) 989int kvmppc_set_one_reg(struct kvm_vcpu *vcpu, u64 id, union kvmppc_one_reg *val)
919{ 990{
920 int r = -EINVAL; 991 int r = 0;
921 992
922 switch (reg->id) { 993 switch (id) {
923 case KVM_REG_PPC_HIOR: 994 case KVM_REG_PPC_HIOR:
924 r = copy_from_user(&to_book3s(vcpu)->hior, 995 to_book3s(vcpu)->hior = set_reg_val(id, *val);
925 (u64 __user *)(long)reg->addr, sizeof(u64)); 996 to_book3s(vcpu)->hior_explicit = true;
926 if (!r) 997 break;
927 to_book3s(vcpu)->hior_explicit = true; 998#ifdef CONFIG_VSX
999 case KVM_REG_PPC_VSR0 ... KVM_REG_PPC_VSR31: {
1000 long int i = id - KVM_REG_PPC_VSR0;
1001
1002 if (!cpu_has_feature(CPU_FTR_VSX)) {
1003 r = -ENXIO;
1004 break;
1005 }
1006 vcpu->arch.fpr[i] = val->vsxval[0];
1007 vcpu->arch.vsr[i] = val->vsxval[1];
928 break; 1008 break;
1009 }
1010#endif /* CONFIG_VSX */
929 default: 1011 default:
1012 r = -EINVAL;
930 break; 1013 break;
931 } 1014 }
932 1015
@@ -1020,8 +1103,6 @@ int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
1020#endif 1103#endif
1021 ulong ext_msr; 1104 ulong ext_msr;
1022 1105
1023 preempt_disable();
1024
1025 /* Check if we can run the vcpu at all */ 1106 /* Check if we can run the vcpu at all */
1026 if (!vcpu->arch.sane) { 1107 if (!vcpu->arch.sane) {
1027 kvm_run->exit_reason = KVM_EXIT_INTERNAL_ERROR; 1108 kvm_run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
@@ -1029,21 +1110,16 @@ int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
1029 goto out; 1110 goto out;
1030 } 1111 }
1031 1112
1032 kvmppc_core_prepare_to_enter(vcpu);
1033
1034 /* 1113 /*
1035 * Interrupts could be timers for the guest which we have to inject 1114 * Interrupts could be timers for the guest which we have to inject
1036 * again, so let's postpone them until we're in the guest and if we 1115 * again, so let's postpone them until we're in the guest and if we
1037 * really did time things so badly, then we just exit again due to 1116 * really did time things so badly, then we just exit again due to
1038 * a host external interrupt. 1117 * a host external interrupt.
1039 */ 1118 */
1040 __hard_irq_disable(); 1119 local_irq_disable();
1041 1120 ret = kvmppc_prepare_to_enter(vcpu);
1042 /* No need to go into the guest when all we do is going out */ 1121 if (ret <= 0) {
1043 if (signal_pending(current)) { 1122 local_irq_enable();
1044 __hard_irq_enable();
1045 kvm_run->exit_reason = KVM_EXIT_INTR;
1046 ret = -EINTR;
1047 goto out; 1123 goto out;
1048 } 1124 }
1049 1125
@@ -1070,7 +1146,7 @@ int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
1070 /* Save VSX state in stack */ 1146 /* Save VSX state in stack */
1071 used_vsr = current->thread.used_vsr; 1147 used_vsr = current->thread.used_vsr;
1072 if (used_vsr && (current->thread.regs->msr & MSR_VSX)) 1148 if (used_vsr && (current->thread.regs->msr & MSR_VSX))
1073 __giveup_vsx(current); 1149 __giveup_vsx(current);
1074#endif 1150#endif
1075 1151
1076 /* Remember the MSR with disabled extensions */ 1152 /* Remember the MSR with disabled extensions */
@@ -1080,20 +1156,19 @@ int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
1080 if (vcpu->arch.shared->msr & MSR_FP) 1156 if (vcpu->arch.shared->msr & MSR_FP)
1081 kvmppc_handle_ext(vcpu, BOOK3S_INTERRUPT_FP_UNAVAIL, MSR_FP); 1157 kvmppc_handle_ext(vcpu, BOOK3S_INTERRUPT_FP_UNAVAIL, MSR_FP);
1082 1158
1083 kvm_guest_enter(); 1159 kvmppc_lazy_ee_enable();
1084 1160
1085 ret = __kvmppc_vcpu_run(kvm_run, vcpu); 1161 ret = __kvmppc_vcpu_run(kvm_run, vcpu);
1086 1162
1087 kvm_guest_exit(); 1163 /* No need for kvm_guest_exit. It's done in handle_exit.
1088 1164 We also get here with interrupts enabled. */
1089 current->thread.regs->msr = ext_msr;
1090 1165
1091 /* Make sure we save the guest FPU/Altivec/VSX state */ 1166 /* Make sure we save the guest FPU/Altivec/VSX state */
1092 kvmppc_giveup_ext(vcpu, MSR_FP); 1167 kvmppc_giveup_ext(vcpu, MSR_FP | MSR_VEC | MSR_VSX);
1093 kvmppc_giveup_ext(vcpu, MSR_VEC); 1168
1094 kvmppc_giveup_ext(vcpu, MSR_VSX); 1169 current->thread.regs->msr = ext_msr;
1095 1170
1096 /* Restore FPU state from stack */ 1171 /* Restore FPU/VSX state from stack */
1097 memcpy(current->thread.fpr, fpr, sizeof(current->thread.fpr)); 1172 memcpy(current->thread.fpr, fpr, sizeof(current->thread.fpr));
1098 current->thread.fpscr.val = fpscr; 1173 current->thread.fpscr.val = fpscr;
1099 current->thread.fpexc_mode = fpexc_mode; 1174 current->thread.fpexc_mode = fpexc_mode;
@@ -1113,7 +1188,7 @@ int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
1113#endif 1188#endif
1114 1189
1115out: 1190out:
1116 preempt_enable(); 1191 vcpu->mode = OUTSIDE_GUEST_MODE;
1117 return ret; 1192 return ret;
1118} 1193}
1119 1194
@@ -1181,14 +1256,31 @@ int kvm_vm_ioctl_get_smmu_info(struct kvm *kvm, struct kvm_ppc_smmu_info *info)
1181} 1256}
1182#endif /* CONFIG_PPC64 */ 1257#endif /* CONFIG_PPC64 */
1183 1258
1259void kvmppc_core_free_memslot(struct kvm_memory_slot *free,
1260 struct kvm_memory_slot *dont)
1261{
1262}
1263
1264int kvmppc_core_create_memslot(struct kvm_memory_slot *slot,
1265 unsigned long npages)
1266{
1267 return 0;
1268}
1269
1184int kvmppc_core_prepare_memory_region(struct kvm *kvm, 1270int kvmppc_core_prepare_memory_region(struct kvm *kvm,
1271 struct kvm_memory_slot *memslot,
1185 struct kvm_userspace_memory_region *mem) 1272 struct kvm_userspace_memory_region *mem)
1186{ 1273{
1187 return 0; 1274 return 0;
1188} 1275}
1189 1276
1190void kvmppc_core_commit_memory_region(struct kvm *kvm, 1277void kvmppc_core_commit_memory_region(struct kvm *kvm,
1191 struct kvm_userspace_memory_region *mem) 1278 struct kvm_userspace_memory_region *mem,
1279 struct kvm_memory_slot old)
1280{
1281}
1282
1283void kvmppc_core_flush_memslot(struct kvm *kvm, struct kvm_memory_slot *memslot)
1192{ 1284{
1193} 1285}
1194 1286
diff --git a/arch/powerpc/kvm/book3s_rmhandlers.S b/arch/powerpc/kvm/book3s_rmhandlers.S
index 9ecf6e35cd8d..8f7633e3afb8 100644
--- a/arch/powerpc/kvm/book3s_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_rmhandlers.S
@@ -170,20 +170,21 @@ kvmppc_handler_skip_ins:
170 * Call kvmppc_handler_trampoline_enter in real mode 170 * Call kvmppc_handler_trampoline_enter in real mode
171 * 171 *
172 * On entry, r4 contains the guest shadow MSR 172 * On entry, r4 contains the guest shadow MSR
173 * MSR.EE has to be 0 when calling this function
173 */ 174 */
174_GLOBAL(kvmppc_entry_trampoline) 175_GLOBAL(kvmppc_entry_trampoline)
175 mfmsr r5 176 mfmsr r5
176 LOAD_REG_ADDR(r7, kvmppc_handler_trampoline_enter) 177 LOAD_REG_ADDR(r7, kvmppc_handler_trampoline_enter)
177 toreal(r7) 178 toreal(r7)
178 179
179 li r9, MSR_RI
180 ori r9, r9, MSR_EE
181 andc r9, r5, r9 /* Clear EE and RI in MSR value */
182 li r6, MSR_IR | MSR_DR 180 li r6, MSR_IR | MSR_DR
183 ori r6, r6, MSR_EE 181 andc r6, r5, r6 /* Clear DR and IR in MSR value */
184 andc r6, r5, r6 /* Clear EE, DR and IR in MSR value */ 182 /*
185 MTMSR_EERI(r9) /* Clear EE and RI in MSR */ 183 * Set EE in HOST_MSR so that it's enabled when we get into our
186 mtsrr0 r7 /* before we set srr0/1 */ 184 * C exit handler function
185 */
186 ori r5, r5, MSR_EE
187 mtsrr0 r7
187 mtsrr1 r6 188 mtsrr1 r6
188 RFI 189 RFI
189 190
@@ -233,8 +234,5 @@ define_load_up(fpu)
233#ifdef CONFIG_ALTIVEC 234#ifdef CONFIG_ALTIVEC
234define_load_up(altivec) 235define_load_up(altivec)
235#endif 236#endif
236#ifdef CONFIG_VSX
237define_load_up(vsx)
238#endif
239 237
240#include "book3s_segment.S" 238#include "book3s_segment.S"
diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c
index d25a097c852b..69f114015780 100644
--- a/arch/powerpc/kvm/booke.c
+++ b/arch/powerpc/kvm/booke.c
@@ -36,9 +36,11 @@
36#include <asm/dbell.h> 36#include <asm/dbell.h>
37#include <asm/hw_irq.h> 37#include <asm/hw_irq.h>
38#include <asm/irq.h> 38#include <asm/irq.h>
39#include <asm/time.h>
39 40
40#include "timing.h" 41#include "timing.h"
41#include "booke.h" 42#include "booke.h"
43#include "trace.h"
42 44
43unsigned long kvmppc_booke_handlers; 45unsigned long kvmppc_booke_handlers;
44 46
@@ -62,6 +64,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
62 { "halt_wakeup", VCPU_STAT(halt_wakeup) }, 64 { "halt_wakeup", VCPU_STAT(halt_wakeup) },
63 { "doorbell", VCPU_STAT(dbell_exits) }, 65 { "doorbell", VCPU_STAT(dbell_exits) },
64 { "guest doorbell", VCPU_STAT(gdbell_exits) }, 66 { "guest doorbell", VCPU_STAT(gdbell_exits) },
67 { "remote_tlb_flush", VM_STAT(remote_tlb_flush) },
65 { NULL } 68 { NULL }
66}; 69};
67 70
@@ -120,6 +123,16 @@ static void kvmppc_vcpu_sync_spe(struct kvm_vcpu *vcpu)
120} 123}
121#endif 124#endif
122 125
126static void kvmppc_vcpu_sync_fpu(struct kvm_vcpu *vcpu)
127{
128#if defined(CONFIG_PPC_FPU) && !defined(CONFIG_KVM_BOOKE_HV)
129 /* We always treat the FP bit as enabled from the host
130 perspective, so only need to adjust the shadow MSR */
131 vcpu->arch.shadow_msr &= ~MSR_FP;
132 vcpu->arch.shadow_msr |= vcpu->arch.shared->msr & MSR_FP;
133#endif
134}
135
123/* 136/*
124 * Helper function for "full" MSR writes. No need to call this if only 137 * Helper function for "full" MSR writes. No need to call this if only
125 * EE/CE/ME/DE/RI are changing. 138 * EE/CE/ME/DE/RI are changing.
@@ -136,11 +149,13 @@ void kvmppc_set_msr(struct kvm_vcpu *vcpu, u32 new_msr)
136 149
137 kvmppc_mmu_msr_notify(vcpu, old_msr); 150 kvmppc_mmu_msr_notify(vcpu, old_msr);
138 kvmppc_vcpu_sync_spe(vcpu); 151 kvmppc_vcpu_sync_spe(vcpu);
152 kvmppc_vcpu_sync_fpu(vcpu);
139} 153}
140 154
141static void kvmppc_booke_queue_irqprio(struct kvm_vcpu *vcpu, 155static void kvmppc_booke_queue_irqprio(struct kvm_vcpu *vcpu,
142 unsigned int priority) 156 unsigned int priority)
143{ 157{
158 trace_kvm_booke_queue_irqprio(vcpu, priority);
144 set_bit(priority, &vcpu->arch.pending_exceptions); 159 set_bit(priority, &vcpu->arch.pending_exceptions);
145} 160}
146 161
@@ -206,6 +221,16 @@ void kvmppc_core_dequeue_external(struct kvm_vcpu *vcpu,
206 clear_bit(BOOKE_IRQPRIO_EXTERNAL_LEVEL, &vcpu->arch.pending_exceptions); 221 clear_bit(BOOKE_IRQPRIO_EXTERNAL_LEVEL, &vcpu->arch.pending_exceptions);
207} 222}
208 223
224static void kvmppc_core_queue_watchdog(struct kvm_vcpu *vcpu)
225{
226 kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_WATCHDOG);
227}
228
229static void kvmppc_core_dequeue_watchdog(struct kvm_vcpu *vcpu)
230{
231 clear_bit(BOOKE_IRQPRIO_WATCHDOG, &vcpu->arch.pending_exceptions);
232}
233
209static void set_guest_srr(struct kvm_vcpu *vcpu, unsigned long srr0, u32 srr1) 234static void set_guest_srr(struct kvm_vcpu *vcpu, unsigned long srr0, u32 srr1)
210{ 235{
211#ifdef CONFIG_KVM_BOOKE_HV 236#ifdef CONFIG_KVM_BOOKE_HV
@@ -287,6 +312,7 @@ static int kvmppc_booke_irqprio_deliver(struct kvm_vcpu *vcpu,
287 bool crit; 312 bool crit;
288 bool keep_irq = false; 313 bool keep_irq = false;
289 enum int_class int_class; 314 enum int_class int_class;
315 ulong new_msr = vcpu->arch.shared->msr;
290 316
291 /* Truncate crit indicators in 32 bit mode */ 317 /* Truncate crit indicators in 32 bit mode */
292 if (!(vcpu->arch.shared->msr & MSR_SF)) { 318 if (!(vcpu->arch.shared->msr & MSR_SF)) {
@@ -325,6 +351,7 @@ static int kvmppc_booke_irqprio_deliver(struct kvm_vcpu *vcpu,
325 msr_mask = MSR_CE | MSR_ME | MSR_DE; 351 msr_mask = MSR_CE | MSR_ME | MSR_DE;
326 int_class = INT_CLASS_NONCRIT; 352 int_class = INT_CLASS_NONCRIT;
327 break; 353 break;
354 case BOOKE_IRQPRIO_WATCHDOG:
328 case BOOKE_IRQPRIO_CRITICAL: 355 case BOOKE_IRQPRIO_CRITICAL:
329 case BOOKE_IRQPRIO_DBELL_CRIT: 356 case BOOKE_IRQPRIO_DBELL_CRIT:
330 allowed = vcpu->arch.shared->msr & MSR_CE; 357 allowed = vcpu->arch.shared->msr & MSR_CE;
@@ -381,7 +408,13 @@ static int kvmppc_booke_irqprio_deliver(struct kvm_vcpu *vcpu,
381 set_guest_esr(vcpu, vcpu->arch.queued_esr); 408 set_guest_esr(vcpu, vcpu->arch.queued_esr);
382 if (update_dear == true) 409 if (update_dear == true)
383 set_guest_dear(vcpu, vcpu->arch.queued_dear); 410 set_guest_dear(vcpu, vcpu->arch.queued_dear);
384 kvmppc_set_msr(vcpu, vcpu->arch.shared->msr & msr_mask); 411
412 new_msr &= msr_mask;
413#if defined(CONFIG_64BIT)
414 if (vcpu->arch.epcr & SPRN_EPCR_ICM)
415 new_msr |= MSR_CM;
416#endif
417 kvmppc_set_msr(vcpu, new_msr);
385 418
386 if (!keep_irq) 419 if (!keep_irq)
387 clear_bit(priority, &vcpu->arch.pending_exceptions); 420 clear_bit(priority, &vcpu->arch.pending_exceptions);
@@ -404,12 +437,121 @@ static int kvmppc_booke_irqprio_deliver(struct kvm_vcpu *vcpu,
404 return allowed; 437 return allowed;
405} 438}
406 439
440/*
441 * Return the number of jiffies until the next timeout. If the timeout is
442 * longer than the NEXT_TIMER_MAX_DELTA, then return NEXT_TIMER_MAX_DELTA
443 * because the larger value can break the timer APIs.
444 */
445static unsigned long watchdog_next_timeout(struct kvm_vcpu *vcpu)
446{
447 u64 tb, wdt_tb, wdt_ticks = 0;
448 u64 nr_jiffies = 0;
449 u32 period = TCR_GET_WP(vcpu->arch.tcr);
450
451 wdt_tb = 1ULL << (63 - period);
452 tb = get_tb();
453 /*
454 * The watchdog timeout will hapeen when TB bit corresponding
455 * to watchdog will toggle from 0 to 1.
456 */
457 if (tb & wdt_tb)
458 wdt_ticks = wdt_tb;
459
460 wdt_ticks += wdt_tb - (tb & (wdt_tb - 1));
461
462 /* Convert timebase ticks to jiffies */
463 nr_jiffies = wdt_ticks;
464
465 if (do_div(nr_jiffies, tb_ticks_per_jiffy))
466 nr_jiffies++;
467
468 return min_t(unsigned long long, nr_jiffies, NEXT_TIMER_MAX_DELTA);
469}
470
471static void arm_next_watchdog(struct kvm_vcpu *vcpu)
472{
473 unsigned long nr_jiffies;
474 unsigned long flags;
475
476 /*
477 * If TSR_ENW and TSR_WIS are not set then no need to exit to
478 * userspace, so clear the KVM_REQ_WATCHDOG request.
479 */
480 if ((vcpu->arch.tsr & (TSR_ENW | TSR_WIS)) != (TSR_ENW | TSR_WIS))
481 clear_bit(KVM_REQ_WATCHDOG, &vcpu->requests);
482
483 spin_lock_irqsave(&vcpu->arch.wdt_lock, flags);
484 nr_jiffies = watchdog_next_timeout(vcpu);
485 /*
486 * If the number of jiffies of watchdog timer >= NEXT_TIMER_MAX_DELTA
487 * then do not run the watchdog timer as this can break timer APIs.
488 */
489 if (nr_jiffies < NEXT_TIMER_MAX_DELTA)
490 mod_timer(&vcpu->arch.wdt_timer, jiffies + nr_jiffies);
491 else
492 del_timer(&vcpu->arch.wdt_timer);
493 spin_unlock_irqrestore(&vcpu->arch.wdt_lock, flags);
494}
495
496void kvmppc_watchdog_func(unsigned long data)
497{
498 struct kvm_vcpu *vcpu = (struct kvm_vcpu *)data;
499 u32 tsr, new_tsr;
500 int final;
501
502 do {
503 new_tsr = tsr = vcpu->arch.tsr;
504 final = 0;
505
506 /* Time out event */
507 if (tsr & TSR_ENW) {
508 if (tsr & TSR_WIS)
509 final = 1;
510 else
511 new_tsr = tsr | TSR_WIS;
512 } else {
513 new_tsr = tsr | TSR_ENW;
514 }
515 } while (cmpxchg(&vcpu->arch.tsr, tsr, new_tsr) != tsr);
516
517 if (new_tsr & TSR_WIS) {
518 smp_wmb();
519 kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu);
520 kvm_vcpu_kick(vcpu);
521 }
522
523 /*
524 * If this is final watchdog expiry and some action is required
525 * then exit to userspace.
526 */
527 if (final && (vcpu->arch.tcr & TCR_WRC_MASK) &&
528 vcpu->arch.watchdog_enabled) {
529 smp_wmb();
530 kvm_make_request(KVM_REQ_WATCHDOG, vcpu);
531 kvm_vcpu_kick(vcpu);
532 }
533
534 /*
535 * Stop running the watchdog timer after final expiration to
536 * prevent the host from being flooded with timers if the
537 * guest sets a short period.
538 * Timers will resume when TSR/TCR is updated next time.
539 */
540 if (!final)
541 arm_next_watchdog(vcpu);
542}
543
407static void update_timer_ints(struct kvm_vcpu *vcpu) 544static void update_timer_ints(struct kvm_vcpu *vcpu)
408{ 545{
409 if ((vcpu->arch.tcr & TCR_DIE) && (vcpu->arch.tsr & TSR_DIS)) 546 if ((vcpu->arch.tcr & TCR_DIE) && (vcpu->arch.tsr & TSR_DIS))
410 kvmppc_core_queue_dec(vcpu); 547 kvmppc_core_queue_dec(vcpu);
411 else 548 else
412 kvmppc_core_dequeue_dec(vcpu); 549 kvmppc_core_dequeue_dec(vcpu);
550
551 if ((vcpu->arch.tcr & TCR_WIE) && (vcpu->arch.tsr & TSR_WIS))
552 kvmppc_core_queue_watchdog(vcpu);
553 else
554 kvmppc_core_dequeue_watchdog(vcpu);
413} 555}
414 556
415static void kvmppc_core_check_exceptions(struct kvm_vcpu *vcpu) 557static void kvmppc_core_check_exceptions(struct kvm_vcpu *vcpu)
@@ -417,13 +559,6 @@ static void kvmppc_core_check_exceptions(struct kvm_vcpu *vcpu)
417 unsigned long *pending = &vcpu->arch.pending_exceptions; 559 unsigned long *pending = &vcpu->arch.pending_exceptions;
418 unsigned int priority; 560 unsigned int priority;
419 561
420 if (vcpu->requests) {
421 if (kvm_check_request(KVM_REQ_PENDING_TIMER, vcpu)) {
422 smp_mb();
423 update_timer_ints(vcpu);
424 }
425 }
426
427 priority = __ffs(*pending); 562 priority = __ffs(*pending);
428 while (priority < BOOKE_IRQPRIO_MAX) { 563 while (priority < BOOKE_IRQPRIO_MAX) {
429 if (kvmppc_booke_irqprio_deliver(vcpu, priority)) 564 if (kvmppc_booke_irqprio_deliver(vcpu, priority))
@@ -459,37 +594,20 @@ int kvmppc_core_prepare_to_enter(struct kvm_vcpu *vcpu)
459 return r; 594 return r;
460} 595}
461 596
462/* 597int kvmppc_core_check_requests(struct kvm_vcpu *vcpu)
463 * Common checks before entering the guest world. Call with interrupts
464 * disabled.
465 *
466 * returns !0 if a signal is pending and check_signal is true
467 */
468static int kvmppc_prepare_to_enter(struct kvm_vcpu *vcpu)
469{ 598{
470 int r = 0; 599 int r = 1; /* Indicate we want to get back into the guest */
471 600
472 WARN_ON_ONCE(!irqs_disabled()); 601 if (kvm_check_request(KVM_REQ_PENDING_TIMER, vcpu))
473 while (true) { 602 update_timer_ints(vcpu);
474 if (need_resched()) { 603#if defined(CONFIG_KVM_E500V2) || defined(CONFIG_KVM_E500MC)
475 local_irq_enable(); 604 if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu))
476 cond_resched(); 605 kvmppc_core_flush_tlb(vcpu);
477 local_irq_disable(); 606#endif
478 continue;
479 }
480
481 if (signal_pending(current)) {
482 r = 1;
483 break;
484 }
485
486 if (kvmppc_core_prepare_to_enter(vcpu)) {
487 /* interrupts got enabled in between, so we
488 are back at square 1 */
489 continue;
490 }
491 607
492 break; 608 if (kvm_check_request(KVM_REQ_WATCHDOG, vcpu)) {
609 vcpu->run->exit_reason = KVM_EXIT_WATCHDOG;
610 r = 0;
493 } 611 }
494 612
495 return r; 613 return r;
@@ -497,7 +615,7 @@ static int kvmppc_prepare_to_enter(struct kvm_vcpu *vcpu)
497 615
498int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) 616int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
499{ 617{
500 int ret; 618 int ret, s;
501#ifdef CONFIG_PPC_FPU 619#ifdef CONFIG_PPC_FPU
502 unsigned int fpscr; 620 unsigned int fpscr;
503 int fpexc_mode; 621 int fpexc_mode;
@@ -510,11 +628,13 @@ int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
510 } 628 }
511 629
512 local_irq_disable(); 630 local_irq_disable();
513 if (kvmppc_prepare_to_enter(vcpu)) { 631 s = kvmppc_prepare_to_enter(vcpu);
514 kvm_run->exit_reason = KVM_EXIT_INTR; 632 if (s <= 0) {
515 ret = -EINTR; 633 local_irq_enable();
634 ret = s;
516 goto out; 635 goto out;
517 } 636 }
637 kvmppc_lazy_ee_enable();
518 638
519 kvm_guest_enter(); 639 kvm_guest_enter();
520 640
@@ -542,6 +662,9 @@ int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
542 662
543 ret = __kvmppc_vcpu_run(kvm_run, vcpu); 663 ret = __kvmppc_vcpu_run(kvm_run, vcpu);
544 664
665 /* No need for kvm_guest_exit. It's done in handle_exit.
666 We also get here with interrupts enabled. */
667
545#ifdef CONFIG_PPC_FPU 668#ifdef CONFIG_PPC_FPU
546 kvmppc_save_guest_fp(vcpu); 669 kvmppc_save_guest_fp(vcpu);
547 670
@@ -557,10 +680,8 @@ int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
557 current->thread.fpexc_mode = fpexc_mode; 680 current->thread.fpexc_mode = fpexc_mode;
558#endif 681#endif
559 682
560 kvm_guest_exit();
561
562out: 683out:
563 local_irq_enable(); 684 vcpu->mode = OUTSIDE_GUEST_MODE;
564 return ret; 685 return ret;
565} 686}
566 687
@@ -668,6 +789,7 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
668 unsigned int exit_nr) 789 unsigned int exit_nr)
669{ 790{
670 int r = RESUME_HOST; 791 int r = RESUME_HOST;
792 int s;
671 793
672 /* update before a new last_exit_type is rewritten */ 794 /* update before a new last_exit_type is rewritten */
673 kvmppc_update_timing_stats(vcpu); 795 kvmppc_update_timing_stats(vcpu);
@@ -677,6 +799,9 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
677 799
678 local_irq_enable(); 800 local_irq_enable();
679 801
802 trace_kvm_exit(exit_nr, vcpu);
803 kvm_guest_exit();
804
680 run->exit_reason = KVM_EXIT_UNKNOWN; 805 run->exit_reason = KVM_EXIT_UNKNOWN;
681 run->ready_for_interrupt_injection = 1; 806 run->ready_for_interrupt_injection = 1;
682 807
@@ -971,10 +1096,12 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
971 */ 1096 */
972 if (!(r & RESUME_HOST)) { 1097 if (!(r & RESUME_HOST)) {
973 local_irq_disable(); 1098 local_irq_disable();
974 if (kvmppc_prepare_to_enter(vcpu)) { 1099 s = kvmppc_prepare_to_enter(vcpu);
975 run->exit_reason = KVM_EXIT_INTR; 1100 if (s <= 0) {
976 r = (-EINTR << 2) | RESUME_HOST | (r & RESUME_FLAG_NV); 1101 local_irq_enable();
977 kvmppc_account_exit(vcpu, SIGNAL_EXITS); 1102 r = (s << 2) | RESUME_HOST | (r & RESUME_FLAG_NV);
1103 } else {
1104 kvmppc_lazy_ee_enable();
978 } 1105 }
979 } 1106 }
980 1107
@@ -1011,6 +1138,21 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
1011 return r; 1138 return r;
1012} 1139}
1013 1140
1141int kvmppc_subarch_vcpu_init(struct kvm_vcpu *vcpu)
1142{
1143 /* setup watchdog timer once */
1144 spin_lock_init(&vcpu->arch.wdt_lock);
1145 setup_timer(&vcpu->arch.wdt_timer, kvmppc_watchdog_func,
1146 (unsigned long)vcpu);
1147
1148 return 0;
1149}
1150
1151void kvmppc_subarch_vcpu_uninit(struct kvm_vcpu *vcpu)
1152{
1153 del_timer_sync(&vcpu->arch.wdt_timer);
1154}
1155
1014int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) 1156int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
1015{ 1157{
1016 int i; 1158 int i;
@@ -1106,7 +1248,13 @@ static int set_sregs_base(struct kvm_vcpu *vcpu,
1106 } 1248 }
1107 1249
1108 if (sregs->u.e.update_special & KVM_SREGS_E_UPDATE_TSR) { 1250 if (sregs->u.e.update_special & KVM_SREGS_E_UPDATE_TSR) {
1251 u32 old_tsr = vcpu->arch.tsr;
1252
1109 vcpu->arch.tsr = sregs->u.e.tsr; 1253 vcpu->arch.tsr = sregs->u.e.tsr;
1254
1255 if ((old_tsr ^ vcpu->arch.tsr) & (TSR_ENW | TSR_WIS))
1256 arm_next_watchdog(vcpu);
1257
1110 update_timer_ints(vcpu); 1258 update_timer_ints(vcpu);
1111 } 1259 }
1112 1260
@@ -1221,12 +1369,70 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
1221 1369
1222int kvm_vcpu_ioctl_get_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg) 1370int kvm_vcpu_ioctl_get_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg)
1223{ 1371{
1224 return -EINVAL; 1372 int r = -EINVAL;
1373
1374 switch (reg->id) {
1375 case KVM_REG_PPC_IAC1:
1376 case KVM_REG_PPC_IAC2:
1377 case KVM_REG_PPC_IAC3:
1378 case KVM_REG_PPC_IAC4: {
1379 int iac = reg->id - KVM_REG_PPC_IAC1;
1380 r = copy_to_user((u64 __user *)(long)reg->addr,
1381 &vcpu->arch.dbg_reg.iac[iac], sizeof(u64));
1382 break;
1383 }
1384 case KVM_REG_PPC_DAC1:
1385 case KVM_REG_PPC_DAC2: {
1386 int dac = reg->id - KVM_REG_PPC_DAC1;
1387 r = copy_to_user((u64 __user *)(long)reg->addr,
1388 &vcpu->arch.dbg_reg.dac[dac], sizeof(u64));
1389 break;
1390 }
1391#if defined(CONFIG_64BIT)
1392 case KVM_REG_PPC_EPCR:
1393 r = put_user(vcpu->arch.epcr, (u32 __user *)(long)reg->addr);
1394 break;
1395#endif
1396 default:
1397 break;
1398 }
1399 return r;
1225} 1400}
1226 1401
1227int kvm_vcpu_ioctl_set_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg) 1402int kvm_vcpu_ioctl_set_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg)
1228{ 1403{
1229 return -EINVAL; 1404 int r = -EINVAL;
1405
1406 switch (reg->id) {
1407 case KVM_REG_PPC_IAC1:
1408 case KVM_REG_PPC_IAC2:
1409 case KVM_REG_PPC_IAC3:
1410 case KVM_REG_PPC_IAC4: {
1411 int iac = reg->id - KVM_REG_PPC_IAC1;
1412 r = copy_from_user(&vcpu->arch.dbg_reg.iac[iac],
1413 (u64 __user *)(long)reg->addr, sizeof(u64));
1414 break;
1415 }
1416 case KVM_REG_PPC_DAC1:
1417 case KVM_REG_PPC_DAC2: {
1418 int dac = reg->id - KVM_REG_PPC_DAC1;
1419 r = copy_from_user(&vcpu->arch.dbg_reg.dac[dac],
1420 (u64 __user *)(long)reg->addr, sizeof(u64));
1421 break;
1422 }
1423#if defined(CONFIG_64BIT)
1424 case KVM_REG_PPC_EPCR: {
1425 u32 new_epcr;
1426 r = get_user(new_epcr, (u32 __user *)(long)reg->addr);
1427 if (r == 0)
1428 kvmppc_set_epcr(vcpu, new_epcr);
1429 break;
1430 }
1431#endif
1432 default:
1433 break;
1434 }
1435 return r;
1230} 1436}
1231 1437
1232int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) 1438int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
@@ -1253,20 +1459,50 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log)
1253 return -ENOTSUPP; 1459 return -ENOTSUPP;
1254} 1460}
1255 1461
1462void kvmppc_core_free_memslot(struct kvm_memory_slot *free,
1463 struct kvm_memory_slot *dont)
1464{
1465}
1466
1467int kvmppc_core_create_memslot(struct kvm_memory_slot *slot,
1468 unsigned long npages)
1469{
1470 return 0;
1471}
1472
1256int kvmppc_core_prepare_memory_region(struct kvm *kvm, 1473int kvmppc_core_prepare_memory_region(struct kvm *kvm,
1474 struct kvm_memory_slot *memslot,
1257 struct kvm_userspace_memory_region *mem) 1475 struct kvm_userspace_memory_region *mem)
1258{ 1476{
1259 return 0; 1477 return 0;
1260} 1478}
1261 1479
1262void kvmppc_core_commit_memory_region(struct kvm *kvm, 1480void kvmppc_core_commit_memory_region(struct kvm *kvm,
1263 struct kvm_userspace_memory_region *mem) 1481 struct kvm_userspace_memory_region *mem,
1482 struct kvm_memory_slot old)
1483{
1484}
1485
1486void kvmppc_core_flush_memslot(struct kvm *kvm, struct kvm_memory_slot *memslot)
1487{
1488}
1489
1490void kvmppc_set_epcr(struct kvm_vcpu *vcpu, u32 new_epcr)
1264{ 1491{
1492#if defined(CONFIG_64BIT)
1493 vcpu->arch.epcr = new_epcr;
1494#ifdef CONFIG_KVM_BOOKE_HV
1495 vcpu->arch.shadow_epcr &= ~SPRN_EPCR_GICM;
1496 if (vcpu->arch.epcr & SPRN_EPCR_ICM)
1497 vcpu->arch.shadow_epcr |= SPRN_EPCR_GICM;
1498#endif
1499#endif
1265} 1500}
1266 1501
1267void kvmppc_set_tcr(struct kvm_vcpu *vcpu, u32 new_tcr) 1502void kvmppc_set_tcr(struct kvm_vcpu *vcpu, u32 new_tcr)
1268{ 1503{
1269 vcpu->arch.tcr = new_tcr; 1504 vcpu->arch.tcr = new_tcr;
1505 arm_next_watchdog(vcpu);
1270 update_timer_ints(vcpu); 1506 update_timer_ints(vcpu);
1271} 1507}
1272 1508
@@ -1281,6 +1517,14 @@ void kvmppc_set_tsr_bits(struct kvm_vcpu *vcpu, u32 tsr_bits)
1281void kvmppc_clr_tsr_bits(struct kvm_vcpu *vcpu, u32 tsr_bits) 1517void kvmppc_clr_tsr_bits(struct kvm_vcpu *vcpu, u32 tsr_bits)
1282{ 1518{
1283 clear_bits(tsr_bits, &vcpu->arch.tsr); 1519 clear_bits(tsr_bits, &vcpu->arch.tsr);
1520
1521 /*
1522 * We may have stopped the watchdog due to
1523 * being stuck on final expiration.
1524 */
1525 if (tsr_bits & (TSR_ENW | TSR_WIS))
1526 arm_next_watchdog(vcpu);
1527
1284 update_timer_ints(vcpu); 1528 update_timer_ints(vcpu);
1285} 1529}
1286 1530
@@ -1298,12 +1542,14 @@ void kvmppc_decrementer_func(unsigned long data)
1298 1542
1299void kvmppc_booke_vcpu_load(struct kvm_vcpu *vcpu, int cpu) 1543void kvmppc_booke_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
1300{ 1544{
1545 vcpu->cpu = smp_processor_id();
1301 current->thread.kvm_vcpu = vcpu; 1546 current->thread.kvm_vcpu = vcpu;
1302} 1547}
1303 1548
1304void kvmppc_booke_vcpu_put(struct kvm_vcpu *vcpu) 1549void kvmppc_booke_vcpu_put(struct kvm_vcpu *vcpu)
1305{ 1550{
1306 current->thread.kvm_vcpu = NULL; 1551 current->thread.kvm_vcpu = NULL;
1552 vcpu->cpu = -1;
1307} 1553}
1308 1554
1309int __init kvmppc_booke_init(void) 1555int __init kvmppc_booke_init(void)
diff --git a/arch/powerpc/kvm/booke.h b/arch/powerpc/kvm/booke.h
index ba61974c1e20..e9b88e433f64 100644
--- a/arch/powerpc/kvm/booke.h
+++ b/arch/powerpc/kvm/booke.h
@@ -69,6 +69,7 @@ extern unsigned long kvmppc_booke_handlers;
69void kvmppc_set_msr(struct kvm_vcpu *vcpu, u32 new_msr); 69void kvmppc_set_msr(struct kvm_vcpu *vcpu, u32 new_msr);
70void kvmppc_mmu_msr_notify(struct kvm_vcpu *vcpu, u32 old_msr); 70void kvmppc_mmu_msr_notify(struct kvm_vcpu *vcpu, u32 old_msr);
71 71
72void kvmppc_set_epcr(struct kvm_vcpu *vcpu, u32 new_epcr);
72void kvmppc_set_tcr(struct kvm_vcpu *vcpu, u32 new_tcr); 73void kvmppc_set_tcr(struct kvm_vcpu *vcpu, u32 new_tcr);
73void kvmppc_set_tsr_bits(struct kvm_vcpu *vcpu, u32 tsr_bits); 74void kvmppc_set_tsr_bits(struct kvm_vcpu *vcpu, u32 tsr_bits);
74void kvmppc_clr_tsr_bits(struct kvm_vcpu *vcpu, u32 tsr_bits); 75void kvmppc_clr_tsr_bits(struct kvm_vcpu *vcpu, u32 tsr_bits);
diff --git a/arch/powerpc/kvm/booke_emulate.c b/arch/powerpc/kvm/booke_emulate.c
index 12834bb608ab..4685b8cf2249 100644
--- a/arch/powerpc/kvm/booke_emulate.c
+++ b/arch/powerpc/kvm/booke_emulate.c
@@ -133,10 +133,10 @@ int kvmppc_booke_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, ulong spr_val)
133 vcpu->arch.csrr1 = spr_val; 133 vcpu->arch.csrr1 = spr_val;
134 break; 134 break;
135 case SPRN_DBCR0: 135 case SPRN_DBCR0:
136 vcpu->arch.dbcr0 = spr_val; 136 vcpu->arch.dbg_reg.dbcr0 = spr_val;
137 break; 137 break;
138 case SPRN_DBCR1: 138 case SPRN_DBCR1:
139 vcpu->arch.dbcr1 = spr_val; 139 vcpu->arch.dbg_reg.dbcr1 = spr_val;
140 break; 140 break;
141 case SPRN_DBSR: 141 case SPRN_DBSR:
142 vcpu->arch.dbsr &= ~spr_val; 142 vcpu->arch.dbsr &= ~spr_val;
@@ -145,6 +145,14 @@ int kvmppc_booke_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, ulong spr_val)
145 kvmppc_clr_tsr_bits(vcpu, spr_val); 145 kvmppc_clr_tsr_bits(vcpu, spr_val);
146 break; 146 break;
147 case SPRN_TCR: 147 case SPRN_TCR:
148 /*
149 * WRC is a 2-bit field that is supposed to preserve its
150 * value once written to non-zero.
151 */
152 if (vcpu->arch.tcr & TCR_WRC_MASK) {
153 spr_val &= ~TCR_WRC_MASK;
154 spr_val |= vcpu->arch.tcr & TCR_WRC_MASK;
155 }
148 kvmppc_set_tcr(vcpu, spr_val); 156 kvmppc_set_tcr(vcpu, spr_val);
149 break; 157 break;
150 158
@@ -229,7 +237,17 @@ int kvmppc_booke_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, ulong spr_val)
229 case SPRN_IVOR15: 237 case SPRN_IVOR15:
230 vcpu->arch.ivor[BOOKE_IRQPRIO_DEBUG] = spr_val; 238 vcpu->arch.ivor[BOOKE_IRQPRIO_DEBUG] = spr_val;
231 break; 239 break;
232 240 case SPRN_MCSR:
241 vcpu->arch.mcsr &= ~spr_val;
242 break;
243#if defined(CONFIG_64BIT)
244 case SPRN_EPCR:
245 kvmppc_set_epcr(vcpu, spr_val);
246#ifdef CONFIG_KVM_BOOKE_HV
247 mtspr(SPRN_EPCR, vcpu->arch.shadow_epcr);
248#endif
249 break;
250#endif
233 default: 251 default:
234 emulated = EMULATE_FAIL; 252 emulated = EMULATE_FAIL;
235 } 253 }
@@ -258,10 +276,10 @@ int kvmppc_booke_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, ulong *spr_val)
258 *spr_val = vcpu->arch.csrr1; 276 *spr_val = vcpu->arch.csrr1;
259 break; 277 break;
260 case SPRN_DBCR0: 278 case SPRN_DBCR0:
261 *spr_val = vcpu->arch.dbcr0; 279 *spr_val = vcpu->arch.dbg_reg.dbcr0;
262 break; 280 break;
263 case SPRN_DBCR1: 281 case SPRN_DBCR1:
264 *spr_val = vcpu->arch.dbcr1; 282 *spr_val = vcpu->arch.dbg_reg.dbcr1;
265 break; 283 break;
266 case SPRN_DBSR: 284 case SPRN_DBSR:
267 *spr_val = vcpu->arch.dbsr; 285 *spr_val = vcpu->arch.dbsr;
@@ -321,6 +339,14 @@ int kvmppc_booke_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, ulong *spr_val)
321 case SPRN_IVOR15: 339 case SPRN_IVOR15:
322 *spr_val = vcpu->arch.ivor[BOOKE_IRQPRIO_DEBUG]; 340 *spr_val = vcpu->arch.ivor[BOOKE_IRQPRIO_DEBUG];
323 break; 341 break;
342 case SPRN_MCSR:
343 *spr_val = vcpu->arch.mcsr;
344 break;
345#if defined(CONFIG_64BIT)
346 case SPRN_EPCR:
347 *spr_val = vcpu->arch.epcr;
348 break;
349#endif
324 350
325 default: 351 default:
326 emulated = EMULATE_FAIL; 352 emulated = EMULATE_FAIL;
diff --git a/arch/powerpc/kvm/bookehv_interrupts.S b/arch/powerpc/kvm/bookehv_interrupts.S
index 099fe8272b57..e8ed7d659c55 100644
--- a/arch/powerpc/kvm/bookehv_interrupts.S
+++ b/arch/powerpc/kvm/bookehv_interrupts.S
@@ -16,6 +16,7 @@
16 * 16 *
17 * Author: Varun Sethi <varun.sethi@freescale.com> 17 * Author: Varun Sethi <varun.sethi@freescale.com>
18 * Author: Scott Wood <scotwood@freescale.com> 18 * Author: Scott Wood <scotwood@freescale.com>
19 * Author: Mihai Caraman <mihai.caraman@freescale.com>
19 * 20 *
20 * This file is derived from arch/powerpc/kvm/booke_interrupts.S 21 * This file is derived from arch/powerpc/kvm/booke_interrupts.S
21 */ 22 */
@@ -30,31 +31,33 @@
30#include <asm/bitsperlong.h> 31#include <asm/bitsperlong.h>
31#include <asm/thread_info.h> 32#include <asm/thread_info.h>
32 33
34#ifdef CONFIG_64BIT
35#include <asm/exception-64e.h>
36#else
33#include "../kernel/head_booke.h" /* for THREAD_NORMSAVE() */ 37#include "../kernel/head_booke.h" /* for THREAD_NORMSAVE() */
34 38#endif
35#define GET_VCPU(vcpu, thread) \
36 PPC_LL vcpu, THREAD_KVM_VCPU(thread)
37 39
38#define LONGBYTES (BITS_PER_LONG / 8) 40#define LONGBYTES (BITS_PER_LONG / 8)
39 41
40#define VCPU_GUEST_SPRG(n) (VCPU_GUEST_SPRGS + (n * LONGBYTES)) 42#define VCPU_GUEST_SPRG(n) (VCPU_GUEST_SPRGS + (n * LONGBYTES))
41 43
42/* The host stack layout: */ 44/* The host stack layout: */
43#define HOST_R1 (0 * LONGBYTES) /* Implied by stwu. */ 45#define HOST_R1 0 /* Implied by stwu. */
44#define HOST_CALLEE_LR (1 * LONGBYTES) 46#define HOST_CALLEE_LR PPC_LR_STKOFF
45#define HOST_RUN (2 * LONGBYTES) /* struct kvm_run */ 47#define HOST_RUN (HOST_CALLEE_LR + LONGBYTES)
46/* 48/*
47 * r2 is special: it holds 'current', and it made nonvolatile in the 49 * r2 is special: it holds 'current', and it made nonvolatile in the
48 * kernel with the -ffixed-r2 gcc option. 50 * kernel with the -ffixed-r2 gcc option.
49 */ 51 */
50#define HOST_R2 (3 * LONGBYTES) 52#define HOST_R2 (HOST_RUN + LONGBYTES)
51#define HOST_CR (4 * LONGBYTES) 53#define HOST_CR (HOST_R2 + LONGBYTES)
52#define HOST_NV_GPRS (5 * LONGBYTES) 54#define HOST_NV_GPRS (HOST_CR + LONGBYTES)
53#define __HOST_NV_GPR(n) (HOST_NV_GPRS + ((n - 14) * LONGBYTES)) 55#define __HOST_NV_GPR(n) (HOST_NV_GPRS + ((n - 14) * LONGBYTES))
54#define HOST_NV_GPR(n) __HOST_NV_GPR(__REG_##n) 56#define HOST_NV_GPR(n) __HOST_NV_GPR(__REG_##n)
55#define HOST_MIN_STACK_SIZE (HOST_NV_GPR(R31) + LONGBYTES) 57#define HOST_MIN_STACK_SIZE (HOST_NV_GPR(R31) + LONGBYTES)
56#define HOST_STACK_SIZE ((HOST_MIN_STACK_SIZE + 15) & ~15) /* Align. */ 58#define HOST_STACK_SIZE ((HOST_MIN_STACK_SIZE + 15) & ~15) /* Align. */
57#define HOST_STACK_LR (HOST_STACK_SIZE + LONGBYTES) /* In caller stack frame. */ 59/* LR in caller stack frame. */
60#define HOST_STACK_LR (HOST_STACK_SIZE + PPC_LR_STKOFF)
58 61
59#define NEED_EMU 0x00000001 /* emulation -- save nv regs */ 62#define NEED_EMU 0x00000001 /* emulation -- save nv regs */
60#define NEED_DEAR 0x00000002 /* save faulting DEAR */ 63#define NEED_DEAR 0x00000002 /* save faulting DEAR */
@@ -201,12 +204,128 @@
201 b kvmppc_resume_host 204 b kvmppc_resume_host
202.endm 205.endm
203 206
207#ifdef CONFIG_64BIT
208/* Exception types */
209#define EX_GEN 1
210#define EX_GDBELL 2
211#define EX_DBG 3
212#define EX_MC 4
213#define EX_CRIT 5
214#define EX_TLB 6
215
216/*
217 * For input register values, see arch/powerpc/include/asm/kvm_booke_hv_asm.h
218 */
219.macro kvm_handler intno type scratch, paca_ex, ex_r10, ex_r11, srr0, srr1, flags
220 _GLOBAL(kvmppc_handler_\intno\()_\srr1)
221 mr r11, r4
222 /*
223 * Get vcpu from Paca: paca->__current.thread->kvm_vcpu
224 */
225 PPC_LL r4, PACACURRENT(r13)
226 PPC_LL r4, (THREAD + THREAD_KVM_VCPU)(r4)
227 stw r10, VCPU_CR(r4)
228 PPC_STL r11, VCPU_GPR(R4)(r4)
229 PPC_STL r5, VCPU_GPR(R5)(r4)
230 .if \type == EX_CRIT
231 PPC_LL r5, (\paca_ex + EX_R13)(r13)
232 .else
233 mfspr r5, \scratch
234 .endif
235 PPC_STL r6, VCPU_GPR(R6)(r4)
236 PPC_STL r8, VCPU_GPR(R8)(r4)
237 PPC_STL r9, VCPU_GPR(R9)(r4)
238 PPC_STL r5, VCPU_GPR(R13)(r4)
239 PPC_LL r6, (\paca_ex + \ex_r10)(r13)
240 PPC_LL r8, (\paca_ex + \ex_r11)(r13)
241 PPC_STL r3, VCPU_GPR(R3)(r4)
242 PPC_STL r7, VCPU_GPR(R7)(r4)
243 PPC_STL r12, VCPU_GPR(R12)(r4)
244 PPC_STL r6, VCPU_GPR(R10)(r4)
245 PPC_STL r8, VCPU_GPR(R11)(r4)
246 mfctr r5
247 PPC_STL r5, VCPU_CTR(r4)
248 mfspr r5, \srr0
249 mfspr r6, \srr1
250 kvm_handler_common \intno, \srr0, \flags
251.endm
252
253#define EX_PARAMS(type) \
254 EX_##type, \
255 SPRN_SPRG_##type##_SCRATCH, \
256 PACA_EX##type, \
257 EX_R10, \
258 EX_R11
259
260#define EX_PARAMS_TLB \
261 EX_TLB, \
262 SPRN_SPRG_GEN_SCRATCH, \
263 PACA_EXTLB, \
264 EX_TLB_R10, \
265 EX_TLB_R11
266
267kvm_handler BOOKE_INTERRUPT_CRITICAL, EX_PARAMS(CRIT), \
268 SPRN_CSRR0, SPRN_CSRR1, 0
269kvm_handler BOOKE_INTERRUPT_MACHINE_CHECK, EX_PARAMS(MC), \
270 SPRN_MCSRR0, SPRN_MCSRR1, 0
271kvm_handler BOOKE_INTERRUPT_DATA_STORAGE, EX_PARAMS(GEN), \
272 SPRN_SRR0, SPRN_SRR1,(NEED_EMU | NEED_DEAR | NEED_ESR)
273kvm_handler BOOKE_INTERRUPT_INST_STORAGE, EX_PARAMS(GEN), \
274 SPRN_SRR0, SPRN_SRR1, NEED_ESR
275kvm_handler BOOKE_INTERRUPT_EXTERNAL, EX_PARAMS(GEN), \
276 SPRN_SRR0, SPRN_SRR1, 0
277kvm_handler BOOKE_INTERRUPT_ALIGNMENT, EX_PARAMS(GEN), \
278 SPRN_SRR0, SPRN_SRR1,(NEED_DEAR | NEED_ESR)
279kvm_handler BOOKE_INTERRUPT_PROGRAM, EX_PARAMS(GEN), \
280 SPRN_SRR0, SPRN_SRR1,NEED_ESR
281kvm_handler BOOKE_INTERRUPT_FP_UNAVAIL, EX_PARAMS(GEN), \
282 SPRN_SRR0, SPRN_SRR1, 0
283kvm_handler BOOKE_INTERRUPT_AP_UNAVAIL, EX_PARAMS(GEN), \
284 SPRN_SRR0, SPRN_SRR1, 0
285kvm_handler BOOKE_INTERRUPT_DECREMENTER, EX_PARAMS(GEN), \
286 SPRN_SRR0, SPRN_SRR1, 0
287kvm_handler BOOKE_INTERRUPT_FIT, EX_PARAMS(GEN), \
288 SPRN_SRR0, SPRN_SRR1, 0
289kvm_handler BOOKE_INTERRUPT_WATCHDOG, EX_PARAMS(CRIT),\
290 SPRN_CSRR0, SPRN_CSRR1, 0
291/*
292 * Only bolted TLB miss exception handlers are supported for now
293 */
294kvm_handler BOOKE_INTERRUPT_DTLB_MISS, EX_PARAMS_TLB, \
295 SPRN_SRR0, SPRN_SRR1, (NEED_EMU | NEED_DEAR | NEED_ESR)
296kvm_handler BOOKE_INTERRUPT_ITLB_MISS, EX_PARAMS_TLB, \
297 SPRN_SRR0, SPRN_SRR1, 0
298kvm_handler BOOKE_INTERRUPT_SPE_UNAVAIL, EX_PARAMS(GEN), \
299 SPRN_SRR0, SPRN_SRR1, 0
300kvm_handler BOOKE_INTERRUPT_SPE_FP_DATA, EX_PARAMS(GEN), \
301 SPRN_SRR0, SPRN_SRR1, 0
302kvm_handler BOOKE_INTERRUPT_SPE_FP_ROUND, EX_PARAMS(GEN), \
303 SPRN_SRR0, SPRN_SRR1, 0
304kvm_handler BOOKE_INTERRUPT_PERFORMANCE_MONITOR, EX_PARAMS(GEN), \
305 SPRN_SRR0, SPRN_SRR1, 0
306kvm_handler BOOKE_INTERRUPT_DOORBELL, EX_PARAMS(GEN), \
307 SPRN_SRR0, SPRN_SRR1, 0
308kvm_handler BOOKE_INTERRUPT_DOORBELL_CRITICAL, EX_PARAMS(CRIT), \
309 SPRN_CSRR0, SPRN_CSRR1, 0
310kvm_handler BOOKE_INTERRUPT_HV_PRIV, EX_PARAMS(GEN), \
311 SPRN_SRR0, SPRN_SRR1, NEED_EMU
312kvm_handler BOOKE_INTERRUPT_HV_SYSCALL, EX_PARAMS(GEN), \
313 SPRN_SRR0, SPRN_SRR1, 0
314kvm_handler BOOKE_INTERRUPT_GUEST_DBELL, EX_PARAMS(GDBELL), \
315 SPRN_GSRR0, SPRN_GSRR1, 0
316kvm_handler BOOKE_INTERRUPT_GUEST_DBELL_CRIT, EX_PARAMS(CRIT), \
317 SPRN_CSRR0, SPRN_CSRR1, 0
318kvm_handler BOOKE_INTERRUPT_DEBUG, EX_PARAMS(DBG), \
319 SPRN_DSRR0, SPRN_DSRR1, 0
320kvm_handler BOOKE_INTERRUPT_DEBUG, EX_PARAMS(CRIT), \
321 SPRN_CSRR0, SPRN_CSRR1, 0
322#else
204/* 323/*
205 * For input register values, see arch/powerpc/include/asm/kvm_booke_hv_asm.h 324 * For input register values, see arch/powerpc/include/asm/kvm_booke_hv_asm.h
206 */ 325 */
207.macro kvm_handler intno srr0, srr1, flags 326.macro kvm_handler intno srr0, srr1, flags
208_GLOBAL(kvmppc_handler_\intno\()_\srr1) 327_GLOBAL(kvmppc_handler_\intno\()_\srr1)
209 GET_VCPU(r11, r10) 328 PPC_LL r11, THREAD_KVM_VCPU(r10)
210 PPC_STL r3, VCPU_GPR(R3)(r11) 329 PPC_STL r3, VCPU_GPR(R3)(r11)
211 mfspr r3, SPRN_SPRG_RSCRATCH0 330 mfspr r3, SPRN_SPRG_RSCRATCH0
212 PPC_STL r4, VCPU_GPR(R4)(r11) 331 PPC_STL r4, VCPU_GPR(R4)(r11)
@@ -233,7 +352,7 @@ _GLOBAL(kvmppc_handler_\intno\()_\srr1)
233.macro kvm_lvl_handler intno scratch srr0, srr1, flags 352.macro kvm_lvl_handler intno scratch srr0, srr1, flags
234_GLOBAL(kvmppc_handler_\intno\()_\srr1) 353_GLOBAL(kvmppc_handler_\intno\()_\srr1)
235 mfspr r10, SPRN_SPRG_THREAD 354 mfspr r10, SPRN_SPRG_THREAD
236 GET_VCPU(r11, r10) 355 PPC_LL r11, THREAD_KVM_VCPU(r10)
237 PPC_STL r3, VCPU_GPR(R3)(r11) 356 PPC_STL r3, VCPU_GPR(R3)(r11)
238 mfspr r3, \scratch 357 mfspr r3, \scratch
239 PPC_STL r4, VCPU_GPR(R4)(r11) 358 PPC_STL r4, VCPU_GPR(R4)(r11)
@@ -295,7 +414,7 @@ kvm_lvl_handler BOOKE_INTERRUPT_DEBUG, \
295 SPRN_SPRG_RSCRATCH_CRIT, SPRN_CSRR0, SPRN_CSRR1, 0 414 SPRN_SPRG_RSCRATCH_CRIT, SPRN_CSRR0, SPRN_CSRR1, 0
296kvm_lvl_handler BOOKE_INTERRUPT_DEBUG, \ 415kvm_lvl_handler BOOKE_INTERRUPT_DEBUG, \
297 SPRN_SPRG_RSCRATCH_DBG, SPRN_DSRR0, SPRN_DSRR1, 0 416 SPRN_SPRG_RSCRATCH_DBG, SPRN_DSRR0, SPRN_DSRR1, 0
298 417#endif
299 418
300/* Registers: 419/* Registers:
301 * SPRG_SCRATCH0: guest r10 420 * SPRG_SCRATCH0: guest r10
diff --git a/arch/powerpc/kvm/e500.h b/arch/powerpc/kvm/e500.h
index aa8b81428bf4..c70d37ed770a 100644
--- a/arch/powerpc/kvm/e500.h
+++ b/arch/powerpc/kvm/e500.h
@@ -27,8 +27,7 @@
27#define E500_TLB_NUM 2 27#define E500_TLB_NUM 2
28 28
29#define E500_TLB_VALID 1 29#define E500_TLB_VALID 1
30#define E500_TLB_DIRTY 2 30#define E500_TLB_BITMAP 2
31#define E500_TLB_BITMAP 4
32 31
33struct tlbe_ref { 32struct tlbe_ref {
34 pfn_t pfn; 33 pfn_t pfn;
@@ -130,9 +129,9 @@ int kvmppc_e500_emul_mt_mmucsr0(struct kvmppc_vcpu_e500 *vcpu_e500,
130 ulong value); 129 ulong value);
131int kvmppc_e500_emul_tlbwe(struct kvm_vcpu *vcpu); 130int kvmppc_e500_emul_tlbwe(struct kvm_vcpu *vcpu);
132int kvmppc_e500_emul_tlbre(struct kvm_vcpu *vcpu); 131int kvmppc_e500_emul_tlbre(struct kvm_vcpu *vcpu);
133int kvmppc_e500_emul_tlbivax(struct kvm_vcpu *vcpu, int ra, int rb); 132int kvmppc_e500_emul_tlbivax(struct kvm_vcpu *vcpu, gva_t ea);
134int kvmppc_e500_emul_tlbilx(struct kvm_vcpu *vcpu, int rt, int ra, int rb); 133int kvmppc_e500_emul_tlbilx(struct kvm_vcpu *vcpu, int type, gva_t ea);
135int kvmppc_e500_emul_tlbsx(struct kvm_vcpu *vcpu, int rb); 134int kvmppc_e500_emul_tlbsx(struct kvm_vcpu *vcpu, gva_t ea);
136int kvmppc_e500_tlb_init(struct kvmppc_vcpu_e500 *vcpu_e500); 135int kvmppc_e500_tlb_init(struct kvmppc_vcpu_e500 *vcpu_e500);
137void kvmppc_e500_tlb_uninit(struct kvmppc_vcpu_e500 *vcpu_e500); 136void kvmppc_e500_tlb_uninit(struct kvmppc_vcpu_e500 *vcpu_e500);
138 137
@@ -155,7 +154,7 @@ get_tlb_size(const struct kvm_book3e_206_tlb_entry *tlbe)
155 154
156static inline gva_t get_tlb_eaddr(const struct kvm_book3e_206_tlb_entry *tlbe) 155static inline gva_t get_tlb_eaddr(const struct kvm_book3e_206_tlb_entry *tlbe)
157{ 156{
158 return tlbe->mas2 & 0xfffff000; 157 return tlbe->mas2 & MAS2_EPN;
159} 158}
160 159
161static inline u64 get_tlb_bytes(const struct kvm_book3e_206_tlb_entry *tlbe) 160static inline u64 get_tlb_bytes(const struct kvm_book3e_206_tlb_entry *tlbe)
diff --git a/arch/powerpc/kvm/e500_emulate.c b/arch/powerpc/kvm/e500_emulate.c
index e04b0ef55ce0..e78f353a836a 100644
--- a/arch/powerpc/kvm/e500_emulate.c
+++ b/arch/powerpc/kvm/e500_emulate.c
@@ -89,6 +89,7 @@ int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,
89 int ra = get_ra(inst); 89 int ra = get_ra(inst);
90 int rb = get_rb(inst); 90 int rb = get_rb(inst);
91 int rt = get_rt(inst); 91 int rt = get_rt(inst);
92 gva_t ea;
92 93
93 switch (get_op(inst)) { 94 switch (get_op(inst)) {
94 case 31: 95 case 31:
@@ -113,15 +114,20 @@ int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,
113 break; 114 break;
114 115
115 case XOP_TLBSX: 116 case XOP_TLBSX:
116 emulated = kvmppc_e500_emul_tlbsx(vcpu,rb); 117 ea = kvmppc_get_ea_indexed(vcpu, ra, rb);
118 emulated = kvmppc_e500_emul_tlbsx(vcpu, ea);
117 break; 119 break;
118 120
119 case XOP_TLBILX: 121 case XOP_TLBILX: {
120 emulated = kvmppc_e500_emul_tlbilx(vcpu, rt, ra, rb); 122 int type = rt & 0x3;
123 ea = kvmppc_get_ea_indexed(vcpu, ra, rb);
124 emulated = kvmppc_e500_emul_tlbilx(vcpu, type, ea);
121 break; 125 break;
126 }
122 127
123 case XOP_TLBIVAX: 128 case XOP_TLBIVAX:
124 emulated = kvmppc_e500_emul_tlbivax(vcpu, ra, rb); 129 ea = kvmppc_get_ea_indexed(vcpu, ra, rb);
130 emulated = kvmppc_e500_emul_tlbivax(vcpu, ea);
125 break; 131 break;
126 132
127 default: 133 default:
diff --git a/arch/powerpc/kvm/e500_tlb.c b/arch/powerpc/kvm/e500_tlb.c
index ff38b664195d..cf3f18012371 100644
--- a/arch/powerpc/kvm/e500_tlb.c
+++ b/arch/powerpc/kvm/e500_tlb.c
@@ -304,17 +304,13 @@ static inline void kvmppc_e500_ref_setup(struct tlbe_ref *ref,
304 ref->flags = E500_TLB_VALID; 304 ref->flags = E500_TLB_VALID;
305 305
306 if (tlbe_is_writable(gtlbe)) 306 if (tlbe_is_writable(gtlbe))
307 ref->flags |= E500_TLB_DIRTY; 307 kvm_set_pfn_dirty(pfn);
308} 308}
309 309
310static inline void kvmppc_e500_ref_release(struct tlbe_ref *ref) 310static inline void kvmppc_e500_ref_release(struct tlbe_ref *ref)
311{ 311{
312 if (ref->flags & E500_TLB_VALID) { 312 if (ref->flags & E500_TLB_VALID) {
313 if (ref->flags & E500_TLB_DIRTY) 313 trace_kvm_booke206_ref_release(ref->pfn, ref->flags);
314 kvm_release_pfn_dirty(ref->pfn);
315 else
316 kvm_release_pfn_clean(ref->pfn);
317
318 ref->flags = 0; 314 ref->flags = 0;
319 } 315 }
320} 316}
@@ -357,6 +353,13 @@ static void clear_tlb_refs(struct kvmppc_vcpu_e500 *vcpu_e500)
357 clear_tlb_privs(vcpu_e500); 353 clear_tlb_privs(vcpu_e500);
358} 354}
359 355
356void kvmppc_core_flush_tlb(struct kvm_vcpu *vcpu)
357{
358 struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
359 clear_tlb_refs(vcpu_e500);
360 clear_tlb1_bitmap(vcpu_e500);
361}
362
360static inline void kvmppc_e500_deliver_tlb_miss(struct kvm_vcpu *vcpu, 363static inline void kvmppc_e500_deliver_tlb_miss(struct kvm_vcpu *vcpu,
361 unsigned int eaddr, int as) 364 unsigned int eaddr, int as)
362{ 365{
@@ -412,7 +415,8 @@ static inline void kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500,
412 struct tlbe_ref *ref) 415 struct tlbe_ref *ref)
413{ 416{
414 struct kvm_memory_slot *slot; 417 struct kvm_memory_slot *slot;
415 unsigned long pfn, hva; 418 unsigned long pfn = 0; /* silence GCC warning */
419 unsigned long hva;
416 int pfnmap = 0; 420 int pfnmap = 0;
417 int tsize = BOOK3E_PAGESZ_4K; 421 int tsize = BOOK3E_PAGESZ_4K;
418 422
@@ -521,7 +525,7 @@ static inline void kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500,
521 if (likely(!pfnmap)) { 525 if (likely(!pfnmap)) {
522 unsigned long tsize_pages = 1 << (tsize + 10 - PAGE_SHIFT); 526 unsigned long tsize_pages = 1 << (tsize + 10 - PAGE_SHIFT);
523 pfn = gfn_to_pfn_memslot(slot, gfn); 527 pfn = gfn_to_pfn_memslot(slot, gfn);
524 if (is_error_pfn(pfn)) { 528 if (is_error_noslot_pfn(pfn)) {
525 printk(KERN_ERR "Couldn't get real page for gfn %lx!\n", 529 printk(KERN_ERR "Couldn't get real page for gfn %lx!\n",
526 (long)gfn); 530 (long)gfn);
527 return; 531 return;
@@ -541,6 +545,9 @@ static inline void kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500,
541 545
542 /* Clear i-cache for new pages */ 546 /* Clear i-cache for new pages */
543 kvmppc_mmu_flush_icache(pfn); 547 kvmppc_mmu_flush_icache(pfn);
548
549 /* Drop refcount on page, so that mmu notifiers can clear it */
550 kvm_release_pfn_clean(pfn);
544} 551}
545 552
546/* XXX only map the one-one case, for now use TLB0 */ 553/* XXX only map the one-one case, for now use TLB0 */
@@ -682,14 +689,11 @@ int kvmppc_e500_emul_mt_mmucsr0(struct kvmppc_vcpu_e500 *vcpu_e500, ulong value)
682 return EMULATE_DONE; 689 return EMULATE_DONE;
683} 690}
684 691
685int kvmppc_e500_emul_tlbivax(struct kvm_vcpu *vcpu, int ra, int rb) 692int kvmppc_e500_emul_tlbivax(struct kvm_vcpu *vcpu, gva_t ea)
686{ 693{
687 struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); 694 struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
688 unsigned int ia; 695 unsigned int ia;
689 int esel, tlbsel; 696 int esel, tlbsel;
690 gva_t ea;
691
692 ea = ((ra) ? kvmppc_get_gpr(vcpu, ra) : 0) + kvmppc_get_gpr(vcpu, rb);
693 697
694 ia = (ea >> 2) & 0x1; 698 ia = (ea >> 2) & 0x1;
695 699
@@ -716,7 +720,7 @@ int kvmppc_e500_emul_tlbivax(struct kvm_vcpu *vcpu, int ra, int rb)
716} 720}
717 721
718static void tlbilx_all(struct kvmppc_vcpu_e500 *vcpu_e500, int tlbsel, 722static void tlbilx_all(struct kvmppc_vcpu_e500 *vcpu_e500, int tlbsel,
719 int pid, int rt) 723 int pid, int type)
720{ 724{
721 struct kvm_book3e_206_tlb_entry *tlbe; 725 struct kvm_book3e_206_tlb_entry *tlbe;
722 int tid, esel; 726 int tid, esel;
@@ -725,7 +729,7 @@ static void tlbilx_all(struct kvmppc_vcpu_e500 *vcpu_e500, int tlbsel,
725 for (esel = 0; esel < vcpu_e500->gtlb_params[tlbsel].entries; esel++) { 729 for (esel = 0; esel < vcpu_e500->gtlb_params[tlbsel].entries; esel++) {
726 tlbe = get_entry(vcpu_e500, tlbsel, esel); 730 tlbe = get_entry(vcpu_e500, tlbsel, esel);
727 tid = get_tlb_tid(tlbe); 731 tid = get_tlb_tid(tlbe);
728 if (rt == 0 || tid == pid) { 732 if (type == 0 || tid == pid) {
729 inval_gtlbe_on_host(vcpu_e500, tlbsel, esel); 733 inval_gtlbe_on_host(vcpu_e500, tlbsel, esel);
730 kvmppc_e500_gtlbe_invalidate(vcpu_e500, tlbsel, esel); 734 kvmppc_e500_gtlbe_invalidate(vcpu_e500, tlbsel, esel);
731 } 735 }
@@ -733,14 +737,9 @@ static void tlbilx_all(struct kvmppc_vcpu_e500 *vcpu_e500, int tlbsel,
733} 737}
734 738
735static void tlbilx_one(struct kvmppc_vcpu_e500 *vcpu_e500, int pid, 739static void tlbilx_one(struct kvmppc_vcpu_e500 *vcpu_e500, int pid,
736 int ra, int rb) 740 gva_t ea)
737{ 741{
738 int tlbsel, esel; 742 int tlbsel, esel;
739 gva_t ea;
740
741 ea = kvmppc_get_gpr(&vcpu_e500->vcpu, rb);
742 if (ra)
743 ea += kvmppc_get_gpr(&vcpu_e500->vcpu, ra);
744 743
745 for (tlbsel = 0; tlbsel < 2; tlbsel++) { 744 for (tlbsel = 0; tlbsel < 2; tlbsel++) {
746 esel = kvmppc_e500_tlb_index(vcpu_e500, ea, tlbsel, pid, -1); 745 esel = kvmppc_e500_tlb_index(vcpu_e500, ea, tlbsel, pid, -1);
@@ -752,16 +751,16 @@ static void tlbilx_one(struct kvmppc_vcpu_e500 *vcpu_e500, int pid,
752 } 751 }
753} 752}
754 753
755int kvmppc_e500_emul_tlbilx(struct kvm_vcpu *vcpu, int rt, int ra, int rb) 754int kvmppc_e500_emul_tlbilx(struct kvm_vcpu *vcpu, int type, gva_t ea)
756{ 755{
757 struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); 756 struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
758 int pid = get_cur_spid(vcpu); 757 int pid = get_cur_spid(vcpu);
759 758
760 if (rt == 0 || rt == 1) { 759 if (type == 0 || type == 1) {
761 tlbilx_all(vcpu_e500, 0, pid, rt); 760 tlbilx_all(vcpu_e500, 0, pid, type);
762 tlbilx_all(vcpu_e500, 1, pid, rt); 761 tlbilx_all(vcpu_e500, 1, pid, type);
763 } else if (rt == 3) { 762 } else if (type == 3) {
764 tlbilx_one(vcpu_e500, pid, ra, rb); 763 tlbilx_one(vcpu_e500, pid, ea);
765 } 764 }
766 765
767 return EMULATE_DONE; 766 return EMULATE_DONE;
@@ -786,16 +785,13 @@ int kvmppc_e500_emul_tlbre(struct kvm_vcpu *vcpu)
786 return EMULATE_DONE; 785 return EMULATE_DONE;
787} 786}
788 787
789int kvmppc_e500_emul_tlbsx(struct kvm_vcpu *vcpu, int rb) 788int kvmppc_e500_emul_tlbsx(struct kvm_vcpu *vcpu, gva_t ea)
790{ 789{
791 struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); 790 struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
792 int as = !!get_cur_sas(vcpu); 791 int as = !!get_cur_sas(vcpu);
793 unsigned int pid = get_cur_spid(vcpu); 792 unsigned int pid = get_cur_spid(vcpu);
794 int esel, tlbsel; 793 int esel, tlbsel;
795 struct kvm_book3e_206_tlb_entry *gtlbe = NULL; 794 struct kvm_book3e_206_tlb_entry *gtlbe = NULL;
796 gva_t ea;
797
798 ea = kvmppc_get_gpr(vcpu, rb);
799 795
800 for (tlbsel = 0; tlbsel < 2; tlbsel++) { 796 for (tlbsel = 0; tlbsel < 2; tlbsel++) {
801 esel = kvmppc_e500_tlb_index(vcpu_e500, ea, tlbsel, pid, as); 797 esel = kvmppc_e500_tlb_index(vcpu_e500, ea, tlbsel, pid, as);
@@ -875,6 +871,8 @@ int kvmppc_e500_emul_tlbwe(struct kvm_vcpu *vcpu)
875 871
876 gtlbe->mas1 = vcpu->arch.shared->mas1; 872 gtlbe->mas1 = vcpu->arch.shared->mas1;
877 gtlbe->mas2 = vcpu->arch.shared->mas2; 873 gtlbe->mas2 = vcpu->arch.shared->mas2;
874 if (!(vcpu->arch.shared->msr & MSR_CM))
875 gtlbe->mas2 &= 0xffffffffUL;
878 gtlbe->mas7_3 = vcpu->arch.shared->mas7_3; 876 gtlbe->mas7_3 = vcpu->arch.shared->mas7_3;
879 877
880 trace_kvm_booke206_gtlb_write(vcpu->arch.shared->mas0, gtlbe->mas1, 878 trace_kvm_booke206_gtlb_write(vcpu->arch.shared->mas0, gtlbe->mas1,
@@ -1039,8 +1037,12 @@ void kvmppc_mmu_map(struct kvm_vcpu *vcpu, u64 eaddr, gpa_t gpaddr,
1039 sesel = 0; /* unused */ 1037 sesel = 0; /* unused */
1040 priv = &vcpu_e500->gtlb_priv[tlbsel][esel]; 1038 priv = &vcpu_e500->gtlb_priv[tlbsel][esel];
1041 1039
1042 kvmppc_e500_setup_stlbe(vcpu, gtlbe, BOOK3E_PAGESZ_4K, 1040 /* Only triggers after clear_tlb_refs */
1043 &priv->ref, eaddr, &stlbe); 1041 if (unlikely(!(priv->ref.flags & E500_TLB_VALID)))
1042 kvmppc_e500_tlb0_map(vcpu_e500, esel, &stlbe);
1043 else
1044 kvmppc_e500_setup_stlbe(vcpu, gtlbe, BOOK3E_PAGESZ_4K,
1045 &priv->ref, eaddr, &stlbe);
1044 break; 1046 break;
1045 1047
1046 case 1: { 1048 case 1: {
@@ -1060,6 +1062,49 @@ void kvmppc_mmu_map(struct kvm_vcpu *vcpu, u64 eaddr, gpa_t gpaddr,
1060 write_stlbe(vcpu_e500, gtlbe, &stlbe, stlbsel, sesel); 1062 write_stlbe(vcpu_e500, gtlbe, &stlbe, stlbsel, sesel);
1061} 1063}
1062 1064
1065/************* MMU Notifiers *************/
1066
1067int kvm_unmap_hva(struct kvm *kvm, unsigned long hva)
1068{
1069 trace_kvm_unmap_hva(hva);
1070
1071 /*
1072 * Flush all shadow tlb entries everywhere. This is slow, but
1073 * we are 100% sure that we catch the to be unmapped page
1074 */
1075 kvm_flush_remote_tlbs(kvm);
1076
1077 return 0;
1078}
1079
1080int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end)
1081{
1082 /* kvm_unmap_hva flushes everything anyways */
1083 kvm_unmap_hva(kvm, start);
1084
1085 return 0;
1086}
1087
1088int kvm_age_hva(struct kvm *kvm, unsigned long hva)
1089{
1090 /* XXX could be more clever ;) */
1091 return 0;
1092}
1093
1094int kvm_test_age_hva(struct kvm *kvm, unsigned long hva)
1095{
1096 /* XXX could be more clever ;) */
1097 return 0;
1098}
1099
1100void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
1101{
1102 /* The page will get remapped properly on its next fault */
1103 kvm_unmap_hva(kvm, hva);
1104}
1105
1106/*****************************************/
1107
1063static void free_gtlb(struct kvmppc_vcpu_e500 *vcpu_e500) 1108static void free_gtlb(struct kvmppc_vcpu_e500 *vcpu_e500)
1064{ 1109{
1065 int i; 1110 int i;
@@ -1081,6 +1126,8 @@ static void free_gtlb(struct kvmppc_vcpu_e500 *vcpu_e500)
1081 } 1126 }
1082 1127
1083 vcpu_e500->num_shared_tlb_pages = 0; 1128 vcpu_e500->num_shared_tlb_pages = 0;
1129
1130 kfree(vcpu_e500->shared_tlb_pages);
1084 vcpu_e500->shared_tlb_pages = NULL; 1131 vcpu_e500->shared_tlb_pages = NULL;
1085 } else { 1132 } else {
1086 kfree(vcpu_e500->gtlb_arch); 1133 kfree(vcpu_e500->gtlb_arch);
@@ -1178,21 +1225,27 @@ int kvm_vcpu_ioctl_config_tlb(struct kvm_vcpu *vcpu,
1178 } 1225 }
1179 1226
1180 virt = vmap(pages, num_pages, VM_MAP, PAGE_KERNEL); 1227 virt = vmap(pages, num_pages, VM_MAP, PAGE_KERNEL);
1181 if (!virt) 1228 if (!virt) {
1229 ret = -ENOMEM;
1182 goto err_put_page; 1230 goto err_put_page;
1231 }
1183 1232
1184 privs[0] = kzalloc(sizeof(struct tlbe_priv) * params.tlb_sizes[0], 1233 privs[0] = kzalloc(sizeof(struct tlbe_priv) * params.tlb_sizes[0],
1185 GFP_KERNEL); 1234 GFP_KERNEL);
1186 privs[1] = kzalloc(sizeof(struct tlbe_priv) * params.tlb_sizes[1], 1235 privs[1] = kzalloc(sizeof(struct tlbe_priv) * params.tlb_sizes[1],
1187 GFP_KERNEL); 1236 GFP_KERNEL);
1188 1237
1189 if (!privs[0] || !privs[1]) 1238 if (!privs[0] || !privs[1]) {
1190 goto err_put_page; 1239 ret = -ENOMEM;
1240 goto err_privs;
1241 }
1191 1242
1192 g2h_bitmap = kzalloc(sizeof(u64) * params.tlb_sizes[1], 1243 g2h_bitmap = kzalloc(sizeof(u64) * params.tlb_sizes[1],
1193 GFP_KERNEL); 1244 GFP_KERNEL);
1194 if (!g2h_bitmap) 1245 if (!g2h_bitmap) {
1195 goto err_put_page; 1246 ret = -ENOMEM;
1247 goto err_privs;
1248 }
1196 1249
1197 free_gtlb(vcpu_e500); 1250 free_gtlb(vcpu_e500);
1198 1251
@@ -1232,10 +1285,11 @@ int kvm_vcpu_ioctl_config_tlb(struct kvm_vcpu *vcpu,
1232 kvmppc_recalc_tlb1map_range(vcpu_e500); 1285 kvmppc_recalc_tlb1map_range(vcpu_e500);
1233 return 0; 1286 return 0;
1234 1287
1235err_put_page: 1288err_privs:
1236 kfree(privs[0]); 1289 kfree(privs[0]);
1237 kfree(privs[1]); 1290 kfree(privs[1]);
1238 1291
1292err_put_page:
1239 for (i = 0; i < num_pages; i++) 1293 for (i = 0; i < num_pages; i++)
1240 put_page(pages[i]); 1294 put_page(pages[i]);
1241 1295
@@ -1332,7 +1386,7 @@ int kvmppc_e500_tlb_init(struct kvmppc_vcpu_e500 *vcpu_e500)
1332 if (!vcpu_e500->gtlb_priv[1]) 1386 if (!vcpu_e500->gtlb_priv[1])
1333 goto err; 1387 goto err;
1334 1388
1335 vcpu_e500->g2h_tlb1_map = kzalloc(sizeof(unsigned int) * 1389 vcpu_e500->g2h_tlb1_map = kzalloc(sizeof(u64) *
1336 vcpu_e500->gtlb_params[1].entries, 1390 vcpu_e500->gtlb_params[1].entries,
1337 GFP_KERNEL); 1391 GFP_KERNEL);
1338 if (!vcpu_e500->g2h_tlb1_map) 1392 if (!vcpu_e500->g2h_tlb1_map)
diff --git a/arch/powerpc/kvm/emulate.c b/arch/powerpc/kvm/emulate.c
index ee04abaefe23..b0855e5d8905 100644
--- a/arch/powerpc/kvm/emulate.c
+++ b/arch/powerpc/kvm/emulate.c
@@ -131,6 +131,125 @@ u32 kvmppc_get_dec(struct kvm_vcpu *vcpu, u64 tb)
131 return vcpu->arch.dec - jd; 131 return vcpu->arch.dec - jd;
132} 132}
133 133
134static int kvmppc_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs)
135{
136 enum emulation_result emulated = EMULATE_DONE;
137 ulong spr_val = kvmppc_get_gpr(vcpu, rs);
138
139 switch (sprn) {
140 case SPRN_SRR0:
141 vcpu->arch.shared->srr0 = spr_val;
142 break;
143 case SPRN_SRR1:
144 vcpu->arch.shared->srr1 = spr_val;
145 break;
146
147 /* XXX We need to context-switch the timebase for
148 * watchdog and FIT. */
149 case SPRN_TBWL: break;
150 case SPRN_TBWU: break;
151
152 case SPRN_MSSSR0: break;
153
154 case SPRN_DEC:
155 vcpu->arch.dec = spr_val;
156 kvmppc_emulate_dec(vcpu);
157 break;
158
159 case SPRN_SPRG0:
160 vcpu->arch.shared->sprg0 = spr_val;
161 break;
162 case SPRN_SPRG1:
163 vcpu->arch.shared->sprg1 = spr_val;
164 break;
165 case SPRN_SPRG2:
166 vcpu->arch.shared->sprg2 = spr_val;
167 break;
168 case SPRN_SPRG3:
169 vcpu->arch.shared->sprg3 = spr_val;
170 break;
171
172 default:
173 emulated = kvmppc_core_emulate_mtspr(vcpu, sprn,
174 spr_val);
175 if (emulated == EMULATE_FAIL)
176 printk(KERN_INFO "mtspr: unknown spr "
177 "0x%x\n", sprn);
178 break;
179 }
180
181 kvmppc_set_exit_type(vcpu, EMULATED_MTSPR_EXITS);
182
183 return emulated;
184}
185
186static int kvmppc_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt)
187{
188 enum emulation_result emulated = EMULATE_DONE;
189 ulong spr_val = 0;
190
191 switch (sprn) {
192 case SPRN_SRR0:
193 spr_val = vcpu->arch.shared->srr0;
194 break;
195 case SPRN_SRR1:
196 spr_val = vcpu->arch.shared->srr1;
197 break;
198 case SPRN_PVR:
199 spr_val = vcpu->arch.pvr;
200 break;
201 case SPRN_PIR:
202 spr_val = vcpu->vcpu_id;
203 break;
204 case SPRN_MSSSR0:
205 spr_val = 0;
206 break;
207
208 /* Note: mftb and TBRL/TBWL are user-accessible, so
209 * the guest can always access the real TB anyways.
210 * In fact, we probably will never see these traps. */
211 case SPRN_TBWL:
212 spr_val = get_tb() >> 32;
213 break;
214 case SPRN_TBWU:
215 spr_val = get_tb();
216 break;
217
218 case SPRN_SPRG0:
219 spr_val = vcpu->arch.shared->sprg0;
220 break;
221 case SPRN_SPRG1:
222 spr_val = vcpu->arch.shared->sprg1;
223 break;
224 case SPRN_SPRG2:
225 spr_val = vcpu->arch.shared->sprg2;
226 break;
227 case SPRN_SPRG3:
228 spr_val = vcpu->arch.shared->sprg3;
229 break;
230 /* Note: SPRG4-7 are user-readable, so we don't get
231 * a trap. */
232
233 case SPRN_DEC:
234 spr_val = kvmppc_get_dec(vcpu, get_tb());
235 break;
236 default:
237 emulated = kvmppc_core_emulate_mfspr(vcpu, sprn,
238 &spr_val);
239 if (unlikely(emulated == EMULATE_FAIL)) {
240 printk(KERN_INFO "mfspr: unknown spr "
241 "0x%x\n", sprn);
242 }
243 break;
244 }
245
246 if (emulated == EMULATE_DONE)
247 kvmppc_set_gpr(vcpu, rt, spr_val);
248 kvmppc_set_exit_type(vcpu, EMULATED_MFSPR_EXITS);
249
250 return emulated;
251}
252
134/* XXX to do: 253/* XXX to do:
135 * lhax 254 * lhax
136 * lhaux 255 * lhaux
@@ -156,7 +275,6 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)
156 int sprn = get_sprn(inst); 275 int sprn = get_sprn(inst);
157 enum emulation_result emulated = EMULATE_DONE; 276 enum emulation_result emulated = EMULATE_DONE;
158 int advance = 1; 277 int advance = 1;
159 ulong spr_val = 0;
160 278
161 /* this default type might be overwritten by subcategories */ 279 /* this default type might be overwritten by subcategories */
162 kvmppc_set_exit_type(vcpu, EMULATED_INST_EXITS); 280 kvmppc_set_exit_type(vcpu, EMULATED_INST_EXITS);
@@ -236,62 +354,7 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)
236 break; 354 break;
237 355
238 case OP_31_XOP_MFSPR: 356 case OP_31_XOP_MFSPR:
239 switch (sprn) { 357 emulated = kvmppc_emulate_mfspr(vcpu, sprn, rt);
240 case SPRN_SRR0:
241 spr_val = vcpu->arch.shared->srr0;
242 break;
243 case SPRN_SRR1:
244 spr_val = vcpu->arch.shared->srr1;
245 break;
246 case SPRN_PVR:
247 spr_val = vcpu->arch.pvr;
248 break;
249 case SPRN_PIR:
250 spr_val = vcpu->vcpu_id;
251 break;
252 case SPRN_MSSSR0:
253 spr_val = 0;
254 break;
255
256 /* Note: mftb and TBRL/TBWL are user-accessible, so
257 * the guest can always access the real TB anyways.
258 * In fact, we probably will never see these traps. */
259 case SPRN_TBWL:
260 spr_val = get_tb() >> 32;
261 break;
262 case SPRN_TBWU:
263 spr_val = get_tb();
264 break;
265
266 case SPRN_SPRG0:
267 spr_val = vcpu->arch.shared->sprg0;
268 break;
269 case SPRN_SPRG1:
270 spr_val = vcpu->arch.shared->sprg1;
271 break;
272 case SPRN_SPRG2:
273 spr_val = vcpu->arch.shared->sprg2;
274 break;
275 case SPRN_SPRG3:
276 spr_val = vcpu->arch.shared->sprg3;
277 break;
278 /* Note: SPRG4-7 are user-readable, so we don't get
279 * a trap. */
280
281 case SPRN_DEC:
282 spr_val = kvmppc_get_dec(vcpu, get_tb());
283 break;
284 default:
285 emulated = kvmppc_core_emulate_mfspr(vcpu, sprn,
286 &spr_val);
287 if (unlikely(emulated == EMULATE_FAIL)) {
288 printk(KERN_INFO "mfspr: unknown spr "
289 "0x%x\n", sprn);
290 }
291 break;
292 }
293 kvmppc_set_gpr(vcpu, rt, spr_val);
294 kvmppc_set_exit_type(vcpu, EMULATED_MFSPR_EXITS);
295 break; 358 break;
296 359
297 case OP_31_XOP_STHX: 360 case OP_31_XOP_STHX:
@@ -308,49 +371,7 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)
308 break; 371 break;
309 372
310 case OP_31_XOP_MTSPR: 373 case OP_31_XOP_MTSPR:
311 spr_val = kvmppc_get_gpr(vcpu, rs); 374 emulated = kvmppc_emulate_mtspr(vcpu, sprn, rs);
312 switch (sprn) {
313 case SPRN_SRR0:
314 vcpu->arch.shared->srr0 = spr_val;
315 break;
316 case SPRN_SRR1:
317 vcpu->arch.shared->srr1 = spr_val;
318 break;
319
320 /* XXX We need to context-switch the timebase for
321 * watchdog and FIT. */
322 case SPRN_TBWL: break;
323 case SPRN_TBWU: break;
324
325 case SPRN_MSSSR0: break;
326
327 case SPRN_DEC:
328 vcpu->arch.dec = spr_val;
329 kvmppc_emulate_dec(vcpu);
330 break;
331
332 case SPRN_SPRG0:
333 vcpu->arch.shared->sprg0 = spr_val;
334 break;
335 case SPRN_SPRG1:
336 vcpu->arch.shared->sprg1 = spr_val;
337 break;
338 case SPRN_SPRG2:
339 vcpu->arch.shared->sprg2 = spr_val;
340 break;
341 case SPRN_SPRG3:
342 vcpu->arch.shared->sprg3 = spr_val;
343 break;
344
345 default:
346 emulated = kvmppc_core_emulate_mtspr(vcpu, sprn,
347 spr_val);
348 if (emulated == EMULATE_FAIL)
349 printk(KERN_INFO "mtspr: unknown spr "
350 "0x%x\n", sprn);
351 break;
352 }
353 kvmppc_set_exit_type(vcpu, EMULATED_MTSPR_EXITS);
354 break; 375 break;
355 376
356 case OP_31_XOP_DCBI: 377 case OP_31_XOP_DCBI:
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 4d213b8b0fb5..70739a089560 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -30,6 +30,7 @@
30#include <asm/kvm_ppc.h> 30#include <asm/kvm_ppc.h>
31#include <asm/tlbflush.h> 31#include <asm/tlbflush.h>
32#include <asm/cputhreads.h> 32#include <asm/cputhreads.h>
33#include <asm/irqflags.h>
33#include "timing.h" 34#include "timing.h"
34#include "../mm/mmu_decl.h" 35#include "../mm/mmu_decl.h"
35 36
@@ -38,8 +39,7 @@
38 39
39int kvm_arch_vcpu_runnable(struct kvm_vcpu *v) 40int kvm_arch_vcpu_runnable(struct kvm_vcpu *v)
40{ 41{
41 return !(v->arch.shared->msr & MSR_WE) || 42 return !!(v->arch.pending_exceptions) ||
42 !!(v->arch.pending_exceptions) ||
43 v->requests; 43 v->requests;
44} 44}
45 45
@@ -48,6 +48,85 @@ int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu)
48 return 1; 48 return 1;
49} 49}
50 50
51#ifndef CONFIG_KVM_BOOK3S_64_HV
52/*
53 * Common checks before entering the guest world. Call with interrupts
54 * disabled.
55 *
56 * returns:
57 *
58 * == 1 if we're ready to go into guest state
59 * <= 0 if we need to go back to the host with return value
60 */
61int kvmppc_prepare_to_enter(struct kvm_vcpu *vcpu)
62{
63 int r = 1;
64
65 WARN_ON_ONCE(!irqs_disabled());
66 while (true) {
67 if (need_resched()) {
68 local_irq_enable();
69 cond_resched();
70 local_irq_disable();
71 continue;
72 }
73
74 if (signal_pending(current)) {
75 kvmppc_account_exit(vcpu, SIGNAL_EXITS);
76 vcpu->run->exit_reason = KVM_EXIT_INTR;
77 r = -EINTR;
78 break;
79 }
80
81 vcpu->mode = IN_GUEST_MODE;
82
83 /*
84 * Reading vcpu->requests must happen after setting vcpu->mode,
85 * so we don't miss a request because the requester sees
86 * OUTSIDE_GUEST_MODE and assumes we'll be checking requests
87 * before next entering the guest (and thus doesn't IPI).
88 */
89 smp_mb();
90
91 if (vcpu->requests) {
92 /* Make sure we process requests preemptable */
93 local_irq_enable();
94 trace_kvm_check_requests(vcpu);
95 r = kvmppc_core_check_requests(vcpu);
96 local_irq_disable();
97 if (r > 0)
98 continue;
99 break;
100 }
101
102 if (kvmppc_core_prepare_to_enter(vcpu)) {
103 /* interrupts got enabled in between, so we
104 are back at square 1 */
105 continue;
106 }
107
108#ifdef CONFIG_PPC64
109 /* lazy EE magic */
110 hard_irq_disable();
111 if (lazy_irq_pending()) {
112 /* Got an interrupt in between, try again */
113 local_irq_enable();
114 local_irq_disable();
115 kvm_guest_exit();
116 continue;
117 }
118
119 trace_hardirqs_on();
120#endif
121
122 kvm_guest_enter();
123 break;
124 }
125
126 return r;
127}
128#endif /* CONFIG_KVM_BOOK3S_64_HV */
129
51int kvmppc_kvm_pv(struct kvm_vcpu *vcpu) 130int kvmppc_kvm_pv(struct kvm_vcpu *vcpu)
52{ 131{
53 int nr = kvmppc_get_gpr(vcpu, 11); 132 int nr = kvmppc_get_gpr(vcpu, 11);
@@ -67,18 +146,18 @@ int kvmppc_kvm_pv(struct kvm_vcpu *vcpu)
67 } 146 }
68 147
69 switch (nr) { 148 switch (nr) {
70 case HC_VENDOR_KVM | KVM_HC_PPC_MAP_MAGIC_PAGE: 149 case KVM_HCALL_TOKEN(KVM_HC_PPC_MAP_MAGIC_PAGE):
71 { 150 {
72 vcpu->arch.magic_page_pa = param1; 151 vcpu->arch.magic_page_pa = param1;
73 vcpu->arch.magic_page_ea = param2; 152 vcpu->arch.magic_page_ea = param2;
74 153
75 r2 = KVM_MAGIC_FEAT_SR | KVM_MAGIC_FEAT_MAS0_TO_SPRG7; 154 r2 = KVM_MAGIC_FEAT_SR | KVM_MAGIC_FEAT_MAS0_TO_SPRG7;
76 155
77 r = HC_EV_SUCCESS; 156 r = EV_SUCCESS;
78 break; 157 break;
79 } 158 }
80 case HC_VENDOR_KVM | KVM_HC_FEATURES: 159 case KVM_HCALL_TOKEN(KVM_HC_FEATURES):
81 r = HC_EV_SUCCESS; 160 r = EV_SUCCESS;
82#if defined(CONFIG_PPC_BOOK3S) || defined(CONFIG_KVM_E500V2) 161#if defined(CONFIG_PPC_BOOK3S) || defined(CONFIG_KVM_E500V2)
83 /* XXX Missing magic page on 44x */ 162 /* XXX Missing magic page on 44x */
84 r2 |= (1 << KVM_FEATURE_MAGIC_PAGE); 163 r2 |= (1 << KVM_FEATURE_MAGIC_PAGE);
@@ -86,8 +165,13 @@ int kvmppc_kvm_pv(struct kvm_vcpu *vcpu)
86 165
87 /* Second return value is in r4 */ 166 /* Second return value is in r4 */
88 break; 167 break;
168 case EV_HCALL_TOKEN(EV_IDLE):
169 r = EV_SUCCESS;
170 kvm_vcpu_block(vcpu);
171 clear_bit(KVM_REQ_UNHALT, &vcpu->requests);
172 break;
89 default: 173 default:
90 r = HC_EV_UNIMPLEMENTED; 174 r = EV_UNIMPLEMENTED;
91 break; 175 break;
92 } 176 }
93 177
@@ -220,6 +304,7 @@ int kvm_dev_ioctl_check_extension(long ext)
220 switch (ext) { 304 switch (ext) {
221#ifdef CONFIG_BOOKE 305#ifdef CONFIG_BOOKE
222 case KVM_CAP_PPC_BOOKE_SREGS: 306 case KVM_CAP_PPC_BOOKE_SREGS:
307 case KVM_CAP_PPC_BOOKE_WATCHDOG:
223#else 308#else
224 case KVM_CAP_PPC_SEGSTATE: 309 case KVM_CAP_PPC_SEGSTATE:
225 case KVM_CAP_PPC_HIOR: 310 case KVM_CAP_PPC_HIOR:
@@ -229,6 +314,7 @@ int kvm_dev_ioctl_check_extension(long ext)
229 case KVM_CAP_PPC_IRQ_LEVEL: 314 case KVM_CAP_PPC_IRQ_LEVEL:
230 case KVM_CAP_ENABLE_CAP: 315 case KVM_CAP_ENABLE_CAP:
231 case KVM_CAP_ONE_REG: 316 case KVM_CAP_ONE_REG:
317 case KVM_CAP_IOEVENTFD:
232 r = 1; 318 r = 1;
233 break; 319 break;
234#ifndef CONFIG_KVM_BOOK3S_64_HV 320#ifndef CONFIG_KVM_BOOK3S_64_HV
@@ -260,10 +346,22 @@ int kvm_dev_ioctl_check_extension(long ext)
260 if (cpu_has_feature(CPU_FTR_ARCH_201)) 346 if (cpu_has_feature(CPU_FTR_ARCH_201))
261 r = 2; 347 r = 2;
262 break; 348 break;
349#endif
263 case KVM_CAP_SYNC_MMU: 350 case KVM_CAP_SYNC_MMU:
351#ifdef CONFIG_KVM_BOOK3S_64_HV
264 r = cpu_has_feature(CPU_FTR_ARCH_206) ? 1 : 0; 352 r = cpu_has_feature(CPU_FTR_ARCH_206) ? 1 : 0;
353#elif defined(KVM_ARCH_WANT_MMU_NOTIFIER)
354 r = 1;
355#else
356 r = 0;
357 break;
358#endif
359#ifdef CONFIG_KVM_BOOK3S_64_HV
360 case KVM_CAP_PPC_HTAB_FD:
361 r = 1;
265 break; 362 break;
266#endif 363#endif
364 break;
267 case KVM_CAP_NR_VCPUS: 365 case KVM_CAP_NR_VCPUS:
268 /* 366 /*
269 * Recommending a number of CPUs is somewhat arbitrary; we 367 * Recommending a number of CPUs is somewhat arbitrary; we
@@ -302,19 +400,12 @@ long kvm_arch_dev_ioctl(struct file *filp,
302void kvm_arch_free_memslot(struct kvm_memory_slot *free, 400void kvm_arch_free_memslot(struct kvm_memory_slot *free,
303 struct kvm_memory_slot *dont) 401 struct kvm_memory_slot *dont)
304{ 402{
305 if (!dont || free->arch.rmap != dont->arch.rmap) { 403 kvmppc_core_free_memslot(free, dont);
306 vfree(free->arch.rmap);
307 free->arch.rmap = NULL;
308 }
309} 404}
310 405
311int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages) 406int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages)
312{ 407{
313 slot->arch.rmap = vzalloc(npages * sizeof(*slot->arch.rmap)); 408 return kvmppc_core_create_memslot(slot, npages);
314 if (!slot->arch.rmap)
315 return -ENOMEM;
316
317 return 0;
318} 409}
319 410
320int kvm_arch_prepare_memory_region(struct kvm *kvm, 411int kvm_arch_prepare_memory_region(struct kvm *kvm,
@@ -323,7 +414,7 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
323 struct kvm_userspace_memory_region *mem, 414 struct kvm_userspace_memory_region *mem,
324 int user_alloc) 415 int user_alloc)
325{ 416{
326 return kvmppc_core_prepare_memory_region(kvm, mem); 417 return kvmppc_core_prepare_memory_region(kvm, memslot, mem);
327} 418}
328 419
329void kvm_arch_commit_memory_region(struct kvm *kvm, 420void kvm_arch_commit_memory_region(struct kvm *kvm,
@@ -331,7 +422,7 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
331 struct kvm_memory_slot old, 422 struct kvm_memory_slot old,
332 int user_alloc) 423 int user_alloc)
333{ 424{
334 kvmppc_core_commit_memory_region(kvm, mem); 425 kvmppc_core_commit_memory_region(kvm, mem, old);
335} 426}
336 427
337void kvm_arch_flush_shadow_all(struct kvm *kvm) 428void kvm_arch_flush_shadow_all(struct kvm *kvm)
@@ -341,6 +432,7 @@ void kvm_arch_flush_shadow_all(struct kvm *kvm)
341void kvm_arch_flush_shadow_memslot(struct kvm *kvm, 432void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
342 struct kvm_memory_slot *slot) 433 struct kvm_memory_slot *slot)
343{ 434{
435 kvmppc_core_flush_memslot(kvm, slot);
344} 436}
345 437
346struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id) 438struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id)
@@ -354,6 +446,11 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id)
354 return vcpu; 446 return vcpu;
355} 447}
356 448
449int kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
450{
451 return 0;
452}
453
357void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu) 454void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
358{ 455{
359 /* Make sure we're not using the vcpu anymore */ 456 /* Make sure we're not using the vcpu anymore */
@@ -390,6 +487,8 @@ enum hrtimer_restart kvmppc_decrementer_wakeup(struct hrtimer *timer)
390 487
391int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) 488int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
392{ 489{
490 int ret;
491
393 hrtimer_init(&vcpu->arch.dec_timer, CLOCK_REALTIME, HRTIMER_MODE_ABS); 492 hrtimer_init(&vcpu->arch.dec_timer, CLOCK_REALTIME, HRTIMER_MODE_ABS);
394 tasklet_init(&vcpu->arch.tasklet, kvmppc_decrementer_func, (ulong)vcpu); 493 tasklet_init(&vcpu->arch.tasklet, kvmppc_decrementer_func, (ulong)vcpu);
395 vcpu->arch.dec_timer.function = kvmppc_decrementer_wakeup; 494 vcpu->arch.dec_timer.function = kvmppc_decrementer_wakeup;
@@ -398,13 +497,14 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
398#ifdef CONFIG_KVM_EXIT_TIMING 497#ifdef CONFIG_KVM_EXIT_TIMING
399 mutex_init(&vcpu->arch.exit_timing_lock); 498 mutex_init(&vcpu->arch.exit_timing_lock);
400#endif 499#endif
401 500 ret = kvmppc_subarch_vcpu_init(vcpu);
402 return 0; 501 return ret;
403} 502}
404 503
405void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) 504void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
406{ 505{
407 kvmppc_mmu_destroy(vcpu); 506 kvmppc_mmu_destroy(vcpu);
507 kvmppc_subarch_vcpu_uninit(vcpu);
408} 508}
409 509
410void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) 510void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
@@ -420,7 +520,6 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
420 mtspr(SPRN_VRSAVE, vcpu->arch.vrsave); 520 mtspr(SPRN_VRSAVE, vcpu->arch.vrsave);
421#endif 521#endif
422 kvmppc_core_vcpu_load(vcpu, cpu); 522 kvmppc_core_vcpu_load(vcpu, cpu);
423 vcpu->cpu = smp_processor_id();
424} 523}
425 524
426void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) 525void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
@@ -429,7 +528,6 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
429#ifdef CONFIG_BOOKE 528#ifdef CONFIG_BOOKE
430 vcpu->arch.vrsave = mfspr(SPRN_VRSAVE); 529 vcpu->arch.vrsave = mfspr(SPRN_VRSAVE);
431#endif 530#endif
432 vcpu->cpu = -1;
433} 531}
434 532
435int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, 533int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
@@ -527,6 +625,13 @@ int kvmppc_handle_load(struct kvm_run *run, struct kvm_vcpu *vcpu,
527 vcpu->mmio_is_write = 0; 625 vcpu->mmio_is_write = 0;
528 vcpu->arch.mmio_sign_extend = 0; 626 vcpu->arch.mmio_sign_extend = 0;
529 627
628 if (!kvm_io_bus_read(vcpu->kvm, KVM_MMIO_BUS, run->mmio.phys_addr,
629 bytes, &run->mmio.data)) {
630 kvmppc_complete_mmio_load(vcpu, run);
631 vcpu->mmio_needed = 0;
632 return EMULATE_DONE;
633 }
634
530 return EMULATE_DO_MMIO; 635 return EMULATE_DO_MMIO;
531} 636}
532 637
@@ -536,8 +641,8 @@ int kvmppc_handle_loads(struct kvm_run *run, struct kvm_vcpu *vcpu,
536{ 641{
537 int r; 642 int r;
538 643
539 r = kvmppc_handle_load(run, vcpu, rt, bytes, is_bigendian);
540 vcpu->arch.mmio_sign_extend = 1; 644 vcpu->arch.mmio_sign_extend = 1;
645 r = kvmppc_handle_load(run, vcpu, rt, bytes, is_bigendian);
541 646
542 return r; 647 return r;
543} 648}
@@ -575,6 +680,13 @@ int kvmppc_handle_store(struct kvm_run *run, struct kvm_vcpu *vcpu,
575 } 680 }
576 } 681 }
577 682
683 if (!kvm_io_bus_write(vcpu->kvm, KVM_MMIO_BUS, run->mmio.phys_addr,
684 bytes, &run->mmio.data)) {
685 kvmppc_complete_mmio_load(vcpu, run);
686 vcpu->mmio_needed = 0;
687 return EMULATE_DONE;
688 }
689
578 return EMULATE_DO_MMIO; 690 return EMULATE_DO_MMIO;
579} 691}
580 692
@@ -649,6 +761,12 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu,
649 r = 0; 761 r = 0;
650 vcpu->arch.papr_enabled = true; 762 vcpu->arch.papr_enabled = true;
651 break; 763 break;
764#ifdef CONFIG_BOOKE
765 case KVM_CAP_PPC_BOOKE_WATCHDOG:
766 r = 0;
767 vcpu->arch.watchdog_enabled = true;
768 break;
769#endif
652#if defined(CONFIG_KVM_E500V2) || defined(CONFIG_KVM_E500MC) 770#if defined(CONFIG_KVM_E500V2) || defined(CONFIG_KVM_E500MC)
653 case KVM_CAP_SW_TLB: { 771 case KVM_CAP_SW_TLB: {
654 struct kvm_config_tlb cfg; 772 struct kvm_config_tlb cfg;
@@ -751,9 +869,16 @@ int kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf)
751 869
752static int kvm_vm_ioctl_get_pvinfo(struct kvm_ppc_pvinfo *pvinfo) 870static int kvm_vm_ioctl_get_pvinfo(struct kvm_ppc_pvinfo *pvinfo)
753{ 871{
872 u32 inst_nop = 0x60000000;
873#ifdef CONFIG_KVM_BOOKE_HV
874 u32 inst_sc1 = 0x44000022;
875 pvinfo->hcall[0] = inst_sc1;
876 pvinfo->hcall[1] = inst_nop;
877 pvinfo->hcall[2] = inst_nop;
878 pvinfo->hcall[3] = inst_nop;
879#else
754 u32 inst_lis = 0x3c000000; 880 u32 inst_lis = 0x3c000000;
755 u32 inst_ori = 0x60000000; 881 u32 inst_ori = 0x60000000;
756 u32 inst_nop = 0x60000000;
757 u32 inst_sc = 0x44000002; 882 u32 inst_sc = 0x44000002;
758 u32 inst_imm_mask = 0xffff; 883 u32 inst_imm_mask = 0xffff;
759 884
@@ -770,6 +895,9 @@ static int kvm_vm_ioctl_get_pvinfo(struct kvm_ppc_pvinfo *pvinfo)
770 pvinfo->hcall[1] = inst_ori | (KVM_SC_MAGIC_R0 & inst_imm_mask); 895 pvinfo->hcall[1] = inst_ori | (KVM_SC_MAGIC_R0 & inst_imm_mask);
771 pvinfo->hcall[2] = inst_sc; 896 pvinfo->hcall[2] = inst_sc;
772 pvinfo->hcall[3] = inst_nop; 897 pvinfo->hcall[3] = inst_nop;
898#endif
899
900 pvinfo->flags = KVM_PPC_PVINFO_FLAGS_EV_IDLE;
773 901
774 return 0; 902 return 0;
775} 903}
@@ -832,6 +960,17 @@ long kvm_arch_vm_ioctl(struct file *filp,
832 r = 0; 960 r = 0;
833 break; 961 break;
834 } 962 }
963
964 case KVM_PPC_GET_HTAB_FD: {
965 struct kvm *kvm = filp->private_data;
966 struct kvm_get_htab_fd ghf;
967
968 r = -EFAULT;
969 if (copy_from_user(&ghf, argp, sizeof(ghf)))
970 break;
971 r = kvm_vm_ioctl_get_htab_fd(kvm, &ghf);
972 break;
973 }
835#endif /* CONFIG_KVM_BOOK3S_64_HV */ 974#endif /* CONFIG_KVM_BOOK3S_64_HV */
836 975
837#ifdef CONFIG_PPC_BOOK3S_64 976#ifdef CONFIG_PPC_BOOK3S_64
diff --git a/arch/powerpc/kvm/trace.h b/arch/powerpc/kvm/trace.h
index ddb6a2149d44..e326489a5420 100644
--- a/arch/powerpc/kvm/trace.h
+++ b/arch/powerpc/kvm/trace.h
@@ -31,6 +31,126 @@ TRACE_EVENT(kvm_ppc_instr,
31 __entry->inst, __entry->pc, __entry->emulate) 31 __entry->inst, __entry->pc, __entry->emulate)
32); 32);
33 33
34#ifdef CONFIG_PPC_BOOK3S
35#define kvm_trace_symbol_exit \
36 {0x100, "SYSTEM_RESET"}, \
37 {0x200, "MACHINE_CHECK"}, \
38 {0x300, "DATA_STORAGE"}, \
39 {0x380, "DATA_SEGMENT"}, \
40 {0x400, "INST_STORAGE"}, \
41 {0x480, "INST_SEGMENT"}, \
42 {0x500, "EXTERNAL"}, \
43 {0x501, "EXTERNAL_LEVEL"}, \
44 {0x502, "EXTERNAL_HV"}, \
45 {0x600, "ALIGNMENT"}, \
46 {0x700, "PROGRAM"}, \
47 {0x800, "FP_UNAVAIL"}, \
48 {0x900, "DECREMENTER"}, \
49 {0x980, "HV_DECREMENTER"}, \
50 {0xc00, "SYSCALL"}, \
51 {0xd00, "TRACE"}, \
52 {0xe00, "H_DATA_STORAGE"}, \
53 {0xe20, "H_INST_STORAGE"}, \
54 {0xe40, "H_EMUL_ASSIST"}, \
55 {0xf00, "PERFMON"}, \
56 {0xf20, "ALTIVEC"}, \
57 {0xf40, "VSX"}
58#else
59#define kvm_trace_symbol_exit \
60 {0, "CRITICAL"}, \
61 {1, "MACHINE_CHECK"}, \
62 {2, "DATA_STORAGE"}, \
63 {3, "INST_STORAGE"}, \
64 {4, "EXTERNAL"}, \
65 {5, "ALIGNMENT"}, \
66 {6, "PROGRAM"}, \
67 {7, "FP_UNAVAIL"}, \
68 {8, "SYSCALL"}, \
69 {9, "AP_UNAVAIL"}, \
70 {10, "DECREMENTER"}, \
71 {11, "FIT"}, \
72 {12, "WATCHDOG"}, \
73 {13, "DTLB_MISS"}, \
74 {14, "ITLB_MISS"}, \
75 {15, "DEBUG"}, \
76 {32, "SPE_UNAVAIL"}, \
77 {33, "SPE_FP_DATA"}, \
78 {34, "SPE_FP_ROUND"}, \
79 {35, "PERFORMANCE_MONITOR"}, \
80 {36, "DOORBELL"}, \
81 {37, "DOORBELL_CRITICAL"}, \
82 {38, "GUEST_DBELL"}, \
83 {39, "GUEST_DBELL_CRIT"}, \
84 {40, "HV_SYSCALL"}, \
85 {41, "HV_PRIV"}
86#endif
87
88TRACE_EVENT(kvm_exit,
89 TP_PROTO(unsigned int exit_nr, struct kvm_vcpu *vcpu),
90 TP_ARGS(exit_nr, vcpu),
91
92 TP_STRUCT__entry(
93 __field( unsigned int, exit_nr )
94 __field( unsigned long, pc )
95 __field( unsigned long, msr )
96 __field( unsigned long, dar )
97#ifdef CONFIG_KVM_BOOK3S_PR
98 __field( unsigned long, srr1 )
99#endif
100 __field( unsigned long, last_inst )
101 ),
102
103 TP_fast_assign(
104#ifdef CONFIG_KVM_BOOK3S_PR
105 struct kvmppc_book3s_shadow_vcpu *svcpu;
106#endif
107 __entry->exit_nr = exit_nr;
108 __entry->pc = kvmppc_get_pc(vcpu);
109 __entry->dar = kvmppc_get_fault_dar(vcpu);
110 __entry->msr = vcpu->arch.shared->msr;
111#ifdef CONFIG_KVM_BOOK3S_PR
112 svcpu = svcpu_get(vcpu);
113 __entry->srr1 = svcpu->shadow_srr1;
114 svcpu_put(svcpu);
115#endif
116 __entry->last_inst = vcpu->arch.last_inst;
117 ),
118
119 TP_printk("exit=%s"
120 " | pc=0x%lx"
121 " | msr=0x%lx"
122 " | dar=0x%lx"
123#ifdef CONFIG_KVM_BOOK3S_PR
124 " | srr1=0x%lx"
125#endif
126 " | last_inst=0x%lx"
127 ,
128 __print_symbolic(__entry->exit_nr, kvm_trace_symbol_exit),
129 __entry->pc,
130 __entry->msr,
131 __entry->dar,
132#ifdef CONFIG_KVM_BOOK3S_PR
133 __entry->srr1,
134#endif
135 __entry->last_inst
136 )
137);
138
139TRACE_EVENT(kvm_unmap_hva,
140 TP_PROTO(unsigned long hva),
141 TP_ARGS(hva),
142
143 TP_STRUCT__entry(
144 __field( unsigned long, hva )
145 ),
146
147 TP_fast_assign(
148 __entry->hva = hva;
149 ),
150
151 TP_printk("unmap hva 0x%lx\n", __entry->hva)
152);
153
34TRACE_EVENT(kvm_stlb_inval, 154TRACE_EVENT(kvm_stlb_inval,
35 TP_PROTO(unsigned int stlb_index), 155 TP_PROTO(unsigned int stlb_index),
36 TP_ARGS(stlb_index), 156 TP_ARGS(stlb_index),
@@ -98,41 +218,31 @@ TRACE_EVENT(kvm_gtlb_write,
98 __entry->word1, __entry->word2) 218 __entry->word1, __entry->word2)
99); 219);
100 220
101 221TRACE_EVENT(kvm_check_requests,
102/************************************************************************* 222 TP_PROTO(struct kvm_vcpu *vcpu),
103 * Book3S trace points * 223 TP_ARGS(vcpu),
104 *************************************************************************/
105
106#ifdef CONFIG_KVM_BOOK3S_PR
107
108TRACE_EVENT(kvm_book3s_exit,
109 TP_PROTO(unsigned int exit_nr, struct kvm_vcpu *vcpu),
110 TP_ARGS(exit_nr, vcpu),
111 224
112 TP_STRUCT__entry( 225 TP_STRUCT__entry(
113 __field( unsigned int, exit_nr ) 226 __field( __u32, cpu_nr )
114 __field( unsigned long, pc ) 227 __field( __u32, requests )
115 __field( unsigned long, msr )
116 __field( unsigned long, dar )
117 __field( unsigned long, srr1 )
118 ), 228 ),
119 229
120 TP_fast_assign( 230 TP_fast_assign(
121 struct kvmppc_book3s_shadow_vcpu *svcpu; 231 __entry->cpu_nr = vcpu->vcpu_id;
122 __entry->exit_nr = exit_nr; 232 __entry->requests = vcpu->requests;
123 __entry->pc = kvmppc_get_pc(vcpu);
124 __entry->dar = kvmppc_get_fault_dar(vcpu);
125 __entry->msr = vcpu->arch.shared->msr;
126 svcpu = svcpu_get(vcpu);
127 __entry->srr1 = svcpu->shadow_srr1;
128 svcpu_put(svcpu);
129 ), 233 ),
130 234
131 TP_printk("exit=0x%x | pc=0x%lx | msr=0x%lx | dar=0x%lx | srr1=0x%lx", 235 TP_printk("vcpu=%x requests=%x",
132 __entry->exit_nr, __entry->pc, __entry->msr, __entry->dar, 236 __entry->cpu_nr, __entry->requests)
133 __entry->srr1)
134); 237);
135 238
239
240/*************************************************************************
241 * Book3S trace points *
242 *************************************************************************/
243
244#ifdef CONFIG_KVM_BOOK3S_PR
245
136TRACE_EVENT(kvm_book3s_reenter, 246TRACE_EVENT(kvm_book3s_reenter,
137 TP_PROTO(int r, struct kvm_vcpu *vcpu), 247 TP_PROTO(int r, struct kvm_vcpu *vcpu),
138 TP_ARGS(r, vcpu), 248 TP_ARGS(r, vcpu),
@@ -395,6 +505,44 @@ TRACE_EVENT(kvm_booke206_gtlb_write,
395 __entry->mas2, __entry->mas7_3) 505 __entry->mas2, __entry->mas7_3)
396); 506);
397 507
508TRACE_EVENT(kvm_booke206_ref_release,
509 TP_PROTO(__u64 pfn, __u32 flags),
510 TP_ARGS(pfn, flags),
511
512 TP_STRUCT__entry(
513 __field( __u64, pfn )
514 __field( __u32, flags )
515 ),
516
517 TP_fast_assign(
518 __entry->pfn = pfn;
519 __entry->flags = flags;
520 ),
521
522 TP_printk("pfn=%llx flags=%x",
523 __entry->pfn, __entry->flags)
524);
525
526TRACE_EVENT(kvm_booke_queue_irqprio,
527 TP_PROTO(struct kvm_vcpu *vcpu, unsigned int priority),
528 TP_ARGS(vcpu, priority),
529
530 TP_STRUCT__entry(
531 __field( __u32, cpu_nr )
532 __field( __u32, priority )
533 __field( unsigned long, pending )
534 ),
535
536 TP_fast_assign(
537 __entry->cpu_nr = vcpu->vcpu_id;
538 __entry->priority = priority;
539 __entry->pending = vcpu->arch.pending_exceptions;
540 ),
541
542 TP_printk("vcpu=%x prio=%x pending=%lx",
543 __entry->cpu_nr, __entry->priority, __entry->pending)
544);
545
398#endif 546#endif
399 547
400#endif /* _TRACE_KVM_H */ 548#endif /* _TRACE_KVM_H */
diff --git a/arch/powerpc/platforms/Kconfig b/arch/powerpc/platforms/Kconfig
index e7a896acd982..48a920d51489 100644
--- a/arch/powerpc/platforms/Kconfig
+++ b/arch/powerpc/platforms/Kconfig
@@ -90,6 +90,7 @@ config MPIC
90config PPC_EPAPR_HV_PIC 90config PPC_EPAPR_HV_PIC
91 bool 91 bool
92 default n 92 default n
93 select EPAPR_PARAVIRT
93 94
94config MPIC_WEIRD 95config MPIC_WEIRD
95 bool 96 bool
diff --git a/arch/powerpc/sysdev/fsl_msi.c b/arch/powerpc/sysdev/fsl_msi.c
index 51ffafae561e..63c5f04ea580 100644
--- a/arch/powerpc/sysdev/fsl_msi.c
+++ b/arch/powerpc/sysdev/fsl_msi.c
@@ -236,7 +236,6 @@ static void fsl_msi_cascade(unsigned int irq, struct irq_desc *desc)
236 u32 intr_index; 236 u32 intr_index;
237 u32 have_shift = 0; 237 u32 have_shift = 0;
238 struct fsl_msi_cascade_data *cascade_data; 238 struct fsl_msi_cascade_data *cascade_data;
239 unsigned int ret;
240 239
241 cascade_data = irq_get_handler_data(irq); 240 cascade_data = irq_get_handler_data(irq);
242 msi_data = cascade_data->msi_data; 241 msi_data = cascade_data->msi_data;
@@ -268,7 +267,9 @@ static void fsl_msi_cascade(unsigned int irq, struct irq_desc *desc)
268 case FSL_PIC_IP_IPIC: 267 case FSL_PIC_IP_IPIC:
269 msir_value = fsl_msi_read(msi_data->msi_regs, msir_index * 0x4); 268 msir_value = fsl_msi_read(msi_data->msi_regs, msir_index * 0x4);
270 break; 269 break;
271 case FSL_PIC_IP_VMPIC: 270#ifdef CONFIG_EPAPR_PARAVIRT
271 case FSL_PIC_IP_VMPIC: {
272 unsigned int ret;
272 ret = fh_vmpic_get_msir(virq_to_hw(irq), &msir_value); 273 ret = fh_vmpic_get_msir(virq_to_hw(irq), &msir_value);
273 if (ret) { 274 if (ret) {
274 pr_err("fsl-msi: fh_vmpic_get_msir() failed for " 275 pr_err("fsl-msi: fh_vmpic_get_msir() failed for "
@@ -277,6 +278,8 @@ static void fsl_msi_cascade(unsigned int irq, struct irq_desc *desc)
277 } 278 }
278 break; 279 break;
279 } 280 }
281#endif
282 }
280 283
281 while (msir_value) { 284 while (msir_value) {
282 intr_index = ffs(msir_value) - 1; 285 intr_index = ffs(msir_value) - 1;
@@ -508,10 +511,12 @@ static const struct of_device_id fsl_of_msi_ids[] = {
508 .compatible = "fsl,ipic-msi", 511 .compatible = "fsl,ipic-msi",
509 .data = &ipic_msi_feature, 512 .data = &ipic_msi_feature,
510 }, 513 },
514#ifdef CONFIG_EPAPR_PARAVIRT
511 { 515 {
512 .compatible = "fsl,vmpic-msi", 516 .compatible = "fsl,vmpic-msi",
513 .data = &vmpic_msi_feature, 517 .data = &vmpic_msi_feature,
514 }, 518 },
519#endif
515 {} 520 {}
516}; 521};
517 522
diff --git a/arch/powerpc/sysdev/fsl_soc.c b/arch/powerpc/sysdev/fsl_soc.c
index c449dbd1c938..97118dc3d285 100644
--- a/arch/powerpc/sysdev/fsl_soc.c
+++ b/arch/powerpc/sysdev/fsl_soc.c
@@ -253,6 +253,7 @@ struct platform_diu_data_ops diu_ops;
253EXPORT_SYMBOL(diu_ops); 253EXPORT_SYMBOL(diu_ops);
254#endif 254#endif
255 255
256#ifdef CONFIG_EPAPR_PARAVIRT
256/* 257/*
257 * Restart the current partition 258 * Restart the current partition
258 * 259 *
@@ -278,3 +279,4 @@ void fsl_hv_halt(void)
278 pr_info("hv exit\n"); 279 pr_info("hv exit\n");
279 fh_partition_stop(-1); 280 fh_partition_stop(-1);
280} 281}
282#endif
diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c
index ff1e2f8ef94a..c30615e605ac 100644
--- a/arch/s390/kvm/interrupt.c
+++ b/arch/s390/kvm/interrupt.c
@@ -629,10 +629,27 @@ int kvm_s390_inject_vcpu(struct kvm_vcpu *vcpu,
629 break; 629 break;
630 case KVM_S390_SIGP_STOP: 630 case KVM_S390_SIGP_STOP:
631 case KVM_S390_RESTART: 631 case KVM_S390_RESTART:
632 VCPU_EVENT(vcpu, 3, "inject: type %x", s390int->type);
633 inti->type = s390int->type;
634 break;
632 case KVM_S390_INT_EXTERNAL_CALL: 635 case KVM_S390_INT_EXTERNAL_CALL:
636 if (s390int->parm & 0xffff0000) {
637 kfree(inti);
638 return -EINVAL;
639 }
640 VCPU_EVENT(vcpu, 3, "inject: external call source-cpu:%u",
641 s390int->parm);
642 inti->type = s390int->type;
643 inti->extcall.code = s390int->parm;
644 break;
633 case KVM_S390_INT_EMERGENCY: 645 case KVM_S390_INT_EMERGENCY:
634 VCPU_EVENT(vcpu, 3, "inject: type %x", s390int->type); 646 if (s390int->parm & 0xffff0000) {
647 kfree(inti);
648 return -EINVAL;
649 }
650 VCPU_EVENT(vcpu, 3, "inject: emergency %u\n", s390int->parm);
635 inti->type = s390int->type; 651 inti->type = s390int->type;
652 inti->emerg.code = s390int->parm;
636 break; 653 break;
637 case KVM_S390_INT_VIRTIO: 654 case KVM_S390_INT_VIRTIO:
638 case KVM_S390_INT_SERVICE: 655 case KVM_S390_INT_SERVICE:
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index d91a95568002..c9011bfaabbe 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -355,6 +355,11 @@ static void kvm_s390_vcpu_initial_reset(struct kvm_vcpu *vcpu)
355 atomic_set_mask(CPUSTAT_STOPPED, &vcpu->arch.sie_block->cpuflags); 355 atomic_set_mask(CPUSTAT_STOPPED, &vcpu->arch.sie_block->cpuflags);
356} 356}
357 357
358int kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
359{
360 return 0;
361}
362
358int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) 363int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
359{ 364{
360 atomic_set(&vcpu->arch.sie_block->cpuflags, CPUSTAT_ZARCH | 365 atomic_set(&vcpu->arch.sie_block->cpuflags, CPUSTAT_ZARCH |
@@ -993,7 +998,7 @@ static int __init kvm_s390_init(void)
993 } 998 }
994 memcpy(facilities, S390_lowcore.stfle_fac_list, 16); 999 memcpy(facilities, S390_lowcore.stfle_fac_list, 16);
995 facilities[0] &= 0xff00fff3f47c0000ULL; 1000 facilities[0] &= 0xff00fff3f47c0000ULL;
996 facilities[1] &= 0x201c000000000000ULL; 1001 facilities[1] &= 0x001c000000000000ULL;
997 return 0; 1002 return 0;
998} 1003}
999 1004
diff --git a/arch/x86/include/asm/clocksource.h b/arch/x86/include/asm/clocksource.h
index 0bdbbb3b9ce7..16a57f4ed64d 100644
--- a/arch/x86/include/asm/clocksource.h
+++ b/arch/x86/include/asm/clocksource.h
@@ -8,6 +8,7 @@
8#define VCLOCK_NONE 0 /* No vDSO clock available. */ 8#define VCLOCK_NONE 0 /* No vDSO clock available. */
9#define VCLOCK_TSC 1 /* vDSO should use vread_tsc. */ 9#define VCLOCK_TSC 1 /* vDSO should use vread_tsc. */
10#define VCLOCK_HPET 2 /* vDSO should use vread_hpet. */ 10#define VCLOCK_HPET 2 /* vDSO should use vread_hpet. */
11#define VCLOCK_PVCLOCK 3 /* vDSO should use vread_pvclock. */
11 12
12struct arch_clocksource_data { 13struct arch_clocksource_data {
13 int vclock_mode; 14 int vclock_mode;
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index da40b1e2228e..2d9075e863a0 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -202,6 +202,7 @@
202 202
203/* Intel-defined CPU features, CPUID level 0x00000007:0 (ebx), word 9 */ 203/* Intel-defined CPU features, CPUID level 0x00000007:0 (ebx), word 9 */
204#define X86_FEATURE_FSGSBASE (9*32+ 0) /* {RD/WR}{FS/GS}BASE instructions*/ 204#define X86_FEATURE_FSGSBASE (9*32+ 0) /* {RD/WR}{FS/GS}BASE instructions*/
205#define X86_FEATURE_TSC_ADJUST (9*32+ 1) /* TSC adjustment MSR 0x3b */
205#define X86_FEATURE_BMI1 (9*32+ 3) /* 1st group bit manipulation extensions */ 206#define X86_FEATURE_BMI1 (9*32+ 3) /* 1st group bit manipulation extensions */
206#define X86_FEATURE_HLE (9*32+ 4) /* Hardware Lock Elision */ 207#define X86_FEATURE_HLE (9*32+ 4) /* Hardware Lock Elision */
207#define X86_FEATURE_AVX2 (9*32+ 5) /* AVX2 instructions */ 208#define X86_FEATURE_AVX2 (9*32+ 5) /* AVX2 instructions */
diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h
index 4da3c0c4c974..a09c28571064 100644
--- a/arch/x86/include/asm/fixmap.h
+++ b/arch/x86/include/asm/fixmap.h
@@ -19,6 +19,7 @@
19#include <asm/acpi.h> 19#include <asm/acpi.h>
20#include <asm/apicdef.h> 20#include <asm/apicdef.h>
21#include <asm/page.h> 21#include <asm/page.h>
22#include <asm/pvclock.h>
22#ifdef CONFIG_X86_32 23#ifdef CONFIG_X86_32
23#include <linux/threads.h> 24#include <linux/threads.h>
24#include <asm/kmap_types.h> 25#include <asm/kmap_types.h>
@@ -81,6 +82,10 @@ enum fixed_addresses {
81 VVAR_PAGE, 82 VVAR_PAGE,
82 VSYSCALL_HPET, 83 VSYSCALL_HPET,
83#endif 84#endif
85#ifdef CONFIG_PARAVIRT_CLOCK
86 PVCLOCK_FIXMAP_BEGIN,
87 PVCLOCK_FIXMAP_END = PVCLOCK_FIXMAP_BEGIN+PVCLOCK_VSYSCALL_NR_PAGES-1,
88#endif
84 FIX_DBGP_BASE, 89 FIX_DBGP_BASE,
85 FIX_EARLYCON_MEM_BASE, 90 FIX_EARLYCON_MEM_BASE,
86#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT 91#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
diff --git a/arch/x86/include/asm/kexec.h b/arch/x86/include/asm/kexec.h
index 317ff1703d0b..6080d2694bad 100644
--- a/arch/x86/include/asm/kexec.h
+++ b/arch/x86/include/asm/kexec.h
@@ -163,6 +163,9 @@ struct kimage_arch {
163}; 163};
164#endif 164#endif
165 165
166typedef void crash_vmclear_fn(void);
167extern crash_vmclear_fn __rcu *crash_vmclear_loaded_vmcss;
168
166#endif /* __ASSEMBLY__ */ 169#endif /* __ASSEMBLY__ */
167 170
168#endif /* _ASM_X86_KEXEC_H */ 171#endif /* _ASM_X86_KEXEC_H */
diff --git a/arch/x86/include/asm/kvm_guest.h b/arch/x86/include/asm/kvm_guest.h
new file mode 100644
index 000000000000..a92b1763c419
--- /dev/null
+++ b/arch/x86/include/asm/kvm_guest.h
@@ -0,0 +1,6 @@
1#ifndef _ASM_X86_KVM_GUEST_H
2#define _ASM_X86_KVM_GUEST_H
3
4int kvm_setup_vsyscall_timeinfo(void);
5
6#endif /* _ASM_X86_KVM_GUEST_H */
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index b2e11f452435..dc87b65e9c3a 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -22,6 +22,8 @@
22#include <linux/kvm_para.h> 22#include <linux/kvm_para.h>
23#include <linux/kvm_types.h> 23#include <linux/kvm_types.h>
24#include <linux/perf_event.h> 24#include <linux/perf_event.h>
25#include <linux/pvclock_gtod.h>
26#include <linux/clocksource.h>
25 27
26#include <asm/pvclock-abi.h> 28#include <asm/pvclock-abi.h>
27#include <asm/desc.h> 29#include <asm/desc.h>
@@ -442,6 +444,7 @@ struct kvm_vcpu_arch {
442 s8 virtual_tsc_shift; 444 s8 virtual_tsc_shift;
443 u32 virtual_tsc_mult; 445 u32 virtual_tsc_mult;
444 u32 virtual_tsc_khz; 446 u32 virtual_tsc_khz;
447 s64 ia32_tsc_adjust_msr;
445 448
446 atomic_t nmi_queued; /* unprocessed asynchronous NMIs */ 449 atomic_t nmi_queued; /* unprocessed asynchronous NMIs */
447 unsigned nmi_pending; /* NMI queued after currently running handler */ 450 unsigned nmi_pending; /* NMI queued after currently running handler */
@@ -559,6 +562,12 @@ struct kvm_arch {
559 u64 cur_tsc_write; 562 u64 cur_tsc_write;
560 u64 cur_tsc_offset; 563 u64 cur_tsc_offset;
561 u8 cur_tsc_generation; 564 u8 cur_tsc_generation;
565 int nr_vcpus_matched_tsc;
566
567 spinlock_t pvclock_gtod_sync_lock;
568 bool use_master_clock;
569 u64 master_kernel_ns;
570 cycle_t master_cycle_now;
562 571
563 struct kvm_xen_hvm_config xen_hvm_config; 572 struct kvm_xen_hvm_config xen_hvm_config;
564 573
@@ -612,6 +621,12 @@ struct kvm_vcpu_stat {
612 621
613struct x86_instruction_info; 622struct x86_instruction_info;
614 623
624struct msr_data {
625 bool host_initiated;
626 u32 index;
627 u64 data;
628};
629
615struct kvm_x86_ops { 630struct kvm_x86_ops {
616 int (*cpu_has_kvm_support)(void); /* __init */ 631 int (*cpu_has_kvm_support)(void); /* __init */
617 int (*disabled_by_bios)(void); /* __init */ 632 int (*disabled_by_bios)(void); /* __init */
@@ -634,7 +649,7 @@ struct kvm_x86_ops {
634 649
635 void (*update_db_bp_intercept)(struct kvm_vcpu *vcpu); 650 void (*update_db_bp_intercept)(struct kvm_vcpu *vcpu);
636 int (*get_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata); 651 int (*get_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata);
637 int (*set_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 data); 652 int (*set_msr)(struct kvm_vcpu *vcpu, struct msr_data *msr);
638 u64 (*get_segment_base)(struct kvm_vcpu *vcpu, int seg); 653 u64 (*get_segment_base)(struct kvm_vcpu *vcpu, int seg);
639 void (*get_segment)(struct kvm_vcpu *vcpu, 654 void (*get_segment)(struct kvm_vcpu *vcpu,
640 struct kvm_segment *var, int seg); 655 struct kvm_segment *var, int seg);
@@ -697,10 +712,11 @@ struct kvm_x86_ops {
697 bool (*has_wbinvd_exit)(void); 712 bool (*has_wbinvd_exit)(void);
698 713
699 void (*set_tsc_khz)(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale); 714 void (*set_tsc_khz)(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale);
715 u64 (*read_tsc_offset)(struct kvm_vcpu *vcpu);
700 void (*write_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset); 716 void (*write_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset);
701 717
702 u64 (*compute_tsc_offset)(struct kvm_vcpu *vcpu, u64 target_tsc); 718 u64 (*compute_tsc_offset)(struct kvm_vcpu *vcpu, u64 target_tsc);
703 u64 (*read_l1_tsc)(struct kvm_vcpu *vcpu); 719 u64 (*read_l1_tsc)(struct kvm_vcpu *vcpu, u64 host_tsc);
704 720
705 void (*get_exit_info)(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2); 721 void (*get_exit_info)(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2);
706 722
@@ -785,7 +801,7 @@ static inline int emulate_instruction(struct kvm_vcpu *vcpu,
785 801
786void kvm_enable_efer_bits(u64); 802void kvm_enable_efer_bits(u64);
787int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *data); 803int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *data);
788int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data); 804int kvm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr);
789 805
790struct x86_emulate_ctxt; 806struct x86_emulate_ctxt;
791 807
@@ -812,7 +828,7 @@ void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l);
812int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr); 828int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr);
813 829
814int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata); 830int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata);
815int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data); 831int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr);
816 832
817unsigned long kvm_get_rflags(struct kvm_vcpu *vcpu); 833unsigned long kvm_get_rflags(struct kvm_vcpu *vcpu);
818void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags); 834void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags);
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index e400cdb2dd65..6e930b218724 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -236,6 +236,7 @@
236#define MSR_IA32_EBL_CR_POWERON 0x0000002a 236#define MSR_IA32_EBL_CR_POWERON 0x0000002a
237#define MSR_EBC_FREQUENCY_ID 0x0000002c 237#define MSR_EBC_FREQUENCY_ID 0x0000002c
238#define MSR_IA32_FEATURE_CONTROL 0x0000003a 238#define MSR_IA32_FEATURE_CONTROL 0x0000003a
239#define MSR_IA32_TSC_ADJUST 0x0000003b
239 240
240#define FEATURE_CONTROL_LOCKED (1<<0) 241#define FEATURE_CONTROL_LOCKED (1<<0)
241#define FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX (1<<1) 242#define FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX (1<<1)
diff --git a/arch/x86/include/asm/pvclock.h b/arch/x86/include/asm/pvclock.h
index c59cc97fe6c1..109a9dd5d454 100644
--- a/arch/x86/include/asm/pvclock.h
+++ b/arch/x86/include/asm/pvclock.h
@@ -6,6 +6,7 @@
6 6
7/* some helper functions for xen and kvm pv clock sources */ 7/* some helper functions for xen and kvm pv clock sources */
8cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src); 8cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src);
9u8 pvclock_read_flags(struct pvclock_vcpu_time_info *src);
9void pvclock_set_flags(u8 flags); 10void pvclock_set_flags(u8 flags);
10unsigned long pvclock_tsc_khz(struct pvclock_vcpu_time_info *src); 11unsigned long pvclock_tsc_khz(struct pvclock_vcpu_time_info *src);
11void pvclock_read_wallclock(struct pvclock_wall_clock *wall, 12void pvclock_read_wallclock(struct pvclock_wall_clock *wall,
@@ -56,4 +57,50 @@ static inline u64 pvclock_scale_delta(u64 delta, u32 mul_frac, int shift)
56 return product; 57 return product;
57} 58}
58 59
60static __always_inline
61u64 pvclock_get_nsec_offset(const struct pvclock_vcpu_time_info *src)
62{
63 u64 delta = __native_read_tsc() - src->tsc_timestamp;
64 return pvclock_scale_delta(delta, src->tsc_to_system_mul,
65 src->tsc_shift);
66}
67
68static __always_inline
69unsigned __pvclock_read_cycles(const struct pvclock_vcpu_time_info *src,
70 cycle_t *cycles, u8 *flags)
71{
72 unsigned version;
73 cycle_t ret, offset;
74 u8 ret_flags;
75
76 version = src->version;
77 /* Note: emulated platforms which do not advertise SSE2 support
78 * result in kvmclock not using the necessary RDTSC barriers.
79 * Without barriers, it is possible that RDTSC instruction reads from
80 * the time stamp counter outside rdtsc_barrier protected section
81 * below, resulting in violation of monotonicity.
82 */
83 rdtsc_barrier();
84 offset = pvclock_get_nsec_offset(src);
85 ret = src->system_time + offset;
86 ret_flags = src->flags;
87 rdtsc_barrier();
88
89 *cycles = ret;
90 *flags = ret_flags;
91 return version;
92}
93
94struct pvclock_vsyscall_time_info {
95 struct pvclock_vcpu_time_info pvti;
96 u32 migrate_count;
97} __attribute__((__aligned__(SMP_CACHE_BYTES)));
98
99#define PVTI_SIZE sizeof(struct pvclock_vsyscall_time_info)
100#define PVCLOCK_VSYSCALL_NR_PAGES (((NR_CPUS-1)/(PAGE_SIZE/PVTI_SIZE))+1)
101
102int __init pvclock_init_vsyscall(struct pvclock_vsyscall_time_info *i,
103 int size);
104struct pvclock_vcpu_time_info *pvclock_get_vsyscall_time_info(int cpu);
105
59#endif /* _ASM_X86_PVCLOCK_H */ 106#endif /* _ASM_X86_PVCLOCK_H */
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index 36ec21c36d68..c2d56b34830d 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -445,8 +445,7 @@ enum vmcs_field {
445#define VMX_EPTP_WB_BIT (1ull << 14) 445#define VMX_EPTP_WB_BIT (1ull << 14)
446#define VMX_EPT_2MB_PAGE_BIT (1ull << 16) 446#define VMX_EPT_2MB_PAGE_BIT (1ull << 16)
447#define VMX_EPT_1GB_PAGE_BIT (1ull << 17) 447#define VMX_EPT_1GB_PAGE_BIT (1ull << 17)
448#define VMX_EPT_AD_BIT (1ull << 21) 448#define VMX_EPT_AD_BIT (1ull << 21)
449#define VMX_EPT_EXTENT_INDIVIDUAL_BIT (1ull << 24)
450#define VMX_EPT_EXTENT_CONTEXT_BIT (1ull << 25) 449#define VMX_EPT_EXTENT_CONTEXT_BIT (1ull << 25)
451#define VMX_EPT_EXTENT_GLOBAL_BIT (1ull << 26) 450#define VMX_EPT_EXTENT_GLOBAL_BIT (1ull << 26)
452 451
diff --git a/arch/x86/include/asm/vsyscall.h b/arch/x86/include/asm/vsyscall.h
index eaea1d31f753..80f80955cfd8 100644
--- a/arch/x86/include/asm/vsyscall.h
+++ b/arch/x86/include/asm/vsyscall.h
@@ -33,6 +33,26 @@ extern void map_vsyscall(void);
33 */ 33 */
34extern bool emulate_vsyscall(struct pt_regs *regs, unsigned long address); 34extern bool emulate_vsyscall(struct pt_regs *regs, unsigned long address);
35 35
36#ifdef CONFIG_X86_64
37
38#define VGETCPU_CPU_MASK 0xfff
39
40static inline unsigned int __getcpu(void)
41{
42 unsigned int p;
43
44 if (VVAR(vgetcpu_mode) == VGETCPU_RDTSCP) {
45 /* Load per CPU data from RDTSCP */
46 native_read_tscp(&p);
47 } else {
48 /* Load per CPU data from GDT */
49 asm("lsl %1,%0" : "=r" (p) : "r" (__PER_CPU_SEG));
50 }
51
52 return p;
53}
54#endif /* CONFIG_X86_64 */
55
36#endif /* __KERNEL__ */ 56#endif /* __KERNEL__ */
37 57
38#endif /* _ASM_X86_VSYSCALL_H */ 58#endif /* _ASM_X86_VSYSCALL_H */
diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index 13ad89971d47..74467feb4dc5 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -16,6 +16,7 @@
16#include <linux/delay.h> 16#include <linux/delay.h>
17#include <linux/elf.h> 17#include <linux/elf.h>
18#include <linux/elfcore.h> 18#include <linux/elfcore.h>
19#include <linux/module.h>
19 20
20#include <asm/processor.h> 21#include <asm/processor.h>
21#include <asm/hardirq.h> 22#include <asm/hardirq.h>
@@ -30,6 +31,27 @@
30 31
31int in_crash_kexec; 32int in_crash_kexec;
32 33
34/*
35 * This is used to VMCLEAR all VMCSs loaded on the
36 * processor. And when loading kvm_intel module, the
37 * callback function pointer will be assigned.
38 *
39 * protected by rcu.
40 */
41crash_vmclear_fn __rcu *crash_vmclear_loaded_vmcss = NULL;
42EXPORT_SYMBOL_GPL(crash_vmclear_loaded_vmcss);
43
44static inline void cpu_crash_vmclear_loaded_vmcss(void)
45{
46 crash_vmclear_fn *do_vmclear_operation = NULL;
47
48 rcu_read_lock();
49 do_vmclear_operation = rcu_dereference(crash_vmclear_loaded_vmcss);
50 if (do_vmclear_operation)
51 do_vmclear_operation();
52 rcu_read_unlock();
53}
54
33#if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC) 55#if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC)
34 56
35static void kdump_nmi_callback(int cpu, struct pt_regs *regs) 57static void kdump_nmi_callback(int cpu, struct pt_regs *regs)
@@ -46,6 +68,11 @@ static void kdump_nmi_callback(int cpu, struct pt_regs *regs)
46#endif 68#endif
47 crash_save_cpu(regs, cpu); 69 crash_save_cpu(regs, cpu);
48 70
71 /*
72 * VMCLEAR VMCSs loaded on all cpus if needed.
73 */
74 cpu_crash_vmclear_loaded_vmcss();
75
49 /* Disable VMX or SVM if needed. 76 /* Disable VMX or SVM if needed.
50 * 77 *
51 * We need to disable virtualization on all CPUs. 78 * We need to disable virtualization on all CPUs.
@@ -88,6 +115,11 @@ void native_machine_crash_shutdown(struct pt_regs *regs)
88 115
89 kdump_nmi_shootdown_cpus(); 116 kdump_nmi_shootdown_cpus();
90 117
118 /*
119 * VMCLEAR VMCSs loaded on this cpu if needed.
120 */
121 cpu_crash_vmclear_loaded_vmcss();
122
91 /* Booting kdump kernel with VMX or SVM enabled won't work, 123 /* Booting kdump kernel with VMX or SVM enabled won't work,
92 * because (among other limitations) we can't disable paging 124 * because (among other limitations) we can't disable paging
93 * with the virt flags. 125 * with the virt flags.
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 4180a874c764..08b973f64032 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -42,6 +42,7 @@
42#include <asm/apic.h> 42#include <asm/apic.h>
43#include <asm/apicdef.h> 43#include <asm/apicdef.h>
44#include <asm/hypervisor.h> 44#include <asm/hypervisor.h>
45#include <asm/kvm_guest.h>
45 46
46static int kvmapf = 1; 47static int kvmapf = 1;
47 48
@@ -62,6 +63,15 @@ static int parse_no_stealacc(char *arg)
62 63
63early_param("no-steal-acc", parse_no_stealacc); 64early_param("no-steal-acc", parse_no_stealacc);
64 65
66static int kvmclock_vsyscall = 1;
67static int parse_no_kvmclock_vsyscall(char *arg)
68{
69 kvmclock_vsyscall = 0;
70 return 0;
71}
72
73early_param("no-kvmclock-vsyscall", parse_no_kvmclock_vsyscall);
74
65static DEFINE_PER_CPU(struct kvm_vcpu_pv_apf_data, apf_reason) __aligned(64); 75static DEFINE_PER_CPU(struct kvm_vcpu_pv_apf_data, apf_reason) __aligned(64);
66static DEFINE_PER_CPU(struct kvm_steal_time, steal_time) __aligned(64); 76static DEFINE_PER_CPU(struct kvm_steal_time, steal_time) __aligned(64);
67static int has_steal_clock = 0; 77static int has_steal_clock = 0;
@@ -110,11 +120,6 @@ void kvm_async_pf_task_wait(u32 token)
110 struct kvm_task_sleep_head *b = &async_pf_sleepers[key]; 120 struct kvm_task_sleep_head *b = &async_pf_sleepers[key];
111 struct kvm_task_sleep_node n, *e; 121 struct kvm_task_sleep_node n, *e;
112 DEFINE_WAIT(wait); 122 DEFINE_WAIT(wait);
113 int cpu, idle;
114
115 cpu = get_cpu();
116 idle = idle_cpu(cpu);
117 put_cpu();
118 123
119 spin_lock(&b->lock); 124 spin_lock(&b->lock);
120 e = _find_apf_task(b, token); 125 e = _find_apf_task(b, token);
@@ -128,7 +133,7 @@ void kvm_async_pf_task_wait(u32 token)
128 133
129 n.token = token; 134 n.token = token;
130 n.cpu = smp_processor_id(); 135 n.cpu = smp_processor_id();
131 n.halted = idle || preempt_count() > 1; 136 n.halted = is_idle_task(current) || preempt_count() > 1;
132 init_waitqueue_head(&n.wq); 137 init_waitqueue_head(&n.wq);
133 hlist_add_head(&n.link, &b->list); 138 hlist_add_head(&n.link, &b->list);
134 spin_unlock(&b->lock); 139 spin_unlock(&b->lock);
@@ -471,6 +476,9 @@ void __init kvm_guest_init(void)
471 if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) 476 if (kvm_para_has_feature(KVM_FEATURE_PV_EOI))
472 apic_set_eoi_write(kvm_guest_apic_eoi_write); 477 apic_set_eoi_write(kvm_guest_apic_eoi_write);
473 478
479 if (kvmclock_vsyscall)
480 kvm_setup_vsyscall_timeinfo();
481
474#ifdef CONFIG_SMP 482#ifdef CONFIG_SMP
475 smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu; 483 smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu;
476 register_cpu_notifier(&kvm_cpu_notifier); 484 register_cpu_notifier(&kvm_cpu_notifier);
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index f1b42b3a186c..220a360010f8 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -23,6 +23,7 @@
23#include <asm/apic.h> 23#include <asm/apic.h>
24#include <linux/percpu.h> 24#include <linux/percpu.h>
25#include <linux/hardirq.h> 25#include <linux/hardirq.h>
26#include <linux/memblock.h>
26 27
27#include <asm/x86_init.h> 28#include <asm/x86_init.h>
28#include <asm/reboot.h> 29#include <asm/reboot.h>
@@ -39,7 +40,7 @@ static int parse_no_kvmclock(char *arg)
39early_param("no-kvmclock", parse_no_kvmclock); 40early_param("no-kvmclock", parse_no_kvmclock);
40 41
41/* The hypervisor will put information about time periodically here */ 42/* The hypervisor will put information about time periodically here */
42static DEFINE_PER_CPU_SHARED_ALIGNED(struct pvclock_vcpu_time_info, hv_clock); 43static struct pvclock_vsyscall_time_info *hv_clock;
43static struct pvclock_wall_clock wall_clock; 44static struct pvclock_wall_clock wall_clock;
44 45
45/* 46/*
@@ -52,15 +53,20 @@ static unsigned long kvm_get_wallclock(void)
52 struct pvclock_vcpu_time_info *vcpu_time; 53 struct pvclock_vcpu_time_info *vcpu_time;
53 struct timespec ts; 54 struct timespec ts;
54 int low, high; 55 int low, high;
56 int cpu;
55 57
56 low = (int)__pa_symbol(&wall_clock); 58 low = (int)__pa_symbol(&wall_clock);
57 high = ((u64)__pa_symbol(&wall_clock) >> 32); 59 high = ((u64)__pa_symbol(&wall_clock) >> 32);
58 60
59 native_write_msr(msr_kvm_wall_clock, low, high); 61 native_write_msr(msr_kvm_wall_clock, low, high);
60 62
61 vcpu_time = &get_cpu_var(hv_clock); 63 preempt_disable();
64 cpu = smp_processor_id();
65
66 vcpu_time = &hv_clock[cpu].pvti;
62 pvclock_read_wallclock(&wall_clock, vcpu_time, &ts); 67 pvclock_read_wallclock(&wall_clock, vcpu_time, &ts);
63 put_cpu_var(hv_clock); 68
69 preempt_enable();
64 70
65 return ts.tv_sec; 71 return ts.tv_sec;
66} 72}
@@ -74,9 +80,11 @@ static cycle_t kvm_clock_read(void)
74{ 80{
75 struct pvclock_vcpu_time_info *src; 81 struct pvclock_vcpu_time_info *src;
76 cycle_t ret; 82 cycle_t ret;
83 int cpu;
77 84
78 preempt_disable_notrace(); 85 preempt_disable_notrace();
79 src = &__get_cpu_var(hv_clock); 86 cpu = smp_processor_id();
87 src = &hv_clock[cpu].pvti;
80 ret = pvclock_clocksource_read(src); 88 ret = pvclock_clocksource_read(src);
81 preempt_enable_notrace(); 89 preempt_enable_notrace();
82 return ret; 90 return ret;
@@ -99,8 +107,15 @@ static cycle_t kvm_clock_get_cycles(struct clocksource *cs)
99static unsigned long kvm_get_tsc_khz(void) 107static unsigned long kvm_get_tsc_khz(void)
100{ 108{
101 struct pvclock_vcpu_time_info *src; 109 struct pvclock_vcpu_time_info *src;
102 src = &per_cpu(hv_clock, 0); 110 int cpu;
103 return pvclock_tsc_khz(src); 111 unsigned long tsc_khz;
112
113 preempt_disable();
114 cpu = smp_processor_id();
115 src = &hv_clock[cpu].pvti;
116 tsc_khz = pvclock_tsc_khz(src);
117 preempt_enable();
118 return tsc_khz;
104} 119}
105 120
106static void kvm_get_preset_lpj(void) 121static void kvm_get_preset_lpj(void)
@@ -119,10 +134,14 @@ bool kvm_check_and_clear_guest_paused(void)
119{ 134{
120 bool ret = false; 135 bool ret = false;
121 struct pvclock_vcpu_time_info *src; 136 struct pvclock_vcpu_time_info *src;
137 int cpu = smp_processor_id();
122 138
123 src = &__get_cpu_var(hv_clock); 139 if (!hv_clock)
140 return ret;
141
142 src = &hv_clock[cpu].pvti;
124 if ((src->flags & PVCLOCK_GUEST_STOPPED) != 0) { 143 if ((src->flags & PVCLOCK_GUEST_STOPPED) != 0) {
125 __this_cpu_and(hv_clock.flags, ~PVCLOCK_GUEST_STOPPED); 144 src->flags &= ~PVCLOCK_GUEST_STOPPED;
126 ret = true; 145 ret = true;
127 } 146 }
128 147
@@ -141,9 +160,10 @@ int kvm_register_clock(char *txt)
141{ 160{
142 int cpu = smp_processor_id(); 161 int cpu = smp_processor_id();
143 int low, high, ret; 162 int low, high, ret;
163 struct pvclock_vcpu_time_info *src = &hv_clock[cpu].pvti;
144 164
145 low = (int)__pa(&per_cpu(hv_clock, cpu)) | 1; 165 low = (int)__pa(src) | 1;
146 high = ((u64)__pa(&per_cpu(hv_clock, cpu)) >> 32); 166 high = ((u64)__pa(src) >> 32);
147 ret = native_write_msr_safe(msr_kvm_system_time, low, high); 167 ret = native_write_msr_safe(msr_kvm_system_time, low, high);
148 printk(KERN_INFO "kvm-clock: cpu %d, msr %x:%x, %s\n", 168 printk(KERN_INFO "kvm-clock: cpu %d, msr %x:%x, %s\n",
149 cpu, high, low, txt); 169 cpu, high, low, txt);
@@ -197,6 +217,8 @@ static void kvm_shutdown(void)
197 217
198void __init kvmclock_init(void) 218void __init kvmclock_init(void)
199{ 219{
220 unsigned long mem;
221
200 if (!kvm_para_available()) 222 if (!kvm_para_available())
201 return; 223 return;
202 224
@@ -209,8 +231,18 @@ void __init kvmclock_init(void)
209 printk(KERN_INFO "kvm-clock: Using msrs %x and %x", 231 printk(KERN_INFO "kvm-clock: Using msrs %x and %x",
210 msr_kvm_system_time, msr_kvm_wall_clock); 232 msr_kvm_system_time, msr_kvm_wall_clock);
211 233
212 if (kvm_register_clock("boot clock")) 234 mem = memblock_alloc(sizeof(struct pvclock_vsyscall_time_info)*NR_CPUS,
235 PAGE_SIZE);
236 if (!mem)
237 return;
238 hv_clock = __va(mem);
239
240 if (kvm_register_clock("boot clock")) {
241 hv_clock = NULL;
242 memblock_free(mem,
243 sizeof(struct pvclock_vsyscall_time_info)*NR_CPUS);
213 return; 244 return;
245 }
214 pv_time_ops.sched_clock = kvm_clock_read; 246 pv_time_ops.sched_clock = kvm_clock_read;
215 x86_platform.calibrate_tsc = kvm_get_tsc_khz; 247 x86_platform.calibrate_tsc = kvm_get_tsc_khz;
216 x86_platform.get_wallclock = kvm_get_wallclock; 248 x86_platform.get_wallclock = kvm_get_wallclock;
@@ -233,3 +265,37 @@ void __init kvmclock_init(void)
233 if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE_STABLE_BIT)) 265 if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE_STABLE_BIT))
234 pvclock_set_flags(PVCLOCK_TSC_STABLE_BIT); 266 pvclock_set_flags(PVCLOCK_TSC_STABLE_BIT);
235} 267}
268
269int __init kvm_setup_vsyscall_timeinfo(void)
270{
271#ifdef CONFIG_X86_64
272 int cpu;
273 int ret;
274 u8 flags;
275 struct pvclock_vcpu_time_info *vcpu_time;
276 unsigned int size;
277
278 size = sizeof(struct pvclock_vsyscall_time_info)*NR_CPUS;
279
280 preempt_disable();
281 cpu = smp_processor_id();
282
283 vcpu_time = &hv_clock[cpu].pvti;
284 flags = pvclock_read_flags(vcpu_time);
285
286 if (!(flags & PVCLOCK_TSC_STABLE_BIT)) {
287 preempt_enable();
288 return 1;
289 }
290
291 if ((ret = pvclock_init_vsyscall(hv_clock, size))) {
292 preempt_enable();
293 return ret;
294 }
295
296 preempt_enable();
297
298 kvm_clock.archdata.vclock_mode = VCLOCK_PVCLOCK;
299#endif
300 return 0;
301}
diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c
index 42eb3300dfc6..85c39590c1a4 100644
--- a/arch/x86/kernel/pvclock.c
+++ b/arch/x86/kernel/pvclock.c
@@ -17,23 +17,13 @@
17 17
18#include <linux/kernel.h> 18#include <linux/kernel.h>
19#include <linux/percpu.h> 19#include <linux/percpu.h>
20#include <linux/notifier.h>
21#include <linux/sched.h>
22#include <linux/gfp.h>
23#include <linux/bootmem.h>
24#include <asm/fixmap.h>
20#include <asm/pvclock.h> 25#include <asm/pvclock.h>
21 26
22/*
23 * These are perodically updated
24 * xen: magic shared_info page
25 * kvm: gpa registered via msr
26 * and then copied here.
27 */
28struct pvclock_shadow_time {
29 u64 tsc_timestamp; /* TSC at last update of time vals. */
30 u64 system_timestamp; /* Time, in nanosecs, since boot. */
31 u32 tsc_to_nsec_mul;
32 int tsc_shift;
33 u32 version;
34 u8 flags;
35};
36
37static u8 valid_flags __read_mostly = 0; 27static u8 valid_flags __read_mostly = 0;
38 28
39void pvclock_set_flags(u8 flags) 29void pvclock_set_flags(u8 flags)
@@ -41,34 +31,6 @@ void pvclock_set_flags(u8 flags)
41 valid_flags = flags; 31 valid_flags = flags;
42} 32}
43 33
44static u64 pvclock_get_nsec_offset(struct pvclock_shadow_time *shadow)
45{
46 u64 delta = native_read_tsc() - shadow->tsc_timestamp;
47 return pvclock_scale_delta(delta, shadow->tsc_to_nsec_mul,
48 shadow->tsc_shift);
49}
50
51/*
52 * Reads a consistent set of time-base values from hypervisor,
53 * into a shadow data area.
54 */
55static unsigned pvclock_get_time_values(struct pvclock_shadow_time *dst,
56 struct pvclock_vcpu_time_info *src)
57{
58 do {
59 dst->version = src->version;
60 rmb(); /* fetch version before data */
61 dst->tsc_timestamp = src->tsc_timestamp;
62 dst->system_timestamp = src->system_time;
63 dst->tsc_to_nsec_mul = src->tsc_to_system_mul;
64 dst->tsc_shift = src->tsc_shift;
65 dst->flags = src->flags;
66 rmb(); /* test version after fetching data */
67 } while ((src->version & 1) || (dst->version != src->version));
68
69 return dst->version;
70}
71
72unsigned long pvclock_tsc_khz(struct pvclock_vcpu_time_info *src) 34unsigned long pvclock_tsc_khz(struct pvclock_vcpu_time_info *src)
73{ 35{
74 u64 pv_tsc_khz = 1000000ULL << 32; 36 u64 pv_tsc_khz = 1000000ULL << 32;
@@ -88,23 +50,32 @@ void pvclock_resume(void)
88 atomic64_set(&last_value, 0); 50 atomic64_set(&last_value, 0);
89} 51}
90 52
53u8 pvclock_read_flags(struct pvclock_vcpu_time_info *src)
54{
55 unsigned version;
56 cycle_t ret;
57 u8 flags;
58
59 do {
60 version = __pvclock_read_cycles(src, &ret, &flags);
61 } while ((src->version & 1) || version != src->version);
62
63 return flags & valid_flags;
64}
65
91cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src) 66cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src)
92{ 67{
93 struct pvclock_shadow_time shadow;
94 unsigned version; 68 unsigned version;
95 cycle_t ret, offset; 69 cycle_t ret;
96 u64 last; 70 u64 last;
71 u8 flags;
97 72
98 do { 73 do {
99 version = pvclock_get_time_values(&shadow, src); 74 version = __pvclock_read_cycles(src, &ret, &flags);
100 barrier(); 75 } while ((src->version & 1) || version != src->version);
101 offset = pvclock_get_nsec_offset(&shadow);
102 ret = shadow.system_timestamp + offset;
103 barrier();
104 } while (version != src->version);
105 76
106 if ((valid_flags & PVCLOCK_TSC_STABLE_BIT) && 77 if ((valid_flags & PVCLOCK_TSC_STABLE_BIT) &&
107 (shadow.flags & PVCLOCK_TSC_STABLE_BIT)) 78 (flags & PVCLOCK_TSC_STABLE_BIT))
108 return ret; 79 return ret;
109 80
110 /* 81 /*
@@ -156,3 +127,71 @@ void pvclock_read_wallclock(struct pvclock_wall_clock *wall_clock,
156 127
157 set_normalized_timespec(ts, now.tv_sec, now.tv_nsec); 128 set_normalized_timespec(ts, now.tv_sec, now.tv_nsec);
158} 129}
130
131static struct pvclock_vsyscall_time_info *pvclock_vdso_info;
132
133static struct pvclock_vsyscall_time_info *
134pvclock_get_vsyscall_user_time_info(int cpu)
135{
136 if (!pvclock_vdso_info) {
137 BUG();
138 return NULL;
139 }
140
141 return &pvclock_vdso_info[cpu];
142}
143
144struct pvclock_vcpu_time_info *pvclock_get_vsyscall_time_info(int cpu)
145{
146 return &pvclock_get_vsyscall_user_time_info(cpu)->pvti;
147}
148
149#ifdef CONFIG_X86_64
150static int pvclock_task_migrate(struct notifier_block *nb, unsigned long l,
151 void *v)
152{
153 struct task_migration_notifier *mn = v;
154 struct pvclock_vsyscall_time_info *pvti;
155
156 pvti = pvclock_get_vsyscall_user_time_info(mn->from_cpu);
157
158 /* this is NULL when pvclock vsyscall is not initialized */
159 if (unlikely(pvti == NULL))
160 return NOTIFY_DONE;
161
162 pvti->migrate_count++;
163
164 return NOTIFY_DONE;
165}
166
167static struct notifier_block pvclock_migrate = {
168 .notifier_call = pvclock_task_migrate,
169};
170
171/*
172 * Initialize the generic pvclock vsyscall state. This will allocate
173 * a/some page(s) for the per-vcpu pvclock information, set up a
174 * fixmap mapping for the page(s)
175 */
176
177int __init pvclock_init_vsyscall(struct pvclock_vsyscall_time_info *i,
178 int size)
179{
180 int idx;
181
182 WARN_ON (size != PVCLOCK_VSYSCALL_NR_PAGES*PAGE_SIZE);
183
184 pvclock_vdso_info = i;
185
186 for (idx = 0; idx <= (PVCLOCK_FIXMAP_END-PVCLOCK_FIXMAP_BEGIN); idx++) {
187 __set_fixmap(PVCLOCK_FIXMAP_BEGIN + idx,
188 __pa_symbol(i) + (idx*PAGE_SIZE),
189 PAGE_KERNEL_VVAR);
190 }
191
192
193 register_task_migration_notifier(&pvclock_migrate);
194
195 return 0;
196}
197#endif
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index ec79e773342e..a20ecb5b6cbf 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -320,6 +320,8 @@ static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
320 if (index == 0) { 320 if (index == 0) {
321 entry->ebx &= kvm_supported_word9_x86_features; 321 entry->ebx &= kvm_supported_word9_x86_features;
322 cpuid_mask(&entry->ebx, 9); 322 cpuid_mask(&entry->ebx, 9);
323 // TSC_ADJUST is emulated
324 entry->ebx |= F(TSC_ADJUST);
323 } else 325 } else
324 entry->ebx = 0; 326 entry->ebx = 0;
325 entry->eax = 0; 327 entry->eax = 0;
@@ -659,6 +661,7 @@ void kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, u32 *ecx, u32 *edx)
659 } else 661 } else
660 *eax = *ebx = *ecx = *edx = 0; 662 *eax = *ebx = *ecx = *edx = 0;
661} 663}
664EXPORT_SYMBOL_GPL(kvm_cpuid);
662 665
663void kvm_emulate_cpuid(struct kvm_vcpu *vcpu) 666void kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
664{ 667{
diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
index 58fc51488828..b7fd07984888 100644
--- a/arch/x86/kvm/cpuid.h
+++ b/arch/x86/kvm/cpuid.h
@@ -31,6 +31,14 @@ static inline bool guest_cpuid_has_xsave(struct kvm_vcpu *vcpu)
31 return best && (best->ecx & bit(X86_FEATURE_XSAVE)); 31 return best && (best->ecx & bit(X86_FEATURE_XSAVE));
32} 32}
33 33
34static inline bool guest_cpuid_has_tsc_adjust(struct kvm_vcpu *vcpu)
35{
36 struct kvm_cpuid_entry2 *best;
37
38 best = kvm_find_cpuid_entry(vcpu, 7, 0);
39 return best && (best->ebx & bit(X86_FEATURE_TSC_ADJUST));
40}
41
34static inline bool guest_cpuid_has_smep(struct kvm_vcpu *vcpu) 42static inline bool guest_cpuid_has_smep(struct kvm_vcpu *vcpu)
35{ 43{
36 struct kvm_cpuid_entry2 *best; 44 struct kvm_cpuid_entry2 *best;
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index bba39bfa1c4b..a27e76371108 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -676,8 +676,9 @@ static int __linearize(struct x86_emulate_ctxt *ctxt,
676 addr.seg); 676 addr.seg);
677 if (!usable) 677 if (!usable)
678 goto bad; 678 goto bad;
679 /* code segment or read-only data segment */ 679 /* code segment in protected mode or read-only data segment */
680 if (((desc.type & 8) || !(desc.type & 2)) && write) 680 if ((((ctxt->mode != X86EMUL_MODE_REAL) && (desc.type & 8))
681 || !(desc.type & 2)) && write)
681 goto bad; 682 goto bad;
682 /* unreadable code segment */ 683 /* unreadable code segment */
683 if (!fetch && (desc.type & 8) && !(desc.type & 2)) 684 if (!fetch && (desc.type & 8) && !(desc.type & 2))
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 43e9fadca5d0..9392f527f107 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -1011,7 +1011,7 @@ static void start_apic_timer(struct kvm_lapic *apic)
1011 local_irq_save(flags); 1011 local_irq_save(flags);
1012 1012
1013 now = apic->lapic_timer.timer.base->get_time(); 1013 now = apic->lapic_timer.timer.base->get_time();
1014 guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu); 1014 guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu, native_read_tsc());
1015 if (likely(tscdeadline > guest_tsc)) { 1015 if (likely(tscdeadline > guest_tsc)) {
1016 ns = (tscdeadline - guest_tsc) * 1000000ULL; 1016 ns = (tscdeadline - guest_tsc) * 1000000ULL;
1017 do_div(ns, this_tsc_khz); 1017 do_div(ns, this_tsc_khz);
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 6f85fe0bf958..01d7c2ad05f5 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2382,12 +2382,20 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
2382 || (!vcpu->arch.mmu.direct_map && write_fault 2382 || (!vcpu->arch.mmu.direct_map && write_fault
2383 && !is_write_protection(vcpu) && !user_fault)) { 2383 && !is_write_protection(vcpu) && !user_fault)) {
2384 2384
2385 /*
2386 * There are two cases:
2387 * - the one is other vcpu creates new sp in the window
2388 * between mapping_level() and acquiring mmu-lock.
2389 * - the another case is the new sp is created by itself
2390 * (page-fault path) when guest uses the target gfn as
2391 * its page table.
2392 * Both of these cases can be fixed by allowing guest to
2393 * retry the access, it will refault, then we can establish
2394 * the mapping by using small page.
2395 */
2385 if (level > PT_PAGE_TABLE_LEVEL && 2396 if (level > PT_PAGE_TABLE_LEVEL &&
2386 has_wrprotected_page(vcpu->kvm, gfn, level)) { 2397 has_wrprotected_page(vcpu->kvm, gfn, level))
2387 ret = 1;
2388 drop_spte(vcpu->kvm, sptep);
2389 goto done; 2398 goto done;
2390 }
2391 2399
2392 spte |= PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE; 2400 spte |= PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE;
2393 2401
@@ -2505,6 +2513,14 @@ static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
2505 mmu_free_roots(vcpu); 2513 mmu_free_roots(vcpu);
2506} 2514}
2507 2515
2516static bool is_rsvd_bits_set(struct kvm_mmu *mmu, u64 gpte, int level)
2517{
2518 int bit7;
2519
2520 bit7 = (gpte >> 7) & 1;
2521 return (gpte & mmu->rsvd_bits_mask[bit7][level-1]) != 0;
2522}
2523
2508static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, 2524static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn,
2509 bool no_dirty_log) 2525 bool no_dirty_log)
2510{ 2526{
@@ -2517,6 +2533,26 @@ static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn,
2517 return gfn_to_pfn_memslot_atomic(slot, gfn); 2533 return gfn_to_pfn_memslot_atomic(slot, gfn);
2518} 2534}
2519 2535
2536static bool prefetch_invalid_gpte(struct kvm_vcpu *vcpu,
2537 struct kvm_mmu_page *sp, u64 *spte,
2538 u64 gpte)
2539{
2540 if (is_rsvd_bits_set(&vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL))
2541 goto no_present;
2542
2543 if (!is_present_gpte(gpte))
2544 goto no_present;
2545
2546 if (!(gpte & PT_ACCESSED_MASK))
2547 goto no_present;
2548
2549 return false;
2550
2551no_present:
2552 drop_spte(vcpu->kvm, spte);
2553 return true;
2554}
2555
2520static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu, 2556static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu,
2521 struct kvm_mmu_page *sp, 2557 struct kvm_mmu_page *sp,
2522 u64 *start, u64 *end) 2558 u64 *start, u64 *end)
@@ -2671,7 +2707,7 @@ static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu,
2671 * PT_PAGE_TABLE_LEVEL and there would be no adjustment done 2707 * PT_PAGE_TABLE_LEVEL and there would be no adjustment done
2672 * here. 2708 * here.
2673 */ 2709 */
2674 if (!is_error_pfn(pfn) && !kvm_is_mmio_pfn(pfn) && 2710 if (!is_error_noslot_pfn(pfn) && !kvm_is_mmio_pfn(pfn) &&
2675 level == PT_PAGE_TABLE_LEVEL && 2711 level == PT_PAGE_TABLE_LEVEL &&
2676 PageTransCompound(pfn_to_page(pfn)) && 2712 PageTransCompound(pfn_to_page(pfn)) &&
2677 !has_wrprotected_page(vcpu->kvm, gfn, PT_DIRECTORY_LEVEL)) { 2713 !has_wrprotected_page(vcpu->kvm, gfn, PT_DIRECTORY_LEVEL)) {
@@ -2699,18 +2735,13 @@ static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu,
2699 } 2735 }
2700} 2736}
2701 2737
2702static bool mmu_invalid_pfn(pfn_t pfn)
2703{
2704 return unlikely(is_invalid_pfn(pfn));
2705}
2706
2707static bool handle_abnormal_pfn(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn, 2738static bool handle_abnormal_pfn(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn,
2708 pfn_t pfn, unsigned access, int *ret_val) 2739 pfn_t pfn, unsigned access, int *ret_val)
2709{ 2740{
2710 bool ret = true; 2741 bool ret = true;
2711 2742
2712 /* The pfn is invalid, report the error! */ 2743 /* The pfn is invalid, report the error! */
2713 if (unlikely(is_invalid_pfn(pfn))) { 2744 if (unlikely(is_error_pfn(pfn))) {
2714 *ret_val = kvm_handle_bad_page(vcpu, gfn, pfn); 2745 *ret_val = kvm_handle_bad_page(vcpu, gfn, pfn);
2715 goto exit; 2746 goto exit;
2716 } 2747 }
@@ -2862,7 +2893,7 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code,
2862 return r; 2893 return r;
2863 2894
2864 spin_lock(&vcpu->kvm->mmu_lock); 2895 spin_lock(&vcpu->kvm->mmu_lock);
2865 if (mmu_notifier_retry(vcpu, mmu_seq)) 2896 if (mmu_notifier_retry(vcpu->kvm, mmu_seq))
2866 goto out_unlock; 2897 goto out_unlock;
2867 kvm_mmu_free_some_pages(vcpu); 2898 kvm_mmu_free_some_pages(vcpu);
2868 if (likely(!force_pt_level)) 2899 if (likely(!force_pt_level))
@@ -3331,7 +3362,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
3331 return r; 3362 return r;
3332 3363
3333 spin_lock(&vcpu->kvm->mmu_lock); 3364 spin_lock(&vcpu->kvm->mmu_lock);
3334 if (mmu_notifier_retry(vcpu, mmu_seq)) 3365 if (mmu_notifier_retry(vcpu->kvm, mmu_seq))
3335 goto out_unlock; 3366 goto out_unlock;
3336 kvm_mmu_free_some_pages(vcpu); 3367 kvm_mmu_free_some_pages(vcpu);
3337 if (likely(!force_pt_level)) 3368 if (likely(!force_pt_level))
@@ -3399,14 +3430,6 @@ static void paging_free(struct kvm_vcpu *vcpu)
3399 nonpaging_free(vcpu); 3430 nonpaging_free(vcpu);
3400} 3431}
3401 3432
3402static bool is_rsvd_bits_set(struct kvm_mmu *mmu, u64 gpte, int level)
3403{
3404 int bit7;
3405
3406 bit7 = (gpte >> 7) & 1;
3407 return (gpte & mmu->rsvd_bits_mask[bit7][level-1]) != 0;
3408}
3409
3410static inline void protect_clean_gpte(unsigned *access, unsigned gpte) 3433static inline void protect_clean_gpte(unsigned *access, unsigned gpte)
3411{ 3434{
3412 unsigned mask; 3435 unsigned mask;
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 714e2c01a6fe..891eb6d93b8b 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -305,51 +305,43 @@ static int FNAME(walk_addr_nested)(struct guest_walker *walker,
305 addr, access); 305 addr, access);
306} 306}
307 307
308static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu, 308static bool
309 struct kvm_mmu_page *sp, u64 *spte, 309FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
310 pt_element_t gpte) 310 u64 *spte, pt_element_t gpte, bool no_dirty_log)
311{ 311{
312 if (is_rsvd_bits_set(&vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL))
313 goto no_present;
314
315 if (!is_present_gpte(gpte))
316 goto no_present;
317
318 if (!(gpte & PT_ACCESSED_MASK))
319 goto no_present;
320
321 return false;
322
323no_present:
324 drop_spte(vcpu->kvm, spte);
325 return true;
326}
327
328static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
329 u64 *spte, const void *pte)
330{
331 pt_element_t gpte;
332 unsigned pte_access; 312 unsigned pte_access;
313 gfn_t gfn;
333 pfn_t pfn; 314 pfn_t pfn;
334 315
335 gpte = *(const pt_element_t *)pte; 316 if (prefetch_invalid_gpte(vcpu, sp, spte, gpte))
336 if (FNAME(prefetch_invalid_gpte)(vcpu, sp, spte, gpte)) 317 return false;
337 return;
338 318
339 pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte); 319 pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte);
320
321 gfn = gpte_to_gfn(gpte);
340 pte_access = sp->role.access & gpte_access(vcpu, gpte); 322 pte_access = sp->role.access & gpte_access(vcpu, gpte);
341 protect_clean_gpte(&pte_access, gpte); 323 protect_clean_gpte(&pte_access, gpte);
342 pfn = gfn_to_pfn_atomic(vcpu->kvm, gpte_to_gfn(gpte)); 324 pfn = pte_prefetch_gfn_to_pfn(vcpu, gfn,
343 if (mmu_invalid_pfn(pfn)) 325 no_dirty_log && (pte_access & ACC_WRITE_MASK));
344 return; 326 if (is_error_pfn(pfn))
327 return false;
345 328
346 /* 329 /*
347 * we call mmu_set_spte() with host_writable = true because that 330 * we call mmu_set_spte() with host_writable = true because
348 * vcpu->arch.update_pte.pfn was fetched from get_user_pages(write = 1). 331 * pte_prefetch_gfn_to_pfn always gets a writable pfn.
349 */ 332 */
350 mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0, 333 mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0,
351 NULL, PT_PAGE_TABLE_LEVEL, 334 NULL, PT_PAGE_TABLE_LEVEL, gfn, pfn, true, true);
352 gpte_to_gfn(gpte), pfn, true, true); 335
336 return true;
337}
338
339static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
340 u64 *spte, const void *pte)
341{
342 pt_element_t gpte = *(const pt_element_t *)pte;
343
344 FNAME(prefetch_gpte)(vcpu, sp, spte, gpte, false);
353} 345}
354 346
355static bool FNAME(gpte_changed)(struct kvm_vcpu *vcpu, 347static bool FNAME(gpte_changed)(struct kvm_vcpu *vcpu,
@@ -395,53 +387,34 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,
395 spte = sp->spt + i; 387 spte = sp->spt + i;
396 388
397 for (i = 0; i < PTE_PREFETCH_NUM; i++, spte++) { 389 for (i = 0; i < PTE_PREFETCH_NUM; i++, spte++) {
398 pt_element_t gpte;
399 unsigned pte_access;
400 gfn_t gfn;
401 pfn_t pfn;
402
403 if (spte == sptep) 390 if (spte == sptep)
404 continue; 391 continue;
405 392
406 if (is_shadow_present_pte(*spte)) 393 if (is_shadow_present_pte(*spte))
407 continue; 394 continue;
408 395
409 gpte = gptep[i]; 396 if (!FNAME(prefetch_gpte)(vcpu, sp, spte, gptep[i], true))
410
411 if (FNAME(prefetch_invalid_gpte)(vcpu, sp, spte, gpte))
412 continue;
413
414 pte_access = sp->role.access & gpte_access(vcpu, gpte);
415 protect_clean_gpte(&pte_access, gpte);
416 gfn = gpte_to_gfn(gpte);
417 pfn = pte_prefetch_gfn_to_pfn(vcpu, gfn,
418 pte_access & ACC_WRITE_MASK);
419 if (mmu_invalid_pfn(pfn))
420 break; 397 break;
421
422 mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0,
423 NULL, PT_PAGE_TABLE_LEVEL, gfn,
424 pfn, true, true);
425 } 398 }
426} 399}
427 400
428/* 401/*
429 * Fetch a shadow pte for a specific level in the paging hierarchy. 402 * Fetch a shadow pte for a specific level in the paging hierarchy.
403 * If the guest tries to write a write-protected page, we need to
404 * emulate this operation, return 1 to indicate this case.
430 */ 405 */
431static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, 406static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
432 struct guest_walker *gw, 407 struct guest_walker *gw,
433 int user_fault, int write_fault, int hlevel, 408 int user_fault, int write_fault, int hlevel,
434 int *emulate, pfn_t pfn, bool map_writable, 409 pfn_t pfn, bool map_writable, bool prefault)
435 bool prefault)
436{ 410{
437 unsigned access = gw->pt_access;
438 struct kvm_mmu_page *sp = NULL; 411 struct kvm_mmu_page *sp = NULL;
439 int top_level;
440 unsigned direct_access;
441 struct kvm_shadow_walk_iterator it; 412 struct kvm_shadow_walk_iterator it;
413 unsigned direct_access, access = gw->pt_access;
414 int top_level, emulate = 0;
442 415
443 if (!is_present_gpte(gw->ptes[gw->level - 1])) 416 if (!is_present_gpte(gw->ptes[gw->level - 1]))
444 return NULL; 417 return 0;
445 418
446 direct_access = gw->pte_access; 419 direct_access = gw->pte_access;
447 420
@@ -505,17 +478,17 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
505 478
506 clear_sp_write_flooding_count(it.sptep); 479 clear_sp_write_flooding_count(it.sptep);
507 mmu_set_spte(vcpu, it.sptep, access, gw->pte_access, 480 mmu_set_spte(vcpu, it.sptep, access, gw->pte_access,
508 user_fault, write_fault, emulate, it.level, 481 user_fault, write_fault, &emulate, it.level,
509 gw->gfn, pfn, prefault, map_writable); 482 gw->gfn, pfn, prefault, map_writable);
510 FNAME(pte_prefetch)(vcpu, gw, it.sptep); 483 FNAME(pte_prefetch)(vcpu, gw, it.sptep);
511 484
512 return it.sptep; 485 return emulate;
513 486
514out_gpte_changed: 487out_gpte_changed:
515 if (sp) 488 if (sp)
516 kvm_mmu_put_page(sp, it.sptep); 489 kvm_mmu_put_page(sp, it.sptep);
517 kvm_release_pfn_clean(pfn); 490 kvm_release_pfn_clean(pfn);
518 return NULL; 491 return 0;
519} 492}
520 493
521/* 494/*
@@ -538,8 +511,6 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
538 int write_fault = error_code & PFERR_WRITE_MASK; 511 int write_fault = error_code & PFERR_WRITE_MASK;
539 int user_fault = error_code & PFERR_USER_MASK; 512 int user_fault = error_code & PFERR_USER_MASK;
540 struct guest_walker walker; 513 struct guest_walker walker;
541 u64 *sptep;
542 int emulate = 0;
543 int r; 514 int r;
544 pfn_t pfn; 515 pfn_t pfn;
545 int level = PT_PAGE_TABLE_LEVEL; 516 int level = PT_PAGE_TABLE_LEVEL;
@@ -594,24 +565,20 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
594 return r; 565 return r;
595 566
596 spin_lock(&vcpu->kvm->mmu_lock); 567 spin_lock(&vcpu->kvm->mmu_lock);
597 if (mmu_notifier_retry(vcpu, mmu_seq)) 568 if (mmu_notifier_retry(vcpu->kvm, mmu_seq))
598 goto out_unlock; 569 goto out_unlock;
599 570
600 kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT); 571 kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT);
601 kvm_mmu_free_some_pages(vcpu); 572 kvm_mmu_free_some_pages(vcpu);
602 if (!force_pt_level) 573 if (!force_pt_level)
603 transparent_hugepage_adjust(vcpu, &walker.gfn, &pfn, &level); 574 transparent_hugepage_adjust(vcpu, &walker.gfn, &pfn, &level);
604 sptep = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault, 575 r = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault,
605 level, &emulate, pfn, map_writable, prefault); 576 level, pfn, map_writable, prefault);
606 (void)sptep;
607 pgprintk("%s: shadow pte %p %llx emulate %d\n", __func__,
608 sptep, *sptep, emulate);
609
610 ++vcpu->stat.pf_fixed; 577 ++vcpu->stat.pf_fixed;
611 kvm_mmu_audit(vcpu, AUDIT_POST_PAGE_FAULT); 578 kvm_mmu_audit(vcpu, AUDIT_POST_PAGE_FAULT);
612 spin_unlock(&vcpu->kvm->mmu_lock); 579 spin_unlock(&vcpu->kvm->mmu_lock);
613 580
614 return emulate; 581 return r;
615 582
616out_unlock: 583out_unlock:
617 spin_unlock(&vcpu->kvm->mmu_lock); 584 spin_unlock(&vcpu->kvm->mmu_lock);
@@ -757,7 +724,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
757 sizeof(pt_element_t))) 724 sizeof(pt_element_t)))
758 return -EINVAL; 725 return -EINVAL;
759 726
760 if (FNAME(prefetch_invalid_gpte)(vcpu, sp, &sp->spt[i], gpte)) { 727 if (prefetch_invalid_gpte(vcpu, sp, &sp->spt[i], gpte)) {
761 vcpu->kvm->tlbs_dirty++; 728 vcpu->kvm->tlbs_dirty++;
762 continue; 729 continue;
763 } 730 }
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index d017df3899ef..d29d3cd1c156 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -20,6 +20,7 @@
20#include "mmu.h" 20#include "mmu.h"
21#include "kvm_cache_regs.h" 21#include "kvm_cache_regs.h"
22#include "x86.h" 22#include "x86.h"
23#include "cpuid.h"
23 24
24#include <linux/module.h> 25#include <linux/module.h>
25#include <linux/mod_devicetable.h> 26#include <linux/mod_devicetable.h>
@@ -630,15 +631,12 @@ static int svm_hardware_enable(void *garbage)
630 return -EBUSY; 631 return -EBUSY;
631 632
632 if (!has_svm()) { 633 if (!has_svm()) {
633 printk(KERN_ERR "svm_hardware_enable: err EOPNOTSUPP on %d\n", 634 pr_err("%s: err EOPNOTSUPP on %d\n", __func__, me);
634 me);
635 return -EINVAL; 635 return -EINVAL;
636 } 636 }
637 sd = per_cpu(svm_data, me); 637 sd = per_cpu(svm_data, me);
638
639 if (!sd) { 638 if (!sd) {
640 printk(KERN_ERR "svm_hardware_enable: svm_data is NULL on %d\n", 639 pr_err("%s: svm_data is NULL on %d\n", __func__, me);
641 me);
642 return -EINVAL; 640 return -EINVAL;
643 } 641 }
644 642
@@ -1012,6 +1010,13 @@ static void svm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale)
1012 svm->tsc_ratio = ratio; 1010 svm->tsc_ratio = ratio;
1013} 1011}
1014 1012
1013static u64 svm_read_tsc_offset(struct kvm_vcpu *vcpu)
1014{
1015 struct vcpu_svm *svm = to_svm(vcpu);
1016
1017 return svm->vmcb->control.tsc_offset;
1018}
1019
1015static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) 1020static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
1016{ 1021{
1017 struct vcpu_svm *svm = to_svm(vcpu); 1022 struct vcpu_svm *svm = to_svm(vcpu);
@@ -1189,6 +1194,8 @@ static void init_vmcb(struct vcpu_svm *svm)
1189static int svm_vcpu_reset(struct kvm_vcpu *vcpu) 1194static int svm_vcpu_reset(struct kvm_vcpu *vcpu)
1190{ 1195{
1191 struct vcpu_svm *svm = to_svm(vcpu); 1196 struct vcpu_svm *svm = to_svm(vcpu);
1197 u32 dummy;
1198 u32 eax = 1;
1192 1199
1193 init_vmcb(svm); 1200 init_vmcb(svm);
1194 1201
@@ -1197,8 +1204,9 @@ static int svm_vcpu_reset(struct kvm_vcpu *vcpu)
1197 svm->vmcb->save.cs.base = svm->vcpu.arch.sipi_vector << 12; 1204 svm->vmcb->save.cs.base = svm->vcpu.arch.sipi_vector << 12;
1198 svm->vmcb->save.cs.selector = svm->vcpu.arch.sipi_vector << 8; 1205 svm->vmcb->save.cs.selector = svm->vcpu.arch.sipi_vector << 8;
1199 } 1206 }
1200 vcpu->arch.regs_avail = ~0; 1207
1201 vcpu->arch.regs_dirty = ~0; 1208 kvm_cpuid(vcpu, &eax, &dummy, &dummy, &dummy);
1209 kvm_register_write(vcpu, VCPU_REGS_RDX, eax);
1202 1210
1203 return 0; 1211 return 0;
1204} 1212}
@@ -1254,11 +1262,6 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
1254 svm->vmcb_pa = page_to_pfn(page) << PAGE_SHIFT; 1262 svm->vmcb_pa = page_to_pfn(page) << PAGE_SHIFT;
1255 svm->asid_generation = 0; 1263 svm->asid_generation = 0;
1256 init_vmcb(svm); 1264 init_vmcb(svm);
1257 kvm_write_tsc(&svm->vcpu, 0);
1258
1259 err = fx_init(&svm->vcpu);
1260 if (err)
1261 goto free_page4;
1262 1265
1263 svm->vcpu.arch.apic_base = 0xfee00000 | MSR_IA32_APICBASE_ENABLE; 1266 svm->vcpu.arch.apic_base = 0xfee00000 | MSR_IA32_APICBASE_ENABLE;
1264 if (kvm_vcpu_is_bsp(&svm->vcpu)) 1267 if (kvm_vcpu_is_bsp(&svm->vcpu))
@@ -1268,8 +1271,6 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
1268 1271
1269 return &svm->vcpu; 1272 return &svm->vcpu;
1270 1273
1271free_page4:
1272 __free_page(hsave_page);
1273free_page3: 1274free_page3:
1274 __free_pages(nested_msrpm_pages, MSRPM_ALLOC_ORDER); 1275 __free_pages(nested_msrpm_pages, MSRPM_ALLOC_ORDER);
1275free_page2: 1276free_page2:
@@ -3008,11 +3009,11 @@ static int cr8_write_interception(struct vcpu_svm *svm)
3008 return 0; 3009 return 0;
3009} 3010}
3010 3011
3011u64 svm_read_l1_tsc(struct kvm_vcpu *vcpu) 3012u64 svm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc)
3012{ 3013{
3013 struct vmcb *vmcb = get_host_vmcb(to_svm(vcpu)); 3014 struct vmcb *vmcb = get_host_vmcb(to_svm(vcpu));
3014 return vmcb->control.tsc_offset + 3015 return vmcb->control.tsc_offset +
3015 svm_scale_tsc(vcpu, native_read_tsc()); 3016 svm_scale_tsc(vcpu, host_tsc);
3016} 3017}
3017 3018
3018static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data) 3019static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data)
@@ -3131,13 +3132,15 @@ static int svm_set_vm_cr(struct kvm_vcpu *vcpu, u64 data)
3131 return 0; 3132 return 0;
3132} 3133}
3133 3134
3134static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data) 3135static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
3135{ 3136{
3136 struct vcpu_svm *svm = to_svm(vcpu); 3137 struct vcpu_svm *svm = to_svm(vcpu);
3137 3138
3139 u32 ecx = msr->index;
3140 u64 data = msr->data;
3138 switch (ecx) { 3141 switch (ecx) {
3139 case MSR_IA32_TSC: 3142 case MSR_IA32_TSC:
3140 kvm_write_tsc(vcpu, data); 3143 kvm_write_tsc(vcpu, msr);
3141 break; 3144 break;
3142 case MSR_STAR: 3145 case MSR_STAR:
3143 svm->vmcb->save.star = data; 3146 svm->vmcb->save.star = data;
@@ -3192,20 +3195,24 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data)
3192 vcpu_unimpl(vcpu, "unimplemented wrmsr: 0x%x data 0x%llx\n", ecx, data); 3195 vcpu_unimpl(vcpu, "unimplemented wrmsr: 0x%x data 0x%llx\n", ecx, data);
3193 break; 3196 break;
3194 default: 3197 default:
3195 return kvm_set_msr_common(vcpu, ecx, data); 3198 return kvm_set_msr_common(vcpu, msr);
3196 } 3199 }
3197 return 0; 3200 return 0;
3198} 3201}
3199 3202
3200static int wrmsr_interception(struct vcpu_svm *svm) 3203static int wrmsr_interception(struct vcpu_svm *svm)
3201{ 3204{
3205 struct msr_data msr;
3202 u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX]; 3206 u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX];
3203 u64 data = (svm->vcpu.arch.regs[VCPU_REGS_RAX] & -1u) 3207 u64 data = (svm->vcpu.arch.regs[VCPU_REGS_RAX] & -1u)
3204 | ((u64)(svm->vcpu.arch.regs[VCPU_REGS_RDX] & -1u) << 32); 3208 | ((u64)(svm->vcpu.arch.regs[VCPU_REGS_RDX] & -1u) << 32);
3205 3209
3210 msr.data = data;
3211 msr.index = ecx;
3212 msr.host_initiated = false;
3206 3213
3207 svm->next_rip = kvm_rip_read(&svm->vcpu) + 2; 3214 svm->next_rip = kvm_rip_read(&svm->vcpu) + 2;
3208 if (svm_set_msr(&svm->vcpu, ecx, data)) { 3215 if (svm_set_msr(&svm->vcpu, &msr)) {
3209 trace_kvm_msr_write_ex(ecx, data); 3216 trace_kvm_msr_write_ex(ecx, data);
3210 kvm_inject_gp(&svm->vcpu, 0); 3217 kvm_inject_gp(&svm->vcpu, 0);
3211 } else { 3218 } else {
@@ -4302,6 +4309,7 @@ static struct kvm_x86_ops svm_x86_ops = {
4302 .has_wbinvd_exit = svm_has_wbinvd_exit, 4309 .has_wbinvd_exit = svm_has_wbinvd_exit,
4303 4310
4304 .set_tsc_khz = svm_set_tsc_khz, 4311 .set_tsc_khz = svm_set_tsc_khz,
4312 .read_tsc_offset = svm_read_tsc_offset,
4305 .write_tsc_offset = svm_write_tsc_offset, 4313 .write_tsc_offset = svm_write_tsc_offset,
4306 .adjust_tsc_offset = svm_adjust_tsc_offset, 4314 .adjust_tsc_offset = svm_adjust_tsc_offset,
4307 .compute_tsc_offset = svm_compute_tsc_offset, 4315 .compute_tsc_offset = svm_compute_tsc_offset,
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index bca63f04dccb..fe5e00ed7036 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -4,6 +4,7 @@
4#include <linux/tracepoint.h> 4#include <linux/tracepoint.h>
5#include <asm/vmx.h> 5#include <asm/vmx.h>
6#include <asm/svm.h> 6#include <asm/svm.h>
7#include <asm/clocksource.h>
7 8
8#undef TRACE_SYSTEM 9#undef TRACE_SYSTEM
9#define TRACE_SYSTEM kvm 10#define TRACE_SYSTEM kvm
@@ -754,6 +755,68 @@ TRACE_EVENT(
754 __entry->write ? "Write" : "Read", 755 __entry->write ? "Write" : "Read",
755 __entry->gpa_match ? "GPA" : "GVA") 756 __entry->gpa_match ? "GPA" : "GVA")
756); 757);
758
759#ifdef CONFIG_X86_64
760
761#define host_clocks \
762 {VCLOCK_NONE, "none"}, \
763 {VCLOCK_TSC, "tsc"}, \
764 {VCLOCK_HPET, "hpet"} \
765
766TRACE_EVENT(kvm_update_master_clock,
767 TP_PROTO(bool use_master_clock, unsigned int host_clock, bool offset_matched),
768 TP_ARGS(use_master_clock, host_clock, offset_matched),
769
770 TP_STRUCT__entry(
771 __field( bool, use_master_clock )
772 __field( unsigned int, host_clock )
773 __field( bool, offset_matched )
774 ),
775
776 TP_fast_assign(
777 __entry->use_master_clock = use_master_clock;
778 __entry->host_clock = host_clock;
779 __entry->offset_matched = offset_matched;
780 ),
781
782 TP_printk("masterclock %d hostclock %s offsetmatched %u",
783 __entry->use_master_clock,
784 __print_symbolic(__entry->host_clock, host_clocks),
785 __entry->offset_matched)
786);
787
788TRACE_EVENT(kvm_track_tsc,
789 TP_PROTO(unsigned int vcpu_id, unsigned int nr_matched,
790 unsigned int online_vcpus, bool use_master_clock,
791 unsigned int host_clock),
792 TP_ARGS(vcpu_id, nr_matched, online_vcpus, use_master_clock,
793 host_clock),
794
795 TP_STRUCT__entry(
796 __field( unsigned int, vcpu_id )
797 __field( unsigned int, nr_vcpus_matched_tsc )
798 __field( unsigned int, online_vcpus )
799 __field( bool, use_master_clock )
800 __field( unsigned int, host_clock )
801 ),
802
803 TP_fast_assign(
804 __entry->vcpu_id = vcpu_id;
805 __entry->nr_vcpus_matched_tsc = nr_matched;
806 __entry->online_vcpus = online_vcpus;
807 __entry->use_master_clock = use_master_clock;
808 __entry->host_clock = host_clock;
809 ),
810
811 TP_printk("vcpu_id %u masterclock %u offsetmatched %u nr_online %u"
812 " hostclock %s",
813 __entry->vcpu_id, __entry->use_master_clock,
814 __entry->nr_vcpus_matched_tsc, __entry->online_vcpus,
815 __print_symbolic(__entry->host_clock, host_clocks))
816);
817
818#endif /* CONFIG_X86_64 */
819
757#endif /* _TRACE_KVM_H */ 820#endif /* _TRACE_KVM_H */
758 821
759#undef TRACE_INCLUDE_PATH 822#undef TRACE_INCLUDE_PATH
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index f85815945fc6..9120ae1901e4 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -42,6 +42,7 @@
42#include <asm/i387.h> 42#include <asm/i387.h>
43#include <asm/xcr.h> 43#include <asm/xcr.h>
44#include <asm/perf_event.h> 44#include <asm/perf_event.h>
45#include <asm/kexec.h>
45 46
46#include "trace.h" 47#include "trace.h"
47 48
@@ -802,11 +803,6 @@ static inline bool cpu_has_vmx_ept_ad_bits(void)
802 return vmx_capability.ept & VMX_EPT_AD_BIT; 803 return vmx_capability.ept & VMX_EPT_AD_BIT;
803} 804}
804 805
805static inline bool cpu_has_vmx_invept_individual_addr(void)
806{
807 return vmx_capability.ept & VMX_EPT_EXTENT_INDIVIDUAL_BIT;
808}
809
810static inline bool cpu_has_vmx_invept_context(void) 806static inline bool cpu_has_vmx_invept_context(void)
811{ 807{
812 return vmx_capability.ept & VMX_EPT_EXTENT_CONTEXT_BIT; 808 return vmx_capability.ept & VMX_EPT_EXTENT_CONTEXT_BIT;
@@ -992,6 +988,46 @@ static void vmcs_load(struct vmcs *vmcs)
992 vmcs, phys_addr); 988 vmcs, phys_addr);
993} 989}
994 990
991#ifdef CONFIG_KEXEC
992/*
993 * This bitmap is used to indicate whether the vmclear
994 * operation is enabled on all cpus. All disabled by
995 * default.
996 */
997static cpumask_t crash_vmclear_enabled_bitmap = CPU_MASK_NONE;
998
999static inline void crash_enable_local_vmclear(int cpu)
1000{
1001 cpumask_set_cpu(cpu, &crash_vmclear_enabled_bitmap);
1002}
1003
1004static inline void crash_disable_local_vmclear(int cpu)
1005{
1006 cpumask_clear_cpu(cpu, &crash_vmclear_enabled_bitmap);
1007}
1008
1009static inline int crash_local_vmclear_enabled(int cpu)
1010{
1011 return cpumask_test_cpu(cpu, &crash_vmclear_enabled_bitmap);
1012}
1013
1014static void crash_vmclear_local_loaded_vmcss(void)
1015{
1016 int cpu = raw_smp_processor_id();
1017 struct loaded_vmcs *v;
1018
1019 if (!crash_local_vmclear_enabled(cpu))
1020 return;
1021
1022 list_for_each_entry(v, &per_cpu(loaded_vmcss_on_cpu, cpu),
1023 loaded_vmcss_on_cpu_link)
1024 vmcs_clear(v->vmcs);
1025}
1026#else
1027static inline void crash_enable_local_vmclear(int cpu) { }
1028static inline void crash_disable_local_vmclear(int cpu) { }
1029#endif /* CONFIG_KEXEC */
1030
995static void __loaded_vmcs_clear(void *arg) 1031static void __loaded_vmcs_clear(void *arg)
996{ 1032{
997 struct loaded_vmcs *loaded_vmcs = arg; 1033 struct loaded_vmcs *loaded_vmcs = arg;
@@ -1001,15 +1037,28 @@ static void __loaded_vmcs_clear(void *arg)
1001 return; /* vcpu migration can race with cpu offline */ 1037 return; /* vcpu migration can race with cpu offline */
1002 if (per_cpu(current_vmcs, cpu) == loaded_vmcs->vmcs) 1038 if (per_cpu(current_vmcs, cpu) == loaded_vmcs->vmcs)
1003 per_cpu(current_vmcs, cpu) = NULL; 1039 per_cpu(current_vmcs, cpu) = NULL;
1040 crash_disable_local_vmclear(cpu);
1004 list_del(&loaded_vmcs->loaded_vmcss_on_cpu_link); 1041 list_del(&loaded_vmcs->loaded_vmcss_on_cpu_link);
1042
1043 /*
1044 * we should ensure updating loaded_vmcs->loaded_vmcss_on_cpu_link
1045 * is before setting loaded_vmcs->vcpu to -1 which is done in
1046 * loaded_vmcs_init. Otherwise, other cpu can see vcpu = -1 fist
1047 * then adds the vmcs into percpu list before it is deleted.
1048 */
1049 smp_wmb();
1050
1005 loaded_vmcs_init(loaded_vmcs); 1051 loaded_vmcs_init(loaded_vmcs);
1052 crash_enable_local_vmclear(cpu);
1006} 1053}
1007 1054
1008static void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs) 1055static void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs)
1009{ 1056{
1010 if (loaded_vmcs->cpu != -1) 1057 int cpu = loaded_vmcs->cpu;
1011 smp_call_function_single( 1058
1012 loaded_vmcs->cpu, __loaded_vmcs_clear, loaded_vmcs, 1); 1059 if (cpu != -1)
1060 smp_call_function_single(cpu,
1061 __loaded_vmcs_clear, loaded_vmcs, 1);
1013} 1062}
1014 1063
1015static inline void vpid_sync_vcpu_single(struct vcpu_vmx *vmx) 1064static inline void vpid_sync_vcpu_single(struct vcpu_vmx *vmx)
@@ -1051,17 +1100,6 @@ static inline void ept_sync_context(u64 eptp)
1051 } 1100 }
1052} 1101}
1053 1102
1054static inline void ept_sync_individual_addr(u64 eptp, gpa_t gpa)
1055{
1056 if (enable_ept) {
1057 if (cpu_has_vmx_invept_individual_addr())
1058 __invept(VMX_EPT_EXTENT_INDIVIDUAL_ADDR,
1059 eptp, gpa);
1060 else
1061 ept_sync_context(eptp);
1062 }
1063}
1064
1065static __always_inline unsigned long vmcs_readl(unsigned long field) 1103static __always_inline unsigned long vmcs_readl(unsigned long field)
1066{ 1104{
1067 unsigned long value; 1105 unsigned long value;
@@ -1535,8 +1573,18 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
1535 1573
1536 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); 1574 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
1537 local_irq_disable(); 1575 local_irq_disable();
1576 crash_disable_local_vmclear(cpu);
1577
1578 /*
1579 * Read loaded_vmcs->cpu should be before fetching
1580 * loaded_vmcs->loaded_vmcss_on_cpu_link.
1581 * See the comments in __loaded_vmcs_clear().
1582 */
1583 smp_rmb();
1584
1538 list_add(&vmx->loaded_vmcs->loaded_vmcss_on_cpu_link, 1585 list_add(&vmx->loaded_vmcs->loaded_vmcss_on_cpu_link,
1539 &per_cpu(loaded_vmcss_on_cpu, cpu)); 1586 &per_cpu(loaded_vmcss_on_cpu, cpu));
1587 crash_enable_local_vmclear(cpu);
1540 local_irq_enable(); 1588 local_irq_enable();
1541 1589
1542 /* 1590 /*
@@ -1839,11 +1887,10 @@ static u64 guest_read_tsc(void)
1839 * Like guest_read_tsc, but always returns L1's notion of the timestamp 1887 * Like guest_read_tsc, but always returns L1's notion of the timestamp
1840 * counter, even if a nested guest (L2) is currently running. 1888 * counter, even if a nested guest (L2) is currently running.
1841 */ 1889 */
1842u64 vmx_read_l1_tsc(struct kvm_vcpu *vcpu) 1890u64 vmx_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc)
1843{ 1891{
1844 u64 host_tsc, tsc_offset; 1892 u64 tsc_offset;
1845 1893
1846 rdtscll(host_tsc);
1847 tsc_offset = is_guest_mode(vcpu) ? 1894 tsc_offset = is_guest_mode(vcpu) ?
1848 to_vmx(vcpu)->nested.vmcs01_tsc_offset : 1895 to_vmx(vcpu)->nested.vmcs01_tsc_offset :
1849 vmcs_read64(TSC_OFFSET); 1896 vmcs_read64(TSC_OFFSET);
@@ -1866,6 +1913,11 @@ static void vmx_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale)
1866 WARN(1, "user requested TSC rate below hardware speed\n"); 1913 WARN(1, "user requested TSC rate below hardware speed\n");
1867} 1914}
1868 1915
1916static u64 vmx_read_tsc_offset(struct kvm_vcpu *vcpu)
1917{
1918 return vmcs_read64(TSC_OFFSET);
1919}
1920
1869/* 1921/*
1870 * writes 'offset' into guest's timestamp counter offset register 1922 * writes 'offset' into guest's timestamp counter offset register
1871 */ 1923 */
@@ -2202,15 +2254,17 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
2202 * Returns 0 on success, non-0 otherwise. 2254 * Returns 0 on success, non-0 otherwise.
2203 * Assumes vcpu_load() was already called. 2255 * Assumes vcpu_load() was already called.
2204 */ 2256 */
2205static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) 2257static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
2206{ 2258{
2207 struct vcpu_vmx *vmx = to_vmx(vcpu); 2259 struct vcpu_vmx *vmx = to_vmx(vcpu);
2208 struct shared_msr_entry *msr; 2260 struct shared_msr_entry *msr;
2209 int ret = 0; 2261 int ret = 0;
2262 u32 msr_index = msr_info->index;
2263 u64 data = msr_info->data;
2210 2264
2211 switch (msr_index) { 2265 switch (msr_index) {
2212 case MSR_EFER: 2266 case MSR_EFER:
2213 ret = kvm_set_msr_common(vcpu, msr_index, data); 2267 ret = kvm_set_msr_common(vcpu, msr_info);
2214 break; 2268 break;
2215#ifdef CONFIG_X86_64 2269#ifdef CONFIG_X86_64
2216 case MSR_FS_BASE: 2270 case MSR_FS_BASE:
@@ -2236,7 +2290,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
2236 vmcs_writel(GUEST_SYSENTER_ESP, data); 2290 vmcs_writel(GUEST_SYSENTER_ESP, data);
2237 break; 2291 break;
2238 case MSR_IA32_TSC: 2292 case MSR_IA32_TSC:
2239 kvm_write_tsc(vcpu, data); 2293 kvm_write_tsc(vcpu, msr_info);
2240 break; 2294 break;
2241 case MSR_IA32_CR_PAT: 2295 case MSR_IA32_CR_PAT:
2242 if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) { 2296 if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
@@ -2244,7 +2298,10 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
2244 vcpu->arch.pat = data; 2298 vcpu->arch.pat = data;
2245 break; 2299 break;
2246 } 2300 }
2247 ret = kvm_set_msr_common(vcpu, msr_index, data); 2301 ret = kvm_set_msr_common(vcpu, msr_info);
2302 break;
2303 case MSR_IA32_TSC_ADJUST:
2304 ret = kvm_set_msr_common(vcpu, msr_info);
2248 break; 2305 break;
2249 case MSR_TSC_AUX: 2306 case MSR_TSC_AUX:
2250 if (!vmx->rdtscp_enabled) 2307 if (!vmx->rdtscp_enabled)
@@ -2267,7 +2324,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
2267 } 2324 }
2268 break; 2325 break;
2269 } 2326 }
2270 ret = kvm_set_msr_common(vcpu, msr_index, data); 2327 ret = kvm_set_msr_common(vcpu, msr_info);
2271 } 2328 }
2272 2329
2273 return ret; 2330 return ret;
@@ -2341,6 +2398,18 @@ static int hardware_enable(void *garbage)
2341 return -EBUSY; 2398 return -EBUSY;
2342 2399
2343 INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu)); 2400 INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu));
2401
2402 /*
2403 * Now we can enable the vmclear operation in kdump
2404 * since the loaded_vmcss_on_cpu list on this cpu
2405 * has been initialized.
2406 *
2407 * Though the cpu is not in VMX operation now, there
2408 * is no problem to enable the vmclear operation
2409 * for the loaded_vmcss_on_cpu list is empty!
2410 */
2411 crash_enable_local_vmclear(cpu);
2412
2344 rdmsrl(MSR_IA32_FEATURE_CONTROL, old); 2413 rdmsrl(MSR_IA32_FEATURE_CONTROL, old);
2345 2414
2346 test_bits = FEATURE_CONTROL_LOCKED; 2415 test_bits = FEATURE_CONTROL_LOCKED;
@@ -2697,6 +2766,7 @@ static void fix_pmode_dataseg(struct kvm_vcpu *vcpu, int seg, struct kvm_segment
2697 if (!(vmcs_readl(sf->base) == tmp.base && tmp.s)) { 2766 if (!(vmcs_readl(sf->base) == tmp.base && tmp.s)) {
2698 tmp.base = vmcs_readl(sf->base); 2767 tmp.base = vmcs_readl(sf->base);
2699 tmp.selector = vmcs_read16(sf->selector); 2768 tmp.selector = vmcs_read16(sf->selector);
2769 tmp.dpl = tmp.selector & SELECTOR_RPL_MASK;
2700 tmp.s = 1; 2770 tmp.s = 1;
2701 } 2771 }
2702 vmx_set_segment(vcpu, &tmp, seg); 2772 vmx_set_segment(vcpu, &tmp, seg);
@@ -3246,7 +3316,7 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu,
3246 * unrestricted guest like Westmere to older host that don't have 3316 * unrestricted guest like Westmere to older host that don't have
3247 * unrestricted guest like Nehelem. 3317 * unrestricted guest like Nehelem.
3248 */ 3318 */
3249 if (!enable_unrestricted_guest && vmx->rmode.vm86_active) { 3319 if (vmx->rmode.vm86_active) {
3250 switch (seg) { 3320 switch (seg) {
3251 case VCPU_SREG_CS: 3321 case VCPU_SREG_CS:
3252 vmcs_write32(GUEST_CS_AR_BYTES, 0xf3); 3322 vmcs_write32(GUEST_CS_AR_BYTES, 0xf3);
@@ -3897,8 +3967,6 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
3897 vmcs_writel(CR0_GUEST_HOST_MASK, ~0UL); 3967 vmcs_writel(CR0_GUEST_HOST_MASK, ~0UL);
3898 set_cr4_guest_host_mask(vmx); 3968 set_cr4_guest_host_mask(vmx);
3899 3969
3900 kvm_write_tsc(&vmx->vcpu, 0);
3901
3902 return 0; 3970 return 0;
3903} 3971}
3904 3972
@@ -3908,8 +3976,6 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
3908 u64 msr; 3976 u64 msr;
3909 int ret; 3977 int ret;
3910 3978
3911 vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP));
3912
3913 vmx->rmode.vm86_active = 0; 3979 vmx->rmode.vm86_active = 0;
3914 3980
3915 vmx->soft_vnmi_blocked = 0; 3981 vmx->soft_vnmi_blocked = 0;
@@ -3921,10 +3987,6 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
3921 msr |= MSR_IA32_APICBASE_BSP; 3987 msr |= MSR_IA32_APICBASE_BSP;
3922 kvm_set_apic_base(&vmx->vcpu, msr); 3988 kvm_set_apic_base(&vmx->vcpu, msr);
3923 3989
3924 ret = fx_init(&vmx->vcpu);
3925 if (ret != 0)
3926 goto out;
3927
3928 vmx_segment_cache_clear(vmx); 3990 vmx_segment_cache_clear(vmx);
3929 3991
3930 seg_setup(VCPU_SREG_CS); 3992 seg_setup(VCPU_SREG_CS);
@@ -3965,7 +4027,6 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
3965 kvm_rip_write(vcpu, 0xfff0); 4027 kvm_rip_write(vcpu, 0xfff0);
3966 else 4028 else
3967 kvm_rip_write(vcpu, 0); 4029 kvm_rip_write(vcpu, 0);
3968 kvm_register_write(vcpu, VCPU_REGS_RSP, 0);
3969 4030
3970 vmcs_writel(GUEST_GDTR_BASE, 0); 4031 vmcs_writel(GUEST_GDTR_BASE, 0);
3971 vmcs_write32(GUEST_GDTR_LIMIT, 0xffff); 4032 vmcs_write32(GUEST_GDTR_LIMIT, 0xffff);
@@ -4015,7 +4076,6 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
4015 /* HACK: Don't enable emulation on guest boot/reset */ 4076 /* HACK: Don't enable emulation on guest boot/reset */
4016 vmx->emulation_required = 0; 4077 vmx->emulation_required = 0;
4017 4078
4018out:
4019 return ret; 4079 return ret;
4020} 4080}
4021 4081
@@ -4287,16 +4347,6 @@ static int handle_exception(struct kvm_vcpu *vcpu)
4287 if (is_machine_check(intr_info)) 4347 if (is_machine_check(intr_info))
4288 return handle_machine_check(vcpu); 4348 return handle_machine_check(vcpu);
4289 4349
4290 if ((vect_info & VECTORING_INFO_VALID_MASK) &&
4291 !is_page_fault(intr_info)) {
4292 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
4293 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_SIMUL_EX;
4294 vcpu->run->internal.ndata = 2;
4295 vcpu->run->internal.data[0] = vect_info;
4296 vcpu->run->internal.data[1] = intr_info;
4297 return 0;
4298 }
4299
4300 if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR) 4350 if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR)
4301 return 1; /* already handled by vmx_vcpu_run() */ 4351 return 1; /* already handled by vmx_vcpu_run() */
4302 4352
@@ -4315,6 +4365,22 @@ static int handle_exception(struct kvm_vcpu *vcpu)
4315 error_code = 0; 4365 error_code = 0;
4316 if (intr_info & INTR_INFO_DELIVER_CODE_MASK) 4366 if (intr_info & INTR_INFO_DELIVER_CODE_MASK)
4317 error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE); 4367 error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
4368
4369 /*
4370 * The #PF with PFEC.RSVD = 1 indicates the guest is accessing
4371 * MMIO, it is better to report an internal error.
4372 * See the comments in vmx_handle_exit.
4373 */
4374 if ((vect_info & VECTORING_INFO_VALID_MASK) &&
4375 !(is_page_fault(intr_info) && !(error_code & PFERR_RSVD_MASK))) {
4376 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
4377 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_SIMUL_EX;
4378 vcpu->run->internal.ndata = 2;
4379 vcpu->run->internal.data[0] = vect_info;
4380 vcpu->run->internal.data[1] = intr_info;
4381 return 0;
4382 }
4383
4318 if (is_page_fault(intr_info)) { 4384 if (is_page_fault(intr_info)) {
4319 /* EPT won't cause page fault directly */ 4385 /* EPT won't cause page fault directly */
4320 BUG_ON(enable_ept); 4386 BUG_ON(enable_ept);
@@ -4626,11 +4692,15 @@ static int handle_rdmsr(struct kvm_vcpu *vcpu)
4626 4692
4627static int handle_wrmsr(struct kvm_vcpu *vcpu) 4693static int handle_wrmsr(struct kvm_vcpu *vcpu)
4628{ 4694{
4695 struct msr_data msr;
4629 u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX]; 4696 u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX];
4630 u64 data = (vcpu->arch.regs[VCPU_REGS_RAX] & -1u) 4697 u64 data = (vcpu->arch.regs[VCPU_REGS_RAX] & -1u)
4631 | ((u64)(vcpu->arch.regs[VCPU_REGS_RDX] & -1u) << 32); 4698 | ((u64)(vcpu->arch.regs[VCPU_REGS_RDX] & -1u) << 32);
4632 4699
4633 if (vmx_set_msr(vcpu, ecx, data) != 0) { 4700 msr.data = data;
4701 msr.index = ecx;
4702 msr.host_initiated = false;
4703 if (vmx_set_msr(vcpu, &msr) != 0) {
4634 trace_kvm_msr_write_ex(ecx, data); 4704 trace_kvm_msr_write_ex(ecx, data);
4635 kvm_inject_gp(vcpu, 0); 4705 kvm_inject_gp(vcpu, 0);
4636 return 1; 4706 return 1;
@@ -4827,11 +4897,6 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu)
4827 4897
4828 exit_qualification = vmcs_readl(EXIT_QUALIFICATION); 4898 exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
4829 4899
4830 if (exit_qualification & (1 << 6)) {
4831 printk(KERN_ERR "EPT: GPA exceeds GAW!\n");
4832 return -EINVAL;
4833 }
4834
4835 gla_validity = (exit_qualification >> 7) & 0x3; 4900 gla_validity = (exit_qualification >> 7) & 0x3;
4836 if (gla_validity != 0x3 && gla_validity != 0x1 && gla_validity != 0) { 4901 if (gla_validity != 0x3 && gla_validity != 0x1 && gla_validity != 0) {
4837 printk(KERN_ERR "EPT: Handling EPT violation failed!\n"); 4902 printk(KERN_ERR "EPT: Handling EPT violation failed!\n");
@@ -5979,13 +6044,24 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
5979 return 0; 6044 return 0;
5980 } 6045 }
5981 6046
6047 /*
6048 * Note:
6049 * Do not try to fix EXIT_REASON_EPT_MISCONFIG if it caused by
6050 * delivery event since it indicates guest is accessing MMIO.
6051 * The vm-exit can be triggered again after return to guest that
6052 * will cause infinite loop.
6053 */
5982 if ((vectoring_info & VECTORING_INFO_VALID_MASK) && 6054 if ((vectoring_info & VECTORING_INFO_VALID_MASK) &&
5983 (exit_reason != EXIT_REASON_EXCEPTION_NMI && 6055 (exit_reason != EXIT_REASON_EXCEPTION_NMI &&
5984 exit_reason != EXIT_REASON_EPT_VIOLATION && 6056 exit_reason != EXIT_REASON_EPT_VIOLATION &&
5985 exit_reason != EXIT_REASON_TASK_SWITCH)) 6057 exit_reason != EXIT_REASON_TASK_SWITCH)) {
5986 printk(KERN_WARNING "%s: unexpected, valid vectoring info " 6058 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
5987 "(0x%x) and exit reason is 0x%x\n", 6059 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_DELIVERY_EV;
5988 __func__, vectoring_info, exit_reason); 6060 vcpu->run->internal.ndata = 2;
6061 vcpu->run->internal.data[0] = vectoring_info;
6062 vcpu->run->internal.data[1] = exit_reason;
6063 return 0;
6064 }
5989 6065
5990 if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked && 6066 if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked &&
5991 !(is_guest_mode(vcpu) && nested_cpu_has_virtual_nmis( 6067 !(is_guest_mode(vcpu) && nested_cpu_has_virtual_nmis(
@@ -7309,6 +7385,7 @@ static struct kvm_x86_ops vmx_x86_ops = {
7309 .has_wbinvd_exit = cpu_has_vmx_wbinvd_exit, 7385 .has_wbinvd_exit = cpu_has_vmx_wbinvd_exit,
7310 7386
7311 .set_tsc_khz = vmx_set_tsc_khz, 7387 .set_tsc_khz = vmx_set_tsc_khz,
7388 .read_tsc_offset = vmx_read_tsc_offset,
7312 .write_tsc_offset = vmx_write_tsc_offset, 7389 .write_tsc_offset = vmx_write_tsc_offset,
7313 .adjust_tsc_offset = vmx_adjust_tsc_offset, 7390 .adjust_tsc_offset = vmx_adjust_tsc_offset,
7314 .compute_tsc_offset = vmx_compute_tsc_offset, 7391 .compute_tsc_offset = vmx_compute_tsc_offset,
@@ -7367,6 +7444,11 @@ static int __init vmx_init(void)
7367 if (r) 7444 if (r)
7368 goto out3; 7445 goto out3;
7369 7446
7447#ifdef CONFIG_KEXEC
7448 rcu_assign_pointer(crash_vmclear_loaded_vmcss,
7449 crash_vmclear_local_loaded_vmcss);
7450#endif
7451
7370 vmx_disable_intercept_for_msr(MSR_FS_BASE, false); 7452 vmx_disable_intercept_for_msr(MSR_FS_BASE, false);
7371 vmx_disable_intercept_for_msr(MSR_GS_BASE, false); 7453 vmx_disable_intercept_for_msr(MSR_GS_BASE, false);
7372 vmx_disable_intercept_for_msr(MSR_KERNEL_GS_BASE, true); 7454 vmx_disable_intercept_for_msr(MSR_KERNEL_GS_BASE, true);
@@ -7404,6 +7486,11 @@ static void __exit vmx_exit(void)
7404 free_page((unsigned long)vmx_io_bitmap_b); 7486 free_page((unsigned long)vmx_io_bitmap_b);
7405 free_page((unsigned long)vmx_io_bitmap_a); 7487 free_page((unsigned long)vmx_io_bitmap_a);
7406 7488
7489#ifdef CONFIG_KEXEC
7490 rcu_assign_pointer(crash_vmclear_loaded_vmcss, NULL);
7491 synchronize_rcu();
7492#endif
7493
7407 kvm_exit(); 7494 kvm_exit();
7408} 7495}
7409 7496
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 4f7641756be2..76f54461f7cb 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -46,6 +46,8 @@
46#include <linux/uaccess.h> 46#include <linux/uaccess.h>
47#include <linux/hash.h> 47#include <linux/hash.h>
48#include <linux/pci.h> 48#include <linux/pci.h>
49#include <linux/timekeeper_internal.h>
50#include <linux/pvclock_gtod.h>
49#include <trace/events/kvm.h> 51#include <trace/events/kvm.h>
50 52
51#define CREATE_TRACE_POINTS 53#define CREATE_TRACE_POINTS
@@ -158,7 +160,9 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
158 160
159u64 __read_mostly host_xcr0; 161u64 __read_mostly host_xcr0;
160 162
161int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt); 163static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt);
164
165static int kvm_vcpu_reset(struct kvm_vcpu *vcpu);
162 166
163static inline void kvm_async_pf_hash_reset(struct kvm_vcpu *vcpu) 167static inline void kvm_async_pf_hash_reset(struct kvm_vcpu *vcpu)
164{ 168{
@@ -633,7 +637,7 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
633 } 637 }
634 638
635 if (is_long_mode(vcpu)) { 639 if (is_long_mode(vcpu)) {
636 if (kvm_read_cr4(vcpu) & X86_CR4_PCIDE) { 640 if (kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE)) {
637 if (cr3 & CR3_PCID_ENABLED_RESERVED_BITS) 641 if (cr3 & CR3_PCID_ENABLED_RESERVED_BITS)
638 return 1; 642 return 1;
639 } else 643 } else
@@ -827,6 +831,7 @@ static u32 msrs_to_save[] = {
827static unsigned num_msrs_to_save; 831static unsigned num_msrs_to_save;
828 832
829static const u32 emulated_msrs[] = { 833static const u32 emulated_msrs[] = {
834 MSR_IA32_TSC_ADJUST,
830 MSR_IA32_TSCDEADLINE, 835 MSR_IA32_TSCDEADLINE,
831 MSR_IA32_MISC_ENABLE, 836 MSR_IA32_MISC_ENABLE,
832 MSR_IA32_MCG_STATUS, 837 MSR_IA32_MCG_STATUS,
@@ -886,9 +891,9 @@ EXPORT_SYMBOL_GPL(kvm_enable_efer_bits);
886 * Returns 0 on success, non-0 otherwise. 891 * Returns 0 on success, non-0 otherwise.
887 * Assumes vcpu_load() was already called. 892 * Assumes vcpu_load() was already called.
888 */ 893 */
889int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) 894int kvm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
890{ 895{
891 return kvm_x86_ops->set_msr(vcpu, msr_index, data); 896 return kvm_x86_ops->set_msr(vcpu, msr);
892} 897}
893 898
894/* 899/*
@@ -896,9 +901,63 @@ int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
896 */ 901 */
897static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data) 902static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
898{ 903{
899 return kvm_set_msr(vcpu, index, *data); 904 struct msr_data msr;
905
906 msr.data = *data;
907 msr.index = index;
908 msr.host_initiated = true;
909 return kvm_set_msr(vcpu, &msr);
900} 910}
901 911
912#ifdef CONFIG_X86_64
913struct pvclock_gtod_data {
914 seqcount_t seq;
915
916 struct { /* extract of a clocksource struct */
917 int vclock_mode;
918 cycle_t cycle_last;
919 cycle_t mask;
920 u32 mult;
921 u32 shift;
922 } clock;
923
924 /* open coded 'struct timespec' */
925 u64 monotonic_time_snsec;
926 time_t monotonic_time_sec;
927};
928
929static struct pvclock_gtod_data pvclock_gtod_data;
930
931static void update_pvclock_gtod(struct timekeeper *tk)
932{
933 struct pvclock_gtod_data *vdata = &pvclock_gtod_data;
934
935 write_seqcount_begin(&vdata->seq);
936
937 /* copy pvclock gtod data */
938 vdata->clock.vclock_mode = tk->clock->archdata.vclock_mode;
939 vdata->clock.cycle_last = tk->clock->cycle_last;
940 vdata->clock.mask = tk->clock->mask;
941 vdata->clock.mult = tk->mult;
942 vdata->clock.shift = tk->shift;
943
944 vdata->monotonic_time_sec = tk->xtime_sec
945 + tk->wall_to_monotonic.tv_sec;
946 vdata->monotonic_time_snsec = tk->xtime_nsec
947 + (tk->wall_to_monotonic.tv_nsec
948 << tk->shift);
949 while (vdata->monotonic_time_snsec >=
950 (((u64)NSEC_PER_SEC) << tk->shift)) {
951 vdata->monotonic_time_snsec -=
952 ((u64)NSEC_PER_SEC) << tk->shift;
953 vdata->monotonic_time_sec++;
954 }
955
956 write_seqcount_end(&vdata->seq);
957}
958#endif
959
960
902static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock) 961static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
903{ 962{
904 int version; 963 int version;
@@ -995,6 +1054,10 @@ static inline u64 get_kernel_ns(void)
995 return timespec_to_ns(&ts); 1054 return timespec_to_ns(&ts);
996} 1055}
997 1056
1057#ifdef CONFIG_X86_64
1058static atomic_t kvm_guest_has_master_clock = ATOMIC_INIT(0);
1059#endif
1060
998static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz); 1061static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz);
999unsigned long max_tsc_khz; 1062unsigned long max_tsc_khz;
1000 1063
@@ -1046,12 +1109,47 @@ static u64 compute_guest_tsc(struct kvm_vcpu *vcpu, s64 kernel_ns)
1046 return tsc; 1109 return tsc;
1047} 1110}
1048 1111
1049void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data) 1112void kvm_track_tsc_matching(struct kvm_vcpu *vcpu)
1113{
1114#ifdef CONFIG_X86_64
1115 bool vcpus_matched;
1116 bool do_request = false;
1117 struct kvm_arch *ka = &vcpu->kvm->arch;
1118 struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
1119
1120 vcpus_matched = (ka->nr_vcpus_matched_tsc + 1 ==
1121 atomic_read(&vcpu->kvm->online_vcpus));
1122
1123 if (vcpus_matched && gtod->clock.vclock_mode == VCLOCK_TSC)
1124 if (!ka->use_master_clock)
1125 do_request = 1;
1126
1127 if (!vcpus_matched && ka->use_master_clock)
1128 do_request = 1;
1129
1130 if (do_request)
1131 kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
1132
1133 trace_kvm_track_tsc(vcpu->vcpu_id, ka->nr_vcpus_matched_tsc,
1134 atomic_read(&vcpu->kvm->online_vcpus),
1135 ka->use_master_clock, gtod->clock.vclock_mode);
1136#endif
1137}
1138
1139static void update_ia32_tsc_adjust_msr(struct kvm_vcpu *vcpu, s64 offset)
1140{
1141 u64 curr_offset = kvm_x86_ops->read_tsc_offset(vcpu);
1142 vcpu->arch.ia32_tsc_adjust_msr += offset - curr_offset;
1143}
1144
1145void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr)
1050{ 1146{
1051 struct kvm *kvm = vcpu->kvm; 1147 struct kvm *kvm = vcpu->kvm;
1052 u64 offset, ns, elapsed; 1148 u64 offset, ns, elapsed;
1053 unsigned long flags; 1149 unsigned long flags;
1054 s64 usdiff; 1150 s64 usdiff;
1151 bool matched;
1152 u64 data = msr->data;
1055 1153
1056 raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags); 1154 raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);
1057 offset = kvm_x86_ops->compute_tsc_offset(vcpu, data); 1155 offset = kvm_x86_ops->compute_tsc_offset(vcpu, data);
@@ -1094,6 +1192,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data)
1094 offset = kvm_x86_ops->compute_tsc_offset(vcpu, data); 1192 offset = kvm_x86_ops->compute_tsc_offset(vcpu, data);
1095 pr_debug("kvm: adjusted tsc offset by %llu\n", delta); 1193 pr_debug("kvm: adjusted tsc offset by %llu\n", delta);
1096 } 1194 }
1195 matched = true;
1097 } else { 1196 } else {
1098 /* 1197 /*
1099 * We split periods of matched TSC writes into generations. 1198 * We split periods of matched TSC writes into generations.
@@ -1108,6 +1207,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data)
1108 kvm->arch.cur_tsc_nsec = ns; 1207 kvm->arch.cur_tsc_nsec = ns;
1109 kvm->arch.cur_tsc_write = data; 1208 kvm->arch.cur_tsc_write = data;
1110 kvm->arch.cur_tsc_offset = offset; 1209 kvm->arch.cur_tsc_offset = offset;
1210 matched = false;
1111 pr_debug("kvm: new tsc generation %u, clock %llu\n", 1211 pr_debug("kvm: new tsc generation %u, clock %llu\n",
1112 kvm->arch.cur_tsc_generation, data); 1212 kvm->arch.cur_tsc_generation, data);
1113 } 1213 }
@@ -1129,26 +1229,195 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data)
1129 vcpu->arch.this_tsc_nsec = kvm->arch.cur_tsc_nsec; 1229 vcpu->arch.this_tsc_nsec = kvm->arch.cur_tsc_nsec;
1130 vcpu->arch.this_tsc_write = kvm->arch.cur_tsc_write; 1230 vcpu->arch.this_tsc_write = kvm->arch.cur_tsc_write;
1131 1231
1232 if (guest_cpuid_has_tsc_adjust(vcpu) && !msr->host_initiated)
1233 update_ia32_tsc_adjust_msr(vcpu, offset);
1132 kvm_x86_ops->write_tsc_offset(vcpu, offset); 1234 kvm_x86_ops->write_tsc_offset(vcpu, offset);
1133 raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags); 1235 raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
1236
1237 spin_lock(&kvm->arch.pvclock_gtod_sync_lock);
1238 if (matched)
1239 kvm->arch.nr_vcpus_matched_tsc++;
1240 else
1241 kvm->arch.nr_vcpus_matched_tsc = 0;
1242
1243 kvm_track_tsc_matching(vcpu);
1244 spin_unlock(&kvm->arch.pvclock_gtod_sync_lock);
1134} 1245}
1135 1246
1136EXPORT_SYMBOL_GPL(kvm_write_tsc); 1247EXPORT_SYMBOL_GPL(kvm_write_tsc);
1137 1248
1249#ifdef CONFIG_X86_64
1250
1251static cycle_t read_tsc(void)
1252{
1253 cycle_t ret;
1254 u64 last;
1255
1256 /*
1257 * Empirically, a fence (of type that depends on the CPU)
1258 * before rdtsc is enough to ensure that rdtsc is ordered
1259 * with respect to loads. The various CPU manuals are unclear
1260 * as to whether rdtsc can be reordered with later loads,
1261 * but no one has ever seen it happen.
1262 */
1263 rdtsc_barrier();
1264 ret = (cycle_t)vget_cycles();
1265
1266 last = pvclock_gtod_data.clock.cycle_last;
1267
1268 if (likely(ret >= last))
1269 return ret;
1270
1271 /*
1272 * GCC likes to generate cmov here, but this branch is extremely
1273 * predictable (it's just a funciton of time and the likely is
1274 * very likely) and there's a data dependence, so force GCC
1275 * to generate a branch instead. I don't barrier() because
1276 * we don't actually need a barrier, and if this function
1277 * ever gets inlined it will generate worse code.
1278 */
1279 asm volatile ("");
1280 return last;
1281}
1282
1283static inline u64 vgettsc(cycle_t *cycle_now)
1284{
1285 long v;
1286 struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
1287
1288 *cycle_now = read_tsc();
1289
1290 v = (*cycle_now - gtod->clock.cycle_last) & gtod->clock.mask;
1291 return v * gtod->clock.mult;
1292}
1293
1294static int do_monotonic(struct timespec *ts, cycle_t *cycle_now)
1295{
1296 unsigned long seq;
1297 u64 ns;
1298 int mode;
1299 struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
1300
1301 ts->tv_nsec = 0;
1302 do {
1303 seq = read_seqcount_begin(&gtod->seq);
1304 mode = gtod->clock.vclock_mode;
1305 ts->tv_sec = gtod->monotonic_time_sec;
1306 ns = gtod->monotonic_time_snsec;
1307 ns += vgettsc(cycle_now);
1308 ns >>= gtod->clock.shift;
1309 } while (unlikely(read_seqcount_retry(&gtod->seq, seq)));
1310 timespec_add_ns(ts, ns);
1311
1312 return mode;
1313}
1314
1315/* returns true if host is using tsc clocksource */
1316static bool kvm_get_time_and_clockread(s64 *kernel_ns, cycle_t *cycle_now)
1317{
1318 struct timespec ts;
1319
1320 /* checked again under seqlock below */
1321 if (pvclock_gtod_data.clock.vclock_mode != VCLOCK_TSC)
1322 return false;
1323
1324 if (do_monotonic(&ts, cycle_now) != VCLOCK_TSC)
1325 return false;
1326
1327 monotonic_to_bootbased(&ts);
1328 *kernel_ns = timespec_to_ns(&ts);
1329
1330 return true;
1331}
1332#endif
1333
1334/*
1335 *
1336 * Assuming a stable TSC across physical CPUS, and a stable TSC
1337 * across virtual CPUs, the following condition is possible.
1338 * Each numbered line represents an event visible to both
1339 * CPUs at the next numbered event.
1340 *
1341 * "timespecX" represents host monotonic time. "tscX" represents
1342 * RDTSC value.
1343 *
1344 * VCPU0 on CPU0 | VCPU1 on CPU1
1345 *
1346 * 1. read timespec0,tsc0
1347 * 2. | timespec1 = timespec0 + N
1348 * | tsc1 = tsc0 + M
1349 * 3. transition to guest | transition to guest
1350 * 4. ret0 = timespec0 + (rdtsc - tsc0) |
1351 * 5. | ret1 = timespec1 + (rdtsc - tsc1)
1352 * | ret1 = timespec0 + N + (rdtsc - (tsc0 + M))
1353 *
1354 * Since ret0 update is visible to VCPU1 at time 5, to obey monotonicity:
1355 *
1356 * - ret0 < ret1
1357 * - timespec0 + (rdtsc - tsc0) < timespec0 + N + (rdtsc - (tsc0 + M))
1358 * ...
1359 * - 0 < N - M => M < N
1360 *
1361 * That is, when timespec0 != timespec1, M < N. Unfortunately that is not
1362 * always the case (the difference between two distinct xtime instances
1363 * might be smaller then the difference between corresponding TSC reads,
1364 * when updating guest vcpus pvclock areas).
1365 *
1366 * To avoid that problem, do not allow visibility of distinct
1367 * system_timestamp/tsc_timestamp values simultaneously: use a master
1368 * copy of host monotonic time values. Update that master copy
1369 * in lockstep.
1370 *
1371 * Rely on synchronization of host TSCs and guest TSCs for monotonicity.
1372 *
1373 */
1374
1375static void pvclock_update_vm_gtod_copy(struct kvm *kvm)
1376{
1377#ifdef CONFIG_X86_64
1378 struct kvm_arch *ka = &kvm->arch;
1379 int vclock_mode;
1380 bool host_tsc_clocksource, vcpus_matched;
1381
1382 vcpus_matched = (ka->nr_vcpus_matched_tsc + 1 ==
1383 atomic_read(&kvm->online_vcpus));
1384
1385 /*
1386 * If the host uses TSC clock, then passthrough TSC as stable
1387 * to the guest.
1388 */
1389 host_tsc_clocksource = kvm_get_time_and_clockread(
1390 &ka->master_kernel_ns,
1391 &ka->master_cycle_now);
1392
1393 ka->use_master_clock = host_tsc_clocksource & vcpus_matched;
1394
1395 if (ka->use_master_clock)
1396 atomic_set(&kvm_guest_has_master_clock, 1);
1397
1398 vclock_mode = pvclock_gtod_data.clock.vclock_mode;
1399 trace_kvm_update_master_clock(ka->use_master_clock, vclock_mode,
1400 vcpus_matched);
1401#endif
1402}
1403
1138static int kvm_guest_time_update(struct kvm_vcpu *v) 1404static int kvm_guest_time_update(struct kvm_vcpu *v)
1139{ 1405{
1140 unsigned long flags; 1406 unsigned long flags, this_tsc_khz;
1141 struct kvm_vcpu_arch *vcpu = &v->arch; 1407 struct kvm_vcpu_arch *vcpu = &v->arch;
1408 struct kvm_arch *ka = &v->kvm->arch;
1142 void *shared_kaddr; 1409 void *shared_kaddr;
1143 unsigned long this_tsc_khz;
1144 s64 kernel_ns, max_kernel_ns; 1410 s64 kernel_ns, max_kernel_ns;
1145 u64 tsc_timestamp; 1411 u64 tsc_timestamp, host_tsc;
1412 struct pvclock_vcpu_time_info *guest_hv_clock;
1146 u8 pvclock_flags; 1413 u8 pvclock_flags;
1414 bool use_master_clock;
1415
1416 kernel_ns = 0;
1417 host_tsc = 0;
1147 1418
1148 /* Keep irq disabled to prevent changes to the clock */ 1419 /* Keep irq disabled to prevent changes to the clock */
1149 local_irq_save(flags); 1420 local_irq_save(flags);
1150 tsc_timestamp = kvm_x86_ops->read_l1_tsc(v);
1151 kernel_ns = get_kernel_ns();
1152 this_tsc_khz = __get_cpu_var(cpu_tsc_khz); 1421 this_tsc_khz = __get_cpu_var(cpu_tsc_khz);
1153 if (unlikely(this_tsc_khz == 0)) { 1422 if (unlikely(this_tsc_khz == 0)) {
1154 local_irq_restore(flags); 1423 local_irq_restore(flags);
@@ -1157,6 +1426,24 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
1157 } 1426 }
1158 1427
1159 /* 1428 /*
1429 * If the host uses TSC clock, then passthrough TSC as stable
1430 * to the guest.
1431 */
1432 spin_lock(&ka->pvclock_gtod_sync_lock);
1433 use_master_clock = ka->use_master_clock;
1434 if (use_master_clock) {
1435 host_tsc = ka->master_cycle_now;
1436 kernel_ns = ka->master_kernel_ns;
1437 }
1438 spin_unlock(&ka->pvclock_gtod_sync_lock);
1439 if (!use_master_clock) {
1440 host_tsc = native_read_tsc();
1441 kernel_ns = get_kernel_ns();
1442 }
1443
1444 tsc_timestamp = kvm_x86_ops->read_l1_tsc(v, host_tsc);
1445
1446 /*
1160 * We may have to catch up the TSC to match elapsed wall clock 1447 * We may have to catch up the TSC to match elapsed wall clock
1161 * time for two reasons, even if kvmclock is used. 1448 * time for two reasons, even if kvmclock is used.
1162 * 1) CPU could have been running below the maximum TSC rate 1449 * 1) CPU could have been running below the maximum TSC rate
@@ -1217,23 +1504,20 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
1217 vcpu->hw_tsc_khz = this_tsc_khz; 1504 vcpu->hw_tsc_khz = this_tsc_khz;
1218 } 1505 }
1219 1506
1220 if (max_kernel_ns > kernel_ns) 1507 /* with a master <monotonic time, tsc value> tuple,
1221 kernel_ns = max_kernel_ns; 1508 * pvclock clock reads always increase at the (scaled) rate
1222 1509 * of guest TSC - no need to deal with sampling errors.
1510 */
1511 if (!use_master_clock) {
1512 if (max_kernel_ns > kernel_ns)
1513 kernel_ns = max_kernel_ns;
1514 }
1223 /* With all the info we got, fill in the values */ 1515 /* With all the info we got, fill in the values */
1224 vcpu->hv_clock.tsc_timestamp = tsc_timestamp; 1516 vcpu->hv_clock.tsc_timestamp = tsc_timestamp;
1225 vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset; 1517 vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset;
1226 vcpu->last_kernel_ns = kernel_ns; 1518 vcpu->last_kernel_ns = kernel_ns;
1227 vcpu->last_guest_tsc = tsc_timestamp; 1519 vcpu->last_guest_tsc = tsc_timestamp;
1228 1520
1229 pvclock_flags = 0;
1230 if (vcpu->pvclock_set_guest_stopped_request) {
1231 pvclock_flags |= PVCLOCK_GUEST_STOPPED;
1232 vcpu->pvclock_set_guest_stopped_request = false;
1233 }
1234
1235 vcpu->hv_clock.flags = pvclock_flags;
1236
1237 /* 1521 /*
1238 * The interface expects us to write an even number signaling that the 1522 * The interface expects us to write an even number signaling that the
1239 * update is finished. Since the guest won't see the intermediate 1523 * update is finished. Since the guest won't see the intermediate
@@ -1243,6 +1527,22 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
1243 1527
1244 shared_kaddr = kmap_atomic(vcpu->time_page); 1528 shared_kaddr = kmap_atomic(vcpu->time_page);
1245 1529
1530 guest_hv_clock = shared_kaddr + vcpu->time_offset;
1531
1532 /* retain PVCLOCK_GUEST_STOPPED if set in guest copy */
1533 pvclock_flags = (guest_hv_clock->flags & PVCLOCK_GUEST_STOPPED);
1534
1535 if (vcpu->pvclock_set_guest_stopped_request) {
1536 pvclock_flags |= PVCLOCK_GUEST_STOPPED;
1537 vcpu->pvclock_set_guest_stopped_request = false;
1538 }
1539
1540 /* If the host uses TSC clocksource, then it is stable */
1541 if (use_master_clock)
1542 pvclock_flags |= PVCLOCK_TSC_STABLE_BIT;
1543
1544 vcpu->hv_clock.flags = pvclock_flags;
1545
1246 memcpy(shared_kaddr + vcpu->time_offset, &vcpu->hv_clock, 1546 memcpy(shared_kaddr + vcpu->time_offset, &vcpu->hv_clock,
1247 sizeof(vcpu->hv_clock)); 1547 sizeof(vcpu->hv_clock));
1248 1548
@@ -1572,9 +1872,11 @@ static void record_steal_time(struct kvm_vcpu *vcpu)
1572 &vcpu->arch.st.steal, sizeof(struct kvm_steal_time)); 1872 &vcpu->arch.st.steal, sizeof(struct kvm_steal_time));
1573} 1873}
1574 1874
1575int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) 1875int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
1576{ 1876{
1577 bool pr = false; 1877 bool pr = false;
1878 u32 msr = msr_info->index;
1879 u64 data = msr_info->data;
1578 1880
1579 switch (msr) { 1881 switch (msr) {
1580 case MSR_EFER: 1882 case MSR_EFER:
@@ -1625,6 +1927,15 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1625 case MSR_IA32_TSCDEADLINE: 1927 case MSR_IA32_TSCDEADLINE:
1626 kvm_set_lapic_tscdeadline_msr(vcpu, data); 1928 kvm_set_lapic_tscdeadline_msr(vcpu, data);
1627 break; 1929 break;
1930 case MSR_IA32_TSC_ADJUST:
1931 if (guest_cpuid_has_tsc_adjust(vcpu)) {
1932 if (!msr_info->host_initiated) {
1933 u64 adj = data - vcpu->arch.ia32_tsc_adjust_msr;
1934 kvm_x86_ops->adjust_tsc_offset(vcpu, adj, true);
1935 }
1936 vcpu->arch.ia32_tsc_adjust_msr = data;
1937 }
1938 break;
1628 case MSR_IA32_MISC_ENABLE: 1939 case MSR_IA32_MISC_ENABLE:
1629 vcpu->arch.ia32_misc_enable_msr = data; 1940 vcpu->arch.ia32_misc_enable_msr = data;
1630 break; 1941 break;
@@ -1984,6 +2295,9 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
1984 case MSR_IA32_TSCDEADLINE: 2295 case MSR_IA32_TSCDEADLINE:
1985 data = kvm_get_lapic_tscdeadline_msr(vcpu); 2296 data = kvm_get_lapic_tscdeadline_msr(vcpu);
1986 break; 2297 break;
2298 case MSR_IA32_TSC_ADJUST:
2299 data = (u64)vcpu->arch.ia32_tsc_adjust_msr;
2300 break;
1987 case MSR_IA32_MISC_ENABLE: 2301 case MSR_IA32_MISC_ENABLE:
1988 data = vcpu->arch.ia32_misc_enable_msr; 2302 data = vcpu->arch.ia32_misc_enable_msr;
1989 break; 2303 break;
@@ -2342,7 +2656,12 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
2342 kvm_x86_ops->write_tsc_offset(vcpu, offset); 2656 kvm_x86_ops->write_tsc_offset(vcpu, offset);
2343 vcpu->arch.tsc_catchup = 1; 2657 vcpu->arch.tsc_catchup = 1;
2344 } 2658 }
2345 kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); 2659 /*
2660 * On a host with synchronized TSC, there is no need to update
2661 * kvmclock on vcpu->cpu migration
2662 */
2663 if (!vcpu->kvm->arch.use_master_clock || vcpu->cpu == -1)
2664 kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
2346 if (vcpu->cpu != cpu) 2665 if (vcpu->cpu != cpu)
2347 kvm_migrate_timers(vcpu); 2666 kvm_migrate_timers(vcpu);
2348 vcpu->cpu = cpu; 2667 vcpu->cpu = cpu;
@@ -2691,15 +3010,10 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
2691 if (!vcpu->arch.apic) 3010 if (!vcpu->arch.apic)
2692 goto out; 3011 goto out;
2693 u.lapic = memdup_user(argp, sizeof(*u.lapic)); 3012 u.lapic = memdup_user(argp, sizeof(*u.lapic));
2694 if (IS_ERR(u.lapic)) { 3013 if (IS_ERR(u.lapic))
2695 r = PTR_ERR(u.lapic); 3014 return PTR_ERR(u.lapic);
2696 goto out;
2697 }
2698 3015
2699 r = kvm_vcpu_ioctl_set_lapic(vcpu, u.lapic); 3016 r = kvm_vcpu_ioctl_set_lapic(vcpu, u.lapic);
2700 if (r)
2701 goto out;
2702 r = 0;
2703 break; 3017 break;
2704 } 3018 }
2705 case KVM_INTERRUPT: { 3019 case KVM_INTERRUPT: {
@@ -2709,16 +3023,10 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
2709 if (copy_from_user(&irq, argp, sizeof irq)) 3023 if (copy_from_user(&irq, argp, sizeof irq))
2710 goto out; 3024 goto out;
2711 r = kvm_vcpu_ioctl_interrupt(vcpu, &irq); 3025 r = kvm_vcpu_ioctl_interrupt(vcpu, &irq);
2712 if (r)
2713 goto out;
2714 r = 0;
2715 break; 3026 break;
2716 } 3027 }
2717 case KVM_NMI: { 3028 case KVM_NMI: {
2718 r = kvm_vcpu_ioctl_nmi(vcpu); 3029 r = kvm_vcpu_ioctl_nmi(vcpu);
2719 if (r)
2720 goto out;
2721 r = 0;
2722 break; 3030 break;
2723 } 3031 }
2724 case KVM_SET_CPUID: { 3032 case KVM_SET_CPUID: {
@@ -2729,8 +3037,6 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
2729 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid)) 3037 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
2730 goto out; 3038 goto out;
2731 r = kvm_vcpu_ioctl_set_cpuid(vcpu, &cpuid, cpuid_arg->entries); 3039 r = kvm_vcpu_ioctl_set_cpuid(vcpu, &cpuid, cpuid_arg->entries);
2732 if (r)
2733 goto out;
2734 break; 3040 break;
2735 } 3041 }
2736 case KVM_SET_CPUID2: { 3042 case KVM_SET_CPUID2: {
@@ -2742,8 +3048,6 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
2742 goto out; 3048 goto out;
2743 r = kvm_vcpu_ioctl_set_cpuid2(vcpu, &cpuid, 3049 r = kvm_vcpu_ioctl_set_cpuid2(vcpu, &cpuid,
2744 cpuid_arg->entries); 3050 cpuid_arg->entries);
2745 if (r)
2746 goto out;
2747 break; 3051 break;
2748 } 3052 }
2749 case KVM_GET_CPUID2: { 3053 case KVM_GET_CPUID2: {
@@ -2875,10 +3179,8 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
2875 } 3179 }
2876 case KVM_SET_XSAVE: { 3180 case KVM_SET_XSAVE: {
2877 u.xsave = memdup_user(argp, sizeof(*u.xsave)); 3181 u.xsave = memdup_user(argp, sizeof(*u.xsave));
2878 if (IS_ERR(u.xsave)) { 3182 if (IS_ERR(u.xsave))
2879 r = PTR_ERR(u.xsave); 3183 return PTR_ERR(u.xsave);
2880 goto out;
2881 }
2882 3184
2883 r = kvm_vcpu_ioctl_x86_set_xsave(vcpu, u.xsave); 3185 r = kvm_vcpu_ioctl_x86_set_xsave(vcpu, u.xsave);
2884 break; 3186 break;
@@ -2900,10 +3202,8 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
2900 } 3202 }
2901 case KVM_SET_XCRS: { 3203 case KVM_SET_XCRS: {
2902 u.xcrs = memdup_user(argp, sizeof(*u.xcrs)); 3204 u.xcrs = memdup_user(argp, sizeof(*u.xcrs));
2903 if (IS_ERR(u.xcrs)) { 3205 if (IS_ERR(u.xcrs))
2904 r = PTR_ERR(u.xcrs); 3206 return PTR_ERR(u.xcrs);
2905 goto out;
2906 }
2907 3207
2908 r = kvm_vcpu_ioctl_x86_set_xcrs(vcpu, u.xcrs); 3208 r = kvm_vcpu_ioctl_x86_set_xcrs(vcpu, u.xcrs);
2909 break; 3209 break;
@@ -2951,7 +3251,7 @@ static int kvm_vm_ioctl_set_tss_addr(struct kvm *kvm, unsigned long addr)
2951 int ret; 3251 int ret;
2952 3252
2953 if (addr > (unsigned int)(-3 * PAGE_SIZE)) 3253 if (addr > (unsigned int)(-3 * PAGE_SIZE))
2954 return -1; 3254 return -EINVAL;
2955 ret = kvm_x86_ops->set_tss_addr(kvm, addr); 3255 ret = kvm_x86_ops->set_tss_addr(kvm, addr);
2956 return ret; 3256 return ret;
2957} 3257}
@@ -3212,8 +3512,6 @@ long kvm_arch_vm_ioctl(struct file *filp,
3212 switch (ioctl) { 3512 switch (ioctl) {
3213 case KVM_SET_TSS_ADDR: 3513 case KVM_SET_TSS_ADDR:
3214 r = kvm_vm_ioctl_set_tss_addr(kvm, arg); 3514 r = kvm_vm_ioctl_set_tss_addr(kvm, arg);
3215 if (r < 0)
3216 goto out;
3217 break; 3515 break;
3218 case KVM_SET_IDENTITY_MAP_ADDR: { 3516 case KVM_SET_IDENTITY_MAP_ADDR: {
3219 u64 ident_addr; 3517 u64 ident_addr;
@@ -3222,14 +3520,10 @@ long kvm_arch_vm_ioctl(struct file *filp,
3222 if (copy_from_user(&ident_addr, argp, sizeof ident_addr)) 3520 if (copy_from_user(&ident_addr, argp, sizeof ident_addr))
3223 goto out; 3521 goto out;
3224 r = kvm_vm_ioctl_set_identity_map_addr(kvm, ident_addr); 3522 r = kvm_vm_ioctl_set_identity_map_addr(kvm, ident_addr);
3225 if (r < 0)
3226 goto out;
3227 break; 3523 break;
3228 } 3524 }
3229 case KVM_SET_NR_MMU_PAGES: 3525 case KVM_SET_NR_MMU_PAGES:
3230 r = kvm_vm_ioctl_set_nr_mmu_pages(kvm, arg); 3526 r = kvm_vm_ioctl_set_nr_mmu_pages(kvm, arg);
3231 if (r)
3232 goto out;
3233 break; 3527 break;
3234 case KVM_GET_NR_MMU_PAGES: 3528 case KVM_GET_NR_MMU_PAGES:
3235 r = kvm_vm_ioctl_get_nr_mmu_pages(kvm); 3529 r = kvm_vm_ioctl_get_nr_mmu_pages(kvm);
@@ -3320,8 +3614,6 @@ long kvm_arch_vm_ioctl(struct file *filp,
3320 r = 0; 3614 r = 0;
3321 get_irqchip_out: 3615 get_irqchip_out:
3322 kfree(chip); 3616 kfree(chip);
3323 if (r)
3324 goto out;
3325 break; 3617 break;
3326 } 3618 }
3327 case KVM_SET_IRQCHIP: { 3619 case KVM_SET_IRQCHIP: {
@@ -3343,8 +3635,6 @@ long kvm_arch_vm_ioctl(struct file *filp,
3343 r = 0; 3635 r = 0;
3344 set_irqchip_out: 3636 set_irqchip_out:
3345 kfree(chip); 3637 kfree(chip);
3346 if (r)
3347 goto out;
3348 break; 3638 break;
3349 } 3639 }
3350 case KVM_GET_PIT: { 3640 case KVM_GET_PIT: {
@@ -3371,9 +3661,6 @@ long kvm_arch_vm_ioctl(struct file *filp,
3371 if (!kvm->arch.vpit) 3661 if (!kvm->arch.vpit)
3372 goto out; 3662 goto out;
3373 r = kvm_vm_ioctl_set_pit(kvm, &u.ps); 3663 r = kvm_vm_ioctl_set_pit(kvm, &u.ps);
3374 if (r)
3375 goto out;
3376 r = 0;
3377 break; 3664 break;
3378 } 3665 }
3379 case KVM_GET_PIT2: { 3666 case KVM_GET_PIT2: {
@@ -3397,9 +3684,6 @@ long kvm_arch_vm_ioctl(struct file *filp,
3397 if (!kvm->arch.vpit) 3684 if (!kvm->arch.vpit)
3398 goto out; 3685 goto out;
3399 r = kvm_vm_ioctl_set_pit2(kvm, &u.ps2); 3686 r = kvm_vm_ioctl_set_pit2(kvm, &u.ps2);
3400 if (r)
3401 goto out;
3402 r = 0;
3403 break; 3687 break;
3404 } 3688 }
3405 case KVM_REINJECT_CONTROL: { 3689 case KVM_REINJECT_CONTROL: {
@@ -3408,9 +3692,6 @@ long kvm_arch_vm_ioctl(struct file *filp,
3408 if (copy_from_user(&control, argp, sizeof(control))) 3692 if (copy_from_user(&control, argp, sizeof(control)))
3409 goto out; 3693 goto out;
3410 r = kvm_vm_ioctl_reinject(kvm, &control); 3694 r = kvm_vm_ioctl_reinject(kvm, &control);
3411 if (r)
3412 goto out;
3413 r = 0;
3414 break; 3695 break;
3415 } 3696 }
3416 case KVM_XEN_HVM_CONFIG: { 3697 case KVM_XEN_HVM_CONFIG: {
@@ -4273,7 +4554,12 @@ static int emulator_get_msr(struct x86_emulate_ctxt *ctxt,
4273static int emulator_set_msr(struct x86_emulate_ctxt *ctxt, 4554static int emulator_set_msr(struct x86_emulate_ctxt *ctxt,
4274 u32 msr_index, u64 data) 4555 u32 msr_index, u64 data)
4275{ 4556{
4276 return kvm_set_msr(emul_to_vcpu(ctxt), msr_index, data); 4557 struct msr_data msr;
4558
4559 msr.data = data;
4560 msr.index = msr_index;
4561 msr.host_initiated = false;
4562 return kvm_set_msr(emul_to_vcpu(ctxt), &msr);
4277} 4563}
4278 4564
4279static int emulator_read_pmc(struct x86_emulate_ctxt *ctxt, 4565static int emulator_read_pmc(struct x86_emulate_ctxt *ctxt,
@@ -4495,7 +4781,7 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t gva)
4495 * instruction -> ... 4781 * instruction -> ...
4496 */ 4782 */
4497 pfn = gfn_to_pfn(vcpu->kvm, gpa_to_gfn(gpa)); 4783 pfn = gfn_to_pfn(vcpu->kvm, gpa_to_gfn(gpa));
4498 if (!is_error_pfn(pfn)) { 4784 if (!is_error_noslot_pfn(pfn)) {
4499 kvm_release_pfn_clean(pfn); 4785 kvm_release_pfn_clean(pfn);
4500 return true; 4786 return true;
4501 } 4787 }
@@ -4881,6 +5167,50 @@ static void kvm_set_mmio_spte_mask(void)
4881 kvm_mmu_set_mmio_spte_mask(mask); 5167 kvm_mmu_set_mmio_spte_mask(mask);
4882} 5168}
4883 5169
5170#ifdef CONFIG_X86_64
5171static void pvclock_gtod_update_fn(struct work_struct *work)
5172{
5173 struct kvm *kvm;
5174
5175 struct kvm_vcpu *vcpu;
5176 int i;
5177
5178 raw_spin_lock(&kvm_lock);
5179 list_for_each_entry(kvm, &vm_list, vm_list)
5180 kvm_for_each_vcpu(i, vcpu, kvm)
5181 set_bit(KVM_REQ_MASTERCLOCK_UPDATE, &vcpu->requests);
5182 atomic_set(&kvm_guest_has_master_clock, 0);
5183 raw_spin_unlock(&kvm_lock);
5184}
5185
5186static DECLARE_WORK(pvclock_gtod_work, pvclock_gtod_update_fn);
5187
5188/*
5189 * Notification about pvclock gtod data update.
5190 */
5191static int pvclock_gtod_notify(struct notifier_block *nb, unsigned long unused,
5192 void *priv)
5193{
5194 struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
5195 struct timekeeper *tk = priv;
5196
5197 update_pvclock_gtod(tk);
5198
5199 /* disable master clock if host does not trust, or does not
5200 * use, TSC clocksource
5201 */
5202 if (gtod->clock.vclock_mode != VCLOCK_TSC &&
5203 atomic_read(&kvm_guest_has_master_clock) != 0)
5204 queue_work(system_long_wq, &pvclock_gtod_work);
5205
5206 return 0;
5207}
5208
5209static struct notifier_block pvclock_gtod_notifier = {
5210 .notifier_call = pvclock_gtod_notify,
5211};
5212#endif
5213
4884int kvm_arch_init(void *opaque) 5214int kvm_arch_init(void *opaque)
4885{ 5215{
4886 int r; 5216 int r;
@@ -4922,6 +5252,10 @@ int kvm_arch_init(void *opaque)
4922 host_xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK); 5252 host_xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
4923 5253
4924 kvm_lapic_init(); 5254 kvm_lapic_init();
5255#ifdef CONFIG_X86_64
5256 pvclock_gtod_register_notifier(&pvclock_gtod_notifier);
5257#endif
5258
4925 return 0; 5259 return 0;
4926 5260
4927out: 5261out:
@@ -4936,6 +5270,9 @@ void kvm_arch_exit(void)
4936 cpufreq_unregister_notifier(&kvmclock_cpufreq_notifier_block, 5270 cpufreq_unregister_notifier(&kvmclock_cpufreq_notifier_block,
4937 CPUFREQ_TRANSITION_NOTIFIER); 5271 CPUFREQ_TRANSITION_NOTIFIER);
4938 unregister_hotcpu_notifier(&kvmclock_cpu_notifier_block); 5272 unregister_hotcpu_notifier(&kvmclock_cpu_notifier_block);
5273#ifdef CONFIG_X86_64
5274 pvclock_gtod_unregister_notifier(&pvclock_gtod_notifier);
5275#endif
4939 kvm_x86_ops = NULL; 5276 kvm_x86_ops = NULL;
4940 kvm_mmu_module_exit(); 5277 kvm_mmu_module_exit();
4941} 5278}
@@ -5059,7 +5396,7 @@ out:
5059} 5396}
5060EXPORT_SYMBOL_GPL(kvm_emulate_hypercall); 5397EXPORT_SYMBOL_GPL(kvm_emulate_hypercall);
5061 5398
5062int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt) 5399static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt)
5063{ 5400{
5064 struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); 5401 struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
5065 char instruction[3]; 5402 char instruction[3];
@@ -5235,6 +5572,29 @@ static void process_nmi(struct kvm_vcpu *vcpu)
5235 kvm_make_request(KVM_REQ_EVENT, vcpu); 5572 kvm_make_request(KVM_REQ_EVENT, vcpu);
5236} 5573}
5237 5574
5575static void kvm_gen_update_masterclock(struct kvm *kvm)
5576{
5577#ifdef CONFIG_X86_64
5578 int i;
5579 struct kvm_vcpu *vcpu;
5580 struct kvm_arch *ka = &kvm->arch;
5581
5582 spin_lock(&ka->pvclock_gtod_sync_lock);
5583 kvm_make_mclock_inprogress_request(kvm);
5584 /* no guest entries from this point */
5585 pvclock_update_vm_gtod_copy(kvm);
5586
5587 kvm_for_each_vcpu(i, vcpu, kvm)
5588 set_bit(KVM_REQ_CLOCK_UPDATE, &vcpu->requests);
5589
5590 /* guest entries allowed */
5591 kvm_for_each_vcpu(i, vcpu, kvm)
5592 clear_bit(KVM_REQ_MCLOCK_INPROGRESS, &vcpu->requests);
5593
5594 spin_unlock(&ka->pvclock_gtod_sync_lock);
5595#endif
5596}
5597
5238static int vcpu_enter_guest(struct kvm_vcpu *vcpu) 5598static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
5239{ 5599{
5240 int r; 5600 int r;
@@ -5247,6 +5607,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
5247 kvm_mmu_unload(vcpu); 5607 kvm_mmu_unload(vcpu);
5248 if (kvm_check_request(KVM_REQ_MIGRATE_TIMER, vcpu)) 5608 if (kvm_check_request(KVM_REQ_MIGRATE_TIMER, vcpu))
5249 __kvm_migrate_timers(vcpu); 5609 __kvm_migrate_timers(vcpu);
5610 if (kvm_check_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu))
5611 kvm_gen_update_masterclock(vcpu->kvm);
5250 if (kvm_check_request(KVM_REQ_CLOCK_UPDATE, vcpu)) { 5612 if (kvm_check_request(KVM_REQ_CLOCK_UPDATE, vcpu)) {
5251 r = kvm_guest_time_update(vcpu); 5613 r = kvm_guest_time_update(vcpu);
5252 if (unlikely(r)) 5614 if (unlikely(r))
@@ -5362,7 +5724,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
5362 if (hw_breakpoint_active()) 5724 if (hw_breakpoint_active())
5363 hw_breakpoint_restore(); 5725 hw_breakpoint_restore();
5364 5726
5365 vcpu->arch.last_guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu); 5727 vcpu->arch.last_guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu,
5728 native_read_tsc());
5366 5729
5367 vcpu->mode = OUTSIDE_GUEST_MODE; 5730 vcpu->mode = OUTSIDE_GUEST_MODE;
5368 smp_wmb(); 5731 smp_wmb();
@@ -5419,7 +5782,7 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
5419 pr_debug("vcpu %d received sipi with vector # %x\n", 5782 pr_debug("vcpu %d received sipi with vector # %x\n",
5420 vcpu->vcpu_id, vcpu->arch.sipi_vector); 5783 vcpu->vcpu_id, vcpu->arch.sipi_vector);
5421 kvm_lapic_reset(vcpu); 5784 kvm_lapic_reset(vcpu);
5422 r = kvm_arch_vcpu_reset(vcpu); 5785 r = kvm_vcpu_reset(vcpu);
5423 if (r) 5786 if (r)
5424 return r; 5787 return r;
5425 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; 5788 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
@@ -6047,7 +6410,7 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
6047 r = vcpu_load(vcpu); 6410 r = vcpu_load(vcpu);
6048 if (r) 6411 if (r)
6049 return r; 6412 return r;
6050 r = kvm_arch_vcpu_reset(vcpu); 6413 r = kvm_vcpu_reset(vcpu);
6051 if (r == 0) 6414 if (r == 0)
6052 r = kvm_mmu_setup(vcpu); 6415 r = kvm_mmu_setup(vcpu);
6053 vcpu_put(vcpu); 6416 vcpu_put(vcpu);
@@ -6055,6 +6418,23 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
6055 return r; 6418 return r;
6056} 6419}
6057 6420
6421int kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
6422{
6423 int r;
6424 struct msr_data msr;
6425
6426 r = vcpu_load(vcpu);
6427 if (r)
6428 return r;
6429 msr.data = 0x0;
6430 msr.index = MSR_IA32_TSC;
6431 msr.host_initiated = true;
6432 kvm_write_tsc(vcpu, &msr);
6433 vcpu_put(vcpu);
6434
6435 return r;
6436}
6437
6058void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) 6438void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
6059{ 6439{
6060 int r; 6440 int r;
@@ -6069,7 +6449,7 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
6069 kvm_x86_ops->vcpu_free(vcpu); 6449 kvm_x86_ops->vcpu_free(vcpu);
6070} 6450}
6071 6451
6072int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu) 6452static int kvm_vcpu_reset(struct kvm_vcpu *vcpu)
6073{ 6453{
6074 atomic_set(&vcpu->arch.nmi_queued, 0); 6454 atomic_set(&vcpu->arch.nmi_queued, 0);
6075 vcpu->arch.nmi_pending = 0; 6455 vcpu->arch.nmi_pending = 0;
@@ -6092,6 +6472,10 @@ int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu)
6092 6472
6093 kvm_pmu_reset(vcpu); 6473 kvm_pmu_reset(vcpu);
6094 6474
6475 memset(vcpu->arch.regs, 0, sizeof(vcpu->arch.regs));
6476 vcpu->arch.regs_avail = ~0;
6477 vcpu->arch.regs_dirty = ~0;
6478
6095 return kvm_x86_ops->vcpu_reset(vcpu); 6479 return kvm_x86_ops->vcpu_reset(vcpu);
6096} 6480}
6097 6481
@@ -6168,6 +6552,8 @@ int kvm_arch_hardware_enable(void *garbage)
6168 kvm_for_each_vcpu(i, vcpu, kvm) { 6552 kvm_for_each_vcpu(i, vcpu, kvm) {
6169 vcpu->arch.tsc_offset_adjustment += delta_cyc; 6553 vcpu->arch.tsc_offset_adjustment += delta_cyc;
6170 vcpu->arch.last_host_tsc = local_tsc; 6554 vcpu->arch.last_host_tsc = local_tsc;
6555 set_bit(KVM_REQ_MASTERCLOCK_UPDATE,
6556 &vcpu->requests);
6171 } 6557 }
6172 6558
6173 /* 6559 /*
@@ -6258,10 +6644,17 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
6258 if (!zalloc_cpumask_var(&vcpu->arch.wbinvd_dirty_mask, GFP_KERNEL)) 6644 if (!zalloc_cpumask_var(&vcpu->arch.wbinvd_dirty_mask, GFP_KERNEL))
6259 goto fail_free_mce_banks; 6645 goto fail_free_mce_banks;
6260 6646
6647 r = fx_init(vcpu);
6648 if (r)
6649 goto fail_free_wbinvd_dirty_mask;
6650
6651 vcpu->arch.ia32_tsc_adjust_msr = 0x0;
6261 kvm_async_pf_hash_reset(vcpu); 6652 kvm_async_pf_hash_reset(vcpu);
6262 kvm_pmu_init(vcpu); 6653 kvm_pmu_init(vcpu);
6263 6654
6264 return 0; 6655 return 0;
6656fail_free_wbinvd_dirty_mask:
6657 free_cpumask_var(vcpu->arch.wbinvd_dirty_mask);
6265fail_free_mce_banks: 6658fail_free_mce_banks:
6266 kfree(vcpu->arch.mce_banks); 6659 kfree(vcpu->arch.mce_banks);
6267fail_free_lapic: 6660fail_free_lapic:
@@ -6305,6 +6698,9 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
6305 6698
6306 raw_spin_lock_init(&kvm->arch.tsc_write_lock); 6699 raw_spin_lock_init(&kvm->arch.tsc_write_lock);
6307 mutex_init(&kvm->arch.apic_map_lock); 6700 mutex_init(&kvm->arch.apic_map_lock);
6701 spin_lock_init(&kvm->arch.pvclock_gtod_sync_lock);
6702
6703 pvclock_update_vm_gtod_copy(kvm);
6308 6704
6309 return 0; 6705 return 0;
6310} 6706}
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 2b5219c12ac8..e224f7a671b6 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -112,7 +112,7 @@ void kvm_before_handle_nmi(struct kvm_vcpu *vcpu);
112void kvm_after_handle_nmi(struct kvm_vcpu *vcpu); 112void kvm_after_handle_nmi(struct kvm_vcpu *vcpu);
113int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip); 113int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip);
114 114
115void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data); 115void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr);
116 116
117int kvm_read_guest_virt(struct x86_emulate_ctxt *ctxt, 117int kvm_read_guest_virt(struct x86_emulate_ctxt *ctxt,
118 gva_t addr, void *val, unsigned int bytes, 118 gva_t addr, void *val, unsigned int bytes,
diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c
index 4df6c373421a..205ad328aa52 100644
--- a/arch/x86/vdso/vclock_gettime.c
+++ b/arch/x86/vdso/vclock_gettime.c
@@ -22,6 +22,7 @@
22#include <asm/hpet.h> 22#include <asm/hpet.h>
23#include <asm/unistd.h> 23#include <asm/unistd.h>
24#include <asm/io.h> 24#include <asm/io.h>
25#include <asm/pvclock.h>
25 26
26#define gtod (&VVAR(vsyscall_gtod_data)) 27#define gtod (&VVAR(vsyscall_gtod_data))
27 28
@@ -62,6 +63,76 @@ static notrace cycle_t vread_hpet(void)
62 return readl((const void __iomem *)fix_to_virt(VSYSCALL_HPET) + 0xf0); 63 return readl((const void __iomem *)fix_to_virt(VSYSCALL_HPET) + 0xf0);
63} 64}
64 65
66#ifdef CONFIG_PARAVIRT_CLOCK
67
68static notrace const struct pvclock_vsyscall_time_info *get_pvti(int cpu)
69{
70 const struct pvclock_vsyscall_time_info *pvti_base;
71 int idx = cpu / (PAGE_SIZE/PVTI_SIZE);
72 int offset = cpu % (PAGE_SIZE/PVTI_SIZE);
73
74 BUG_ON(PVCLOCK_FIXMAP_BEGIN + idx > PVCLOCK_FIXMAP_END);
75
76 pvti_base = (struct pvclock_vsyscall_time_info *)
77 __fix_to_virt(PVCLOCK_FIXMAP_BEGIN+idx);
78
79 return &pvti_base[offset];
80}
81
82static notrace cycle_t vread_pvclock(int *mode)
83{
84 const struct pvclock_vsyscall_time_info *pvti;
85 cycle_t ret;
86 u64 last;
87 u32 version;
88 u32 migrate_count;
89 u8 flags;
90 unsigned cpu, cpu1;
91
92
93 /*
94 * When looping to get a consistent (time-info, tsc) pair, we
95 * also need to deal with the possibility we can switch vcpus,
96 * so make sure we always re-fetch time-info for the current vcpu.
97 */
98 do {
99 cpu = __getcpu() & VGETCPU_CPU_MASK;
100 /* TODO: We can put vcpu id into higher bits of pvti.version.
101 * This will save a couple of cycles by getting rid of
102 * __getcpu() calls (Gleb).
103 */
104
105 pvti = get_pvti(cpu);
106
107 migrate_count = pvti->migrate_count;
108
109 version = __pvclock_read_cycles(&pvti->pvti, &ret, &flags);
110
111 /*
112 * Test we're still on the cpu as well as the version.
113 * We could have been migrated just after the first
114 * vgetcpu but before fetching the version, so we
115 * wouldn't notice a version change.
116 */
117 cpu1 = __getcpu() & VGETCPU_CPU_MASK;
118 } while (unlikely(cpu != cpu1 ||
119 (pvti->pvti.version & 1) ||
120 pvti->pvti.version != version ||
121 pvti->migrate_count != migrate_count));
122
123 if (unlikely(!(flags & PVCLOCK_TSC_STABLE_BIT)))
124 *mode = VCLOCK_NONE;
125
126 /* refer to tsc.c read_tsc() comment for rationale */
127 last = VVAR(vsyscall_gtod_data).clock.cycle_last;
128
129 if (likely(ret >= last))
130 return ret;
131
132 return last;
133}
134#endif
135
65notrace static long vdso_fallback_gettime(long clock, struct timespec *ts) 136notrace static long vdso_fallback_gettime(long clock, struct timespec *ts)
66{ 137{
67 long ret; 138 long ret;
@@ -80,7 +151,7 @@ notrace static long vdso_fallback_gtod(struct timeval *tv, struct timezone *tz)
80} 151}
81 152
82 153
83notrace static inline u64 vgetsns(void) 154notrace static inline u64 vgetsns(int *mode)
84{ 155{
85 long v; 156 long v;
86 cycles_t cycles; 157 cycles_t cycles;
@@ -88,6 +159,10 @@ notrace static inline u64 vgetsns(void)
88 cycles = vread_tsc(); 159 cycles = vread_tsc();
89 else if (gtod->clock.vclock_mode == VCLOCK_HPET) 160 else if (gtod->clock.vclock_mode == VCLOCK_HPET)
90 cycles = vread_hpet(); 161 cycles = vread_hpet();
162#ifdef CONFIG_PARAVIRT_CLOCK
163 else if (gtod->clock.vclock_mode == VCLOCK_PVCLOCK)
164 cycles = vread_pvclock(mode);
165#endif
91 else 166 else
92 return 0; 167 return 0;
93 v = (cycles - gtod->clock.cycle_last) & gtod->clock.mask; 168 v = (cycles - gtod->clock.cycle_last) & gtod->clock.mask;
@@ -107,7 +182,7 @@ notrace static int __always_inline do_realtime(struct timespec *ts)
107 mode = gtod->clock.vclock_mode; 182 mode = gtod->clock.vclock_mode;
108 ts->tv_sec = gtod->wall_time_sec; 183 ts->tv_sec = gtod->wall_time_sec;
109 ns = gtod->wall_time_snsec; 184 ns = gtod->wall_time_snsec;
110 ns += vgetsns(); 185 ns += vgetsns(&mode);
111 ns >>= gtod->clock.shift; 186 ns >>= gtod->clock.shift;
112 } while (unlikely(read_seqcount_retry(&gtod->seq, seq))); 187 } while (unlikely(read_seqcount_retry(&gtod->seq, seq)));
113 188
@@ -127,7 +202,7 @@ notrace static int do_monotonic(struct timespec *ts)
127 mode = gtod->clock.vclock_mode; 202 mode = gtod->clock.vclock_mode;
128 ts->tv_sec = gtod->monotonic_time_sec; 203 ts->tv_sec = gtod->monotonic_time_sec;
129 ns = gtod->monotonic_time_snsec; 204 ns = gtod->monotonic_time_snsec;
130 ns += vgetsns(); 205 ns += vgetsns(&mode);
131 ns >>= gtod->clock.shift; 206 ns >>= gtod->clock.shift;
132 } while (unlikely(read_seqcount_retry(&gtod->seq, seq))); 207 } while (unlikely(read_seqcount_retry(&gtod->seq, seq)));
133 timespec_add_ns(ts, ns); 208 timespec_add_ns(ts, ns);
diff --git a/arch/x86/vdso/vgetcpu.c b/arch/x86/vdso/vgetcpu.c
index 5463ad558573..2f94b039e55b 100644
--- a/arch/x86/vdso/vgetcpu.c
+++ b/arch/x86/vdso/vgetcpu.c
@@ -17,15 +17,10 @@ __vdso_getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *unused)
17{ 17{
18 unsigned int p; 18 unsigned int p;
19 19
20 if (VVAR(vgetcpu_mode) == VGETCPU_RDTSCP) { 20 p = __getcpu();
21 /* Load per CPU data from RDTSCP */ 21
22 native_read_tscp(&p);
23 } else {
24 /* Load per CPU data from GDT */
25 asm("lsl %1,%0" : "=r" (p) : "r" (__PER_CPU_SEG));
26 }
27 if (cpu) 22 if (cpu)
28 *cpu = p & 0xfff; 23 *cpu = p & VGETCPU_CPU_MASK;
29 if (node) 24 if (node)
30 *node = p >> 12; 25 *node = p >> 12;
31 return 0; 26 return 0;