aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/kvm
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2012-03-28 17:35:31 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2012-03-28 17:35:31 -0400
commit2e7580b0e75d771d93e24e681031a165b1d31071 (patch)
treed9449702609eeaab28913a43b5a4434667e09d43 /arch/x86/kvm
parentd25413efa9536e2f425ea45c7720598035c597bc (diff)
parentcf9eeac46350b8b43730b7dc5e999757bed089a4 (diff)
Merge branch 'kvm-updates/3.4' of git://git.kernel.org/pub/scm/virt/kvm/kvm
Pull kvm updates from Avi Kivity: "Changes include timekeeping improvements, support for assigning host PCI devices that share interrupt lines, s390 user-controlled guests, a large ppc update, and random fixes." This is with the sign-off's fixed, hopefully next merge window we won't have rebased commits. * 'kvm-updates/3.4' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (130 commits) KVM: Convert intx_mask_lock to spin lock KVM: x86: fix kvm_write_tsc() TSC matching thinko x86: kvmclock: abstract save/restore sched_clock_state KVM: nVMX: Fix erroneous exception bitmap check KVM: Ignore the writes to MSR_K7_HWCR(3) KVM: MMU: make use of ->root_level in reset_rsvds_bits_mask KVM: PMU: add proper support for fixed counter 2 KVM: PMU: Fix raw event check KVM: PMU: warn when pin control is set in eventsel msr KVM: VMX: Fix delayed load of shared MSRs KVM: use correct tlbs dirty type in cmpxchg KVM: Allow host IRQ sharing for assigned PCI 2.3 devices KVM: Ensure all vcpus are consistent with in-kernel irqchip settings KVM: x86 emulator: Allow PM/VM86 switch during task switch KVM: SVM: Fix CPL updates KVM: x86 emulator: VM86 segments must have DPL 3 KVM: x86 emulator: Fix task switch privilege checks arch/powerpc/kvm/book3s_hv.c: included linux/sched.h twice KVM: x86 emulator: correctly mask pmc index bits in RDPMC instruction emulation KVM: mmu_notifier: Flush TLBs before releasing mmu_lock ...
Diffstat (limited to 'arch/x86/kvm')
-rw-r--r--arch/x86/kvm/cpuid.c2
-rw-r--r--arch/x86/kvm/cpuid.h8
-rw-r--r--arch/x86/kvm/emulate.c112
-rw-r--r--arch/x86/kvm/i8259.c1
-rw-r--r--arch/x86/kvm/lapic.c4
-rw-r--r--arch/x86/kvm/mmu.c85
-rw-r--r--arch/x86/kvm/mmu_audit.c4
-rw-r--r--arch/x86/kvm/pmu.c10
-rw-r--r--arch/x86/kvm/svm.c119
-rw-r--r--arch/x86/kvm/vmx.c53
-rw-r--r--arch/x86/kvm/x86.c403
11 files changed, 595 insertions, 206 deletions
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 89b02bfaaca5..9fed5bedaad6 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -236,7 +236,7 @@ static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
236 const u32 kvm_supported_word6_x86_features = 236 const u32 kvm_supported_word6_x86_features =
237 F(LAHF_LM) | F(CMP_LEGACY) | 0 /*SVM*/ | 0 /* ExtApicSpace */ | 237 F(LAHF_LM) | F(CMP_LEGACY) | 0 /*SVM*/ | 0 /* ExtApicSpace */ |
238 F(CR8_LEGACY) | F(ABM) | F(SSE4A) | F(MISALIGNSSE) | 238 F(CR8_LEGACY) | F(ABM) | F(SSE4A) | F(MISALIGNSSE) |
239 F(3DNOWPREFETCH) | 0 /* OSVW */ | 0 /* IBS */ | F(XOP) | 239 F(3DNOWPREFETCH) | F(OSVW) | 0 /* IBS */ | F(XOP) |
240 0 /* SKINIT, WDT, LWP */ | F(FMA4) | F(TBM); 240 0 /* SKINIT, WDT, LWP */ | F(FMA4) | F(TBM);
241 241
242 /* cpuid 0xC0000001.edx */ 242 /* cpuid 0xC0000001.edx */
diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
index 5b97e1797a6d..26d1fb437eb5 100644
--- a/arch/x86/kvm/cpuid.h
+++ b/arch/x86/kvm/cpuid.h
@@ -43,4 +43,12 @@ static inline bool guest_cpuid_has_fsgsbase(struct kvm_vcpu *vcpu)
43 return best && (best->ebx & bit(X86_FEATURE_FSGSBASE)); 43 return best && (best->ebx & bit(X86_FEATURE_FSGSBASE));
44} 44}
45 45
46static inline bool guest_cpuid_has_osvw(struct kvm_vcpu *vcpu)
47{
48 struct kvm_cpuid_entry2 *best;
49
50 best = kvm_find_cpuid_entry(vcpu, 0x80000001, 0);
51 return best && (best->ecx & bit(X86_FEATURE_OSVW));
52}
53
46#endif 54#endif
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 0982507b962a..83756223f8aa 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -57,6 +57,7 @@
57#define OpDS 23ull /* DS */ 57#define OpDS 23ull /* DS */
58#define OpFS 24ull /* FS */ 58#define OpFS 24ull /* FS */
59#define OpGS 25ull /* GS */ 59#define OpGS 25ull /* GS */
60#define OpMem8 26ull /* 8-bit zero extended memory operand */
60 61
61#define OpBits 5 /* Width of operand field */ 62#define OpBits 5 /* Width of operand field */
62#define OpMask ((1ull << OpBits) - 1) 63#define OpMask ((1ull << OpBits) - 1)
@@ -101,6 +102,7 @@
101#define SrcAcc (OpAcc << SrcShift) 102#define SrcAcc (OpAcc << SrcShift)
102#define SrcImmU16 (OpImmU16 << SrcShift) 103#define SrcImmU16 (OpImmU16 << SrcShift)
103#define SrcDX (OpDX << SrcShift) 104#define SrcDX (OpDX << SrcShift)
105#define SrcMem8 (OpMem8 << SrcShift)
104#define SrcMask (OpMask << SrcShift) 106#define SrcMask (OpMask << SrcShift)
105#define BitOp (1<<11) 107#define BitOp (1<<11)
106#define MemAbs (1<<12) /* Memory operand is absolute displacement */ 108#define MemAbs (1<<12) /* Memory operand is absolute displacement */
@@ -858,8 +860,7 @@ static void write_sse_reg(struct x86_emulate_ctxt *ctxt, sse128_t *data,
858} 860}
859 861
860static void decode_register_operand(struct x86_emulate_ctxt *ctxt, 862static void decode_register_operand(struct x86_emulate_ctxt *ctxt,
861 struct operand *op, 863 struct operand *op)
862 int inhibit_bytereg)
863{ 864{
864 unsigned reg = ctxt->modrm_reg; 865 unsigned reg = ctxt->modrm_reg;
865 int highbyte_regs = ctxt->rex_prefix == 0; 866 int highbyte_regs = ctxt->rex_prefix == 0;
@@ -876,7 +877,7 @@ static void decode_register_operand(struct x86_emulate_ctxt *ctxt,
876 } 877 }
877 878
878 op->type = OP_REG; 879 op->type = OP_REG;
879 if ((ctxt->d & ByteOp) && !inhibit_bytereg) { 880 if (ctxt->d & ByteOp) {
880 op->addr.reg = decode_register(reg, ctxt->regs, highbyte_regs); 881 op->addr.reg = decode_register(reg, ctxt->regs, highbyte_regs);
881 op->bytes = 1; 882 op->bytes = 1;
882 } else { 883 } else {
@@ -1151,6 +1152,22 @@ static int pio_in_emulated(struct x86_emulate_ctxt *ctxt,
1151 return 1; 1152 return 1;
1152} 1153}
1153 1154
1155static int read_interrupt_descriptor(struct x86_emulate_ctxt *ctxt,
1156 u16 index, struct desc_struct *desc)
1157{
1158 struct desc_ptr dt;
1159 ulong addr;
1160
1161 ctxt->ops->get_idt(ctxt, &dt);
1162
1163 if (dt.size < index * 8 + 7)
1164 return emulate_gp(ctxt, index << 3 | 0x2);
1165
1166 addr = dt.address + index * 8;
1167 return ctxt->ops->read_std(ctxt, addr, desc, sizeof *desc,
1168 &ctxt->exception);
1169}
1170
1154static void get_descriptor_table_ptr(struct x86_emulate_ctxt *ctxt, 1171static void get_descriptor_table_ptr(struct x86_emulate_ctxt *ctxt,
1155 u16 selector, struct desc_ptr *dt) 1172 u16 selector, struct desc_ptr *dt)
1156{ 1173{
@@ -1227,6 +1244,8 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
1227 seg_desc.type = 3; 1244 seg_desc.type = 3;
1228 seg_desc.p = 1; 1245 seg_desc.p = 1;
1229 seg_desc.s = 1; 1246 seg_desc.s = 1;
1247 if (ctxt->mode == X86EMUL_MODE_VM86)
1248 seg_desc.dpl = 3;
1230 goto load; 1249 goto load;
1231 } 1250 }
1232 1251
@@ -1891,6 +1910,17 @@ setup_syscalls_segments(struct x86_emulate_ctxt *ctxt,
1891 ss->p = 1; 1910 ss->p = 1;
1892} 1911}
1893 1912
1913static bool vendor_intel(struct x86_emulate_ctxt *ctxt)
1914{
1915 u32 eax, ebx, ecx, edx;
1916
1917 eax = ecx = 0;
1918 return ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx)
1919 && ebx == X86EMUL_CPUID_VENDOR_GenuineIntel_ebx
1920 && ecx == X86EMUL_CPUID_VENDOR_GenuineIntel_ecx
1921 && edx == X86EMUL_CPUID_VENDOR_GenuineIntel_edx;
1922}
1923
1894static bool em_syscall_is_enabled(struct x86_emulate_ctxt *ctxt) 1924static bool em_syscall_is_enabled(struct x86_emulate_ctxt *ctxt)
1895{ 1925{
1896 struct x86_emulate_ops *ops = ctxt->ops; 1926 struct x86_emulate_ops *ops = ctxt->ops;
@@ -2007,6 +2037,14 @@ static int em_sysenter(struct x86_emulate_ctxt *ctxt)
2007 if (ctxt->mode == X86EMUL_MODE_REAL) 2037 if (ctxt->mode == X86EMUL_MODE_REAL)
2008 return emulate_gp(ctxt, 0); 2038 return emulate_gp(ctxt, 0);
2009 2039
2040 /*
2041 * Not recognized on AMD in compat mode (but is recognized in legacy
2042 * mode).
2043 */
2044 if ((ctxt->mode == X86EMUL_MODE_PROT32) && (efer & EFER_LMA)
2045 && !vendor_intel(ctxt))
2046 return emulate_ud(ctxt);
2047
2010 /* XXX sysenter/sysexit have not been tested in 64bit mode. 2048 /* XXX sysenter/sysexit have not been tested in 64bit mode.
2011 * Therefore, we inject an #UD. 2049 * Therefore, we inject an #UD.
2012 */ 2050 */
@@ -2306,6 +2344,8 @@ static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt,
2306 return emulate_gp(ctxt, 0); 2344 return emulate_gp(ctxt, 0);
2307 ctxt->_eip = tss->eip; 2345 ctxt->_eip = tss->eip;
2308 ctxt->eflags = tss->eflags | 2; 2346 ctxt->eflags = tss->eflags | 2;
2347
2348 /* General purpose registers */
2309 ctxt->regs[VCPU_REGS_RAX] = tss->eax; 2349 ctxt->regs[VCPU_REGS_RAX] = tss->eax;
2310 ctxt->regs[VCPU_REGS_RCX] = tss->ecx; 2350 ctxt->regs[VCPU_REGS_RCX] = tss->ecx;
2311 ctxt->regs[VCPU_REGS_RDX] = tss->edx; 2351 ctxt->regs[VCPU_REGS_RDX] = tss->edx;
@@ -2328,6 +2368,24 @@ static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt,
2328 set_segment_selector(ctxt, tss->gs, VCPU_SREG_GS); 2368 set_segment_selector(ctxt, tss->gs, VCPU_SREG_GS);
2329 2369
2330 /* 2370 /*
2371 * If we're switching between Protected Mode and VM86, we need to make
2372 * sure to update the mode before loading the segment descriptors so
2373 * that the selectors are interpreted correctly.
2374 *
2375 * Need to get rflags to the vcpu struct immediately because it
2376 * influences the CPL which is checked at least when loading the segment
2377 * descriptors and when pushing an error code to the new kernel stack.
2378 *
2379 * TODO Introduce a separate ctxt->ops->set_cpl callback
2380 */
2381 if (ctxt->eflags & X86_EFLAGS_VM)
2382 ctxt->mode = X86EMUL_MODE_VM86;
2383 else
2384 ctxt->mode = X86EMUL_MODE_PROT32;
2385
2386 ctxt->ops->set_rflags(ctxt, ctxt->eflags);
2387
2388 /*
2331 * Now load segment descriptors. If fault happenes at this stage 2389 * Now load segment descriptors. If fault happenes at this stage
2332 * it is handled in a context of new task 2390 * it is handled in a context of new task
2333 */ 2391 */
@@ -2401,7 +2459,7 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt,
2401} 2459}
2402 2460
2403static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt, 2461static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
2404 u16 tss_selector, int reason, 2462 u16 tss_selector, int idt_index, int reason,
2405 bool has_error_code, u32 error_code) 2463 bool has_error_code, u32 error_code)
2406{ 2464{
2407 struct x86_emulate_ops *ops = ctxt->ops; 2465 struct x86_emulate_ops *ops = ctxt->ops;
@@ -2423,12 +2481,35 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
2423 2481
2424 /* FIXME: check that next_tss_desc is tss */ 2482 /* FIXME: check that next_tss_desc is tss */
2425 2483
2426 if (reason != TASK_SWITCH_IRET) { 2484 /*
2427 if ((tss_selector & 3) > next_tss_desc.dpl || 2485 * Check privileges. The three cases are task switch caused by...
2428 ops->cpl(ctxt) > next_tss_desc.dpl) 2486 *
2429 return emulate_gp(ctxt, 0); 2487 * 1. jmp/call/int to task gate: Check against DPL of the task gate
2488 * 2. Exception/IRQ/iret: No check is performed
2489 * 3. jmp/call to TSS: Check agains DPL of the TSS
2490 */
2491 if (reason == TASK_SWITCH_GATE) {
2492 if (idt_index != -1) {
2493 /* Software interrupts */
2494 struct desc_struct task_gate_desc;
2495 int dpl;
2496
2497 ret = read_interrupt_descriptor(ctxt, idt_index,
2498 &task_gate_desc);
2499 if (ret != X86EMUL_CONTINUE)
2500 return ret;
2501
2502 dpl = task_gate_desc.dpl;
2503 if ((tss_selector & 3) > dpl || ops->cpl(ctxt) > dpl)
2504 return emulate_gp(ctxt, (idt_index << 3) | 0x2);
2505 }
2506 } else if (reason != TASK_SWITCH_IRET) {
2507 int dpl = next_tss_desc.dpl;
2508 if ((tss_selector & 3) > dpl || ops->cpl(ctxt) > dpl)
2509 return emulate_gp(ctxt, tss_selector);
2430 } 2510 }
2431 2511
2512
2432 desc_limit = desc_limit_scaled(&next_tss_desc); 2513 desc_limit = desc_limit_scaled(&next_tss_desc);
2433 if (!next_tss_desc.p || 2514 if (!next_tss_desc.p ||
2434 ((desc_limit < 0x67 && (next_tss_desc.type & 8)) || 2515 ((desc_limit < 0x67 && (next_tss_desc.type & 8)) ||
@@ -2481,7 +2562,7 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
2481} 2562}
2482 2563
2483int emulator_task_switch(struct x86_emulate_ctxt *ctxt, 2564int emulator_task_switch(struct x86_emulate_ctxt *ctxt,
2484 u16 tss_selector, int reason, 2565 u16 tss_selector, int idt_index, int reason,
2485 bool has_error_code, u32 error_code) 2566 bool has_error_code, u32 error_code)
2486{ 2567{
2487 int rc; 2568 int rc;
@@ -2489,7 +2570,7 @@ int emulator_task_switch(struct x86_emulate_ctxt *ctxt,
2489 ctxt->_eip = ctxt->eip; 2570 ctxt->_eip = ctxt->eip;
2490 ctxt->dst.type = OP_NONE; 2571 ctxt->dst.type = OP_NONE;
2491 2572
2492 rc = emulator_do_task_switch(ctxt, tss_selector, reason, 2573 rc = emulator_do_task_switch(ctxt, tss_selector, idt_index, reason,
2493 has_error_code, error_code); 2574 has_error_code, error_code);
2494 2575
2495 if (rc == X86EMUL_CONTINUE) 2576 if (rc == X86EMUL_CONTINUE)
@@ -3514,13 +3595,13 @@ static struct opcode twobyte_table[256] = {
3514 I(DstMem | SrcReg | ModRM | BitOp | Lock, em_btr), 3595 I(DstMem | SrcReg | ModRM | BitOp | Lock, em_btr),
3515 I(DstReg | SrcMemFAddr | ModRM | Src2FS, em_lseg), 3596 I(DstReg | SrcMemFAddr | ModRM | Src2FS, em_lseg),
3516 I(DstReg | SrcMemFAddr | ModRM | Src2GS, em_lseg), 3597 I(DstReg | SrcMemFAddr | ModRM | Src2GS, em_lseg),
3517 D(ByteOp | DstReg | SrcMem | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov), 3598 D(DstReg | SrcMem8 | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov),
3518 /* 0xB8 - 0xBF */ 3599 /* 0xB8 - 0xBF */
3519 N, N, 3600 N, N,
3520 G(BitOp, group8), 3601 G(BitOp, group8),
3521 I(DstMem | SrcReg | ModRM | BitOp | Lock | PageTable, em_btc), 3602 I(DstMem | SrcReg | ModRM | BitOp | Lock | PageTable, em_btc),
3522 I(DstReg | SrcMem | ModRM, em_bsf), I(DstReg | SrcMem | ModRM, em_bsr), 3603 I(DstReg | SrcMem | ModRM, em_bsf), I(DstReg | SrcMem | ModRM, em_bsr),
3523 D(ByteOp | DstReg | SrcMem | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov), 3604 D(DstReg | SrcMem8 | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov),
3524 /* 0xC0 - 0xCF */ 3605 /* 0xC0 - 0xCF */
3525 D2bv(DstMem | SrcReg | ModRM | Lock), 3606 D2bv(DstMem | SrcReg | ModRM | Lock),
3526 N, D(DstMem | SrcReg | ModRM | Mov), 3607 N, D(DstMem | SrcReg | ModRM | Mov),
@@ -3602,9 +3683,7 @@ static int decode_operand(struct x86_emulate_ctxt *ctxt, struct operand *op,
3602 3683
3603 switch (d) { 3684 switch (d) {
3604 case OpReg: 3685 case OpReg:
3605 decode_register_operand(ctxt, op, 3686 decode_register_operand(ctxt, op);
3606 op == &ctxt->dst &&
3607 ctxt->twobyte && (ctxt->b == 0xb6 || ctxt->b == 0xb7));
3608 break; 3687 break;
3609 case OpImmUByte: 3688 case OpImmUByte:
3610 rc = decode_imm(ctxt, op, 1, false); 3689 rc = decode_imm(ctxt, op, 1, false);
@@ -3656,6 +3735,9 @@ static int decode_operand(struct x86_emulate_ctxt *ctxt, struct operand *op,
3656 case OpImm: 3735 case OpImm:
3657 rc = decode_imm(ctxt, op, imm_size(ctxt), true); 3736 rc = decode_imm(ctxt, op, imm_size(ctxt), true);
3658 break; 3737 break;
3738 case OpMem8:
3739 ctxt->memop.bytes = 1;
3740 goto mem_common;
3659 case OpMem16: 3741 case OpMem16:
3660 ctxt->memop.bytes = 2; 3742 ctxt->memop.bytes = 2;
3661 goto mem_common; 3743 goto mem_common;
diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index b6a73537e1ef..81cf4fa4a2be 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -307,6 +307,7 @@ static void pic_ioport_write(void *opaque, u32 addr, u32 val)
307 if (val & 0x10) { 307 if (val & 0x10) {
308 s->init4 = val & 1; 308 s->init4 = val & 1;
309 s->last_irr = 0; 309 s->last_irr = 0;
310 s->irr &= s->elcr;
310 s->imr = 0; 311 s->imr = 0;
311 s->priority_add = 0; 312 s->priority_add = 0;
312 s->special_mask = 0; 313 s->special_mask = 0;
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 31bfc6927bc0..858432287ab6 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -433,7 +433,7 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
433 break; 433 break;
434 434
435 case APIC_DM_INIT: 435 case APIC_DM_INIT:
436 if (level) { 436 if (!trig_mode || level) {
437 result = 1; 437 result = 1;
438 vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED; 438 vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED;
439 kvm_make_request(KVM_REQ_EVENT, vcpu); 439 kvm_make_request(KVM_REQ_EVENT, vcpu);
@@ -731,7 +731,7 @@ static void start_apic_timer(struct kvm_lapic *apic)
731 u64 guest_tsc, tscdeadline = apic->lapic_timer.tscdeadline; 731 u64 guest_tsc, tscdeadline = apic->lapic_timer.tscdeadline;
732 u64 ns = 0; 732 u64 ns = 0;
733 struct kvm_vcpu *vcpu = apic->vcpu; 733 struct kvm_vcpu *vcpu = apic->vcpu;
734 unsigned long this_tsc_khz = vcpu_tsc_khz(vcpu); 734 unsigned long this_tsc_khz = vcpu->arch.virtual_tsc_khz;
735 unsigned long flags; 735 unsigned long flags;
736 736
737 if (unlikely(!tscdeadline || !this_tsc_khz)) 737 if (unlikely(!tscdeadline || !this_tsc_khz))
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 224b02c3cda9..4cb164268846 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -688,9 +688,8 @@ static struct kvm_lpage_info *lpage_info_slot(gfn_t gfn,
688{ 688{
689 unsigned long idx; 689 unsigned long idx;
690 690
691 idx = (gfn >> KVM_HPAGE_GFN_SHIFT(level)) - 691 idx = gfn_to_index(gfn, slot->base_gfn, level);
692 (slot->base_gfn >> KVM_HPAGE_GFN_SHIFT(level)); 692 return &slot->arch.lpage_info[level - 2][idx];
693 return &slot->lpage_info[level - 2][idx];
694} 693}
695 694
696static void account_shadowed(struct kvm *kvm, gfn_t gfn) 695static void account_shadowed(struct kvm *kvm, gfn_t gfn)
@@ -946,7 +945,7 @@ static void pte_list_walk(unsigned long *pte_list, pte_list_walk_fn fn)
946 } 945 }
947} 946}
948 947
949static unsigned long *__gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level, 948static unsigned long *__gfn_to_rmap(gfn_t gfn, int level,
950 struct kvm_memory_slot *slot) 949 struct kvm_memory_slot *slot)
951{ 950{
952 struct kvm_lpage_info *linfo; 951 struct kvm_lpage_info *linfo;
@@ -966,7 +965,7 @@ static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level)
966 struct kvm_memory_slot *slot; 965 struct kvm_memory_slot *slot;
967 966
968 slot = gfn_to_memslot(kvm, gfn); 967 slot = gfn_to_memslot(kvm, gfn);
969 return __gfn_to_rmap(kvm, gfn, level, slot); 968 return __gfn_to_rmap(gfn, level, slot);
970} 969}
971 970
972static bool rmap_can_add(struct kvm_vcpu *vcpu) 971static bool rmap_can_add(struct kvm_vcpu *vcpu)
@@ -988,7 +987,7 @@ static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
988 return pte_list_add(vcpu, spte, rmapp); 987 return pte_list_add(vcpu, spte, rmapp);
989} 988}
990 989
991static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte) 990static u64 *rmap_next(unsigned long *rmapp, u64 *spte)
992{ 991{
993 return pte_list_next(rmapp, spte); 992 return pte_list_next(rmapp, spte);
994} 993}
@@ -1018,8 +1017,8 @@ int kvm_mmu_rmap_write_protect(struct kvm *kvm, u64 gfn,
1018 u64 *spte; 1017 u64 *spte;
1019 int i, write_protected = 0; 1018 int i, write_protected = 0;
1020 1019
1021 rmapp = __gfn_to_rmap(kvm, gfn, PT_PAGE_TABLE_LEVEL, slot); 1020 rmapp = __gfn_to_rmap(gfn, PT_PAGE_TABLE_LEVEL, slot);
1022 spte = rmap_next(kvm, rmapp, NULL); 1021 spte = rmap_next(rmapp, NULL);
1023 while (spte) { 1022 while (spte) {
1024 BUG_ON(!(*spte & PT_PRESENT_MASK)); 1023 BUG_ON(!(*spte & PT_PRESENT_MASK));
1025 rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte); 1024 rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte);
@@ -1027,14 +1026,14 @@ int kvm_mmu_rmap_write_protect(struct kvm *kvm, u64 gfn,
1027 mmu_spte_update(spte, *spte & ~PT_WRITABLE_MASK); 1026 mmu_spte_update(spte, *spte & ~PT_WRITABLE_MASK);
1028 write_protected = 1; 1027 write_protected = 1;
1029 } 1028 }
1030 spte = rmap_next(kvm, rmapp, spte); 1029 spte = rmap_next(rmapp, spte);
1031 } 1030 }
1032 1031
1033 /* check for huge page mappings */ 1032 /* check for huge page mappings */
1034 for (i = PT_DIRECTORY_LEVEL; 1033 for (i = PT_DIRECTORY_LEVEL;
1035 i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { 1034 i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
1036 rmapp = __gfn_to_rmap(kvm, gfn, i, slot); 1035 rmapp = __gfn_to_rmap(gfn, i, slot);
1037 spte = rmap_next(kvm, rmapp, NULL); 1036 spte = rmap_next(rmapp, NULL);
1038 while (spte) { 1037 while (spte) {
1039 BUG_ON(!(*spte & PT_PRESENT_MASK)); 1038 BUG_ON(!(*spte & PT_PRESENT_MASK));
1040 BUG_ON(!is_large_pte(*spte)); 1039 BUG_ON(!is_large_pte(*spte));
@@ -1045,7 +1044,7 @@ int kvm_mmu_rmap_write_protect(struct kvm *kvm, u64 gfn,
1045 spte = NULL; 1044 spte = NULL;
1046 write_protected = 1; 1045 write_protected = 1;
1047 } 1046 }
1048 spte = rmap_next(kvm, rmapp, spte); 1047 spte = rmap_next(rmapp, spte);
1049 } 1048 }
1050 } 1049 }
1051 1050
@@ -1066,7 +1065,7 @@ static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp,
1066 u64 *spte; 1065 u64 *spte;
1067 int need_tlb_flush = 0; 1066 int need_tlb_flush = 0;
1068 1067
1069 while ((spte = rmap_next(kvm, rmapp, NULL))) { 1068 while ((spte = rmap_next(rmapp, NULL))) {
1070 BUG_ON(!(*spte & PT_PRESENT_MASK)); 1069 BUG_ON(!(*spte & PT_PRESENT_MASK));
1071 rmap_printk("kvm_rmap_unmap_hva: spte %p %llx\n", spte, *spte); 1070 rmap_printk("kvm_rmap_unmap_hva: spte %p %llx\n", spte, *spte);
1072 drop_spte(kvm, spte); 1071 drop_spte(kvm, spte);
@@ -1085,14 +1084,14 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
1085 1084
1086 WARN_ON(pte_huge(*ptep)); 1085 WARN_ON(pte_huge(*ptep));
1087 new_pfn = pte_pfn(*ptep); 1086 new_pfn = pte_pfn(*ptep);
1088 spte = rmap_next(kvm, rmapp, NULL); 1087 spte = rmap_next(rmapp, NULL);
1089 while (spte) { 1088 while (spte) {
1090 BUG_ON(!is_shadow_present_pte(*spte)); 1089 BUG_ON(!is_shadow_present_pte(*spte));
1091 rmap_printk("kvm_set_pte_rmapp: spte %p %llx\n", spte, *spte); 1090 rmap_printk("kvm_set_pte_rmapp: spte %p %llx\n", spte, *spte);
1092 need_flush = 1; 1091 need_flush = 1;
1093 if (pte_write(*ptep)) { 1092 if (pte_write(*ptep)) {
1094 drop_spte(kvm, spte); 1093 drop_spte(kvm, spte);
1095 spte = rmap_next(kvm, rmapp, NULL); 1094 spte = rmap_next(rmapp, NULL);
1096 } else { 1095 } else {
1097 new_spte = *spte &~ (PT64_BASE_ADDR_MASK); 1096 new_spte = *spte &~ (PT64_BASE_ADDR_MASK);
1098 new_spte |= (u64)new_pfn << PAGE_SHIFT; 1097 new_spte |= (u64)new_pfn << PAGE_SHIFT;
@@ -1102,7 +1101,7 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
1102 new_spte &= ~shadow_accessed_mask; 1101 new_spte &= ~shadow_accessed_mask;
1103 mmu_spte_clear_track_bits(spte); 1102 mmu_spte_clear_track_bits(spte);
1104 mmu_spte_set(spte, new_spte); 1103 mmu_spte_set(spte, new_spte);
1105 spte = rmap_next(kvm, rmapp, spte); 1104 spte = rmap_next(rmapp, spte);
1106 } 1105 }
1107 } 1106 }
1108 if (need_flush) 1107 if (need_flush)
@@ -1176,7 +1175,7 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
1176 if (!shadow_accessed_mask) 1175 if (!shadow_accessed_mask)
1177 return kvm_unmap_rmapp(kvm, rmapp, data); 1176 return kvm_unmap_rmapp(kvm, rmapp, data);
1178 1177
1179 spte = rmap_next(kvm, rmapp, NULL); 1178 spte = rmap_next(rmapp, NULL);
1180 while (spte) { 1179 while (spte) {
1181 int _young; 1180 int _young;
1182 u64 _spte = *spte; 1181 u64 _spte = *spte;
@@ -1186,7 +1185,7 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
1186 young = 1; 1185 young = 1;
1187 clear_bit(PT_ACCESSED_SHIFT, (unsigned long *)spte); 1186 clear_bit(PT_ACCESSED_SHIFT, (unsigned long *)spte);
1188 } 1187 }
1189 spte = rmap_next(kvm, rmapp, spte); 1188 spte = rmap_next(rmapp, spte);
1190 } 1189 }
1191 return young; 1190 return young;
1192} 1191}
@@ -1205,7 +1204,7 @@ static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
1205 if (!shadow_accessed_mask) 1204 if (!shadow_accessed_mask)
1206 goto out; 1205 goto out;
1207 1206
1208 spte = rmap_next(kvm, rmapp, NULL); 1207 spte = rmap_next(rmapp, NULL);
1209 while (spte) { 1208 while (spte) {
1210 u64 _spte = *spte; 1209 u64 _spte = *spte;
1211 BUG_ON(!(_spte & PT_PRESENT_MASK)); 1210 BUG_ON(!(_spte & PT_PRESENT_MASK));
@@ -1214,7 +1213,7 @@ static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
1214 young = 1; 1213 young = 1;
1215 break; 1214 break;
1216 } 1215 }
1217 spte = rmap_next(kvm, rmapp, spte); 1216 spte = rmap_next(rmapp, spte);
1218 } 1217 }
1219out: 1218out:
1220 return young; 1219 return young;
@@ -1391,11 +1390,6 @@ struct kvm_mmu_pages {
1391 unsigned int nr; 1390 unsigned int nr;
1392}; 1391};
1393 1392
1394#define for_each_unsync_children(bitmap, idx) \
1395 for (idx = find_first_bit(bitmap, 512); \
1396 idx < 512; \
1397 idx = find_next_bit(bitmap, 512, idx+1))
1398
1399static int mmu_pages_add(struct kvm_mmu_pages *pvec, struct kvm_mmu_page *sp, 1393static int mmu_pages_add(struct kvm_mmu_pages *pvec, struct kvm_mmu_page *sp,
1400 int idx) 1394 int idx)
1401{ 1395{
@@ -1417,7 +1411,7 @@ static int __mmu_unsync_walk(struct kvm_mmu_page *sp,
1417{ 1411{
1418 int i, ret, nr_unsync_leaf = 0; 1412 int i, ret, nr_unsync_leaf = 0;
1419 1413
1420 for_each_unsync_children(sp->unsync_child_bitmap, i) { 1414 for_each_set_bit(i, sp->unsync_child_bitmap, 512) {
1421 struct kvm_mmu_page *child; 1415 struct kvm_mmu_page *child;
1422 u64 ent = sp->spt[i]; 1416 u64 ent = sp->spt[i];
1423 1417
@@ -1803,6 +1797,7 @@ static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep)
1803{ 1797{
1804 if (is_large_pte(*sptep)) { 1798 if (is_large_pte(*sptep)) {
1805 drop_spte(vcpu->kvm, sptep); 1799 drop_spte(vcpu->kvm, sptep);
1800 --vcpu->kvm->stat.lpages;
1806 kvm_flush_remote_tlbs(vcpu->kvm); 1801 kvm_flush_remote_tlbs(vcpu->kvm);
1807 } 1802 }
1808} 1803}
@@ -3190,15 +3185,14 @@ static bool sync_mmio_spte(u64 *sptep, gfn_t gfn, unsigned access,
3190#undef PTTYPE 3185#undef PTTYPE
3191 3186
3192static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, 3187static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
3193 struct kvm_mmu *context, 3188 struct kvm_mmu *context)
3194 int level)
3195{ 3189{
3196 int maxphyaddr = cpuid_maxphyaddr(vcpu); 3190 int maxphyaddr = cpuid_maxphyaddr(vcpu);
3197 u64 exb_bit_rsvd = 0; 3191 u64 exb_bit_rsvd = 0;
3198 3192
3199 if (!context->nx) 3193 if (!context->nx)
3200 exb_bit_rsvd = rsvd_bits(63, 63); 3194 exb_bit_rsvd = rsvd_bits(63, 63);
3201 switch (level) { 3195 switch (context->root_level) {
3202 case PT32_ROOT_LEVEL: 3196 case PT32_ROOT_LEVEL:
3203 /* no rsvd bits for 2 level 4K page table entries */ 3197 /* no rsvd bits for 2 level 4K page table entries */
3204 context->rsvd_bits_mask[0][1] = 0; 3198 context->rsvd_bits_mask[0][1] = 0;
@@ -3256,8 +3250,9 @@ static int paging64_init_context_common(struct kvm_vcpu *vcpu,
3256 int level) 3250 int level)
3257{ 3251{
3258 context->nx = is_nx(vcpu); 3252 context->nx = is_nx(vcpu);
3253 context->root_level = level;
3259 3254
3260 reset_rsvds_bits_mask(vcpu, context, level); 3255 reset_rsvds_bits_mask(vcpu, context);
3261 3256
3262 ASSERT(is_pae(vcpu)); 3257 ASSERT(is_pae(vcpu));
3263 context->new_cr3 = paging_new_cr3; 3258 context->new_cr3 = paging_new_cr3;
@@ -3267,7 +3262,6 @@ static int paging64_init_context_common(struct kvm_vcpu *vcpu,
3267 context->invlpg = paging64_invlpg; 3262 context->invlpg = paging64_invlpg;
3268 context->update_pte = paging64_update_pte; 3263 context->update_pte = paging64_update_pte;
3269 context->free = paging_free; 3264 context->free = paging_free;
3270 context->root_level = level;
3271 context->shadow_root_level = level; 3265 context->shadow_root_level = level;
3272 context->root_hpa = INVALID_PAGE; 3266 context->root_hpa = INVALID_PAGE;
3273 context->direct_map = false; 3267 context->direct_map = false;
@@ -3284,8 +3278,9 @@ static int paging32_init_context(struct kvm_vcpu *vcpu,
3284 struct kvm_mmu *context) 3278 struct kvm_mmu *context)
3285{ 3279{
3286 context->nx = false; 3280 context->nx = false;
3281 context->root_level = PT32_ROOT_LEVEL;
3287 3282
3288 reset_rsvds_bits_mask(vcpu, context, PT32_ROOT_LEVEL); 3283 reset_rsvds_bits_mask(vcpu, context);
3289 3284
3290 context->new_cr3 = paging_new_cr3; 3285 context->new_cr3 = paging_new_cr3;
3291 context->page_fault = paging32_page_fault; 3286 context->page_fault = paging32_page_fault;
@@ -3294,7 +3289,6 @@ static int paging32_init_context(struct kvm_vcpu *vcpu,
3294 context->sync_page = paging32_sync_page; 3289 context->sync_page = paging32_sync_page;
3295 context->invlpg = paging32_invlpg; 3290 context->invlpg = paging32_invlpg;
3296 context->update_pte = paging32_update_pte; 3291 context->update_pte = paging32_update_pte;
3297 context->root_level = PT32_ROOT_LEVEL;
3298 context->shadow_root_level = PT32E_ROOT_LEVEL; 3292 context->shadow_root_level = PT32E_ROOT_LEVEL;
3299 context->root_hpa = INVALID_PAGE; 3293 context->root_hpa = INVALID_PAGE;
3300 context->direct_map = false; 3294 context->direct_map = false;
@@ -3325,7 +3319,6 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
3325 context->get_cr3 = get_cr3; 3319 context->get_cr3 = get_cr3;
3326 context->get_pdptr = kvm_pdptr_read; 3320 context->get_pdptr = kvm_pdptr_read;
3327 context->inject_page_fault = kvm_inject_page_fault; 3321 context->inject_page_fault = kvm_inject_page_fault;
3328 context->nx = is_nx(vcpu);
3329 3322
3330 if (!is_paging(vcpu)) { 3323 if (!is_paging(vcpu)) {
3331 context->nx = false; 3324 context->nx = false;
@@ -3333,19 +3326,19 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
3333 context->root_level = 0; 3326 context->root_level = 0;
3334 } else if (is_long_mode(vcpu)) { 3327 } else if (is_long_mode(vcpu)) {
3335 context->nx = is_nx(vcpu); 3328 context->nx = is_nx(vcpu);
3336 reset_rsvds_bits_mask(vcpu, context, PT64_ROOT_LEVEL);
3337 context->gva_to_gpa = paging64_gva_to_gpa;
3338 context->root_level = PT64_ROOT_LEVEL; 3329 context->root_level = PT64_ROOT_LEVEL;
3330 reset_rsvds_bits_mask(vcpu, context);
3331 context->gva_to_gpa = paging64_gva_to_gpa;
3339 } else if (is_pae(vcpu)) { 3332 } else if (is_pae(vcpu)) {
3340 context->nx = is_nx(vcpu); 3333 context->nx = is_nx(vcpu);
3341 reset_rsvds_bits_mask(vcpu, context, PT32E_ROOT_LEVEL);
3342 context->gva_to_gpa = paging64_gva_to_gpa;
3343 context->root_level = PT32E_ROOT_LEVEL; 3334 context->root_level = PT32E_ROOT_LEVEL;
3335 reset_rsvds_bits_mask(vcpu, context);
3336 context->gva_to_gpa = paging64_gva_to_gpa;
3344 } else { 3337 } else {
3345 context->nx = false; 3338 context->nx = false;
3346 reset_rsvds_bits_mask(vcpu, context, PT32_ROOT_LEVEL);
3347 context->gva_to_gpa = paging32_gva_to_gpa;
3348 context->root_level = PT32_ROOT_LEVEL; 3339 context->root_level = PT32_ROOT_LEVEL;
3340 reset_rsvds_bits_mask(vcpu, context);
3341 context->gva_to_gpa = paging32_gva_to_gpa;
3349 } 3342 }
3350 3343
3351 return 0; 3344 return 0;
@@ -3408,18 +3401,18 @@ static int init_kvm_nested_mmu(struct kvm_vcpu *vcpu)
3408 g_context->gva_to_gpa = nonpaging_gva_to_gpa_nested; 3401 g_context->gva_to_gpa = nonpaging_gva_to_gpa_nested;
3409 } else if (is_long_mode(vcpu)) { 3402 } else if (is_long_mode(vcpu)) {
3410 g_context->nx = is_nx(vcpu); 3403 g_context->nx = is_nx(vcpu);
3411 reset_rsvds_bits_mask(vcpu, g_context, PT64_ROOT_LEVEL);
3412 g_context->root_level = PT64_ROOT_LEVEL; 3404 g_context->root_level = PT64_ROOT_LEVEL;
3405 reset_rsvds_bits_mask(vcpu, g_context);
3413 g_context->gva_to_gpa = paging64_gva_to_gpa_nested; 3406 g_context->gva_to_gpa = paging64_gva_to_gpa_nested;
3414 } else if (is_pae(vcpu)) { 3407 } else if (is_pae(vcpu)) {
3415 g_context->nx = is_nx(vcpu); 3408 g_context->nx = is_nx(vcpu);
3416 reset_rsvds_bits_mask(vcpu, g_context, PT32E_ROOT_LEVEL);
3417 g_context->root_level = PT32E_ROOT_LEVEL; 3409 g_context->root_level = PT32E_ROOT_LEVEL;
3410 reset_rsvds_bits_mask(vcpu, g_context);
3418 g_context->gva_to_gpa = paging64_gva_to_gpa_nested; 3411 g_context->gva_to_gpa = paging64_gva_to_gpa_nested;
3419 } else { 3412 } else {
3420 g_context->nx = false; 3413 g_context->nx = false;
3421 reset_rsvds_bits_mask(vcpu, g_context, PT32_ROOT_LEVEL);
3422 g_context->root_level = PT32_ROOT_LEVEL; 3414 g_context->root_level = PT32_ROOT_LEVEL;
3415 reset_rsvds_bits_mask(vcpu, g_context);
3423 g_context->gva_to_gpa = paging32_gva_to_gpa_nested; 3416 g_context->gva_to_gpa = paging32_gva_to_gpa_nested;
3424 } 3417 }
3425 3418
@@ -3555,7 +3548,7 @@ static u64 mmu_pte_write_fetch_gpte(struct kvm_vcpu *vcpu, gpa_t *gpa,
3555 * If we're seeing too many writes to a page, it may no longer be a page table, 3548 * If we're seeing too many writes to a page, it may no longer be a page table,
3556 * or we may be forking, in which case it is better to unmap the page. 3549 * or we may be forking, in which case it is better to unmap the page.
3557 */ 3550 */
3558static bool detect_write_flooding(struct kvm_mmu_page *sp, u64 *spte) 3551static bool detect_write_flooding(struct kvm_mmu_page *sp)
3559{ 3552{
3560 /* 3553 /*
3561 * Skip write-flooding detected for the sp whose level is 1, because 3554 * Skip write-flooding detected for the sp whose level is 1, because
@@ -3664,10 +3657,8 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
3664 3657
3665 mask.cr0_wp = mask.cr4_pae = mask.nxe = 1; 3658 mask.cr0_wp = mask.cr4_pae = mask.nxe = 1;
3666 for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn, node) { 3659 for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn, node) {
3667 spte = get_written_sptes(sp, gpa, &npte);
3668
3669 if (detect_write_misaligned(sp, gpa, bytes) || 3660 if (detect_write_misaligned(sp, gpa, bytes) ||
3670 detect_write_flooding(sp, spte)) { 3661 detect_write_flooding(sp)) {
3671 zap_page |= !!kvm_mmu_prepare_zap_page(vcpu->kvm, sp, 3662 zap_page |= !!kvm_mmu_prepare_zap_page(vcpu->kvm, sp,
3672 &invalid_list); 3663 &invalid_list);
3673 ++vcpu->kvm->stat.mmu_flooded; 3664 ++vcpu->kvm->stat.mmu_flooded;
diff --git a/arch/x86/kvm/mmu_audit.c b/arch/x86/kvm/mmu_audit.c
index ea7b4fd34676..715da5a19a5b 100644
--- a/arch/x86/kvm/mmu_audit.c
+++ b/arch/x86/kvm/mmu_audit.c
@@ -200,13 +200,13 @@ static void audit_write_protection(struct kvm *kvm, struct kvm_mmu_page *sp)
200 slot = gfn_to_memslot(kvm, sp->gfn); 200 slot = gfn_to_memslot(kvm, sp->gfn);
201 rmapp = &slot->rmap[sp->gfn - slot->base_gfn]; 201 rmapp = &slot->rmap[sp->gfn - slot->base_gfn];
202 202
203 spte = rmap_next(kvm, rmapp, NULL); 203 spte = rmap_next(rmapp, NULL);
204 while (spte) { 204 while (spte) {
205 if (is_writable_pte(*spte)) 205 if (is_writable_pte(*spte))
206 audit_printk(kvm, "shadow page has writable " 206 audit_printk(kvm, "shadow page has writable "
207 "mappings: gfn %llx role %x\n", 207 "mappings: gfn %llx role %x\n",
208 sp->gfn, sp->role.word); 208 sp->gfn, sp->role.word);
209 spte = rmap_next(kvm, rmapp, spte); 209 spte = rmap_next(rmapp, spte);
210 } 210 }
211} 211}
212 212
diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c
index 7aad5446f393..a73f0c104813 100644
--- a/arch/x86/kvm/pmu.c
+++ b/arch/x86/kvm/pmu.c
@@ -33,10 +33,11 @@ static struct kvm_arch_event_perf_mapping {
33 [4] = { 0x2e, 0x41, PERF_COUNT_HW_CACHE_MISSES }, 33 [4] = { 0x2e, 0x41, PERF_COUNT_HW_CACHE_MISSES },
34 [5] = { 0xc4, 0x00, PERF_COUNT_HW_BRANCH_INSTRUCTIONS }, 34 [5] = { 0xc4, 0x00, PERF_COUNT_HW_BRANCH_INSTRUCTIONS },
35 [6] = { 0xc5, 0x00, PERF_COUNT_HW_BRANCH_MISSES }, 35 [6] = { 0xc5, 0x00, PERF_COUNT_HW_BRANCH_MISSES },
36 [7] = { 0x00, 0x30, PERF_COUNT_HW_REF_CPU_CYCLES },
36}; 37};
37 38
38/* mapping between fixed pmc index and arch_events array */ 39/* mapping between fixed pmc index and arch_events array */
39int fixed_pmc_events[] = {1, 0, 2}; 40int fixed_pmc_events[] = {1, 0, 7};
40 41
41static bool pmc_is_gp(struct kvm_pmc *pmc) 42static bool pmc_is_gp(struct kvm_pmc *pmc)
42{ 43{
@@ -210,6 +211,9 @@ static void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel)
210 unsigned config, type = PERF_TYPE_RAW; 211 unsigned config, type = PERF_TYPE_RAW;
211 u8 event_select, unit_mask; 212 u8 event_select, unit_mask;
212 213
214 if (eventsel & ARCH_PERFMON_EVENTSEL_PIN_CONTROL)
215 printk_once("kvm pmu: pin control bit is ignored\n");
216
213 pmc->eventsel = eventsel; 217 pmc->eventsel = eventsel;
214 218
215 stop_counter(pmc); 219 stop_counter(pmc);
@@ -220,7 +224,7 @@ static void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel)
220 event_select = eventsel & ARCH_PERFMON_EVENTSEL_EVENT; 224 event_select = eventsel & ARCH_PERFMON_EVENTSEL_EVENT;
221 unit_mask = (eventsel & ARCH_PERFMON_EVENTSEL_UMASK) >> 8; 225 unit_mask = (eventsel & ARCH_PERFMON_EVENTSEL_UMASK) >> 8;
222 226
223 if (!(event_select & (ARCH_PERFMON_EVENTSEL_EDGE | 227 if (!(eventsel & (ARCH_PERFMON_EVENTSEL_EDGE |
224 ARCH_PERFMON_EVENTSEL_INV | 228 ARCH_PERFMON_EVENTSEL_INV |
225 ARCH_PERFMON_EVENTSEL_CMASK))) { 229 ARCH_PERFMON_EVENTSEL_CMASK))) {
226 config = find_arch_event(&pmc->vcpu->arch.pmu, event_select, 230 config = find_arch_event(&pmc->vcpu->arch.pmu, event_select,
@@ -413,7 +417,7 @@ int kvm_pmu_read_pmc(struct kvm_vcpu *vcpu, unsigned pmc, u64 *data)
413 struct kvm_pmc *counters; 417 struct kvm_pmc *counters;
414 u64 ctr; 418 u64 ctr;
415 419
416 pmc &= (3u << 30) - 1; 420 pmc &= ~(3u << 30);
417 if (!fixed && pmc >= pmu->nr_arch_gp_counters) 421 if (!fixed && pmc >= pmu->nr_arch_gp_counters)
418 return 1; 422 return 1;
419 if (fixed && pmc >= pmu->nr_arch_fixed_counters) 423 if (fixed && pmc >= pmu->nr_arch_fixed_counters)
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index e385214711cb..e334389e1c75 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -111,6 +111,12 @@ struct nested_state {
111#define MSRPM_OFFSETS 16 111#define MSRPM_OFFSETS 16
112static u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly; 112static u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly;
113 113
114/*
115 * Set osvw_len to higher value when updated Revision Guides
116 * are published and we know what the new status bits are
117 */
118static uint64_t osvw_len = 4, osvw_status;
119
114struct vcpu_svm { 120struct vcpu_svm {
115 struct kvm_vcpu vcpu; 121 struct kvm_vcpu vcpu;
116 struct vmcb *vmcb; 122 struct vmcb *vmcb;
@@ -177,11 +183,13 @@ static bool npt_enabled = true;
177#else 183#else
178static bool npt_enabled; 184static bool npt_enabled;
179#endif 185#endif
180static int npt = 1;
181 186
187/* allow nested paging (virtualized MMU) for all guests */
188static int npt = true;
182module_param(npt, int, S_IRUGO); 189module_param(npt, int, S_IRUGO);
183 190
184static int nested = 1; 191/* allow nested virtualization in KVM/SVM */
192static int nested = true;
185module_param(nested, int, S_IRUGO); 193module_param(nested, int, S_IRUGO);
186 194
187static void svm_flush_tlb(struct kvm_vcpu *vcpu); 195static void svm_flush_tlb(struct kvm_vcpu *vcpu);
@@ -557,6 +565,27 @@ static void svm_init_erratum_383(void)
557 erratum_383_found = true; 565 erratum_383_found = true;
558} 566}
559 567
568static void svm_init_osvw(struct kvm_vcpu *vcpu)
569{
570 /*
571 * Guests should see errata 400 and 415 as fixed (assuming that
572 * HLT and IO instructions are intercepted).
573 */
574 vcpu->arch.osvw.length = (osvw_len >= 3) ? (osvw_len) : 3;
575 vcpu->arch.osvw.status = osvw_status & ~(6ULL);
576
577 /*
578 * By increasing VCPU's osvw.length to 3 we are telling the guest that
579 * all osvw.status bits inside that length, including bit 0 (which is
580 * reserved for erratum 298), are valid. However, if host processor's
581 * osvw_len is 0 then osvw_status[0] carries no information. We need to
582 * be conservative here and therefore we tell the guest that erratum 298
583 * is present (because we really don't know).
584 */
585 if (osvw_len == 0 && boot_cpu_data.x86 == 0x10)
586 vcpu->arch.osvw.status |= 1;
587}
588
560static int has_svm(void) 589static int has_svm(void)
561{ 590{
562 const char *msg; 591 const char *msg;
@@ -623,6 +652,36 @@ static int svm_hardware_enable(void *garbage)
623 __get_cpu_var(current_tsc_ratio) = TSC_RATIO_DEFAULT; 652 __get_cpu_var(current_tsc_ratio) = TSC_RATIO_DEFAULT;
624 } 653 }
625 654
655
656 /*
657 * Get OSVW bits.
658 *
659 * Note that it is possible to have a system with mixed processor
660 * revisions and therefore different OSVW bits. If bits are not the same
661 * on different processors then choose the worst case (i.e. if erratum
662 * is present on one processor and not on another then assume that the
663 * erratum is present everywhere).
664 */
665 if (cpu_has(&boot_cpu_data, X86_FEATURE_OSVW)) {
666 uint64_t len, status = 0;
667 int err;
668
669 len = native_read_msr_safe(MSR_AMD64_OSVW_ID_LENGTH, &err);
670 if (!err)
671 status = native_read_msr_safe(MSR_AMD64_OSVW_STATUS,
672 &err);
673
674 if (err)
675 osvw_status = osvw_len = 0;
676 else {
677 if (len < osvw_len)
678 osvw_len = len;
679 osvw_status |= status;
680 osvw_status &= (1ULL << osvw_len) - 1;
681 }
682 } else
683 osvw_status = osvw_len = 0;
684
626 svm_init_erratum_383(); 685 svm_init_erratum_383();
627 686
628 amd_pmu_enable_virt(); 687 amd_pmu_enable_virt();
@@ -910,20 +969,25 @@ static u64 svm_scale_tsc(struct kvm_vcpu *vcpu, u64 tsc)
910 return _tsc; 969 return _tsc;
911} 970}
912 971
913static void svm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz) 972static void svm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale)
914{ 973{
915 struct vcpu_svm *svm = to_svm(vcpu); 974 struct vcpu_svm *svm = to_svm(vcpu);
916 u64 ratio; 975 u64 ratio;
917 u64 khz; 976 u64 khz;
918 977
919 /* TSC scaling supported? */ 978 /* Guest TSC same frequency as host TSC? */
920 if (!boot_cpu_has(X86_FEATURE_TSCRATEMSR)) 979 if (!scale) {
980 svm->tsc_ratio = TSC_RATIO_DEFAULT;
921 return; 981 return;
982 }
922 983
923 /* TSC-Scaling disabled or guest TSC same frequency as host TSC? */ 984 /* TSC scaling supported? */
924 if (user_tsc_khz == 0) { 985 if (!boot_cpu_has(X86_FEATURE_TSCRATEMSR)) {
925 vcpu->arch.virtual_tsc_khz = 0; 986 if (user_tsc_khz > tsc_khz) {
926 svm->tsc_ratio = TSC_RATIO_DEFAULT; 987 vcpu->arch.tsc_catchup = 1;
988 vcpu->arch.tsc_always_catchup = 1;
989 } else
990 WARN(1, "user requested TSC rate below hardware speed\n");
927 return; 991 return;
928 } 992 }
929 993
@@ -938,7 +1002,6 @@ static void svm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz)
938 user_tsc_khz); 1002 user_tsc_khz);
939 return; 1003 return;
940 } 1004 }
941 vcpu->arch.virtual_tsc_khz = user_tsc_khz;
942 svm->tsc_ratio = ratio; 1005 svm->tsc_ratio = ratio;
943} 1006}
944 1007
@@ -958,10 +1021,14 @@ static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
958 mark_dirty(svm->vmcb, VMCB_INTERCEPTS); 1021 mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
959} 1022}
960 1023
961static void svm_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment) 1024static void svm_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment, bool host)
962{ 1025{
963 struct vcpu_svm *svm = to_svm(vcpu); 1026 struct vcpu_svm *svm = to_svm(vcpu);
964 1027
1028 WARN_ON(adjustment < 0);
1029 if (host)
1030 adjustment = svm_scale_tsc(vcpu, adjustment);
1031
965 svm->vmcb->control.tsc_offset += adjustment; 1032 svm->vmcb->control.tsc_offset += adjustment;
966 if (is_guest_mode(vcpu)) 1033 if (is_guest_mode(vcpu))
967 svm->nested.hsave->control.tsc_offset += adjustment; 1034 svm->nested.hsave->control.tsc_offset += adjustment;
@@ -1191,6 +1258,8 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
1191 if (kvm_vcpu_is_bsp(&svm->vcpu)) 1258 if (kvm_vcpu_is_bsp(&svm->vcpu))
1192 svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP; 1259 svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP;
1193 1260
1261 svm_init_osvw(&svm->vcpu);
1262
1194 return &svm->vcpu; 1263 return &svm->vcpu;
1195 1264
1196free_page4: 1265free_page4:
@@ -1268,6 +1337,21 @@ static void svm_vcpu_put(struct kvm_vcpu *vcpu)
1268 wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]); 1337 wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
1269} 1338}
1270 1339
1340static void svm_update_cpl(struct kvm_vcpu *vcpu)
1341{
1342 struct vcpu_svm *svm = to_svm(vcpu);
1343 int cpl;
1344
1345 if (!is_protmode(vcpu))
1346 cpl = 0;
1347 else if (svm->vmcb->save.rflags & X86_EFLAGS_VM)
1348 cpl = 3;
1349 else
1350 cpl = svm->vmcb->save.cs.selector & 0x3;
1351
1352 svm->vmcb->save.cpl = cpl;
1353}
1354
1271static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu) 1355static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu)
1272{ 1356{
1273 return to_svm(vcpu)->vmcb->save.rflags; 1357 return to_svm(vcpu)->vmcb->save.rflags;
@@ -1275,7 +1359,11 @@ static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu)
1275 1359
1276static void svm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) 1360static void svm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
1277{ 1361{
1362 unsigned long old_rflags = to_svm(vcpu)->vmcb->save.rflags;
1363
1278 to_svm(vcpu)->vmcb->save.rflags = rflags; 1364 to_svm(vcpu)->vmcb->save.rflags = rflags;
1365 if ((old_rflags ^ rflags) & X86_EFLAGS_VM)
1366 svm_update_cpl(vcpu);
1279} 1367}
1280 1368
1281static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg) 1369static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
@@ -1543,9 +1631,7 @@ static void svm_set_segment(struct kvm_vcpu *vcpu,
1543 s->attrib |= (var->g & 1) << SVM_SELECTOR_G_SHIFT; 1631 s->attrib |= (var->g & 1) << SVM_SELECTOR_G_SHIFT;
1544 } 1632 }
1545 if (seg == VCPU_SREG_CS) 1633 if (seg == VCPU_SREG_CS)
1546 svm->vmcb->save.cpl 1634 svm_update_cpl(vcpu);
1547 = (svm->vmcb->save.cs.attrib
1548 >> SVM_SELECTOR_DPL_SHIFT) & 3;
1549 1635
1550 mark_dirty(svm->vmcb, VMCB_SEG); 1636 mark_dirty(svm->vmcb, VMCB_SEG);
1551} 1637}
@@ -2735,7 +2821,10 @@ static int task_switch_interception(struct vcpu_svm *svm)
2735 (int_vec == OF_VECTOR || int_vec == BP_VECTOR))) 2821 (int_vec == OF_VECTOR || int_vec == BP_VECTOR)))
2736 skip_emulated_instruction(&svm->vcpu); 2822 skip_emulated_instruction(&svm->vcpu);
2737 2823
2738 if (kvm_task_switch(&svm->vcpu, tss_selector, reason, 2824 if (int_type != SVM_EXITINTINFO_TYPE_SOFT)
2825 int_vec = -1;
2826
2827 if (kvm_task_switch(&svm->vcpu, tss_selector, int_vec, reason,
2739 has_error_code, error_code) == EMULATE_FAIL) { 2828 has_error_code, error_code) == EMULATE_FAIL) {
2740 svm->vcpu.run->exit_reason = KVM_EXIT_INTERNAL_ERROR; 2829 svm->vcpu.run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
2741 svm->vcpu.run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; 2830 svm->vcpu.run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 246490f643b6..280751c84724 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -70,9 +70,6 @@ module_param(emulate_invalid_guest_state, bool, S_IRUGO);
70static bool __read_mostly vmm_exclusive = 1; 70static bool __read_mostly vmm_exclusive = 1;
71module_param(vmm_exclusive, bool, S_IRUGO); 71module_param(vmm_exclusive, bool, S_IRUGO);
72 72
73static bool __read_mostly yield_on_hlt = 1;
74module_param(yield_on_hlt, bool, S_IRUGO);
75
76static bool __read_mostly fasteoi = 1; 73static bool __read_mostly fasteoi = 1;
77module_param(fasteoi, bool, S_IRUGO); 74module_param(fasteoi, bool, S_IRUGO);
78 75
@@ -1655,17 +1652,6 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
1655 vmx_set_interrupt_shadow(vcpu, 0); 1652 vmx_set_interrupt_shadow(vcpu, 0);
1656} 1653}
1657 1654
1658static void vmx_clear_hlt(struct kvm_vcpu *vcpu)
1659{
1660 /* Ensure that we clear the HLT state in the VMCS. We don't need to
1661 * explicitly skip the instruction because if the HLT state is set, then
1662 * the instruction is already executing and RIP has already been
1663 * advanced. */
1664 if (!yield_on_hlt &&
1665 vmcs_read32(GUEST_ACTIVITY_STATE) == GUEST_ACTIVITY_HLT)
1666 vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE);
1667}
1668
1669/* 1655/*
1670 * KVM wants to inject page-faults which it got to the guest. This function 1656 * KVM wants to inject page-faults which it got to the guest. This function
1671 * checks whether in a nested guest, we need to inject them to L1 or L2. 1657 * checks whether in a nested guest, we need to inject them to L1 or L2.
@@ -1678,7 +1664,7 @@ static int nested_pf_handled(struct kvm_vcpu *vcpu)
1678 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 1664 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
1679 1665
1680 /* TODO: also check PFEC_MATCH/MASK, not just EB.PF. */ 1666 /* TODO: also check PFEC_MATCH/MASK, not just EB.PF. */
1681 if (!(vmcs12->exception_bitmap & PF_VECTOR)) 1667 if (!(vmcs12->exception_bitmap & (1u << PF_VECTOR)))
1682 return 0; 1668 return 0;
1683 1669
1684 nested_vmx_vmexit(vcpu); 1670 nested_vmx_vmexit(vcpu);
@@ -1718,7 +1704,6 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
1718 intr_info |= INTR_TYPE_HARD_EXCEPTION; 1704 intr_info |= INTR_TYPE_HARD_EXCEPTION;
1719 1705
1720 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info); 1706 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info);
1721 vmx_clear_hlt(vcpu);
1722} 1707}
1723 1708
1724static bool vmx_rdtscp_supported(void) 1709static bool vmx_rdtscp_supported(void)
@@ -1817,13 +1802,19 @@ u64 vmx_read_l1_tsc(struct kvm_vcpu *vcpu)
1817} 1802}
1818 1803
1819/* 1804/*
1820 * Empty call-back. Needs to be implemented when VMX enables the SET_TSC_KHZ 1805 * Engage any workarounds for mis-matched TSC rates. Currently limited to
1821 * ioctl. In this case the call-back should update internal vmx state to make 1806 * software catchup for faster rates on slower CPUs.
1822 * the changes effective.
1823 */ 1807 */
1824static void vmx_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz) 1808static void vmx_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale)
1825{ 1809{
1826 /* Nothing to do here */ 1810 if (!scale)
1811 return;
1812
1813 if (user_tsc_khz > tsc_khz) {
1814 vcpu->arch.tsc_catchup = 1;
1815 vcpu->arch.tsc_always_catchup = 1;
1816 } else
1817 WARN(1, "user requested TSC rate below hardware speed\n");
1827} 1818}
1828 1819
1829/* 1820/*
@@ -1850,7 +1841,7 @@ static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
1850 } 1841 }
1851} 1842}
1852 1843
1853static void vmx_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment) 1844static void vmx_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment, bool host)
1854{ 1845{
1855 u64 offset = vmcs_read64(TSC_OFFSET); 1846 u64 offset = vmcs_read64(TSC_OFFSET);
1856 vmcs_write64(TSC_OFFSET, offset + adjustment); 1847 vmcs_write64(TSC_OFFSET, offset + adjustment);
@@ -2219,6 +2210,9 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
2219 msr = find_msr_entry(vmx, msr_index); 2210 msr = find_msr_entry(vmx, msr_index);
2220 if (msr) { 2211 if (msr) {
2221 msr->data = data; 2212 msr->data = data;
2213 if (msr - vmx->guest_msrs < vmx->save_nmsrs)
2214 kvm_set_shared_msr(msr->index, msr->data,
2215 msr->mask);
2222 break; 2216 break;
2223 } 2217 }
2224 ret = kvm_set_msr_common(vcpu, msr_index, data); 2218 ret = kvm_set_msr_common(vcpu, msr_index, data);
@@ -2399,7 +2393,7 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
2399 &_pin_based_exec_control) < 0) 2393 &_pin_based_exec_control) < 0)
2400 return -EIO; 2394 return -EIO;
2401 2395
2402 min = 2396 min = CPU_BASED_HLT_EXITING |
2403#ifdef CONFIG_X86_64 2397#ifdef CONFIG_X86_64
2404 CPU_BASED_CR8_LOAD_EXITING | 2398 CPU_BASED_CR8_LOAD_EXITING |
2405 CPU_BASED_CR8_STORE_EXITING | 2399 CPU_BASED_CR8_STORE_EXITING |
@@ -2414,9 +2408,6 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
2414 CPU_BASED_INVLPG_EXITING | 2408 CPU_BASED_INVLPG_EXITING |
2415 CPU_BASED_RDPMC_EXITING; 2409 CPU_BASED_RDPMC_EXITING;
2416 2410
2417 if (yield_on_hlt)
2418 min |= CPU_BASED_HLT_EXITING;
2419
2420 opt = CPU_BASED_TPR_SHADOW | 2411 opt = CPU_BASED_TPR_SHADOW |
2421 CPU_BASED_USE_MSR_BITMAPS | 2412 CPU_BASED_USE_MSR_BITMAPS |
2422 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; 2413 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
@@ -4003,7 +3994,6 @@ static void vmx_inject_irq(struct kvm_vcpu *vcpu)
4003 } else 3994 } else
4004 intr |= INTR_TYPE_EXT_INTR; 3995 intr |= INTR_TYPE_EXT_INTR;
4005 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr); 3996 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr);
4006 vmx_clear_hlt(vcpu);
4007} 3997}
4008 3998
4009static void vmx_inject_nmi(struct kvm_vcpu *vcpu) 3999static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
@@ -4035,7 +4025,6 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
4035 } 4025 }
4036 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 4026 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
4037 INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR); 4027 INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR);
4038 vmx_clear_hlt(vcpu);
4039} 4028}
4040 4029
4041static int vmx_nmi_allowed(struct kvm_vcpu *vcpu) 4030static int vmx_nmi_allowed(struct kvm_vcpu *vcpu)
@@ -4672,9 +4661,10 @@ static int handle_task_switch(struct kvm_vcpu *vcpu)
4672 bool has_error_code = false; 4661 bool has_error_code = false;
4673 u32 error_code = 0; 4662 u32 error_code = 0;
4674 u16 tss_selector; 4663 u16 tss_selector;
4675 int reason, type, idt_v; 4664 int reason, type, idt_v, idt_index;
4676 4665
4677 idt_v = (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK); 4666 idt_v = (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK);
4667 idt_index = (vmx->idt_vectoring_info & VECTORING_INFO_VECTOR_MASK);
4678 type = (vmx->idt_vectoring_info & VECTORING_INFO_TYPE_MASK); 4668 type = (vmx->idt_vectoring_info & VECTORING_INFO_TYPE_MASK);
4679 4669
4680 exit_qualification = vmcs_readl(EXIT_QUALIFICATION); 4670 exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
@@ -4712,8 +4702,9 @@ static int handle_task_switch(struct kvm_vcpu *vcpu)
4712 type != INTR_TYPE_NMI_INTR)) 4702 type != INTR_TYPE_NMI_INTR))
4713 skip_emulated_instruction(vcpu); 4703 skip_emulated_instruction(vcpu);
4714 4704
4715 if (kvm_task_switch(vcpu, tss_selector, reason, 4705 if (kvm_task_switch(vcpu, tss_selector,
4716 has_error_code, error_code) == EMULATE_FAIL) { 4706 type == INTR_TYPE_SOFT_INTR ? idt_index : -1, reason,
4707 has_error_code, error_code) == EMULATE_FAIL) {
4717 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; 4708 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
4718 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; 4709 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
4719 vcpu->run->internal.ndata = 0; 4710 vcpu->run->internal.ndata = 0;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 54696b5f8443..4044ce0bf7c1 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -97,6 +97,10 @@ EXPORT_SYMBOL_GPL(kvm_has_tsc_control);
97u32 kvm_max_guest_tsc_khz; 97u32 kvm_max_guest_tsc_khz;
98EXPORT_SYMBOL_GPL(kvm_max_guest_tsc_khz); 98EXPORT_SYMBOL_GPL(kvm_max_guest_tsc_khz);
99 99
100/* tsc tolerance in parts per million - default to 1/2 of the NTP threshold */
101static u32 tsc_tolerance_ppm = 250;
102module_param(tsc_tolerance_ppm, uint, S_IRUGO | S_IWUSR);
103
100#define KVM_NR_SHARED_MSRS 16 104#define KVM_NR_SHARED_MSRS 16
101 105
102struct kvm_shared_msrs_global { 106struct kvm_shared_msrs_global {
@@ -969,50 +973,51 @@ static inline u64 get_kernel_ns(void)
969static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz); 973static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz);
970unsigned long max_tsc_khz; 974unsigned long max_tsc_khz;
971 975
972static inline int kvm_tsc_changes_freq(void) 976static inline u64 nsec_to_cycles(struct kvm_vcpu *vcpu, u64 nsec)
973{ 977{
974 int cpu = get_cpu(); 978 return pvclock_scale_delta(nsec, vcpu->arch.virtual_tsc_mult,
975 int ret = !boot_cpu_has(X86_FEATURE_CONSTANT_TSC) && 979 vcpu->arch.virtual_tsc_shift);
976 cpufreq_quick_get(cpu) != 0;
977 put_cpu();
978 return ret;
979} 980}
980 981
981u64 vcpu_tsc_khz(struct kvm_vcpu *vcpu) 982static u32 adjust_tsc_khz(u32 khz, s32 ppm)
982{ 983{
983 if (vcpu->arch.virtual_tsc_khz) 984 u64 v = (u64)khz * (1000000 + ppm);
984 return vcpu->arch.virtual_tsc_khz; 985 do_div(v, 1000000);
985 else 986 return v;
986 return __this_cpu_read(cpu_tsc_khz);
987} 987}
988 988
989static inline u64 nsec_to_cycles(struct kvm_vcpu *vcpu, u64 nsec) 989static void kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 this_tsc_khz)
990{ 990{
991 u64 ret; 991 u32 thresh_lo, thresh_hi;
992 992 int use_scaling = 0;
993 WARN_ON(preemptible());
994 if (kvm_tsc_changes_freq())
995 printk_once(KERN_WARNING
996 "kvm: unreliable cycle conversion on adjustable rate TSC\n");
997 ret = nsec * vcpu_tsc_khz(vcpu);
998 do_div(ret, USEC_PER_SEC);
999 return ret;
1000}
1001 993
1002static void kvm_init_tsc_catchup(struct kvm_vcpu *vcpu, u32 this_tsc_khz)
1003{
1004 /* Compute a scale to convert nanoseconds in TSC cycles */ 994 /* Compute a scale to convert nanoseconds in TSC cycles */
1005 kvm_get_time_scale(this_tsc_khz, NSEC_PER_SEC / 1000, 995 kvm_get_time_scale(this_tsc_khz, NSEC_PER_SEC / 1000,
1006 &vcpu->arch.tsc_catchup_shift, 996 &vcpu->arch.virtual_tsc_shift,
1007 &vcpu->arch.tsc_catchup_mult); 997 &vcpu->arch.virtual_tsc_mult);
998 vcpu->arch.virtual_tsc_khz = this_tsc_khz;
999
1000 /*
1001 * Compute the variation in TSC rate which is acceptable
1002 * within the range of tolerance and decide if the
1003 * rate being applied is within that bounds of the hardware
1004 * rate. If so, no scaling or compensation need be done.
1005 */
1006 thresh_lo = adjust_tsc_khz(tsc_khz, -tsc_tolerance_ppm);
1007 thresh_hi = adjust_tsc_khz(tsc_khz, tsc_tolerance_ppm);
1008 if (this_tsc_khz < thresh_lo || this_tsc_khz > thresh_hi) {
1009 pr_debug("kvm: requested TSC rate %u falls outside tolerance [%u,%u]\n", this_tsc_khz, thresh_lo, thresh_hi);
1010 use_scaling = 1;
1011 }
1012 kvm_x86_ops->set_tsc_khz(vcpu, this_tsc_khz, use_scaling);
1008} 1013}
1009 1014
1010static u64 compute_guest_tsc(struct kvm_vcpu *vcpu, s64 kernel_ns) 1015static u64 compute_guest_tsc(struct kvm_vcpu *vcpu, s64 kernel_ns)
1011{ 1016{
1012 u64 tsc = pvclock_scale_delta(kernel_ns-vcpu->arch.last_tsc_nsec, 1017 u64 tsc = pvclock_scale_delta(kernel_ns-vcpu->arch.this_tsc_nsec,
1013 vcpu->arch.tsc_catchup_mult, 1018 vcpu->arch.virtual_tsc_mult,
1014 vcpu->arch.tsc_catchup_shift); 1019 vcpu->arch.virtual_tsc_shift);
1015 tsc += vcpu->arch.last_tsc_write; 1020 tsc += vcpu->arch.this_tsc_write;
1016 return tsc; 1021 return tsc;
1017} 1022}
1018 1023
@@ -1021,48 +1026,88 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data)
1021 struct kvm *kvm = vcpu->kvm; 1026 struct kvm *kvm = vcpu->kvm;
1022 u64 offset, ns, elapsed; 1027 u64 offset, ns, elapsed;
1023 unsigned long flags; 1028 unsigned long flags;
1024 s64 sdiff; 1029 s64 usdiff;
1025 1030
1026 raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags); 1031 raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);
1027 offset = kvm_x86_ops->compute_tsc_offset(vcpu, data); 1032 offset = kvm_x86_ops->compute_tsc_offset(vcpu, data);
1028 ns = get_kernel_ns(); 1033 ns = get_kernel_ns();
1029 elapsed = ns - kvm->arch.last_tsc_nsec; 1034 elapsed = ns - kvm->arch.last_tsc_nsec;
1030 sdiff = data - kvm->arch.last_tsc_write; 1035
1031 if (sdiff < 0) 1036 /* n.b - signed multiplication and division required */
1032 sdiff = -sdiff; 1037 usdiff = data - kvm->arch.last_tsc_write;
1038#ifdef CONFIG_X86_64
1039 usdiff = (usdiff * 1000) / vcpu->arch.virtual_tsc_khz;
1040#else
1041 /* do_div() only does unsigned */
1042 asm("idivl %2; xor %%edx, %%edx"
1043 : "=A"(usdiff)
1044 : "A"(usdiff * 1000), "rm"(vcpu->arch.virtual_tsc_khz));
1045#endif
1046 do_div(elapsed, 1000);
1047 usdiff -= elapsed;
1048 if (usdiff < 0)
1049 usdiff = -usdiff;
1033 1050
1034 /* 1051 /*
1035 * Special case: close write to TSC within 5 seconds of 1052 * Special case: TSC write with a small delta (1 second) of virtual
1036 * another CPU is interpreted as an attempt to synchronize 1053 * cycle time against real time is interpreted as an attempt to
1037 * The 5 seconds is to accommodate host load / swapping as 1054 * synchronize the CPU.
1038 * well as any reset of TSC during the boot process. 1055 *
1039 * 1056 * For a reliable TSC, we can match TSC offsets, and for an unstable
1040 * In that case, for a reliable TSC, we can match TSC offsets, 1057 * TSC, we add elapsed time in this computation. We could let the
1041 * or make a best guest using elapsed value. 1058 * compensation code attempt to catch up if we fall behind, but
1042 */ 1059 * it's better to try to match offsets from the beginning.
1043 if (sdiff < nsec_to_cycles(vcpu, 5ULL * NSEC_PER_SEC) && 1060 */
1044 elapsed < 5ULL * NSEC_PER_SEC) { 1061 if (usdiff < USEC_PER_SEC &&
1062 vcpu->arch.virtual_tsc_khz == kvm->arch.last_tsc_khz) {
1045 if (!check_tsc_unstable()) { 1063 if (!check_tsc_unstable()) {
1046 offset = kvm->arch.last_tsc_offset; 1064 offset = kvm->arch.cur_tsc_offset;
1047 pr_debug("kvm: matched tsc offset for %llu\n", data); 1065 pr_debug("kvm: matched tsc offset for %llu\n", data);
1048 } else { 1066 } else {
1049 u64 delta = nsec_to_cycles(vcpu, elapsed); 1067 u64 delta = nsec_to_cycles(vcpu, elapsed);
1050 offset += delta; 1068 data += delta;
1069 offset = kvm_x86_ops->compute_tsc_offset(vcpu, data);
1051 pr_debug("kvm: adjusted tsc offset by %llu\n", delta); 1070 pr_debug("kvm: adjusted tsc offset by %llu\n", delta);
1052 } 1071 }
1053 ns = kvm->arch.last_tsc_nsec; 1072 } else {
1073 /*
1074 * We split periods of matched TSC writes into generations.
1075 * For each generation, we track the original measured
1076 * nanosecond time, offset, and write, so if TSCs are in
1077 * sync, we can match exact offset, and if not, we can match
1078 * exact software computaion in compute_guest_tsc()
1079 *
1080 * These values are tracked in kvm->arch.cur_xxx variables.
1081 */
1082 kvm->arch.cur_tsc_generation++;
1083 kvm->arch.cur_tsc_nsec = ns;
1084 kvm->arch.cur_tsc_write = data;
1085 kvm->arch.cur_tsc_offset = offset;
1086 pr_debug("kvm: new tsc generation %u, clock %llu\n",
1087 kvm->arch.cur_tsc_generation, data);
1054 } 1088 }
1089
1090 /*
1091 * We also track th most recent recorded KHZ, write and time to
1092 * allow the matching interval to be extended at each write.
1093 */
1055 kvm->arch.last_tsc_nsec = ns; 1094 kvm->arch.last_tsc_nsec = ns;
1056 kvm->arch.last_tsc_write = data; 1095 kvm->arch.last_tsc_write = data;
1057 kvm->arch.last_tsc_offset = offset; 1096 kvm->arch.last_tsc_khz = vcpu->arch.virtual_tsc_khz;
1058 kvm_x86_ops->write_tsc_offset(vcpu, offset);
1059 raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
1060 1097
1061 /* Reset of TSC must disable overshoot protection below */ 1098 /* Reset of TSC must disable overshoot protection below */
1062 vcpu->arch.hv_clock.tsc_timestamp = 0; 1099 vcpu->arch.hv_clock.tsc_timestamp = 0;
1063 vcpu->arch.last_tsc_write = data; 1100 vcpu->arch.last_guest_tsc = data;
1064 vcpu->arch.last_tsc_nsec = ns; 1101
1102 /* Keep track of which generation this VCPU has synchronized to */
1103 vcpu->arch.this_tsc_generation = kvm->arch.cur_tsc_generation;
1104 vcpu->arch.this_tsc_nsec = kvm->arch.cur_tsc_nsec;
1105 vcpu->arch.this_tsc_write = kvm->arch.cur_tsc_write;
1106
1107 kvm_x86_ops->write_tsc_offset(vcpu, offset);
1108 raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
1065} 1109}
1110
1066EXPORT_SYMBOL_GPL(kvm_write_tsc); 1111EXPORT_SYMBOL_GPL(kvm_write_tsc);
1067 1112
1068static int kvm_guest_time_update(struct kvm_vcpu *v) 1113static int kvm_guest_time_update(struct kvm_vcpu *v)
@@ -1078,7 +1123,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
1078 local_irq_save(flags); 1123 local_irq_save(flags);
1079 tsc_timestamp = kvm_x86_ops->read_l1_tsc(v); 1124 tsc_timestamp = kvm_x86_ops->read_l1_tsc(v);
1080 kernel_ns = get_kernel_ns(); 1125 kernel_ns = get_kernel_ns();
1081 this_tsc_khz = vcpu_tsc_khz(v); 1126 this_tsc_khz = __get_cpu_var(cpu_tsc_khz);
1082 if (unlikely(this_tsc_khz == 0)) { 1127 if (unlikely(this_tsc_khz == 0)) {
1083 local_irq_restore(flags); 1128 local_irq_restore(flags);
1084 kvm_make_request(KVM_REQ_CLOCK_UPDATE, v); 1129 kvm_make_request(KVM_REQ_CLOCK_UPDATE, v);
@@ -1098,7 +1143,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
1098 if (vcpu->tsc_catchup) { 1143 if (vcpu->tsc_catchup) {
1099 u64 tsc = compute_guest_tsc(v, kernel_ns); 1144 u64 tsc = compute_guest_tsc(v, kernel_ns);
1100 if (tsc > tsc_timestamp) { 1145 if (tsc > tsc_timestamp) {
1101 kvm_x86_ops->adjust_tsc_offset(v, tsc - tsc_timestamp); 1146 adjust_tsc_offset_guest(v, tsc - tsc_timestamp);
1102 tsc_timestamp = tsc; 1147 tsc_timestamp = tsc;
1103 } 1148 }
1104 } 1149 }
@@ -1130,7 +1175,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
1130 * observed by the guest and ensure the new system time is greater. 1175 * observed by the guest and ensure the new system time is greater.
1131 */ 1176 */
1132 max_kernel_ns = 0; 1177 max_kernel_ns = 0;
1133 if (vcpu->hv_clock.tsc_timestamp && vcpu->last_guest_tsc) { 1178 if (vcpu->hv_clock.tsc_timestamp) {
1134 max_kernel_ns = vcpu->last_guest_tsc - 1179 max_kernel_ns = vcpu->last_guest_tsc -
1135 vcpu->hv_clock.tsc_timestamp; 1180 vcpu->hv_clock.tsc_timestamp;
1136 max_kernel_ns = pvclock_scale_delta(max_kernel_ns, 1181 max_kernel_ns = pvclock_scale_delta(max_kernel_ns,
@@ -1504,6 +1549,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1504 case MSR_K7_HWCR: 1549 case MSR_K7_HWCR:
1505 data &= ~(u64)0x40; /* ignore flush filter disable */ 1550 data &= ~(u64)0x40; /* ignore flush filter disable */
1506 data &= ~(u64)0x100; /* ignore ignne emulation enable */ 1551 data &= ~(u64)0x100; /* ignore ignne emulation enable */
1552 data &= ~(u64)0x8; /* ignore TLB cache disable */
1507 if (data != 0) { 1553 if (data != 0) {
1508 pr_unimpl(vcpu, "unimplemented HWCR wrmsr: 0x%llx\n", 1554 pr_unimpl(vcpu, "unimplemented HWCR wrmsr: 0x%llx\n",
1509 data); 1555 data);
@@ -1676,6 +1722,16 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1676 */ 1722 */
1677 pr_unimpl(vcpu, "ignored wrmsr: 0x%x data %llx\n", msr, data); 1723 pr_unimpl(vcpu, "ignored wrmsr: 0x%x data %llx\n", msr, data);
1678 break; 1724 break;
1725 case MSR_AMD64_OSVW_ID_LENGTH:
1726 if (!guest_cpuid_has_osvw(vcpu))
1727 return 1;
1728 vcpu->arch.osvw.length = data;
1729 break;
1730 case MSR_AMD64_OSVW_STATUS:
1731 if (!guest_cpuid_has_osvw(vcpu))
1732 return 1;
1733 vcpu->arch.osvw.status = data;
1734 break;
1679 default: 1735 default:
1680 if (msr && (msr == vcpu->kvm->arch.xen_hvm_config.msr)) 1736 if (msr && (msr == vcpu->kvm->arch.xen_hvm_config.msr))
1681 return xen_hvm_config(vcpu, data); 1737 return xen_hvm_config(vcpu, data);
@@ -1960,6 +2016,16 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
1960 */ 2016 */
1961 data = 0xbe702111; 2017 data = 0xbe702111;
1962 break; 2018 break;
2019 case MSR_AMD64_OSVW_ID_LENGTH:
2020 if (!guest_cpuid_has_osvw(vcpu))
2021 return 1;
2022 data = vcpu->arch.osvw.length;
2023 break;
2024 case MSR_AMD64_OSVW_STATUS:
2025 if (!guest_cpuid_has_osvw(vcpu))
2026 return 1;
2027 data = vcpu->arch.osvw.status;
2028 break;
1963 default: 2029 default:
1964 if (kvm_pmu_msr(vcpu, msr)) 2030 if (kvm_pmu_msr(vcpu, msr))
1965 return kvm_pmu_get_msr(vcpu, msr, pdata); 2031 return kvm_pmu_get_msr(vcpu, msr, pdata);
@@ -2080,6 +2146,7 @@ int kvm_dev_ioctl_check_extension(long ext)
2080 case KVM_CAP_XSAVE: 2146 case KVM_CAP_XSAVE:
2081 case KVM_CAP_ASYNC_PF: 2147 case KVM_CAP_ASYNC_PF:
2082 case KVM_CAP_GET_TSC_KHZ: 2148 case KVM_CAP_GET_TSC_KHZ:
2149 case KVM_CAP_PCI_2_3:
2083 r = 1; 2150 r = 1;
2084 break; 2151 break;
2085 case KVM_CAP_COALESCED_MMIO: 2152 case KVM_CAP_COALESCED_MMIO:
@@ -2214,19 +2281,23 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
2214 } 2281 }
2215 2282
2216 kvm_x86_ops->vcpu_load(vcpu, cpu); 2283 kvm_x86_ops->vcpu_load(vcpu, cpu);
2217 if (unlikely(vcpu->cpu != cpu) || check_tsc_unstable()) {
2218 /* Make sure TSC doesn't go backwards */
2219 s64 tsc_delta;
2220 u64 tsc;
2221 2284
2222 tsc = kvm_x86_ops->read_l1_tsc(vcpu); 2285 /* Apply any externally detected TSC adjustments (due to suspend) */
2223 tsc_delta = !vcpu->arch.last_guest_tsc ? 0 : 2286 if (unlikely(vcpu->arch.tsc_offset_adjustment)) {
2224 tsc - vcpu->arch.last_guest_tsc; 2287 adjust_tsc_offset_host(vcpu, vcpu->arch.tsc_offset_adjustment);
2288 vcpu->arch.tsc_offset_adjustment = 0;
2289 set_bit(KVM_REQ_CLOCK_UPDATE, &vcpu->requests);
2290 }
2225 2291
2292 if (unlikely(vcpu->cpu != cpu) || check_tsc_unstable()) {
2293 s64 tsc_delta = !vcpu->arch.last_host_tsc ? 0 :
2294 native_read_tsc() - vcpu->arch.last_host_tsc;
2226 if (tsc_delta < 0) 2295 if (tsc_delta < 0)
2227 mark_tsc_unstable("KVM discovered backwards TSC"); 2296 mark_tsc_unstable("KVM discovered backwards TSC");
2228 if (check_tsc_unstable()) { 2297 if (check_tsc_unstable()) {
2229 kvm_x86_ops->adjust_tsc_offset(vcpu, -tsc_delta); 2298 u64 offset = kvm_x86_ops->compute_tsc_offset(vcpu,
2299 vcpu->arch.last_guest_tsc);
2300 kvm_x86_ops->write_tsc_offset(vcpu, offset);
2230 vcpu->arch.tsc_catchup = 1; 2301 vcpu->arch.tsc_catchup = 1;
2231 } 2302 }
2232 kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); 2303 kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
@@ -2243,7 +2314,7 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
2243{ 2314{
2244 kvm_x86_ops->vcpu_put(vcpu); 2315 kvm_x86_ops->vcpu_put(vcpu);
2245 kvm_put_guest_fpu(vcpu); 2316 kvm_put_guest_fpu(vcpu);
2246 vcpu->arch.last_guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu); 2317 vcpu->arch.last_host_tsc = native_read_tsc();
2247} 2318}
2248 2319
2249static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu, 2320static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,
@@ -2785,26 +2856,21 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
2785 u32 user_tsc_khz; 2856 u32 user_tsc_khz;
2786 2857
2787 r = -EINVAL; 2858 r = -EINVAL;
2788 if (!kvm_has_tsc_control)
2789 break;
2790
2791 user_tsc_khz = (u32)arg; 2859 user_tsc_khz = (u32)arg;
2792 2860
2793 if (user_tsc_khz >= kvm_max_guest_tsc_khz) 2861 if (user_tsc_khz >= kvm_max_guest_tsc_khz)
2794 goto out; 2862 goto out;
2795 2863
2796 kvm_x86_ops->set_tsc_khz(vcpu, user_tsc_khz); 2864 if (user_tsc_khz == 0)
2865 user_tsc_khz = tsc_khz;
2866
2867 kvm_set_tsc_khz(vcpu, user_tsc_khz);
2797 2868
2798 r = 0; 2869 r = 0;
2799 goto out; 2870 goto out;
2800 } 2871 }
2801 case KVM_GET_TSC_KHZ: { 2872 case KVM_GET_TSC_KHZ: {
2802 r = -EIO; 2873 r = vcpu->arch.virtual_tsc_khz;
2803 if (check_tsc_unstable())
2804 goto out;
2805
2806 r = vcpu_tsc_khz(vcpu);
2807
2808 goto out; 2874 goto out;
2809 } 2875 }
2810 default: 2876 default:
@@ -2815,6 +2881,11 @@ out:
2815 return r; 2881 return r;
2816} 2882}
2817 2883
2884int kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf)
2885{
2886 return VM_FAULT_SIGBUS;
2887}
2888
2818static int kvm_vm_ioctl_set_tss_addr(struct kvm *kvm, unsigned long addr) 2889static int kvm_vm_ioctl_set_tss_addr(struct kvm *kvm, unsigned long addr)
2819{ 2890{
2820 int ret; 2891 int ret;
@@ -2998,6 +3069,8 @@ static void write_protect_slot(struct kvm *kvm,
2998 unsigned long *dirty_bitmap, 3069 unsigned long *dirty_bitmap,
2999 unsigned long nr_dirty_pages) 3070 unsigned long nr_dirty_pages)
3000{ 3071{
3072 spin_lock(&kvm->mmu_lock);
3073
3001 /* Not many dirty pages compared to # of shadow pages. */ 3074 /* Not many dirty pages compared to # of shadow pages. */
3002 if (nr_dirty_pages < kvm->arch.n_used_mmu_pages) { 3075 if (nr_dirty_pages < kvm->arch.n_used_mmu_pages) {
3003 unsigned long gfn_offset; 3076 unsigned long gfn_offset;
@@ -3005,16 +3078,13 @@ static void write_protect_slot(struct kvm *kvm,
3005 for_each_set_bit(gfn_offset, dirty_bitmap, memslot->npages) { 3078 for_each_set_bit(gfn_offset, dirty_bitmap, memslot->npages) {
3006 unsigned long gfn = memslot->base_gfn + gfn_offset; 3079 unsigned long gfn = memslot->base_gfn + gfn_offset;
3007 3080
3008 spin_lock(&kvm->mmu_lock);
3009 kvm_mmu_rmap_write_protect(kvm, gfn, memslot); 3081 kvm_mmu_rmap_write_protect(kvm, gfn, memslot);
3010 spin_unlock(&kvm->mmu_lock);
3011 } 3082 }
3012 kvm_flush_remote_tlbs(kvm); 3083 kvm_flush_remote_tlbs(kvm);
3013 } else { 3084 } else
3014 spin_lock(&kvm->mmu_lock);
3015 kvm_mmu_slot_remove_write_access(kvm, memslot->id); 3085 kvm_mmu_slot_remove_write_access(kvm, memslot->id);
3016 spin_unlock(&kvm->mmu_lock); 3086
3017 } 3087 spin_unlock(&kvm->mmu_lock);
3018} 3088}
3019 3089
3020/* 3090/*
@@ -3133,6 +3203,9 @@ long kvm_arch_vm_ioctl(struct file *filp,
3133 r = -EEXIST; 3203 r = -EEXIST;
3134 if (kvm->arch.vpic) 3204 if (kvm->arch.vpic)
3135 goto create_irqchip_unlock; 3205 goto create_irqchip_unlock;
3206 r = -EINVAL;
3207 if (atomic_read(&kvm->online_vcpus))
3208 goto create_irqchip_unlock;
3136 r = -ENOMEM; 3209 r = -ENOMEM;
3137 vpic = kvm_create_pic(kvm); 3210 vpic = kvm_create_pic(kvm);
3138 if (vpic) { 3211 if (vpic) {
@@ -4063,6 +4136,11 @@ static int emulator_set_cr(struct x86_emulate_ctxt *ctxt, int cr, ulong val)
4063 return res; 4136 return res;
4064} 4137}
4065 4138
4139static void emulator_set_rflags(struct x86_emulate_ctxt *ctxt, ulong val)
4140{
4141 kvm_set_rflags(emul_to_vcpu(ctxt), val);
4142}
4143
4066static int emulator_get_cpl(struct x86_emulate_ctxt *ctxt) 4144static int emulator_get_cpl(struct x86_emulate_ctxt *ctxt)
4067{ 4145{
4068 return kvm_x86_ops->get_cpl(emul_to_vcpu(ctxt)); 4146 return kvm_x86_ops->get_cpl(emul_to_vcpu(ctxt));
@@ -4244,6 +4322,7 @@ static struct x86_emulate_ops emulate_ops = {
4244 .set_idt = emulator_set_idt, 4322 .set_idt = emulator_set_idt,
4245 .get_cr = emulator_get_cr, 4323 .get_cr = emulator_get_cr,
4246 .set_cr = emulator_set_cr, 4324 .set_cr = emulator_set_cr,
4325 .set_rflags = emulator_set_rflags,
4247 .cpl = emulator_get_cpl, 4326 .cpl = emulator_get_cpl,
4248 .get_dr = emulator_get_dr, 4327 .get_dr = emulator_get_dr,
4249 .set_dr = emulator_set_dr, 4328 .set_dr = emulator_set_dr,
@@ -5288,6 +5367,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
5288 profile_hit(KVM_PROFILING, (void *)rip); 5367 profile_hit(KVM_PROFILING, (void *)rip);
5289 } 5368 }
5290 5369
5370 if (unlikely(vcpu->arch.tsc_always_catchup))
5371 kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
5291 5372
5292 kvm_lapic_sync_from_vapic(vcpu); 5373 kvm_lapic_sync_from_vapic(vcpu);
5293 5374
@@ -5587,15 +5668,15 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
5587 return 0; 5668 return 0;
5588} 5669}
5589 5670
5590int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason, 5671int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int idt_index,
5591 bool has_error_code, u32 error_code) 5672 int reason, bool has_error_code, u32 error_code)
5592{ 5673{
5593 struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt; 5674 struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
5594 int ret; 5675 int ret;
5595 5676
5596 init_emulate_ctxt(vcpu); 5677 init_emulate_ctxt(vcpu);
5597 5678
5598 ret = emulator_task_switch(ctxt, tss_selector, reason, 5679 ret = emulator_task_switch(ctxt, tss_selector, idt_index, reason,
5599 has_error_code, error_code); 5680 has_error_code, error_code);
5600 5681
5601 if (ret) 5682 if (ret)
@@ -5928,13 +6009,88 @@ int kvm_arch_hardware_enable(void *garbage)
5928 struct kvm *kvm; 6009 struct kvm *kvm;
5929 struct kvm_vcpu *vcpu; 6010 struct kvm_vcpu *vcpu;
5930 int i; 6011 int i;
6012 int ret;
6013 u64 local_tsc;
6014 u64 max_tsc = 0;
6015 bool stable, backwards_tsc = false;
5931 6016
5932 kvm_shared_msr_cpu_online(); 6017 kvm_shared_msr_cpu_online();
5933 list_for_each_entry(kvm, &vm_list, vm_list) 6018 ret = kvm_x86_ops->hardware_enable(garbage);
5934 kvm_for_each_vcpu(i, vcpu, kvm) 6019 if (ret != 0)
5935 if (vcpu->cpu == smp_processor_id()) 6020 return ret;
5936 kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); 6021
5937 return kvm_x86_ops->hardware_enable(garbage); 6022 local_tsc = native_read_tsc();
6023 stable = !check_tsc_unstable();
6024 list_for_each_entry(kvm, &vm_list, vm_list) {
6025 kvm_for_each_vcpu(i, vcpu, kvm) {
6026 if (!stable && vcpu->cpu == smp_processor_id())
6027 set_bit(KVM_REQ_CLOCK_UPDATE, &vcpu->requests);
6028 if (stable && vcpu->arch.last_host_tsc > local_tsc) {
6029 backwards_tsc = true;
6030 if (vcpu->arch.last_host_tsc > max_tsc)
6031 max_tsc = vcpu->arch.last_host_tsc;
6032 }
6033 }
6034 }
6035
6036 /*
6037 * Sometimes, even reliable TSCs go backwards. This happens on
6038 * platforms that reset TSC during suspend or hibernate actions, but
6039 * maintain synchronization. We must compensate. Fortunately, we can
6040 * detect that condition here, which happens early in CPU bringup,
6041 * before any KVM threads can be running. Unfortunately, we can't
6042 * bring the TSCs fully up to date with real time, as we aren't yet far
6043 * enough into CPU bringup that we know how much real time has actually
6044 * elapsed; our helper function, get_kernel_ns() will be using boot
6045 * variables that haven't been updated yet.
6046 *
6047 * So we simply find the maximum observed TSC above, then record the
6048 * adjustment to TSC in each VCPU. When the VCPU later gets loaded,
6049 * the adjustment will be applied. Note that we accumulate
6050 * adjustments, in case multiple suspend cycles happen before some VCPU
6051 * gets a chance to run again. In the event that no KVM threads get a
6052 * chance to run, we will miss the entire elapsed period, as we'll have
6053 * reset last_host_tsc, so VCPUs will not have the TSC adjusted and may
6054 * loose cycle time. This isn't too big a deal, since the loss will be
6055 * uniform across all VCPUs (not to mention the scenario is extremely
6056 * unlikely). It is possible that a second hibernate recovery happens
6057 * much faster than a first, causing the observed TSC here to be
6058 * smaller; this would require additional padding adjustment, which is
6059 * why we set last_host_tsc to the local tsc observed here.
6060 *
6061 * N.B. - this code below runs only on platforms with reliable TSC,
6062 * as that is the only way backwards_tsc is set above. Also note
6063 * that this runs for ALL vcpus, which is not a bug; all VCPUs should
6064 * have the same delta_cyc adjustment applied if backwards_tsc
6065 * is detected. Note further, this adjustment is only done once,
6066 * as we reset last_host_tsc on all VCPUs to stop this from being
6067 * called multiple times (one for each physical CPU bringup).
6068 *
6069 * Platforms with unnreliable TSCs don't have to deal with this, they
6070 * will be compensated by the logic in vcpu_load, which sets the TSC to
6071 * catchup mode. This will catchup all VCPUs to real time, but cannot
6072 * guarantee that they stay in perfect synchronization.
6073 */
6074 if (backwards_tsc) {
6075 u64 delta_cyc = max_tsc - local_tsc;
6076 list_for_each_entry(kvm, &vm_list, vm_list) {
6077 kvm_for_each_vcpu(i, vcpu, kvm) {
6078 vcpu->arch.tsc_offset_adjustment += delta_cyc;
6079 vcpu->arch.last_host_tsc = local_tsc;
6080 }
6081
6082 /*
6083 * We have to disable TSC offset matching.. if you were
6084 * booting a VM while issuing an S4 host suspend....
6085 * you may have some problem. Solving this issue is
6086 * left as an exercise to the reader.
6087 */
6088 kvm->arch.last_tsc_nsec = 0;
6089 kvm->arch.last_tsc_write = 0;
6090 }
6091
6092 }
6093 return 0;
5938} 6094}
5939 6095
5940void kvm_arch_hardware_disable(void *garbage) 6096void kvm_arch_hardware_disable(void *garbage)
@@ -5958,6 +6114,11 @@ void kvm_arch_check_processor_compat(void *rtn)
5958 kvm_x86_ops->check_processor_compatibility(rtn); 6114 kvm_x86_ops->check_processor_compatibility(rtn);
5959} 6115}
5960 6116
6117bool kvm_vcpu_compatible(struct kvm_vcpu *vcpu)
6118{
6119 return irqchip_in_kernel(vcpu->kvm) == (vcpu->arch.apic != NULL);
6120}
6121
5961int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) 6122int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
5962{ 6123{
5963 struct page *page; 6124 struct page *page;
@@ -5980,7 +6141,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
5980 } 6141 }
5981 vcpu->arch.pio_data = page_address(page); 6142 vcpu->arch.pio_data = page_address(page);
5982 6143
5983 kvm_init_tsc_catchup(vcpu, max_tsc_khz); 6144 kvm_set_tsc_khz(vcpu, max_tsc_khz);
5984 6145
5985 r = kvm_mmu_create(vcpu); 6146 r = kvm_mmu_create(vcpu);
5986 if (r < 0) 6147 if (r < 0)
@@ -6032,8 +6193,11 @@ void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
6032 free_page((unsigned long)vcpu->arch.pio_data); 6193 free_page((unsigned long)vcpu->arch.pio_data);
6033} 6194}
6034 6195
6035int kvm_arch_init_vm(struct kvm *kvm) 6196int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
6036{ 6197{
6198 if (type)
6199 return -EINVAL;
6200
6037 INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); 6201 INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
6038 INIT_LIST_HEAD(&kvm->arch.assigned_dev_head); 6202 INIT_LIST_HEAD(&kvm->arch.assigned_dev_head);
6039 6203
@@ -6093,6 +6257,65 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
6093 put_page(kvm->arch.ept_identity_pagetable); 6257 put_page(kvm->arch.ept_identity_pagetable);
6094} 6258}
6095 6259
6260void kvm_arch_free_memslot(struct kvm_memory_slot *free,
6261 struct kvm_memory_slot *dont)
6262{
6263 int i;
6264
6265 for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) {
6266 if (!dont || free->arch.lpage_info[i] != dont->arch.lpage_info[i]) {
6267 vfree(free->arch.lpage_info[i]);
6268 free->arch.lpage_info[i] = NULL;
6269 }
6270 }
6271}
6272
6273int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages)
6274{
6275 int i;
6276
6277 for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) {
6278 unsigned long ugfn;
6279 int lpages;
6280 int level = i + 2;
6281
6282 lpages = gfn_to_index(slot->base_gfn + npages - 1,
6283 slot->base_gfn, level) + 1;
6284
6285 slot->arch.lpage_info[i] =
6286 vzalloc(lpages * sizeof(*slot->arch.lpage_info[i]));
6287 if (!slot->arch.lpage_info[i])
6288 goto out_free;
6289
6290 if (slot->base_gfn & (KVM_PAGES_PER_HPAGE(level) - 1))
6291 slot->arch.lpage_info[i][0].write_count = 1;
6292 if ((slot->base_gfn + npages) & (KVM_PAGES_PER_HPAGE(level) - 1))
6293 slot->arch.lpage_info[i][lpages - 1].write_count = 1;
6294 ugfn = slot->userspace_addr >> PAGE_SHIFT;
6295 /*
6296 * If the gfn and userspace address are not aligned wrt each
6297 * other, or if explicitly asked to, disable large page
6298 * support for this slot
6299 */
6300 if ((slot->base_gfn ^ ugfn) & (KVM_PAGES_PER_HPAGE(level) - 1) ||
6301 !kvm_largepages_enabled()) {
6302 unsigned long j;
6303
6304 for (j = 0; j < lpages; ++j)
6305 slot->arch.lpage_info[i][j].write_count = 1;
6306 }
6307 }
6308
6309 return 0;
6310
6311out_free:
6312 for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) {
6313 vfree(slot->arch.lpage_info[i]);
6314 slot->arch.lpage_info[i] = NULL;
6315 }
6316 return -ENOMEM;
6317}
6318
6096int kvm_arch_prepare_memory_region(struct kvm *kvm, 6319int kvm_arch_prepare_memory_region(struct kvm *kvm,
6097 struct kvm_memory_slot *memslot, 6320 struct kvm_memory_slot *memslot,
6098 struct kvm_memory_slot old, 6321 struct kvm_memory_slot old,