diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2012-03-28 17:35:31 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2012-03-28 17:35:31 -0400 |
commit | 2e7580b0e75d771d93e24e681031a165b1d31071 (patch) | |
tree | d9449702609eeaab28913a43b5a4434667e09d43 /arch/x86/kvm | |
parent | d25413efa9536e2f425ea45c7720598035c597bc (diff) | |
parent | cf9eeac46350b8b43730b7dc5e999757bed089a4 (diff) |
Merge branch 'kvm-updates/3.4' of git://git.kernel.org/pub/scm/virt/kvm/kvm
Pull kvm updates from Avi Kivity:
"Changes include timekeeping improvements, support for assigning host
PCI devices that share interrupt lines, s390 user-controlled guests, a
large ppc update, and random fixes."
This is with the sign-off's fixed, hopefully next merge window we won't
have rebased commits.
* 'kvm-updates/3.4' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (130 commits)
KVM: Convert intx_mask_lock to spin lock
KVM: x86: fix kvm_write_tsc() TSC matching thinko
x86: kvmclock: abstract save/restore sched_clock_state
KVM: nVMX: Fix erroneous exception bitmap check
KVM: Ignore the writes to MSR_K7_HWCR(3)
KVM: MMU: make use of ->root_level in reset_rsvds_bits_mask
KVM: PMU: add proper support for fixed counter 2
KVM: PMU: Fix raw event check
KVM: PMU: warn when pin control is set in eventsel msr
KVM: VMX: Fix delayed load of shared MSRs
KVM: use correct tlbs dirty type in cmpxchg
KVM: Allow host IRQ sharing for assigned PCI 2.3 devices
KVM: Ensure all vcpus are consistent with in-kernel irqchip settings
KVM: x86 emulator: Allow PM/VM86 switch during task switch
KVM: SVM: Fix CPL updates
KVM: x86 emulator: VM86 segments must have DPL 3
KVM: x86 emulator: Fix task switch privilege checks
arch/powerpc/kvm/book3s_hv.c: included linux/sched.h twice
KVM: x86 emulator: correctly mask pmc index bits in RDPMC instruction emulation
KVM: mmu_notifier: Flush TLBs before releasing mmu_lock
...
Diffstat (limited to 'arch/x86/kvm')
-rw-r--r-- | arch/x86/kvm/cpuid.c | 2 | ||||
-rw-r--r-- | arch/x86/kvm/cpuid.h | 8 | ||||
-rw-r--r-- | arch/x86/kvm/emulate.c | 112 | ||||
-rw-r--r-- | arch/x86/kvm/i8259.c | 1 | ||||
-rw-r--r-- | arch/x86/kvm/lapic.c | 4 | ||||
-rw-r--r-- | arch/x86/kvm/mmu.c | 85 | ||||
-rw-r--r-- | arch/x86/kvm/mmu_audit.c | 4 | ||||
-rw-r--r-- | arch/x86/kvm/pmu.c | 10 | ||||
-rw-r--r-- | arch/x86/kvm/svm.c | 119 | ||||
-rw-r--r-- | arch/x86/kvm/vmx.c | 53 | ||||
-rw-r--r-- | arch/x86/kvm/x86.c | 403 |
11 files changed, 595 insertions, 206 deletions
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 89b02bfaaca5..9fed5bedaad6 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c | |||
@@ -236,7 +236,7 @@ static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, | |||
236 | const u32 kvm_supported_word6_x86_features = | 236 | const u32 kvm_supported_word6_x86_features = |
237 | F(LAHF_LM) | F(CMP_LEGACY) | 0 /*SVM*/ | 0 /* ExtApicSpace */ | | 237 | F(LAHF_LM) | F(CMP_LEGACY) | 0 /*SVM*/ | 0 /* ExtApicSpace */ | |
238 | F(CR8_LEGACY) | F(ABM) | F(SSE4A) | F(MISALIGNSSE) | | 238 | F(CR8_LEGACY) | F(ABM) | F(SSE4A) | F(MISALIGNSSE) | |
239 | F(3DNOWPREFETCH) | 0 /* OSVW */ | 0 /* IBS */ | F(XOP) | | 239 | F(3DNOWPREFETCH) | F(OSVW) | 0 /* IBS */ | F(XOP) | |
240 | 0 /* SKINIT, WDT, LWP */ | F(FMA4) | F(TBM); | 240 | 0 /* SKINIT, WDT, LWP */ | F(FMA4) | F(TBM); |
241 | 241 | ||
242 | /* cpuid 0xC0000001.edx */ | 242 | /* cpuid 0xC0000001.edx */ |
diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h index 5b97e1797a6d..26d1fb437eb5 100644 --- a/arch/x86/kvm/cpuid.h +++ b/arch/x86/kvm/cpuid.h | |||
@@ -43,4 +43,12 @@ static inline bool guest_cpuid_has_fsgsbase(struct kvm_vcpu *vcpu) | |||
43 | return best && (best->ebx & bit(X86_FEATURE_FSGSBASE)); | 43 | return best && (best->ebx & bit(X86_FEATURE_FSGSBASE)); |
44 | } | 44 | } |
45 | 45 | ||
46 | static inline bool guest_cpuid_has_osvw(struct kvm_vcpu *vcpu) | ||
47 | { | ||
48 | struct kvm_cpuid_entry2 *best; | ||
49 | |||
50 | best = kvm_find_cpuid_entry(vcpu, 0x80000001, 0); | ||
51 | return best && (best->ecx & bit(X86_FEATURE_OSVW)); | ||
52 | } | ||
53 | |||
46 | #endif | 54 | #endif |
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 0982507b962a..83756223f8aa 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c | |||
@@ -57,6 +57,7 @@ | |||
57 | #define OpDS 23ull /* DS */ | 57 | #define OpDS 23ull /* DS */ |
58 | #define OpFS 24ull /* FS */ | 58 | #define OpFS 24ull /* FS */ |
59 | #define OpGS 25ull /* GS */ | 59 | #define OpGS 25ull /* GS */ |
60 | #define OpMem8 26ull /* 8-bit zero extended memory operand */ | ||
60 | 61 | ||
61 | #define OpBits 5 /* Width of operand field */ | 62 | #define OpBits 5 /* Width of operand field */ |
62 | #define OpMask ((1ull << OpBits) - 1) | 63 | #define OpMask ((1ull << OpBits) - 1) |
@@ -101,6 +102,7 @@ | |||
101 | #define SrcAcc (OpAcc << SrcShift) | 102 | #define SrcAcc (OpAcc << SrcShift) |
102 | #define SrcImmU16 (OpImmU16 << SrcShift) | 103 | #define SrcImmU16 (OpImmU16 << SrcShift) |
103 | #define SrcDX (OpDX << SrcShift) | 104 | #define SrcDX (OpDX << SrcShift) |
105 | #define SrcMem8 (OpMem8 << SrcShift) | ||
104 | #define SrcMask (OpMask << SrcShift) | 106 | #define SrcMask (OpMask << SrcShift) |
105 | #define BitOp (1<<11) | 107 | #define BitOp (1<<11) |
106 | #define MemAbs (1<<12) /* Memory operand is absolute displacement */ | 108 | #define MemAbs (1<<12) /* Memory operand is absolute displacement */ |
@@ -858,8 +860,7 @@ static void write_sse_reg(struct x86_emulate_ctxt *ctxt, sse128_t *data, | |||
858 | } | 860 | } |
859 | 861 | ||
860 | static void decode_register_operand(struct x86_emulate_ctxt *ctxt, | 862 | static void decode_register_operand(struct x86_emulate_ctxt *ctxt, |
861 | struct operand *op, | 863 | struct operand *op) |
862 | int inhibit_bytereg) | ||
863 | { | 864 | { |
864 | unsigned reg = ctxt->modrm_reg; | 865 | unsigned reg = ctxt->modrm_reg; |
865 | int highbyte_regs = ctxt->rex_prefix == 0; | 866 | int highbyte_regs = ctxt->rex_prefix == 0; |
@@ -876,7 +877,7 @@ static void decode_register_operand(struct x86_emulate_ctxt *ctxt, | |||
876 | } | 877 | } |
877 | 878 | ||
878 | op->type = OP_REG; | 879 | op->type = OP_REG; |
879 | if ((ctxt->d & ByteOp) && !inhibit_bytereg) { | 880 | if (ctxt->d & ByteOp) { |
880 | op->addr.reg = decode_register(reg, ctxt->regs, highbyte_regs); | 881 | op->addr.reg = decode_register(reg, ctxt->regs, highbyte_regs); |
881 | op->bytes = 1; | 882 | op->bytes = 1; |
882 | } else { | 883 | } else { |
@@ -1151,6 +1152,22 @@ static int pio_in_emulated(struct x86_emulate_ctxt *ctxt, | |||
1151 | return 1; | 1152 | return 1; |
1152 | } | 1153 | } |
1153 | 1154 | ||
1155 | static int read_interrupt_descriptor(struct x86_emulate_ctxt *ctxt, | ||
1156 | u16 index, struct desc_struct *desc) | ||
1157 | { | ||
1158 | struct desc_ptr dt; | ||
1159 | ulong addr; | ||
1160 | |||
1161 | ctxt->ops->get_idt(ctxt, &dt); | ||
1162 | |||
1163 | if (dt.size < index * 8 + 7) | ||
1164 | return emulate_gp(ctxt, index << 3 | 0x2); | ||
1165 | |||
1166 | addr = dt.address + index * 8; | ||
1167 | return ctxt->ops->read_std(ctxt, addr, desc, sizeof *desc, | ||
1168 | &ctxt->exception); | ||
1169 | } | ||
1170 | |||
1154 | static void get_descriptor_table_ptr(struct x86_emulate_ctxt *ctxt, | 1171 | static void get_descriptor_table_ptr(struct x86_emulate_ctxt *ctxt, |
1155 | u16 selector, struct desc_ptr *dt) | 1172 | u16 selector, struct desc_ptr *dt) |
1156 | { | 1173 | { |
@@ -1227,6 +1244,8 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt, | |||
1227 | seg_desc.type = 3; | 1244 | seg_desc.type = 3; |
1228 | seg_desc.p = 1; | 1245 | seg_desc.p = 1; |
1229 | seg_desc.s = 1; | 1246 | seg_desc.s = 1; |
1247 | if (ctxt->mode == X86EMUL_MODE_VM86) | ||
1248 | seg_desc.dpl = 3; | ||
1230 | goto load; | 1249 | goto load; |
1231 | } | 1250 | } |
1232 | 1251 | ||
@@ -1891,6 +1910,17 @@ setup_syscalls_segments(struct x86_emulate_ctxt *ctxt, | |||
1891 | ss->p = 1; | 1910 | ss->p = 1; |
1892 | } | 1911 | } |
1893 | 1912 | ||
1913 | static bool vendor_intel(struct x86_emulate_ctxt *ctxt) | ||
1914 | { | ||
1915 | u32 eax, ebx, ecx, edx; | ||
1916 | |||
1917 | eax = ecx = 0; | ||
1918 | return ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx) | ||
1919 | && ebx == X86EMUL_CPUID_VENDOR_GenuineIntel_ebx | ||
1920 | && ecx == X86EMUL_CPUID_VENDOR_GenuineIntel_ecx | ||
1921 | && edx == X86EMUL_CPUID_VENDOR_GenuineIntel_edx; | ||
1922 | } | ||
1923 | |||
1894 | static bool em_syscall_is_enabled(struct x86_emulate_ctxt *ctxt) | 1924 | static bool em_syscall_is_enabled(struct x86_emulate_ctxt *ctxt) |
1895 | { | 1925 | { |
1896 | struct x86_emulate_ops *ops = ctxt->ops; | 1926 | struct x86_emulate_ops *ops = ctxt->ops; |
@@ -2007,6 +2037,14 @@ static int em_sysenter(struct x86_emulate_ctxt *ctxt) | |||
2007 | if (ctxt->mode == X86EMUL_MODE_REAL) | 2037 | if (ctxt->mode == X86EMUL_MODE_REAL) |
2008 | return emulate_gp(ctxt, 0); | 2038 | return emulate_gp(ctxt, 0); |
2009 | 2039 | ||
2040 | /* | ||
2041 | * Not recognized on AMD in compat mode (but is recognized in legacy | ||
2042 | * mode). | ||
2043 | */ | ||
2044 | if ((ctxt->mode == X86EMUL_MODE_PROT32) && (efer & EFER_LMA) | ||
2045 | && !vendor_intel(ctxt)) | ||
2046 | return emulate_ud(ctxt); | ||
2047 | |||
2010 | /* XXX sysenter/sysexit have not been tested in 64bit mode. | 2048 | /* XXX sysenter/sysexit have not been tested in 64bit mode. |
2011 | * Therefore, we inject an #UD. | 2049 | * Therefore, we inject an #UD. |
2012 | */ | 2050 | */ |
@@ -2306,6 +2344,8 @@ static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt, | |||
2306 | return emulate_gp(ctxt, 0); | 2344 | return emulate_gp(ctxt, 0); |
2307 | ctxt->_eip = tss->eip; | 2345 | ctxt->_eip = tss->eip; |
2308 | ctxt->eflags = tss->eflags | 2; | 2346 | ctxt->eflags = tss->eflags | 2; |
2347 | |||
2348 | /* General purpose registers */ | ||
2309 | ctxt->regs[VCPU_REGS_RAX] = tss->eax; | 2349 | ctxt->regs[VCPU_REGS_RAX] = tss->eax; |
2310 | ctxt->regs[VCPU_REGS_RCX] = tss->ecx; | 2350 | ctxt->regs[VCPU_REGS_RCX] = tss->ecx; |
2311 | ctxt->regs[VCPU_REGS_RDX] = tss->edx; | 2351 | ctxt->regs[VCPU_REGS_RDX] = tss->edx; |
@@ -2328,6 +2368,24 @@ static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt, | |||
2328 | set_segment_selector(ctxt, tss->gs, VCPU_SREG_GS); | 2368 | set_segment_selector(ctxt, tss->gs, VCPU_SREG_GS); |
2329 | 2369 | ||
2330 | /* | 2370 | /* |
2371 | * If we're switching between Protected Mode and VM86, we need to make | ||
2372 | * sure to update the mode before loading the segment descriptors so | ||
2373 | * that the selectors are interpreted correctly. | ||
2374 | * | ||
2375 | * Need to get rflags to the vcpu struct immediately because it | ||
2376 | * influences the CPL which is checked at least when loading the segment | ||
2377 | * descriptors and when pushing an error code to the new kernel stack. | ||
2378 | * | ||
2379 | * TODO Introduce a separate ctxt->ops->set_cpl callback | ||
2380 | */ | ||
2381 | if (ctxt->eflags & X86_EFLAGS_VM) | ||
2382 | ctxt->mode = X86EMUL_MODE_VM86; | ||
2383 | else | ||
2384 | ctxt->mode = X86EMUL_MODE_PROT32; | ||
2385 | |||
2386 | ctxt->ops->set_rflags(ctxt, ctxt->eflags); | ||
2387 | |||
2388 | /* | ||
2331 | * Now load segment descriptors. If fault happenes at this stage | 2389 | * Now load segment descriptors. If fault happenes at this stage |
2332 | * it is handled in a context of new task | 2390 | * it is handled in a context of new task |
2333 | */ | 2391 | */ |
@@ -2401,7 +2459,7 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt, | |||
2401 | } | 2459 | } |
2402 | 2460 | ||
2403 | static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt, | 2461 | static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt, |
2404 | u16 tss_selector, int reason, | 2462 | u16 tss_selector, int idt_index, int reason, |
2405 | bool has_error_code, u32 error_code) | 2463 | bool has_error_code, u32 error_code) |
2406 | { | 2464 | { |
2407 | struct x86_emulate_ops *ops = ctxt->ops; | 2465 | struct x86_emulate_ops *ops = ctxt->ops; |
@@ -2423,12 +2481,35 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt, | |||
2423 | 2481 | ||
2424 | /* FIXME: check that next_tss_desc is tss */ | 2482 | /* FIXME: check that next_tss_desc is tss */ |
2425 | 2483 | ||
2426 | if (reason != TASK_SWITCH_IRET) { | 2484 | /* |
2427 | if ((tss_selector & 3) > next_tss_desc.dpl || | 2485 | * Check privileges. The three cases are task switch caused by... |
2428 | ops->cpl(ctxt) > next_tss_desc.dpl) | 2486 | * |
2429 | return emulate_gp(ctxt, 0); | 2487 | * 1. jmp/call/int to task gate: Check against DPL of the task gate |
2488 | * 2. Exception/IRQ/iret: No check is performed | ||
2489 | * 3. jmp/call to TSS: Check agains DPL of the TSS | ||
2490 | */ | ||
2491 | if (reason == TASK_SWITCH_GATE) { | ||
2492 | if (idt_index != -1) { | ||
2493 | /* Software interrupts */ | ||
2494 | struct desc_struct task_gate_desc; | ||
2495 | int dpl; | ||
2496 | |||
2497 | ret = read_interrupt_descriptor(ctxt, idt_index, | ||
2498 | &task_gate_desc); | ||
2499 | if (ret != X86EMUL_CONTINUE) | ||
2500 | return ret; | ||
2501 | |||
2502 | dpl = task_gate_desc.dpl; | ||
2503 | if ((tss_selector & 3) > dpl || ops->cpl(ctxt) > dpl) | ||
2504 | return emulate_gp(ctxt, (idt_index << 3) | 0x2); | ||
2505 | } | ||
2506 | } else if (reason != TASK_SWITCH_IRET) { | ||
2507 | int dpl = next_tss_desc.dpl; | ||
2508 | if ((tss_selector & 3) > dpl || ops->cpl(ctxt) > dpl) | ||
2509 | return emulate_gp(ctxt, tss_selector); | ||
2430 | } | 2510 | } |
2431 | 2511 | ||
2512 | |||
2432 | desc_limit = desc_limit_scaled(&next_tss_desc); | 2513 | desc_limit = desc_limit_scaled(&next_tss_desc); |
2433 | if (!next_tss_desc.p || | 2514 | if (!next_tss_desc.p || |
2434 | ((desc_limit < 0x67 && (next_tss_desc.type & 8)) || | 2515 | ((desc_limit < 0x67 && (next_tss_desc.type & 8)) || |
@@ -2481,7 +2562,7 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt, | |||
2481 | } | 2562 | } |
2482 | 2563 | ||
2483 | int emulator_task_switch(struct x86_emulate_ctxt *ctxt, | 2564 | int emulator_task_switch(struct x86_emulate_ctxt *ctxt, |
2484 | u16 tss_selector, int reason, | 2565 | u16 tss_selector, int idt_index, int reason, |
2485 | bool has_error_code, u32 error_code) | 2566 | bool has_error_code, u32 error_code) |
2486 | { | 2567 | { |
2487 | int rc; | 2568 | int rc; |
@@ -2489,7 +2570,7 @@ int emulator_task_switch(struct x86_emulate_ctxt *ctxt, | |||
2489 | ctxt->_eip = ctxt->eip; | 2570 | ctxt->_eip = ctxt->eip; |
2490 | ctxt->dst.type = OP_NONE; | 2571 | ctxt->dst.type = OP_NONE; |
2491 | 2572 | ||
2492 | rc = emulator_do_task_switch(ctxt, tss_selector, reason, | 2573 | rc = emulator_do_task_switch(ctxt, tss_selector, idt_index, reason, |
2493 | has_error_code, error_code); | 2574 | has_error_code, error_code); |
2494 | 2575 | ||
2495 | if (rc == X86EMUL_CONTINUE) | 2576 | if (rc == X86EMUL_CONTINUE) |
@@ -3514,13 +3595,13 @@ static struct opcode twobyte_table[256] = { | |||
3514 | I(DstMem | SrcReg | ModRM | BitOp | Lock, em_btr), | 3595 | I(DstMem | SrcReg | ModRM | BitOp | Lock, em_btr), |
3515 | I(DstReg | SrcMemFAddr | ModRM | Src2FS, em_lseg), | 3596 | I(DstReg | SrcMemFAddr | ModRM | Src2FS, em_lseg), |
3516 | I(DstReg | SrcMemFAddr | ModRM | Src2GS, em_lseg), | 3597 | I(DstReg | SrcMemFAddr | ModRM | Src2GS, em_lseg), |
3517 | D(ByteOp | DstReg | SrcMem | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov), | 3598 | D(DstReg | SrcMem8 | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov), |
3518 | /* 0xB8 - 0xBF */ | 3599 | /* 0xB8 - 0xBF */ |
3519 | N, N, | 3600 | N, N, |
3520 | G(BitOp, group8), | 3601 | G(BitOp, group8), |
3521 | I(DstMem | SrcReg | ModRM | BitOp | Lock | PageTable, em_btc), | 3602 | I(DstMem | SrcReg | ModRM | BitOp | Lock | PageTable, em_btc), |
3522 | I(DstReg | SrcMem | ModRM, em_bsf), I(DstReg | SrcMem | ModRM, em_bsr), | 3603 | I(DstReg | SrcMem | ModRM, em_bsf), I(DstReg | SrcMem | ModRM, em_bsr), |
3523 | D(ByteOp | DstReg | SrcMem | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov), | 3604 | D(DstReg | SrcMem8 | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov), |
3524 | /* 0xC0 - 0xCF */ | 3605 | /* 0xC0 - 0xCF */ |
3525 | D2bv(DstMem | SrcReg | ModRM | Lock), | 3606 | D2bv(DstMem | SrcReg | ModRM | Lock), |
3526 | N, D(DstMem | SrcReg | ModRM | Mov), | 3607 | N, D(DstMem | SrcReg | ModRM | Mov), |
@@ -3602,9 +3683,7 @@ static int decode_operand(struct x86_emulate_ctxt *ctxt, struct operand *op, | |||
3602 | 3683 | ||
3603 | switch (d) { | 3684 | switch (d) { |
3604 | case OpReg: | 3685 | case OpReg: |
3605 | decode_register_operand(ctxt, op, | 3686 | decode_register_operand(ctxt, op); |
3606 | op == &ctxt->dst && | ||
3607 | ctxt->twobyte && (ctxt->b == 0xb6 || ctxt->b == 0xb7)); | ||
3608 | break; | 3687 | break; |
3609 | case OpImmUByte: | 3688 | case OpImmUByte: |
3610 | rc = decode_imm(ctxt, op, 1, false); | 3689 | rc = decode_imm(ctxt, op, 1, false); |
@@ -3656,6 +3735,9 @@ static int decode_operand(struct x86_emulate_ctxt *ctxt, struct operand *op, | |||
3656 | case OpImm: | 3735 | case OpImm: |
3657 | rc = decode_imm(ctxt, op, imm_size(ctxt), true); | 3736 | rc = decode_imm(ctxt, op, imm_size(ctxt), true); |
3658 | break; | 3737 | break; |
3738 | case OpMem8: | ||
3739 | ctxt->memop.bytes = 1; | ||
3740 | goto mem_common; | ||
3659 | case OpMem16: | 3741 | case OpMem16: |
3660 | ctxt->memop.bytes = 2; | 3742 | ctxt->memop.bytes = 2; |
3661 | goto mem_common; | 3743 | goto mem_common; |
diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c index b6a73537e1ef..81cf4fa4a2be 100644 --- a/arch/x86/kvm/i8259.c +++ b/arch/x86/kvm/i8259.c | |||
@@ -307,6 +307,7 @@ static void pic_ioport_write(void *opaque, u32 addr, u32 val) | |||
307 | if (val & 0x10) { | 307 | if (val & 0x10) { |
308 | s->init4 = val & 1; | 308 | s->init4 = val & 1; |
309 | s->last_irr = 0; | 309 | s->last_irr = 0; |
310 | s->irr &= s->elcr; | ||
310 | s->imr = 0; | 311 | s->imr = 0; |
311 | s->priority_add = 0; | 312 | s->priority_add = 0; |
312 | s->special_mask = 0; | 313 | s->special_mask = 0; |
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 31bfc6927bc0..858432287ab6 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c | |||
@@ -433,7 +433,7 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, | |||
433 | break; | 433 | break; |
434 | 434 | ||
435 | case APIC_DM_INIT: | 435 | case APIC_DM_INIT: |
436 | if (level) { | 436 | if (!trig_mode || level) { |
437 | result = 1; | 437 | result = 1; |
438 | vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED; | 438 | vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED; |
439 | kvm_make_request(KVM_REQ_EVENT, vcpu); | 439 | kvm_make_request(KVM_REQ_EVENT, vcpu); |
@@ -731,7 +731,7 @@ static void start_apic_timer(struct kvm_lapic *apic) | |||
731 | u64 guest_tsc, tscdeadline = apic->lapic_timer.tscdeadline; | 731 | u64 guest_tsc, tscdeadline = apic->lapic_timer.tscdeadline; |
732 | u64 ns = 0; | 732 | u64 ns = 0; |
733 | struct kvm_vcpu *vcpu = apic->vcpu; | 733 | struct kvm_vcpu *vcpu = apic->vcpu; |
734 | unsigned long this_tsc_khz = vcpu_tsc_khz(vcpu); | 734 | unsigned long this_tsc_khz = vcpu->arch.virtual_tsc_khz; |
735 | unsigned long flags; | 735 | unsigned long flags; |
736 | 736 | ||
737 | if (unlikely(!tscdeadline || !this_tsc_khz)) | 737 | if (unlikely(!tscdeadline || !this_tsc_khz)) |
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 224b02c3cda9..4cb164268846 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c | |||
@@ -688,9 +688,8 @@ static struct kvm_lpage_info *lpage_info_slot(gfn_t gfn, | |||
688 | { | 688 | { |
689 | unsigned long idx; | 689 | unsigned long idx; |
690 | 690 | ||
691 | idx = (gfn >> KVM_HPAGE_GFN_SHIFT(level)) - | 691 | idx = gfn_to_index(gfn, slot->base_gfn, level); |
692 | (slot->base_gfn >> KVM_HPAGE_GFN_SHIFT(level)); | 692 | return &slot->arch.lpage_info[level - 2][idx]; |
693 | return &slot->lpage_info[level - 2][idx]; | ||
694 | } | 693 | } |
695 | 694 | ||
696 | static void account_shadowed(struct kvm *kvm, gfn_t gfn) | 695 | static void account_shadowed(struct kvm *kvm, gfn_t gfn) |
@@ -946,7 +945,7 @@ static void pte_list_walk(unsigned long *pte_list, pte_list_walk_fn fn) | |||
946 | } | 945 | } |
947 | } | 946 | } |
948 | 947 | ||
949 | static unsigned long *__gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level, | 948 | static unsigned long *__gfn_to_rmap(gfn_t gfn, int level, |
950 | struct kvm_memory_slot *slot) | 949 | struct kvm_memory_slot *slot) |
951 | { | 950 | { |
952 | struct kvm_lpage_info *linfo; | 951 | struct kvm_lpage_info *linfo; |
@@ -966,7 +965,7 @@ static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level) | |||
966 | struct kvm_memory_slot *slot; | 965 | struct kvm_memory_slot *slot; |
967 | 966 | ||
968 | slot = gfn_to_memslot(kvm, gfn); | 967 | slot = gfn_to_memslot(kvm, gfn); |
969 | return __gfn_to_rmap(kvm, gfn, level, slot); | 968 | return __gfn_to_rmap(gfn, level, slot); |
970 | } | 969 | } |
971 | 970 | ||
972 | static bool rmap_can_add(struct kvm_vcpu *vcpu) | 971 | static bool rmap_can_add(struct kvm_vcpu *vcpu) |
@@ -988,7 +987,7 @@ static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) | |||
988 | return pte_list_add(vcpu, spte, rmapp); | 987 | return pte_list_add(vcpu, spte, rmapp); |
989 | } | 988 | } |
990 | 989 | ||
991 | static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte) | 990 | static u64 *rmap_next(unsigned long *rmapp, u64 *spte) |
992 | { | 991 | { |
993 | return pte_list_next(rmapp, spte); | 992 | return pte_list_next(rmapp, spte); |
994 | } | 993 | } |
@@ -1018,8 +1017,8 @@ int kvm_mmu_rmap_write_protect(struct kvm *kvm, u64 gfn, | |||
1018 | u64 *spte; | 1017 | u64 *spte; |
1019 | int i, write_protected = 0; | 1018 | int i, write_protected = 0; |
1020 | 1019 | ||
1021 | rmapp = __gfn_to_rmap(kvm, gfn, PT_PAGE_TABLE_LEVEL, slot); | 1020 | rmapp = __gfn_to_rmap(gfn, PT_PAGE_TABLE_LEVEL, slot); |
1022 | spte = rmap_next(kvm, rmapp, NULL); | 1021 | spte = rmap_next(rmapp, NULL); |
1023 | while (spte) { | 1022 | while (spte) { |
1024 | BUG_ON(!(*spte & PT_PRESENT_MASK)); | 1023 | BUG_ON(!(*spte & PT_PRESENT_MASK)); |
1025 | rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte); | 1024 | rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte); |
@@ -1027,14 +1026,14 @@ int kvm_mmu_rmap_write_protect(struct kvm *kvm, u64 gfn, | |||
1027 | mmu_spte_update(spte, *spte & ~PT_WRITABLE_MASK); | 1026 | mmu_spte_update(spte, *spte & ~PT_WRITABLE_MASK); |
1028 | write_protected = 1; | 1027 | write_protected = 1; |
1029 | } | 1028 | } |
1030 | spte = rmap_next(kvm, rmapp, spte); | 1029 | spte = rmap_next(rmapp, spte); |
1031 | } | 1030 | } |
1032 | 1031 | ||
1033 | /* check for huge page mappings */ | 1032 | /* check for huge page mappings */ |
1034 | for (i = PT_DIRECTORY_LEVEL; | 1033 | for (i = PT_DIRECTORY_LEVEL; |
1035 | i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { | 1034 | i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { |
1036 | rmapp = __gfn_to_rmap(kvm, gfn, i, slot); | 1035 | rmapp = __gfn_to_rmap(gfn, i, slot); |
1037 | spte = rmap_next(kvm, rmapp, NULL); | 1036 | spte = rmap_next(rmapp, NULL); |
1038 | while (spte) { | 1037 | while (spte) { |
1039 | BUG_ON(!(*spte & PT_PRESENT_MASK)); | 1038 | BUG_ON(!(*spte & PT_PRESENT_MASK)); |
1040 | BUG_ON(!is_large_pte(*spte)); | 1039 | BUG_ON(!is_large_pte(*spte)); |
@@ -1045,7 +1044,7 @@ int kvm_mmu_rmap_write_protect(struct kvm *kvm, u64 gfn, | |||
1045 | spte = NULL; | 1044 | spte = NULL; |
1046 | write_protected = 1; | 1045 | write_protected = 1; |
1047 | } | 1046 | } |
1048 | spte = rmap_next(kvm, rmapp, spte); | 1047 | spte = rmap_next(rmapp, spte); |
1049 | } | 1048 | } |
1050 | } | 1049 | } |
1051 | 1050 | ||
@@ -1066,7 +1065,7 @@ static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp, | |||
1066 | u64 *spte; | 1065 | u64 *spte; |
1067 | int need_tlb_flush = 0; | 1066 | int need_tlb_flush = 0; |
1068 | 1067 | ||
1069 | while ((spte = rmap_next(kvm, rmapp, NULL))) { | 1068 | while ((spte = rmap_next(rmapp, NULL))) { |
1070 | BUG_ON(!(*spte & PT_PRESENT_MASK)); | 1069 | BUG_ON(!(*spte & PT_PRESENT_MASK)); |
1071 | rmap_printk("kvm_rmap_unmap_hva: spte %p %llx\n", spte, *spte); | 1070 | rmap_printk("kvm_rmap_unmap_hva: spte %p %llx\n", spte, *spte); |
1072 | drop_spte(kvm, spte); | 1071 | drop_spte(kvm, spte); |
@@ -1085,14 +1084,14 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp, | |||
1085 | 1084 | ||
1086 | WARN_ON(pte_huge(*ptep)); | 1085 | WARN_ON(pte_huge(*ptep)); |
1087 | new_pfn = pte_pfn(*ptep); | 1086 | new_pfn = pte_pfn(*ptep); |
1088 | spte = rmap_next(kvm, rmapp, NULL); | 1087 | spte = rmap_next(rmapp, NULL); |
1089 | while (spte) { | 1088 | while (spte) { |
1090 | BUG_ON(!is_shadow_present_pte(*spte)); | 1089 | BUG_ON(!is_shadow_present_pte(*spte)); |
1091 | rmap_printk("kvm_set_pte_rmapp: spte %p %llx\n", spte, *spte); | 1090 | rmap_printk("kvm_set_pte_rmapp: spte %p %llx\n", spte, *spte); |
1092 | need_flush = 1; | 1091 | need_flush = 1; |
1093 | if (pte_write(*ptep)) { | 1092 | if (pte_write(*ptep)) { |
1094 | drop_spte(kvm, spte); | 1093 | drop_spte(kvm, spte); |
1095 | spte = rmap_next(kvm, rmapp, NULL); | 1094 | spte = rmap_next(rmapp, NULL); |
1096 | } else { | 1095 | } else { |
1097 | new_spte = *spte &~ (PT64_BASE_ADDR_MASK); | 1096 | new_spte = *spte &~ (PT64_BASE_ADDR_MASK); |
1098 | new_spte |= (u64)new_pfn << PAGE_SHIFT; | 1097 | new_spte |= (u64)new_pfn << PAGE_SHIFT; |
@@ -1102,7 +1101,7 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp, | |||
1102 | new_spte &= ~shadow_accessed_mask; | 1101 | new_spte &= ~shadow_accessed_mask; |
1103 | mmu_spte_clear_track_bits(spte); | 1102 | mmu_spte_clear_track_bits(spte); |
1104 | mmu_spte_set(spte, new_spte); | 1103 | mmu_spte_set(spte, new_spte); |
1105 | spte = rmap_next(kvm, rmapp, spte); | 1104 | spte = rmap_next(rmapp, spte); |
1106 | } | 1105 | } |
1107 | } | 1106 | } |
1108 | if (need_flush) | 1107 | if (need_flush) |
@@ -1176,7 +1175,7 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp, | |||
1176 | if (!shadow_accessed_mask) | 1175 | if (!shadow_accessed_mask) |
1177 | return kvm_unmap_rmapp(kvm, rmapp, data); | 1176 | return kvm_unmap_rmapp(kvm, rmapp, data); |
1178 | 1177 | ||
1179 | spte = rmap_next(kvm, rmapp, NULL); | 1178 | spte = rmap_next(rmapp, NULL); |
1180 | while (spte) { | 1179 | while (spte) { |
1181 | int _young; | 1180 | int _young; |
1182 | u64 _spte = *spte; | 1181 | u64 _spte = *spte; |
@@ -1186,7 +1185,7 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp, | |||
1186 | young = 1; | 1185 | young = 1; |
1187 | clear_bit(PT_ACCESSED_SHIFT, (unsigned long *)spte); | 1186 | clear_bit(PT_ACCESSED_SHIFT, (unsigned long *)spte); |
1188 | } | 1187 | } |
1189 | spte = rmap_next(kvm, rmapp, spte); | 1188 | spte = rmap_next(rmapp, spte); |
1190 | } | 1189 | } |
1191 | return young; | 1190 | return young; |
1192 | } | 1191 | } |
@@ -1205,7 +1204,7 @@ static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp, | |||
1205 | if (!shadow_accessed_mask) | 1204 | if (!shadow_accessed_mask) |
1206 | goto out; | 1205 | goto out; |
1207 | 1206 | ||
1208 | spte = rmap_next(kvm, rmapp, NULL); | 1207 | spte = rmap_next(rmapp, NULL); |
1209 | while (spte) { | 1208 | while (spte) { |
1210 | u64 _spte = *spte; | 1209 | u64 _spte = *spte; |
1211 | BUG_ON(!(_spte & PT_PRESENT_MASK)); | 1210 | BUG_ON(!(_spte & PT_PRESENT_MASK)); |
@@ -1214,7 +1213,7 @@ static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp, | |||
1214 | young = 1; | 1213 | young = 1; |
1215 | break; | 1214 | break; |
1216 | } | 1215 | } |
1217 | spte = rmap_next(kvm, rmapp, spte); | 1216 | spte = rmap_next(rmapp, spte); |
1218 | } | 1217 | } |
1219 | out: | 1218 | out: |
1220 | return young; | 1219 | return young; |
@@ -1391,11 +1390,6 @@ struct kvm_mmu_pages { | |||
1391 | unsigned int nr; | 1390 | unsigned int nr; |
1392 | }; | 1391 | }; |
1393 | 1392 | ||
1394 | #define for_each_unsync_children(bitmap, idx) \ | ||
1395 | for (idx = find_first_bit(bitmap, 512); \ | ||
1396 | idx < 512; \ | ||
1397 | idx = find_next_bit(bitmap, 512, idx+1)) | ||
1398 | |||
1399 | static int mmu_pages_add(struct kvm_mmu_pages *pvec, struct kvm_mmu_page *sp, | 1393 | static int mmu_pages_add(struct kvm_mmu_pages *pvec, struct kvm_mmu_page *sp, |
1400 | int idx) | 1394 | int idx) |
1401 | { | 1395 | { |
@@ -1417,7 +1411,7 @@ static int __mmu_unsync_walk(struct kvm_mmu_page *sp, | |||
1417 | { | 1411 | { |
1418 | int i, ret, nr_unsync_leaf = 0; | 1412 | int i, ret, nr_unsync_leaf = 0; |
1419 | 1413 | ||
1420 | for_each_unsync_children(sp->unsync_child_bitmap, i) { | 1414 | for_each_set_bit(i, sp->unsync_child_bitmap, 512) { |
1421 | struct kvm_mmu_page *child; | 1415 | struct kvm_mmu_page *child; |
1422 | u64 ent = sp->spt[i]; | 1416 | u64 ent = sp->spt[i]; |
1423 | 1417 | ||
@@ -1803,6 +1797,7 @@ static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep) | |||
1803 | { | 1797 | { |
1804 | if (is_large_pte(*sptep)) { | 1798 | if (is_large_pte(*sptep)) { |
1805 | drop_spte(vcpu->kvm, sptep); | 1799 | drop_spte(vcpu->kvm, sptep); |
1800 | --vcpu->kvm->stat.lpages; | ||
1806 | kvm_flush_remote_tlbs(vcpu->kvm); | 1801 | kvm_flush_remote_tlbs(vcpu->kvm); |
1807 | } | 1802 | } |
1808 | } | 1803 | } |
@@ -3190,15 +3185,14 @@ static bool sync_mmio_spte(u64 *sptep, gfn_t gfn, unsigned access, | |||
3190 | #undef PTTYPE | 3185 | #undef PTTYPE |
3191 | 3186 | ||
3192 | static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, | 3187 | static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, |
3193 | struct kvm_mmu *context, | 3188 | struct kvm_mmu *context) |
3194 | int level) | ||
3195 | { | 3189 | { |
3196 | int maxphyaddr = cpuid_maxphyaddr(vcpu); | 3190 | int maxphyaddr = cpuid_maxphyaddr(vcpu); |
3197 | u64 exb_bit_rsvd = 0; | 3191 | u64 exb_bit_rsvd = 0; |
3198 | 3192 | ||
3199 | if (!context->nx) | 3193 | if (!context->nx) |
3200 | exb_bit_rsvd = rsvd_bits(63, 63); | 3194 | exb_bit_rsvd = rsvd_bits(63, 63); |
3201 | switch (level) { | 3195 | switch (context->root_level) { |
3202 | case PT32_ROOT_LEVEL: | 3196 | case PT32_ROOT_LEVEL: |
3203 | /* no rsvd bits for 2 level 4K page table entries */ | 3197 | /* no rsvd bits for 2 level 4K page table entries */ |
3204 | context->rsvd_bits_mask[0][1] = 0; | 3198 | context->rsvd_bits_mask[0][1] = 0; |
@@ -3256,8 +3250,9 @@ static int paging64_init_context_common(struct kvm_vcpu *vcpu, | |||
3256 | int level) | 3250 | int level) |
3257 | { | 3251 | { |
3258 | context->nx = is_nx(vcpu); | 3252 | context->nx = is_nx(vcpu); |
3253 | context->root_level = level; | ||
3259 | 3254 | ||
3260 | reset_rsvds_bits_mask(vcpu, context, level); | 3255 | reset_rsvds_bits_mask(vcpu, context); |
3261 | 3256 | ||
3262 | ASSERT(is_pae(vcpu)); | 3257 | ASSERT(is_pae(vcpu)); |
3263 | context->new_cr3 = paging_new_cr3; | 3258 | context->new_cr3 = paging_new_cr3; |
@@ -3267,7 +3262,6 @@ static int paging64_init_context_common(struct kvm_vcpu *vcpu, | |||
3267 | context->invlpg = paging64_invlpg; | 3262 | context->invlpg = paging64_invlpg; |
3268 | context->update_pte = paging64_update_pte; | 3263 | context->update_pte = paging64_update_pte; |
3269 | context->free = paging_free; | 3264 | context->free = paging_free; |
3270 | context->root_level = level; | ||
3271 | context->shadow_root_level = level; | 3265 | context->shadow_root_level = level; |
3272 | context->root_hpa = INVALID_PAGE; | 3266 | context->root_hpa = INVALID_PAGE; |
3273 | context->direct_map = false; | 3267 | context->direct_map = false; |
@@ -3284,8 +3278,9 @@ static int paging32_init_context(struct kvm_vcpu *vcpu, | |||
3284 | struct kvm_mmu *context) | 3278 | struct kvm_mmu *context) |
3285 | { | 3279 | { |
3286 | context->nx = false; | 3280 | context->nx = false; |
3281 | context->root_level = PT32_ROOT_LEVEL; | ||
3287 | 3282 | ||
3288 | reset_rsvds_bits_mask(vcpu, context, PT32_ROOT_LEVEL); | 3283 | reset_rsvds_bits_mask(vcpu, context); |
3289 | 3284 | ||
3290 | context->new_cr3 = paging_new_cr3; | 3285 | context->new_cr3 = paging_new_cr3; |
3291 | context->page_fault = paging32_page_fault; | 3286 | context->page_fault = paging32_page_fault; |
@@ -3294,7 +3289,6 @@ static int paging32_init_context(struct kvm_vcpu *vcpu, | |||
3294 | context->sync_page = paging32_sync_page; | 3289 | context->sync_page = paging32_sync_page; |
3295 | context->invlpg = paging32_invlpg; | 3290 | context->invlpg = paging32_invlpg; |
3296 | context->update_pte = paging32_update_pte; | 3291 | context->update_pte = paging32_update_pte; |
3297 | context->root_level = PT32_ROOT_LEVEL; | ||
3298 | context->shadow_root_level = PT32E_ROOT_LEVEL; | 3292 | context->shadow_root_level = PT32E_ROOT_LEVEL; |
3299 | context->root_hpa = INVALID_PAGE; | 3293 | context->root_hpa = INVALID_PAGE; |
3300 | context->direct_map = false; | 3294 | context->direct_map = false; |
@@ -3325,7 +3319,6 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) | |||
3325 | context->get_cr3 = get_cr3; | 3319 | context->get_cr3 = get_cr3; |
3326 | context->get_pdptr = kvm_pdptr_read; | 3320 | context->get_pdptr = kvm_pdptr_read; |
3327 | context->inject_page_fault = kvm_inject_page_fault; | 3321 | context->inject_page_fault = kvm_inject_page_fault; |
3328 | context->nx = is_nx(vcpu); | ||
3329 | 3322 | ||
3330 | if (!is_paging(vcpu)) { | 3323 | if (!is_paging(vcpu)) { |
3331 | context->nx = false; | 3324 | context->nx = false; |
@@ -3333,19 +3326,19 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) | |||
3333 | context->root_level = 0; | 3326 | context->root_level = 0; |
3334 | } else if (is_long_mode(vcpu)) { | 3327 | } else if (is_long_mode(vcpu)) { |
3335 | context->nx = is_nx(vcpu); | 3328 | context->nx = is_nx(vcpu); |
3336 | reset_rsvds_bits_mask(vcpu, context, PT64_ROOT_LEVEL); | ||
3337 | context->gva_to_gpa = paging64_gva_to_gpa; | ||
3338 | context->root_level = PT64_ROOT_LEVEL; | 3329 | context->root_level = PT64_ROOT_LEVEL; |
3330 | reset_rsvds_bits_mask(vcpu, context); | ||
3331 | context->gva_to_gpa = paging64_gva_to_gpa; | ||
3339 | } else if (is_pae(vcpu)) { | 3332 | } else if (is_pae(vcpu)) { |
3340 | context->nx = is_nx(vcpu); | 3333 | context->nx = is_nx(vcpu); |
3341 | reset_rsvds_bits_mask(vcpu, context, PT32E_ROOT_LEVEL); | ||
3342 | context->gva_to_gpa = paging64_gva_to_gpa; | ||
3343 | context->root_level = PT32E_ROOT_LEVEL; | 3334 | context->root_level = PT32E_ROOT_LEVEL; |
3335 | reset_rsvds_bits_mask(vcpu, context); | ||
3336 | context->gva_to_gpa = paging64_gva_to_gpa; | ||
3344 | } else { | 3337 | } else { |
3345 | context->nx = false; | 3338 | context->nx = false; |
3346 | reset_rsvds_bits_mask(vcpu, context, PT32_ROOT_LEVEL); | ||
3347 | context->gva_to_gpa = paging32_gva_to_gpa; | ||
3348 | context->root_level = PT32_ROOT_LEVEL; | 3339 | context->root_level = PT32_ROOT_LEVEL; |
3340 | reset_rsvds_bits_mask(vcpu, context); | ||
3341 | context->gva_to_gpa = paging32_gva_to_gpa; | ||
3349 | } | 3342 | } |
3350 | 3343 | ||
3351 | return 0; | 3344 | return 0; |
@@ -3408,18 +3401,18 @@ static int init_kvm_nested_mmu(struct kvm_vcpu *vcpu) | |||
3408 | g_context->gva_to_gpa = nonpaging_gva_to_gpa_nested; | 3401 | g_context->gva_to_gpa = nonpaging_gva_to_gpa_nested; |
3409 | } else if (is_long_mode(vcpu)) { | 3402 | } else if (is_long_mode(vcpu)) { |
3410 | g_context->nx = is_nx(vcpu); | 3403 | g_context->nx = is_nx(vcpu); |
3411 | reset_rsvds_bits_mask(vcpu, g_context, PT64_ROOT_LEVEL); | ||
3412 | g_context->root_level = PT64_ROOT_LEVEL; | 3404 | g_context->root_level = PT64_ROOT_LEVEL; |
3405 | reset_rsvds_bits_mask(vcpu, g_context); | ||
3413 | g_context->gva_to_gpa = paging64_gva_to_gpa_nested; | 3406 | g_context->gva_to_gpa = paging64_gva_to_gpa_nested; |
3414 | } else if (is_pae(vcpu)) { | 3407 | } else if (is_pae(vcpu)) { |
3415 | g_context->nx = is_nx(vcpu); | 3408 | g_context->nx = is_nx(vcpu); |
3416 | reset_rsvds_bits_mask(vcpu, g_context, PT32E_ROOT_LEVEL); | ||
3417 | g_context->root_level = PT32E_ROOT_LEVEL; | 3409 | g_context->root_level = PT32E_ROOT_LEVEL; |
3410 | reset_rsvds_bits_mask(vcpu, g_context); | ||
3418 | g_context->gva_to_gpa = paging64_gva_to_gpa_nested; | 3411 | g_context->gva_to_gpa = paging64_gva_to_gpa_nested; |
3419 | } else { | 3412 | } else { |
3420 | g_context->nx = false; | 3413 | g_context->nx = false; |
3421 | reset_rsvds_bits_mask(vcpu, g_context, PT32_ROOT_LEVEL); | ||
3422 | g_context->root_level = PT32_ROOT_LEVEL; | 3414 | g_context->root_level = PT32_ROOT_LEVEL; |
3415 | reset_rsvds_bits_mask(vcpu, g_context); | ||
3423 | g_context->gva_to_gpa = paging32_gva_to_gpa_nested; | 3416 | g_context->gva_to_gpa = paging32_gva_to_gpa_nested; |
3424 | } | 3417 | } |
3425 | 3418 | ||
@@ -3555,7 +3548,7 @@ static u64 mmu_pte_write_fetch_gpte(struct kvm_vcpu *vcpu, gpa_t *gpa, | |||
3555 | * If we're seeing too many writes to a page, it may no longer be a page table, | 3548 | * If we're seeing too many writes to a page, it may no longer be a page table, |
3556 | * or we may be forking, in which case it is better to unmap the page. | 3549 | * or we may be forking, in which case it is better to unmap the page. |
3557 | */ | 3550 | */ |
3558 | static bool detect_write_flooding(struct kvm_mmu_page *sp, u64 *spte) | 3551 | static bool detect_write_flooding(struct kvm_mmu_page *sp) |
3559 | { | 3552 | { |
3560 | /* | 3553 | /* |
3561 | * Skip write-flooding detected for the sp whose level is 1, because | 3554 | * Skip write-flooding detected for the sp whose level is 1, because |
@@ -3664,10 +3657,8 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, | |||
3664 | 3657 | ||
3665 | mask.cr0_wp = mask.cr4_pae = mask.nxe = 1; | 3658 | mask.cr0_wp = mask.cr4_pae = mask.nxe = 1; |
3666 | for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn, node) { | 3659 | for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn, node) { |
3667 | spte = get_written_sptes(sp, gpa, &npte); | ||
3668 | |||
3669 | if (detect_write_misaligned(sp, gpa, bytes) || | 3660 | if (detect_write_misaligned(sp, gpa, bytes) || |
3670 | detect_write_flooding(sp, spte)) { | 3661 | detect_write_flooding(sp)) { |
3671 | zap_page |= !!kvm_mmu_prepare_zap_page(vcpu->kvm, sp, | 3662 | zap_page |= !!kvm_mmu_prepare_zap_page(vcpu->kvm, sp, |
3672 | &invalid_list); | 3663 | &invalid_list); |
3673 | ++vcpu->kvm->stat.mmu_flooded; | 3664 | ++vcpu->kvm->stat.mmu_flooded; |
diff --git a/arch/x86/kvm/mmu_audit.c b/arch/x86/kvm/mmu_audit.c index ea7b4fd34676..715da5a19a5b 100644 --- a/arch/x86/kvm/mmu_audit.c +++ b/arch/x86/kvm/mmu_audit.c | |||
@@ -200,13 +200,13 @@ static void audit_write_protection(struct kvm *kvm, struct kvm_mmu_page *sp) | |||
200 | slot = gfn_to_memslot(kvm, sp->gfn); | 200 | slot = gfn_to_memslot(kvm, sp->gfn); |
201 | rmapp = &slot->rmap[sp->gfn - slot->base_gfn]; | 201 | rmapp = &slot->rmap[sp->gfn - slot->base_gfn]; |
202 | 202 | ||
203 | spte = rmap_next(kvm, rmapp, NULL); | 203 | spte = rmap_next(rmapp, NULL); |
204 | while (spte) { | 204 | while (spte) { |
205 | if (is_writable_pte(*spte)) | 205 | if (is_writable_pte(*spte)) |
206 | audit_printk(kvm, "shadow page has writable " | 206 | audit_printk(kvm, "shadow page has writable " |
207 | "mappings: gfn %llx role %x\n", | 207 | "mappings: gfn %llx role %x\n", |
208 | sp->gfn, sp->role.word); | 208 | sp->gfn, sp->role.word); |
209 | spte = rmap_next(kvm, rmapp, spte); | 209 | spte = rmap_next(rmapp, spte); |
210 | } | 210 | } |
211 | } | 211 | } |
212 | 212 | ||
diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index 7aad5446f393..a73f0c104813 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c | |||
@@ -33,10 +33,11 @@ static struct kvm_arch_event_perf_mapping { | |||
33 | [4] = { 0x2e, 0x41, PERF_COUNT_HW_CACHE_MISSES }, | 33 | [4] = { 0x2e, 0x41, PERF_COUNT_HW_CACHE_MISSES }, |
34 | [5] = { 0xc4, 0x00, PERF_COUNT_HW_BRANCH_INSTRUCTIONS }, | 34 | [5] = { 0xc4, 0x00, PERF_COUNT_HW_BRANCH_INSTRUCTIONS }, |
35 | [6] = { 0xc5, 0x00, PERF_COUNT_HW_BRANCH_MISSES }, | 35 | [6] = { 0xc5, 0x00, PERF_COUNT_HW_BRANCH_MISSES }, |
36 | [7] = { 0x00, 0x30, PERF_COUNT_HW_REF_CPU_CYCLES }, | ||
36 | }; | 37 | }; |
37 | 38 | ||
38 | /* mapping between fixed pmc index and arch_events array */ | 39 | /* mapping between fixed pmc index and arch_events array */ |
39 | int fixed_pmc_events[] = {1, 0, 2}; | 40 | int fixed_pmc_events[] = {1, 0, 7}; |
40 | 41 | ||
41 | static bool pmc_is_gp(struct kvm_pmc *pmc) | 42 | static bool pmc_is_gp(struct kvm_pmc *pmc) |
42 | { | 43 | { |
@@ -210,6 +211,9 @@ static void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel) | |||
210 | unsigned config, type = PERF_TYPE_RAW; | 211 | unsigned config, type = PERF_TYPE_RAW; |
211 | u8 event_select, unit_mask; | 212 | u8 event_select, unit_mask; |
212 | 213 | ||
214 | if (eventsel & ARCH_PERFMON_EVENTSEL_PIN_CONTROL) | ||
215 | printk_once("kvm pmu: pin control bit is ignored\n"); | ||
216 | |||
213 | pmc->eventsel = eventsel; | 217 | pmc->eventsel = eventsel; |
214 | 218 | ||
215 | stop_counter(pmc); | 219 | stop_counter(pmc); |
@@ -220,7 +224,7 @@ static void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel) | |||
220 | event_select = eventsel & ARCH_PERFMON_EVENTSEL_EVENT; | 224 | event_select = eventsel & ARCH_PERFMON_EVENTSEL_EVENT; |
221 | unit_mask = (eventsel & ARCH_PERFMON_EVENTSEL_UMASK) >> 8; | 225 | unit_mask = (eventsel & ARCH_PERFMON_EVENTSEL_UMASK) >> 8; |
222 | 226 | ||
223 | if (!(event_select & (ARCH_PERFMON_EVENTSEL_EDGE | | 227 | if (!(eventsel & (ARCH_PERFMON_EVENTSEL_EDGE | |
224 | ARCH_PERFMON_EVENTSEL_INV | | 228 | ARCH_PERFMON_EVENTSEL_INV | |
225 | ARCH_PERFMON_EVENTSEL_CMASK))) { | 229 | ARCH_PERFMON_EVENTSEL_CMASK))) { |
226 | config = find_arch_event(&pmc->vcpu->arch.pmu, event_select, | 230 | config = find_arch_event(&pmc->vcpu->arch.pmu, event_select, |
@@ -413,7 +417,7 @@ int kvm_pmu_read_pmc(struct kvm_vcpu *vcpu, unsigned pmc, u64 *data) | |||
413 | struct kvm_pmc *counters; | 417 | struct kvm_pmc *counters; |
414 | u64 ctr; | 418 | u64 ctr; |
415 | 419 | ||
416 | pmc &= (3u << 30) - 1; | 420 | pmc &= ~(3u << 30); |
417 | if (!fixed && pmc >= pmu->nr_arch_gp_counters) | 421 | if (!fixed && pmc >= pmu->nr_arch_gp_counters) |
418 | return 1; | 422 | return 1; |
419 | if (fixed && pmc >= pmu->nr_arch_fixed_counters) | 423 | if (fixed && pmc >= pmu->nr_arch_fixed_counters) |
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index e385214711cb..e334389e1c75 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c | |||
@@ -111,6 +111,12 @@ struct nested_state { | |||
111 | #define MSRPM_OFFSETS 16 | 111 | #define MSRPM_OFFSETS 16 |
112 | static u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly; | 112 | static u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly; |
113 | 113 | ||
114 | /* | ||
115 | * Set osvw_len to higher value when updated Revision Guides | ||
116 | * are published and we know what the new status bits are | ||
117 | */ | ||
118 | static uint64_t osvw_len = 4, osvw_status; | ||
119 | |||
114 | struct vcpu_svm { | 120 | struct vcpu_svm { |
115 | struct kvm_vcpu vcpu; | 121 | struct kvm_vcpu vcpu; |
116 | struct vmcb *vmcb; | 122 | struct vmcb *vmcb; |
@@ -177,11 +183,13 @@ static bool npt_enabled = true; | |||
177 | #else | 183 | #else |
178 | static bool npt_enabled; | 184 | static bool npt_enabled; |
179 | #endif | 185 | #endif |
180 | static int npt = 1; | ||
181 | 186 | ||
187 | /* allow nested paging (virtualized MMU) for all guests */ | ||
188 | static int npt = true; | ||
182 | module_param(npt, int, S_IRUGO); | 189 | module_param(npt, int, S_IRUGO); |
183 | 190 | ||
184 | static int nested = 1; | 191 | /* allow nested virtualization in KVM/SVM */ |
192 | static int nested = true; | ||
185 | module_param(nested, int, S_IRUGO); | 193 | module_param(nested, int, S_IRUGO); |
186 | 194 | ||
187 | static void svm_flush_tlb(struct kvm_vcpu *vcpu); | 195 | static void svm_flush_tlb(struct kvm_vcpu *vcpu); |
@@ -557,6 +565,27 @@ static void svm_init_erratum_383(void) | |||
557 | erratum_383_found = true; | 565 | erratum_383_found = true; |
558 | } | 566 | } |
559 | 567 | ||
568 | static void svm_init_osvw(struct kvm_vcpu *vcpu) | ||
569 | { | ||
570 | /* | ||
571 | * Guests should see errata 400 and 415 as fixed (assuming that | ||
572 | * HLT and IO instructions are intercepted). | ||
573 | */ | ||
574 | vcpu->arch.osvw.length = (osvw_len >= 3) ? (osvw_len) : 3; | ||
575 | vcpu->arch.osvw.status = osvw_status & ~(6ULL); | ||
576 | |||
577 | /* | ||
578 | * By increasing VCPU's osvw.length to 3 we are telling the guest that | ||
579 | * all osvw.status bits inside that length, including bit 0 (which is | ||
580 | * reserved for erratum 298), are valid. However, if host processor's | ||
581 | * osvw_len is 0 then osvw_status[0] carries no information. We need to | ||
582 | * be conservative here and therefore we tell the guest that erratum 298 | ||
583 | * is present (because we really don't know). | ||
584 | */ | ||
585 | if (osvw_len == 0 && boot_cpu_data.x86 == 0x10) | ||
586 | vcpu->arch.osvw.status |= 1; | ||
587 | } | ||
588 | |||
560 | static int has_svm(void) | 589 | static int has_svm(void) |
561 | { | 590 | { |
562 | const char *msg; | 591 | const char *msg; |
@@ -623,6 +652,36 @@ static int svm_hardware_enable(void *garbage) | |||
623 | __get_cpu_var(current_tsc_ratio) = TSC_RATIO_DEFAULT; | 652 | __get_cpu_var(current_tsc_ratio) = TSC_RATIO_DEFAULT; |
624 | } | 653 | } |
625 | 654 | ||
655 | |||
656 | /* | ||
657 | * Get OSVW bits. | ||
658 | * | ||
659 | * Note that it is possible to have a system with mixed processor | ||
660 | * revisions and therefore different OSVW bits. If bits are not the same | ||
661 | * on different processors then choose the worst case (i.e. if erratum | ||
662 | * is present on one processor and not on another then assume that the | ||
663 | * erratum is present everywhere). | ||
664 | */ | ||
665 | if (cpu_has(&boot_cpu_data, X86_FEATURE_OSVW)) { | ||
666 | uint64_t len, status = 0; | ||
667 | int err; | ||
668 | |||
669 | len = native_read_msr_safe(MSR_AMD64_OSVW_ID_LENGTH, &err); | ||
670 | if (!err) | ||
671 | status = native_read_msr_safe(MSR_AMD64_OSVW_STATUS, | ||
672 | &err); | ||
673 | |||
674 | if (err) | ||
675 | osvw_status = osvw_len = 0; | ||
676 | else { | ||
677 | if (len < osvw_len) | ||
678 | osvw_len = len; | ||
679 | osvw_status |= status; | ||
680 | osvw_status &= (1ULL << osvw_len) - 1; | ||
681 | } | ||
682 | } else | ||
683 | osvw_status = osvw_len = 0; | ||
684 | |||
626 | svm_init_erratum_383(); | 685 | svm_init_erratum_383(); |
627 | 686 | ||
628 | amd_pmu_enable_virt(); | 687 | amd_pmu_enable_virt(); |
@@ -910,20 +969,25 @@ static u64 svm_scale_tsc(struct kvm_vcpu *vcpu, u64 tsc) | |||
910 | return _tsc; | 969 | return _tsc; |
911 | } | 970 | } |
912 | 971 | ||
913 | static void svm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz) | 972 | static void svm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale) |
914 | { | 973 | { |
915 | struct vcpu_svm *svm = to_svm(vcpu); | 974 | struct vcpu_svm *svm = to_svm(vcpu); |
916 | u64 ratio; | 975 | u64 ratio; |
917 | u64 khz; | 976 | u64 khz; |
918 | 977 | ||
919 | /* TSC scaling supported? */ | 978 | /* Guest TSC same frequency as host TSC? */ |
920 | if (!boot_cpu_has(X86_FEATURE_TSCRATEMSR)) | 979 | if (!scale) { |
980 | svm->tsc_ratio = TSC_RATIO_DEFAULT; | ||
921 | return; | 981 | return; |
982 | } | ||
922 | 983 | ||
923 | /* TSC-Scaling disabled or guest TSC same frequency as host TSC? */ | 984 | /* TSC scaling supported? */ |
924 | if (user_tsc_khz == 0) { | 985 | if (!boot_cpu_has(X86_FEATURE_TSCRATEMSR)) { |
925 | vcpu->arch.virtual_tsc_khz = 0; | 986 | if (user_tsc_khz > tsc_khz) { |
926 | svm->tsc_ratio = TSC_RATIO_DEFAULT; | 987 | vcpu->arch.tsc_catchup = 1; |
988 | vcpu->arch.tsc_always_catchup = 1; | ||
989 | } else | ||
990 | WARN(1, "user requested TSC rate below hardware speed\n"); | ||
927 | return; | 991 | return; |
928 | } | 992 | } |
929 | 993 | ||
@@ -938,7 +1002,6 @@ static void svm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz) | |||
938 | user_tsc_khz); | 1002 | user_tsc_khz); |
939 | return; | 1003 | return; |
940 | } | 1004 | } |
941 | vcpu->arch.virtual_tsc_khz = user_tsc_khz; | ||
942 | svm->tsc_ratio = ratio; | 1005 | svm->tsc_ratio = ratio; |
943 | } | 1006 | } |
944 | 1007 | ||
@@ -958,10 +1021,14 @@ static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) | |||
958 | mark_dirty(svm->vmcb, VMCB_INTERCEPTS); | 1021 | mark_dirty(svm->vmcb, VMCB_INTERCEPTS); |
959 | } | 1022 | } |
960 | 1023 | ||
961 | static void svm_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment) | 1024 | static void svm_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment, bool host) |
962 | { | 1025 | { |
963 | struct vcpu_svm *svm = to_svm(vcpu); | 1026 | struct vcpu_svm *svm = to_svm(vcpu); |
964 | 1027 | ||
1028 | WARN_ON(adjustment < 0); | ||
1029 | if (host) | ||
1030 | adjustment = svm_scale_tsc(vcpu, adjustment); | ||
1031 | |||
965 | svm->vmcb->control.tsc_offset += adjustment; | 1032 | svm->vmcb->control.tsc_offset += adjustment; |
966 | if (is_guest_mode(vcpu)) | 1033 | if (is_guest_mode(vcpu)) |
967 | svm->nested.hsave->control.tsc_offset += adjustment; | 1034 | svm->nested.hsave->control.tsc_offset += adjustment; |
@@ -1191,6 +1258,8 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) | |||
1191 | if (kvm_vcpu_is_bsp(&svm->vcpu)) | 1258 | if (kvm_vcpu_is_bsp(&svm->vcpu)) |
1192 | svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP; | 1259 | svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP; |
1193 | 1260 | ||
1261 | svm_init_osvw(&svm->vcpu); | ||
1262 | |||
1194 | return &svm->vcpu; | 1263 | return &svm->vcpu; |
1195 | 1264 | ||
1196 | free_page4: | 1265 | free_page4: |
@@ -1268,6 +1337,21 @@ static void svm_vcpu_put(struct kvm_vcpu *vcpu) | |||
1268 | wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]); | 1337 | wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]); |
1269 | } | 1338 | } |
1270 | 1339 | ||
1340 | static void svm_update_cpl(struct kvm_vcpu *vcpu) | ||
1341 | { | ||
1342 | struct vcpu_svm *svm = to_svm(vcpu); | ||
1343 | int cpl; | ||
1344 | |||
1345 | if (!is_protmode(vcpu)) | ||
1346 | cpl = 0; | ||
1347 | else if (svm->vmcb->save.rflags & X86_EFLAGS_VM) | ||
1348 | cpl = 3; | ||
1349 | else | ||
1350 | cpl = svm->vmcb->save.cs.selector & 0x3; | ||
1351 | |||
1352 | svm->vmcb->save.cpl = cpl; | ||
1353 | } | ||
1354 | |||
1271 | static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu) | 1355 | static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu) |
1272 | { | 1356 | { |
1273 | return to_svm(vcpu)->vmcb->save.rflags; | 1357 | return to_svm(vcpu)->vmcb->save.rflags; |
@@ -1275,7 +1359,11 @@ static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu) | |||
1275 | 1359 | ||
1276 | static void svm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) | 1360 | static void svm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) |
1277 | { | 1361 | { |
1362 | unsigned long old_rflags = to_svm(vcpu)->vmcb->save.rflags; | ||
1363 | |||
1278 | to_svm(vcpu)->vmcb->save.rflags = rflags; | 1364 | to_svm(vcpu)->vmcb->save.rflags = rflags; |
1365 | if ((old_rflags ^ rflags) & X86_EFLAGS_VM) | ||
1366 | svm_update_cpl(vcpu); | ||
1279 | } | 1367 | } |
1280 | 1368 | ||
1281 | static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg) | 1369 | static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg) |
@@ -1543,9 +1631,7 @@ static void svm_set_segment(struct kvm_vcpu *vcpu, | |||
1543 | s->attrib |= (var->g & 1) << SVM_SELECTOR_G_SHIFT; | 1631 | s->attrib |= (var->g & 1) << SVM_SELECTOR_G_SHIFT; |
1544 | } | 1632 | } |
1545 | if (seg == VCPU_SREG_CS) | 1633 | if (seg == VCPU_SREG_CS) |
1546 | svm->vmcb->save.cpl | 1634 | svm_update_cpl(vcpu); |
1547 | = (svm->vmcb->save.cs.attrib | ||
1548 | >> SVM_SELECTOR_DPL_SHIFT) & 3; | ||
1549 | 1635 | ||
1550 | mark_dirty(svm->vmcb, VMCB_SEG); | 1636 | mark_dirty(svm->vmcb, VMCB_SEG); |
1551 | } | 1637 | } |
@@ -2735,7 +2821,10 @@ static int task_switch_interception(struct vcpu_svm *svm) | |||
2735 | (int_vec == OF_VECTOR || int_vec == BP_VECTOR))) | 2821 | (int_vec == OF_VECTOR || int_vec == BP_VECTOR))) |
2736 | skip_emulated_instruction(&svm->vcpu); | 2822 | skip_emulated_instruction(&svm->vcpu); |
2737 | 2823 | ||
2738 | if (kvm_task_switch(&svm->vcpu, tss_selector, reason, | 2824 | if (int_type != SVM_EXITINTINFO_TYPE_SOFT) |
2825 | int_vec = -1; | ||
2826 | |||
2827 | if (kvm_task_switch(&svm->vcpu, tss_selector, int_vec, reason, | ||
2739 | has_error_code, error_code) == EMULATE_FAIL) { | 2828 | has_error_code, error_code) == EMULATE_FAIL) { |
2740 | svm->vcpu.run->exit_reason = KVM_EXIT_INTERNAL_ERROR; | 2829 | svm->vcpu.run->exit_reason = KVM_EXIT_INTERNAL_ERROR; |
2741 | svm->vcpu.run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; | 2830 | svm->vcpu.run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; |
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 246490f643b6..280751c84724 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c | |||
@@ -70,9 +70,6 @@ module_param(emulate_invalid_guest_state, bool, S_IRUGO); | |||
70 | static bool __read_mostly vmm_exclusive = 1; | 70 | static bool __read_mostly vmm_exclusive = 1; |
71 | module_param(vmm_exclusive, bool, S_IRUGO); | 71 | module_param(vmm_exclusive, bool, S_IRUGO); |
72 | 72 | ||
73 | static bool __read_mostly yield_on_hlt = 1; | ||
74 | module_param(yield_on_hlt, bool, S_IRUGO); | ||
75 | |||
76 | static bool __read_mostly fasteoi = 1; | 73 | static bool __read_mostly fasteoi = 1; |
77 | module_param(fasteoi, bool, S_IRUGO); | 74 | module_param(fasteoi, bool, S_IRUGO); |
78 | 75 | ||
@@ -1655,17 +1652,6 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu) | |||
1655 | vmx_set_interrupt_shadow(vcpu, 0); | 1652 | vmx_set_interrupt_shadow(vcpu, 0); |
1656 | } | 1653 | } |
1657 | 1654 | ||
1658 | static void vmx_clear_hlt(struct kvm_vcpu *vcpu) | ||
1659 | { | ||
1660 | /* Ensure that we clear the HLT state in the VMCS. We don't need to | ||
1661 | * explicitly skip the instruction because if the HLT state is set, then | ||
1662 | * the instruction is already executing and RIP has already been | ||
1663 | * advanced. */ | ||
1664 | if (!yield_on_hlt && | ||
1665 | vmcs_read32(GUEST_ACTIVITY_STATE) == GUEST_ACTIVITY_HLT) | ||
1666 | vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE); | ||
1667 | } | ||
1668 | |||
1669 | /* | 1655 | /* |
1670 | * KVM wants to inject page-faults which it got to the guest. This function | 1656 | * KVM wants to inject page-faults which it got to the guest. This function |
1671 | * checks whether in a nested guest, we need to inject them to L1 or L2. | 1657 | * checks whether in a nested guest, we need to inject them to L1 or L2. |
@@ -1678,7 +1664,7 @@ static int nested_pf_handled(struct kvm_vcpu *vcpu) | |||
1678 | struct vmcs12 *vmcs12 = get_vmcs12(vcpu); | 1664 | struct vmcs12 *vmcs12 = get_vmcs12(vcpu); |
1679 | 1665 | ||
1680 | /* TODO: also check PFEC_MATCH/MASK, not just EB.PF. */ | 1666 | /* TODO: also check PFEC_MATCH/MASK, not just EB.PF. */ |
1681 | if (!(vmcs12->exception_bitmap & PF_VECTOR)) | 1667 | if (!(vmcs12->exception_bitmap & (1u << PF_VECTOR))) |
1682 | return 0; | 1668 | return 0; |
1683 | 1669 | ||
1684 | nested_vmx_vmexit(vcpu); | 1670 | nested_vmx_vmexit(vcpu); |
@@ -1718,7 +1704,6 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr, | |||
1718 | intr_info |= INTR_TYPE_HARD_EXCEPTION; | 1704 | intr_info |= INTR_TYPE_HARD_EXCEPTION; |
1719 | 1705 | ||
1720 | vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info); | 1706 | vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info); |
1721 | vmx_clear_hlt(vcpu); | ||
1722 | } | 1707 | } |
1723 | 1708 | ||
1724 | static bool vmx_rdtscp_supported(void) | 1709 | static bool vmx_rdtscp_supported(void) |
@@ -1817,13 +1802,19 @@ u64 vmx_read_l1_tsc(struct kvm_vcpu *vcpu) | |||
1817 | } | 1802 | } |
1818 | 1803 | ||
1819 | /* | 1804 | /* |
1820 | * Empty call-back. Needs to be implemented when VMX enables the SET_TSC_KHZ | 1805 | * Engage any workarounds for mis-matched TSC rates. Currently limited to |
1821 | * ioctl. In this case the call-back should update internal vmx state to make | 1806 | * software catchup for faster rates on slower CPUs. |
1822 | * the changes effective. | ||
1823 | */ | 1807 | */ |
1824 | static void vmx_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz) | 1808 | static void vmx_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale) |
1825 | { | 1809 | { |
1826 | /* Nothing to do here */ | 1810 | if (!scale) |
1811 | return; | ||
1812 | |||
1813 | if (user_tsc_khz > tsc_khz) { | ||
1814 | vcpu->arch.tsc_catchup = 1; | ||
1815 | vcpu->arch.tsc_always_catchup = 1; | ||
1816 | } else | ||
1817 | WARN(1, "user requested TSC rate below hardware speed\n"); | ||
1827 | } | 1818 | } |
1828 | 1819 | ||
1829 | /* | 1820 | /* |
@@ -1850,7 +1841,7 @@ static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) | |||
1850 | } | 1841 | } |
1851 | } | 1842 | } |
1852 | 1843 | ||
1853 | static void vmx_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment) | 1844 | static void vmx_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment, bool host) |
1854 | { | 1845 | { |
1855 | u64 offset = vmcs_read64(TSC_OFFSET); | 1846 | u64 offset = vmcs_read64(TSC_OFFSET); |
1856 | vmcs_write64(TSC_OFFSET, offset + adjustment); | 1847 | vmcs_write64(TSC_OFFSET, offset + adjustment); |
@@ -2219,6 +2210,9 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) | |||
2219 | msr = find_msr_entry(vmx, msr_index); | 2210 | msr = find_msr_entry(vmx, msr_index); |
2220 | if (msr) { | 2211 | if (msr) { |
2221 | msr->data = data; | 2212 | msr->data = data; |
2213 | if (msr - vmx->guest_msrs < vmx->save_nmsrs) | ||
2214 | kvm_set_shared_msr(msr->index, msr->data, | ||
2215 | msr->mask); | ||
2222 | break; | 2216 | break; |
2223 | } | 2217 | } |
2224 | ret = kvm_set_msr_common(vcpu, msr_index, data); | 2218 | ret = kvm_set_msr_common(vcpu, msr_index, data); |
@@ -2399,7 +2393,7 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) | |||
2399 | &_pin_based_exec_control) < 0) | 2393 | &_pin_based_exec_control) < 0) |
2400 | return -EIO; | 2394 | return -EIO; |
2401 | 2395 | ||
2402 | min = | 2396 | min = CPU_BASED_HLT_EXITING | |
2403 | #ifdef CONFIG_X86_64 | 2397 | #ifdef CONFIG_X86_64 |
2404 | CPU_BASED_CR8_LOAD_EXITING | | 2398 | CPU_BASED_CR8_LOAD_EXITING | |
2405 | CPU_BASED_CR8_STORE_EXITING | | 2399 | CPU_BASED_CR8_STORE_EXITING | |
@@ -2414,9 +2408,6 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) | |||
2414 | CPU_BASED_INVLPG_EXITING | | 2408 | CPU_BASED_INVLPG_EXITING | |
2415 | CPU_BASED_RDPMC_EXITING; | 2409 | CPU_BASED_RDPMC_EXITING; |
2416 | 2410 | ||
2417 | if (yield_on_hlt) | ||
2418 | min |= CPU_BASED_HLT_EXITING; | ||
2419 | |||
2420 | opt = CPU_BASED_TPR_SHADOW | | 2411 | opt = CPU_BASED_TPR_SHADOW | |
2421 | CPU_BASED_USE_MSR_BITMAPS | | 2412 | CPU_BASED_USE_MSR_BITMAPS | |
2422 | CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; | 2413 | CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; |
@@ -4003,7 +3994,6 @@ static void vmx_inject_irq(struct kvm_vcpu *vcpu) | |||
4003 | } else | 3994 | } else |
4004 | intr |= INTR_TYPE_EXT_INTR; | 3995 | intr |= INTR_TYPE_EXT_INTR; |
4005 | vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr); | 3996 | vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr); |
4006 | vmx_clear_hlt(vcpu); | ||
4007 | } | 3997 | } |
4008 | 3998 | ||
4009 | static void vmx_inject_nmi(struct kvm_vcpu *vcpu) | 3999 | static void vmx_inject_nmi(struct kvm_vcpu *vcpu) |
@@ -4035,7 +4025,6 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu) | |||
4035 | } | 4025 | } |
4036 | vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, | 4026 | vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, |
4037 | INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR); | 4027 | INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR); |
4038 | vmx_clear_hlt(vcpu); | ||
4039 | } | 4028 | } |
4040 | 4029 | ||
4041 | static int vmx_nmi_allowed(struct kvm_vcpu *vcpu) | 4030 | static int vmx_nmi_allowed(struct kvm_vcpu *vcpu) |
@@ -4672,9 +4661,10 @@ static int handle_task_switch(struct kvm_vcpu *vcpu) | |||
4672 | bool has_error_code = false; | 4661 | bool has_error_code = false; |
4673 | u32 error_code = 0; | 4662 | u32 error_code = 0; |
4674 | u16 tss_selector; | 4663 | u16 tss_selector; |
4675 | int reason, type, idt_v; | 4664 | int reason, type, idt_v, idt_index; |
4676 | 4665 | ||
4677 | idt_v = (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK); | 4666 | idt_v = (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK); |
4667 | idt_index = (vmx->idt_vectoring_info & VECTORING_INFO_VECTOR_MASK); | ||
4678 | type = (vmx->idt_vectoring_info & VECTORING_INFO_TYPE_MASK); | 4668 | type = (vmx->idt_vectoring_info & VECTORING_INFO_TYPE_MASK); |
4679 | 4669 | ||
4680 | exit_qualification = vmcs_readl(EXIT_QUALIFICATION); | 4670 | exit_qualification = vmcs_readl(EXIT_QUALIFICATION); |
@@ -4712,8 +4702,9 @@ static int handle_task_switch(struct kvm_vcpu *vcpu) | |||
4712 | type != INTR_TYPE_NMI_INTR)) | 4702 | type != INTR_TYPE_NMI_INTR)) |
4713 | skip_emulated_instruction(vcpu); | 4703 | skip_emulated_instruction(vcpu); |
4714 | 4704 | ||
4715 | if (kvm_task_switch(vcpu, tss_selector, reason, | 4705 | if (kvm_task_switch(vcpu, tss_selector, |
4716 | has_error_code, error_code) == EMULATE_FAIL) { | 4706 | type == INTR_TYPE_SOFT_INTR ? idt_index : -1, reason, |
4707 | has_error_code, error_code) == EMULATE_FAIL) { | ||
4717 | vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; | 4708 | vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; |
4718 | vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; | 4709 | vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; |
4719 | vcpu->run->internal.ndata = 0; | 4710 | vcpu->run->internal.ndata = 0; |
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 54696b5f8443..4044ce0bf7c1 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c | |||
@@ -97,6 +97,10 @@ EXPORT_SYMBOL_GPL(kvm_has_tsc_control); | |||
97 | u32 kvm_max_guest_tsc_khz; | 97 | u32 kvm_max_guest_tsc_khz; |
98 | EXPORT_SYMBOL_GPL(kvm_max_guest_tsc_khz); | 98 | EXPORT_SYMBOL_GPL(kvm_max_guest_tsc_khz); |
99 | 99 | ||
100 | /* tsc tolerance in parts per million - default to 1/2 of the NTP threshold */ | ||
101 | static u32 tsc_tolerance_ppm = 250; | ||
102 | module_param(tsc_tolerance_ppm, uint, S_IRUGO | S_IWUSR); | ||
103 | |||
100 | #define KVM_NR_SHARED_MSRS 16 | 104 | #define KVM_NR_SHARED_MSRS 16 |
101 | 105 | ||
102 | struct kvm_shared_msrs_global { | 106 | struct kvm_shared_msrs_global { |
@@ -969,50 +973,51 @@ static inline u64 get_kernel_ns(void) | |||
969 | static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz); | 973 | static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz); |
970 | unsigned long max_tsc_khz; | 974 | unsigned long max_tsc_khz; |
971 | 975 | ||
972 | static inline int kvm_tsc_changes_freq(void) | 976 | static inline u64 nsec_to_cycles(struct kvm_vcpu *vcpu, u64 nsec) |
973 | { | 977 | { |
974 | int cpu = get_cpu(); | 978 | return pvclock_scale_delta(nsec, vcpu->arch.virtual_tsc_mult, |
975 | int ret = !boot_cpu_has(X86_FEATURE_CONSTANT_TSC) && | 979 | vcpu->arch.virtual_tsc_shift); |
976 | cpufreq_quick_get(cpu) != 0; | ||
977 | put_cpu(); | ||
978 | return ret; | ||
979 | } | 980 | } |
980 | 981 | ||
981 | u64 vcpu_tsc_khz(struct kvm_vcpu *vcpu) | 982 | static u32 adjust_tsc_khz(u32 khz, s32 ppm) |
982 | { | 983 | { |
983 | if (vcpu->arch.virtual_tsc_khz) | 984 | u64 v = (u64)khz * (1000000 + ppm); |
984 | return vcpu->arch.virtual_tsc_khz; | 985 | do_div(v, 1000000); |
985 | else | 986 | return v; |
986 | return __this_cpu_read(cpu_tsc_khz); | ||
987 | } | 987 | } |
988 | 988 | ||
989 | static inline u64 nsec_to_cycles(struct kvm_vcpu *vcpu, u64 nsec) | 989 | static void kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 this_tsc_khz) |
990 | { | 990 | { |
991 | u64 ret; | 991 | u32 thresh_lo, thresh_hi; |
992 | 992 | int use_scaling = 0; | |
993 | WARN_ON(preemptible()); | ||
994 | if (kvm_tsc_changes_freq()) | ||
995 | printk_once(KERN_WARNING | ||
996 | "kvm: unreliable cycle conversion on adjustable rate TSC\n"); | ||
997 | ret = nsec * vcpu_tsc_khz(vcpu); | ||
998 | do_div(ret, USEC_PER_SEC); | ||
999 | return ret; | ||
1000 | } | ||
1001 | 993 | ||
1002 | static void kvm_init_tsc_catchup(struct kvm_vcpu *vcpu, u32 this_tsc_khz) | ||
1003 | { | ||
1004 | /* Compute a scale to convert nanoseconds in TSC cycles */ | 994 | /* Compute a scale to convert nanoseconds in TSC cycles */ |
1005 | kvm_get_time_scale(this_tsc_khz, NSEC_PER_SEC / 1000, | 995 | kvm_get_time_scale(this_tsc_khz, NSEC_PER_SEC / 1000, |
1006 | &vcpu->arch.tsc_catchup_shift, | 996 | &vcpu->arch.virtual_tsc_shift, |
1007 | &vcpu->arch.tsc_catchup_mult); | 997 | &vcpu->arch.virtual_tsc_mult); |
998 | vcpu->arch.virtual_tsc_khz = this_tsc_khz; | ||
999 | |||
1000 | /* | ||
1001 | * Compute the variation in TSC rate which is acceptable | ||
1002 | * within the range of tolerance and decide if the | ||
1003 | * rate being applied is within that bounds of the hardware | ||
1004 | * rate. If so, no scaling or compensation need be done. | ||
1005 | */ | ||
1006 | thresh_lo = adjust_tsc_khz(tsc_khz, -tsc_tolerance_ppm); | ||
1007 | thresh_hi = adjust_tsc_khz(tsc_khz, tsc_tolerance_ppm); | ||
1008 | if (this_tsc_khz < thresh_lo || this_tsc_khz > thresh_hi) { | ||
1009 | pr_debug("kvm: requested TSC rate %u falls outside tolerance [%u,%u]\n", this_tsc_khz, thresh_lo, thresh_hi); | ||
1010 | use_scaling = 1; | ||
1011 | } | ||
1012 | kvm_x86_ops->set_tsc_khz(vcpu, this_tsc_khz, use_scaling); | ||
1008 | } | 1013 | } |
1009 | 1014 | ||
1010 | static u64 compute_guest_tsc(struct kvm_vcpu *vcpu, s64 kernel_ns) | 1015 | static u64 compute_guest_tsc(struct kvm_vcpu *vcpu, s64 kernel_ns) |
1011 | { | 1016 | { |
1012 | u64 tsc = pvclock_scale_delta(kernel_ns-vcpu->arch.last_tsc_nsec, | 1017 | u64 tsc = pvclock_scale_delta(kernel_ns-vcpu->arch.this_tsc_nsec, |
1013 | vcpu->arch.tsc_catchup_mult, | 1018 | vcpu->arch.virtual_tsc_mult, |
1014 | vcpu->arch.tsc_catchup_shift); | 1019 | vcpu->arch.virtual_tsc_shift); |
1015 | tsc += vcpu->arch.last_tsc_write; | 1020 | tsc += vcpu->arch.this_tsc_write; |
1016 | return tsc; | 1021 | return tsc; |
1017 | } | 1022 | } |
1018 | 1023 | ||
@@ -1021,48 +1026,88 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data) | |||
1021 | struct kvm *kvm = vcpu->kvm; | 1026 | struct kvm *kvm = vcpu->kvm; |
1022 | u64 offset, ns, elapsed; | 1027 | u64 offset, ns, elapsed; |
1023 | unsigned long flags; | 1028 | unsigned long flags; |
1024 | s64 sdiff; | 1029 | s64 usdiff; |
1025 | 1030 | ||
1026 | raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags); | 1031 | raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags); |
1027 | offset = kvm_x86_ops->compute_tsc_offset(vcpu, data); | 1032 | offset = kvm_x86_ops->compute_tsc_offset(vcpu, data); |
1028 | ns = get_kernel_ns(); | 1033 | ns = get_kernel_ns(); |
1029 | elapsed = ns - kvm->arch.last_tsc_nsec; | 1034 | elapsed = ns - kvm->arch.last_tsc_nsec; |
1030 | sdiff = data - kvm->arch.last_tsc_write; | 1035 | |
1031 | if (sdiff < 0) | 1036 | /* n.b - signed multiplication and division required */ |
1032 | sdiff = -sdiff; | 1037 | usdiff = data - kvm->arch.last_tsc_write; |
1038 | #ifdef CONFIG_X86_64 | ||
1039 | usdiff = (usdiff * 1000) / vcpu->arch.virtual_tsc_khz; | ||
1040 | #else | ||
1041 | /* do_div() only does unsigned */ | ||
1042 | asm("idivl %2; xor %%edx, %%edx" | ||
1043 | : "=A"(usdiff) | ||
1044 | : "A"(usdiff * 1000), "rm"(vcpu->arch.virtual_tsc_khz)); | ||
1045 | #endif | ||
1046 | do_div(elapsed, 1000); | ||
1047 | usdiff -= elapsed; | ||
1048 | if (usdiff < 0) | ||
1049 | usdiff = -usdiff; | ||
1033 | 1050 | ||
1034 | /* | 1051 | /* |
1035 | * Special case: close write to TSC within 5 seconds of | 1052 | * Special case: TSC write with a small delta (1 second) of virtual |
1036 | * another CPU is interpreted as an attempt to synchronize | 1053 | * cycle time against real time is interpreted as an attempt to |
1037 | * The 5 seconds is to accommodate host load / swapping as | 1054 | * synchronize the CPU. |
1038 | * well as any reset of TSC during the boot process. | 1055 | * |
1039 | * | 1056 | * For a reliable TSC, we can match TSC offsets, and for an unstable |
1040 | * In that case, for a reliable TSC, we can match TSC offsets, | 1057 | * TSC, we add elapsed time in this computation. We could let the |
1041 | * or make a best guest using elapsed value. | 1058 | * compensation code attempt to catch up if we fall behind, but |
1042 | */ | 1059 | * it's better to try to match offsets from the beginning. |
1043 | if (sdiff < nsec_to_cycles(vcpu, 5ULL * NSEC_PER_SEC) && | 1060 | */ |
1044 | elapsed < 5ULL * NSEC_PER_SEC) { | 1061 | if (usdiff < USEC_PER_SEC && |
1062 | vcpu->arch.virtual_tsc_khz == kvm->arch.last_tsc_khz) { | ||
1045 | if (!check_tsc_unstable()) { | 1063 | if (!check_tsc_unstable()) { |
1046 | offset = kvm->arch.last_tsc_offset; | 1064 | offset = kvm->arch.cur_tsc_offset; |
1047 | pr_debug("kvm: matched tsc offset for %llu\n", data); | 1065 | pr_debug("kvm: matched tsc offset for %llu\n", data); |
1048 | } else { | 1066 | } else { |
1049 | u64 delta = nsec_to_cycles(vcpu, elapsed); | 1067 | u64 delta = nsec_to_cycles(vcpu, elapsed); |
1050 | offset += delta; | 1068 | data += delta; |
1069 | offset = kvm_x86_ops->compute_tsc_offset(vcpu, data); | ||
1051 | pr_debug("kvm: adjusted tsc offset by %llu\n", delta); | 1070 | pr_debug("kvm: adjusted tsc offset by %llu\n", delta); |
1052 | } | 1071 | } |
1053 | ns = kvm->arch.last_tsc_nsec; | 1072 | } else { |
1073 | /* | ||
1074 | * We split periods of matched TSC writes into generations. | ||
1075 | * For each generation, we track the original measured | ||
1076 | * nanosecond time, offset, and write, so if TSCs are in | ||
1077 | * sync, we can match exact offset, and if not, we can match | ||
1078 | * exact software computaion in compute_guest_tsc() | ||
1079 | * | ||
1080 | * These values are tracked in kvm->arch.cur_xxx variables. | ||
1081 | */ | ||
1082 | kvm->arch.cur_tsc_generation++; | ||
1083 | kvm->arch.cur_tsc_nsec = ns; | ||
1084 | kvm->arch.cur_tsc_write = data; | ||
1085 | kvm->arch.cur_tsc_offset = offset; | ||
1086 | pr_debug("kvm: new tsc generation %u, clock %llu\n", | ||
1087 | kvm->arch.cur_tsc_generation, data); | ||
1054 | } | 1088 | } |
1089 | |||
1090 | /* | ||
1091 | * We also track th most recent recorded KHZ, write and time to | ||
1092 | * allow the matching interval to be extended at each write. | ||
1093 | */ | ||
1055 | kvm->arch.last_tsc_nsec = ns; | 1094 | kvm->arch.last_tsc_nsec = ns; |
1056 | kvm->arch.last_tsc_write = data; | 1095 | kvm->arch.last_tsc_write = data; |
1057 | kvm->arch.last_tsc_offset = offset; | 1096 | kvm->arch.last_tsc_khz = vcpu->arch.virtual_tsc_khz; |
1058 | kvm_x86_ops->write_tsc_offset(vcpu, offset); | ||
1059 | raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags); | ||
1060 | 1097 | ||
1061 | /* Reset of TSC must disable overshoot protection below */ | 1098 | /* Reset of TSC must disable overshoot protection below */ |
1062 | vcpu->arch.hv_clock.tsc_timestamp = 0; | 1099 | vcpu->arch.hv_clock.tsc_timestamp = 0; |
1063 | vcpu->arch.last_tsc_write = data; | 1100 | vcpu->arch.last_guest_tsc = data; |
1064 | vcpu->arch.last_tsc_nsec = ns; | 1101 | |
1102 | /* Keep track of which generation this VCPU has synchronized to */ | ||
1103 | vcpu->arch.this_tsc_generation = kvm->arch.cur_tsc_generation; | ||
1104 | vcpu->arch.this_tsc_nsec = kvm->arch.cur_tsc_nsec; | ||
1105 | vcpu->arch.this_tsc_write = kvm->arch.cur_tsc_write; | ||
1106 | |||
1107 | kvm_x86_ops->write_tsc_offset(vcpu, offset); | ||
1108 | raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags); | ||
1065 | } | 1109 | } |
1110 | |||
1066 | EXPORT_SYMBOL_GPL(kvm_write_tsc); | 1111 | EXPORT_SYMBOL_GPL(kvm_write_tsc); |
1067 | 1112 | ||
1068 | static int kvm_guest_time_update(struct kvm_vcpu *v) | 1113 | static int kvm_guest_time_update(struct kvm_vcpu *v) |
@@ -1078,7 +1123,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) | |||
1078 | local_irq_save(flags); | 1123 | local_irq_save(flags); |
1079 | tsc_timestamp = kvm_x86_ops->read_l1_tsc(v); | 1124 | tsc_timestamp = kvm_x86_ops->read_l1_tsc(v); |
1080 | kernel_ns = get_kernel_ns(); | 1125 | kernel_ns = get_kernel_ns(); |
1081 | this_tsc_khz = vcpu_tsc_khz(v); | 1126 | this_tsc_khz = __get_cpu_var(cpu_tsc_khz); |
1082 | if (unlikely(this_tsc_khz == 0)) { | 1127 | if (unlikely(this_tsc_khz == 0)) { |
1083 | local_irq_restore(flags); | 1128 | local_irq_restore(flags); |
1084 | kvm_make_request(KVM_REQ_CLOCK_UPDATE, v); | 1129 | kvm_make_request(KVM_REQ_CLOCK_UPDATE, v); |
@@ -1098,7 +1143,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) | |||
1098 | if (vcpu->tsc_catchup) { | 1143 | if (vcpu->tsc_catchup) { |
1099 | u64 tsc = compute_guest_tsc(v, kernel_ns); | 1144 | u64 tsc = compute_guest_tsc(v, kernel_ns); |
1100 | if (tsc > tsc_timestamp) { | 1145 | if (tsc > tsc_timestamp) { |
1101 | kvm_x86_ops->adjust_tsc_offset(v, tsc - tsc_timestamp); | 1146 | adjust_tsc_offset_guest(v, tsc - tsc_timestamp); |
1102 | tsc_timestamp = tsc; | 1147 | tsc_timestamp = tsc; |
1103 | } | 1148 | } |
1104 | } | 1149 | } |
@@ -1130,7 +1175,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) | |||
1130 | * observed by the guest and ensure the new system time is greater. | 1175 | * observed by the guest and ensure the new system time is greater. |
1131 | */ | 1176 | */ |
1132 | max_kernel_ns = 0; | 1177 | max_kernel_ns = 0; |
1133 | if (vcpu->hv_clock.tsc_timestamp && vcpu->last_guest_tsc) { | 1178 | if (vcpu->hv_clock.tsc_timestamp) { |
1134 | max_kernel_ns = vcpu->last_guest_tsc - | 1179 | max_kernel_ns = vcpu->last_guest_tsc - |
1135 | vcpu->hv_clock.tsc_timestamp; | 1180 | vcpu->hv_clock.tsc_timestamp; |
1136 | max_kernel_ns = pvclock_scale_delta(max_kernel_ns, | 1181 | max_kernel_ns = pvclock_scale_delta(max_kernel_ns, |
@@ -1504,6 +1549,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) | |||
1504 | case MSR_K7_HWCR: | 1549 | case MSR_K7_HWCR: |
1505 | data &= ~(u64)0x40; /* ignore flush filter disable */ | 1550 | data &= ~(u64)0x40; /* ignore flush filter disable */ |
1506 | data &= ~(u64)0x100; /* ignore ignne emulation enable */ | 1551 | data &= ~(u64)0x100; /* ignore ignne emulation enable */ |
1552 | data &= ~(u64)0x8; /* ignore TLB cache disable */ | ||
1507 | if (data != 0) { | 1553 | if (data != 0) { |
1508 | pr_unimpl(vcpu, "unimplemented HWCR wrmsr: 0x%llx\n", | 1554 | pr_unimpl(vcpu, "unimplemented HWCR wrmsr: 0x%llx\n", |
1509 | data); | 1555 | data); |
@@ -1676,6 +1722,16 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) | |||
1676 | */ | 1722 | */ |
1677 | pr_unimpl(vcpu, "ignored wrmsr: 0x%x data %llx\n", msr, data); | 1723 | pr_unimpl(vcpu, "ignored wrmsr: 0x%x data %llx\n", msr, data); |
1678 | break; | 1724 | break; |
1725 | case MSR_AMD64_OSVW_ID_LENGTH: | ||
1726 | if (!guest_cpuid_has_osvw(vcpu)) | ||
1727 | return 1; | ||
1728 | vcpu->arch.osvw.length = data; | ||
1729 | break; | ||
1730 | case MSR_AMD64_OSVW_STATUS: | ||
1731 | if (!guest_cpuid_has_osvw(vcpu)) | ||
1732 | return 1; | ||
1733 | vcpu->arch.osvw.status = data; | ||
1734 | break; | ||
1679 | default: | 1735 | default: |
1680 | if (msr && (msr == vcpu->kvm->arch.xen_hvm_config.msr)) | 1736 | if (msr && (msr == vcpu->kvm->arch.xen_hvm_config.msr)) |
1681 | return xen_hvm_config(vcpu, data); | 1737 | return xen_hvm_config(vcpu, data); |
@@ -1960,6 +2016,16 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) | |||
1960 | */ | 2016 | */ |
1961 | data = 0xbe702111; | 2017 | data = 0xbe702111; |
1962 | break; | 2018 | break; |
2019 | case MSR_AMD64_OSVW_ID_LENGTH: | ||
2020 | if (!guest_cpuid_has_osvw(vcpu)) | ||
2021 | return 1; | ||
2022 | data = vcpu->arch.osvw.length; | ||
2023 | break; | ||
2024 | case MSR_AMD64_OSVW_STATUS: | ||
2025 | if (!guest_cpuid_has_osvw(vcpu)) | ||
2026 | return 1; | ||
2027 | data = vcpu->arch.osvw.status; | ||
2028 | break; | ||
1963 | default: | 2029 | default: |
1964 | if (kvm_pmu_msr(vcpu, msr)) | 2030 | if (kvm_pmu_msr(vcpu, msr)) |
1965 | return kvm_pmu_get_msr(vcpu, msr, pdata); | 2031 | return kvm_pmu_get_msr(vcpu, msr, pdata); |
@@ -2080,6 +2146,7 @@ int kvm_dev_ioctl_check_extension(long ext) | |||
2080 | case KVM_CAP_XSAVE: | 2146 | case KVM_CAP_XSAVE: |
2081 | case KVM_CAP_ASYNC_PF: | 2147 | case KVM_CAP_ASYNC_PF: |
2082 | case KVM_CAP_GET_TSC_KHZ: | 2148 | case KVM_CAP_GET_TSC_KHZ: |
2149 | case KVM_CAP_PCI_2_3: | ||
2083 | r = 1; | 2150 | r = 1; |
2084 | break; | 2151 | break; |
2085 | case KVM_CAP_COALESCED_MMIO: | 2152 | case KVM_CAP_COALESCED_MMIO: |
@@ -2214,19 +2281,23 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) | |||
2214 | } | 2281 | } |
2215 | 2282 | ||
2216 | kvm_x86_ops->vcpu_load(vcpu, cpu); | 2283 | kvm_x86_ops->vcpu_load(vcpu, cpu); |
2217 | if (unlikely(vcpu->cpu != cpu) || check_tsc_unstable()) { | ||
2218 | /* Make sure TSC doesn't go backwards */ | ||
2219 | s64 tsc_delta; | ||
2220 | u64 tsc; | ||
2221 | 2284 | ||
2222 | tsc = kvm_x86_ops->read_l1_tsc(vcpu); | 2285 | /* Apply any externally detected TSC adjustments (due to suspend) */ |
2223 | tsc_delta = !vcpu->arch.last_guest_tsc ? 0 : | 2286 | if (unlikely(vcpu->arch.tsc_offset_adjustment)) { |
2224 | tsc - vcpu->arch.last_guest_tsc; | 2287 | adjust_tsc_offset_host(vcpu, vcpu->arch.tsc_offset_adjustment); |
2288 | vcpu->arch.tsc_offset_adjustment = 0; | ||
2289 | set_bit(KVM_REQ_CLOCK_UPDATE, &vcpu->requests); | ||
2290 | } | ||
2225 | 2291 | ||
2292 | if (unlikely(vcpu->cpu != cpu) || check_tsc_unstable()) { | ||
2293 | s64 tsc_delta = !vcpu->arch.last_host_tsc ? 0 : | ||
2294 | native_read_tsc() - vcpu->arch.last_host_tsc; | ||
2226 | if (tsc_delta < 0) | 2295 | if (tsc_delta < 0) |
2227 | mark_tsc_unstable("KVM discovered backwards TSC"); | 2296 | mark_tsc_unstable("KVM discovered backwards TSC"); |
2228 | if (check_tsc_unstable()) { | 2297 | if (check_tsc_unstable()) { |
2229 | kvm_x86_ops->adjust_tsc_offset(vcpu, -tsc_delta); | 2298 | u64 offset = kvm_x86_ops->compute_tsc_offset(vcpu, |
2299 | vcpu->arch.last_guest_tsc); | ||
2300 | kvm_x86_ops->write_tsc_offset(vcpu, offset); | ||
2230 | vcpu->arch.tsc_catchup = 1; | 2301 | vcpu->arch.tsc_catchup = 1; |
2231 | } | 2302 | } |
2232 | kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); | 2303 | kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); |
@@ -2243,7 +2314,7 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) | |||
2243 | { | 2314 | { |
2244 | kvm_x86_ops->vcpu_put(vcpu); | 2315 | kvm_x86_ops->vcpu_put(vcpu); |
2245 | kvm_put_guest_fpu(vcpu); | 2316 | kvm_put_guest_fpu(vcpu); |
2246 | vcpu->arch.last_guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu); | 2317 | vcpu->arch.last_host_tsc = native_read_tsc(); |
2247 | } | 2318 | } |
2248 | 2319 | ||
2249 | static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu, | 2320 | static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu, |
@@ -2785,26 +2856,21 @@ long kvm_arch_vcpu_ioctl(struct file *filp, | |||
2785 | u32 user_tsc_khz; | 2856 | u32 user_tsc_khz; |
2786 | 2857 | ||
2787 | r = -EINVAL; | 2858 | r = -EINVAL; |
2788 | if (!kvm_has_tsc_control) | ||
2789 | break; | ||
2790 | |||
2791 | user_tsc_khz = (u32)arg; | 2859 | user_tsc_khz = (u32)arg; |
2792 | 2860 | ||
2793 | if (user_tsc_khz >= kvm_max_guest_tsc_khz) | 2861 | if (user_tsc_khz >= kvm_max_guest_tsc_khz) |
2794 | goto out; | 2862 | goto out; |
2795 | 2863 | ||
2796 | kvm_x86_ops->set_tsc_khz(vcpu, user_tsc_khz); | 2864 | if (user_tsc_khz == 0) |
2865 | user_tsc_khz = tsc_khz; | ||
2866 | |||
2867 | kvm_set_tsc_khz(vcpu, user_tsc_khz); | ||
2797 | 2868 | ||
2798 | r = 0; | 2869 | r = 0; |
2799 | goto out; | 2870 | goto out; |
2800 | } | 2871 | } |
2801 | case KVM_GET_TSC_KHZ: { | 2872 | case KVM_GET_TSC_KHZ: { |
2802 | r = -EIO; | 2873 | r = vcpu->arch.virtual_tsc_khz; |
2803 | if (check_tsc_unstable()) | ||
2804 | goto out; | ||
2805 | |||
2806 | r = vcpu_tsc_khz(vcpu); | ||
2807 | |||
2808 | goto out; | 2874 | goto out; |
2809 | } | 2875 | } |
2810 | default: | 2876 | default: |
@@ -2815,6 +2881,11 @@ out: | |||
2815 | return r; | 2881 | return r; |
2816 | } | 2882 | } |
2817 | 2883 | ||
2884 | int kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf) | ||
2885 | { | ||
2886 | return VM_FAULT_SIGBUS; | ||
2887 | } | ||
2888 | |||
2818 | static int kvm_vm_ioctl_set_tss_addr(struct kvm *kvm, unsigned long addr) | 2889 | static int kvm_vm_ioctl_set_tss_addr(struct kvm *kvm, unsigned long addr) |
2819 | { | 2890 | { |
2820 | int ret; | 2891 | int ret; |
@@ -2998,6 +3069,8 @@ static void write_protect_slot(struct kvm *kvm, | |||
2998 | unsigned long *dirty_bitmap, | 3069 | unsigned long *dirty_bitmap, |
2999 | unsigned long nr_dirty_pages) | 3070 | unsigned long nr_dirty_pages) |
3000 | { | 3071 | { |
3072 | spin_lock(&kvm->mmu_lock); | ||
3073 | |||
3001 | /* Not many dirty pages compared to # of shadow pages. */ | 3074 | /* Not many dirty pages compared to # of shadow pages. */ |
3002 | if (nr_dirty_pages < kvm->arch.n_used_mmu_pages) { | 3075 | if (nr_dirty_pages < kvm->arch.n_used_mmu_pages) { |
3003 | unsigned long gfn_offset; | 3076 | unsigned long gfn_offset; |
@@ -3005,16 +3078,13 @@ static void write_protect_slot(struct kvm *kvm, | |||
3005 | for_each_set_bit(gfn_offset, dirty_bitmap, memslot->npages) { | 3078 | for_each_set_bit(gfn_offset, dirty_bitmap, memslot->npages) { |
3006 | unsigned long gfn = memslot->base_gfn + gfn_offset; | 3079 | unsigned long gfn = memslot->base_gfn + gfn_offset; |
3007 | 3080 | ||
3008 | spin_lock(&kvm->mmu_lock); | ||
3009 | kvm_mmu_rmap_write_protect(kvm, gfn, memslot); | 3081 | kvm_mmu_rmap_write_protect(kvm, gfn, memslot); |
3010 | spin_unlock(&kvm->mmu_lock); | ||
3011 | } | 3082 | } |
3012 | kvm_flush_remote_tlbs(kvm); | 3083 | kvm_flush_remote_tlbs(kvm); |
3013 | } else { | 3084 | } else |
3014 | spin_lock(&kvm->mmu_lock); | ||
3015 | kvm_mmu_slot_remove_write_access(kvm, memslot->id); | 3085 | kvm_mmu_slot_remove_write_access(kvm, memslot->id); |
3016 | spin_unlock(&kvm->mmu_lock); | 3086 | |
3017 | } | 3087 | spin_unlock(&kvm->mmu_lock); |
3018 | } | 3088 | } |
3019 | 3089 | ||
3020 | /* | 3090 | /* |
@@ -3133,6 +3203,9 @@ long kvm_arch_vm_ioctl(struct file *filp, | |||
3133 | r = -EEXIST; | 3203 | r = -EEXIST; |
3134 | if (kvm->arch.vpic) | 3204 | if (kvm->arch.vpic) |
3135 | goto create_irqchip_unlock; | 3205 | goto create_irqchip_unlock; |
3206 | r = -EINVAL; | ||
3207 | if (atomic_read(&kvm->online_vcpus)) | ||
3208 | goto create_irqchip_unlock; | ||
3136 | r = -ENOMEM; | 3209 | r = -ENOMEM; |
3137 | vpic = kvm_create_pic(kvm); | 3210 | vpic = kvm_create_pic(kvm); |
3138 | if (vpic) { | 3211 | if (vpic) { |
@@ -4063,6 +4136,11 @@ static int emulator_set_cr(struct x86_emulate_ctxt *ctxt, int cr, ulong val) | |||
4063 | return res; | 4136 | return res; |
4064 | } | 4137 | } |
4065 | 4138 | ||
4139 | static void emulator_set_rflags(struct x86_emulate_ctxt *ctxt, ulong val) | ||
4140 | { | ||
4141 | kvm_set_rflags(emul_to_vcpu(ctxt), val); | ||
4142 | } | ||
4143 | |||
4066 | static int emulator_get_cpl(struct x86_emulate_ctxt *ctxt) | 4144 | static int emulator_get_cpl(struct x86_emulate_ctxt *ctxt) |
4067 | { | 4145 | { |
4068 | return kvm_x86_ops->get_cpl(emul_to_vcpu(ctxt)); | 4146 | return kvm_x86_ops->get_cpl(emul_to_vcpu(ctxt)); |
@@ -4244,6 +4322,7 @@ static struct x86_emulate_ops emulate_ops = { | |||
4244 | .set_idt = emulator_set_idt, | 4322 | .set_idt = emulator_set_idt, |
4245 | .get_cr = emulator_get_cr, | 4323 | .get_cr = emulator_get_cr, |
4246 | .set_cr = emulator_set_cr, | 4324 | .set_cr = emulator_set_cr, |
4325 | .set_rflags = emulator_set_rflags, | ||
4247 | .cpl = emulator_get_cpl, | 4326 | .cpl = emulator_get_cpl, |
4248 | .get_dr = emulator_get_dr, | 4327 | .get_dr = emulator_get_dr, |
4249 | .set_dr = emulator_set_dr, | 4328 | .set_dr = emulator_set_dr, |
@@ -5288,6 +5367,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) | |||
5288 | profile_hit(KVM_PROFILING, (void *)rip); | 5367 | profile_hit(KVM_PROFILING, (void *)rip); |
5289 | } | 5368 | } |
5290 | 5369 | ||
5370 | if (unlikely(vcpu->arch.tsc_always_catchup)) | ||
5371 | kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); | ||
5291 | 5372 | ||
5292 | kvm_lapic_sync_from_vapic(vcpu); | 5373 | kvm_lapic_sync_from_vapic(vcpu); |
5293 | 5374 | ||
@@ -5587,15 +5668,15 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu, | |||
5587 | return 0; | 5668 | return 0; |
5588 | } | 5669 | } |
5589 | 5670 | ||
5590 | int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason, | 5671 | int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int idt_index, |
5591 | bool has_error_code, u32 error_code) | 5672 | int reason, bool has_error_code, u32 error_code) |
5592 | { | 5673 | { |
5593 | struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt; | 5674 | struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt; |
5594 | int ret; | 5675 | int ret; |
5595 | 5676 | ||
5596 | init_emulate_ctxt(vcpu); | 5677 | init_emulate_ctxt(vcpu); |
5597 | 5678 | ||
5598 | ret = emulator_task_switch(ctxt, tss_selector, reason, | 5679 | ret = emulator_task_switch(ctxt, tss_selector, idt_index, reason, |
5599 | has_error_code, error_code); | 5680 | has_error_code, error_code); |
5600 | 5681 | ||
5601 | if (ret) | 5682 | if (ret) |
@@ -5928,13 +6009,88 @@ int kvm_arch_hardware_enable(void *garbage) | |||
5928 | struct kvm *kvm; | 6009 | struct kvm *kvm; |
5929 | struct kvm_vcpu *vcpu; | 6010 | struct kvm_vcpu *vcpu; |
5930 | int i; | 6011 | int i; |
6012 | int ret; | ||
6013 | u64 local_tsc; | ||
6014 | u64 max_tsc = 0; | ||
6015 | bool stable, backwards_tsc = false; | ||
5931 | 6016 | ||
5932 | kvm_shared_msr_cpu_online(); | 6017 | kvm_shared_msr_cpu_online(); |
5933 | list_for_each_entry(kvm, &vm_list, vm_list) | 6018 | ret = kvm_x86_ops->hardware_enable(garbage); |
5934 | kvm_for_each_vcpu(i, vcpu, kvm) | 6019 | if (ret != 0) |
5935 | if (vcpu->cpu == smp_processor_id()) | 6020 | return ret; |
5936 | kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); | 6021 | |
5937 | return kvm_x86_ops->hardware_enable(garbage); | 6022 | local_tsc = native_read_tsc(); |
6023 | stable = !check_tsc_unstable(); | ||
6024 | list_for_each_entry(kvm, &vm_list, vm_list) { | ||
6025 | kvm_for_each_vcpu(i, vcpu, kvm) { | ||
6026 | if (!stable && vcpu->cpu == smp_processor_id()) | ||
6027 | set_bit(KVM_REQ_CLOCK_UPDATE, &vcpu->requests); | ||
6028 | if (stable && vcpu->arch.last_host_tsc > local_tsc) { | ||
6029 | backwards_tsc = true; | ||
6030 | if (vcpu->arch.last_host_tsc > max_tsc) | ||
6031 | max_tsc = vcpu->arch.last_host_tsc; | ||
6032 | } | ||
6033 | } | ||
6034 | } | ||
6035 | |||
6036 | /* | ||
6037 | * Sometimes, even reliable TSCs go backwards. This happens on | ||
6038 | * platforms that reset TSC during suspend or hibernate actions, but | ||
6039 | * maintain synchronization. We must compensate. Fortunately, we can | ||
6040 | * detect that condition here, which happens early in CPU bringup, | ||
6041 | * before any KVM threads can be running. Unfortunately, we can't | ||
6042 | * bring the TSCs fully up to date with real time, as we aren't yet far | ||
6043 | * enough into CPU bringup that we know how much real time has actually | ||
6044 | * elapsed; our helper function, get_kernel_ns() will be using boot | ||
6045 | * variables that haven't been updated yet. | ||
6046 | * | ||
6047 | * So we simply find the maximum observed TSC above, then record the | ||
6048 | * adjustment to TSC in each VCPU. When the VCPU later gets loaded, | ||
6049 | * the adjustment will be applied. Note that we accumulate | ||
6050 | * adjustments, in case multiple suspend cycles happen before some VCPU | ||
6051 | * gets a chance to run again. In the event that no KVM threads get a | ||
6052 | * chance to run, we will miss the entire elapsed period, as we'll have | ||
6053 | * reset last_host_tsc, so VCPUs will not have the TSC adjusted and may | ||
6054 | * loose cycle time. This isn't too big a deal, since the loss will be | ||
6055 | * uniform across all VCPUs (not to mention the scenario is extremely | ||
6056 | * unlikely). It is possible that a second hibernate recovery happens | ||
6057 | * much faster than a first, causing the observed TSC here to be | ||
6058 | * smaller; this would require additional padding adjustment, which is | ||
6059 | * why we set last_host_tsc to the local tsc observed here. | ||
6060 | * | ||
6061 | * N.B. - this code below runs only on platforms with reliable TSC, | ||
6062 | * as that is the only way backwards_tsc is set above. Also note | ||
6063 | * that this runs for ALL vcpus, which is not a bug; all VCPUs should | ||
6064 | * have the same delta_cyc adjustment applied if backwards_tsc | ||
6065 | * is detected. Note further, this adjustment is only done once, | ||
6066 | * as we reset last_host_tsc on all VCPUs to stop this from being | ||
6067 | * called multiple times (one for each physical CPU bringup). | ||
6068 | * | ||
6069 | * Platforms with unnreliable TSCs don't have to deal with this, they | ||
6070 | * will be compensated by the logic in vcpu_load, which sets the TSC to | ||
6071 | * catchup mode. This will catchup all VCPUs to real time, but cannot | ||
6072 | * guarantee that they stay in perfect synchronization. | ||
6073 | */ | ||
6074 | if (backwards_tsc) { | ||
6075 | u64 delta_cyc = max_tsc - local_tsc; | ||
6076 | list_for_each_entry(kvm, &vm_list, vm_list) { | ||
6077 | kvm_for_each_vcpu(i, vcpu, kvm) { | ||
6078 | vcpu->arch.tsc_offset_adjustment += delta_cyc; | ||
6079 | vcpu->arch.last_host_tsc = local_tsc; | ||
6080 | } | ||
6081 | |||
6082 | /* | ||
6083 | * We have to disable TSC offset matching.. if you were | ||
6084 | * booting a VM while issuing an S4 host suspend.... | ||
6085 | * you may have some problem. Solving this issue is | ||
6086 | * left as an exercise to the reader. | ||
6087 | */ | ||
6088 | kvm->arch.last_tsc_nsec = 0; | ||
6089 | kvm->arch.last_tsc_write = 0; | ||
6090 | } | ||
6091 | |||
6092 | } | ||
6093 | return 0; | ||
5938 | } | 6094 | } |
5939 | 6095 | ||
5940 | void kvm_arch_hardware_disable(void *garbage) | 6096 | void kvm_arch_hardware_disable(void *garbage) |
@@ -5958,6 +6114,11 @@ void kvm_arch_check_processor_compat(void *rtn) | |||
5958 | kvm_x86_ops->check_processor_compatibility(rtn); | 6114 | kvm_x86_ops->check_processor_compatibility(rtn); |
5959 | } | 6115 | } |
5960 | 6116 | ||
6117 | bool kvm_vcpu_compatible(struct kvm_vcpu *vcpu) | ||
6118 | { | ||
6119 | return irqchip_in_kernel(vcpu->kvm) == (vcpu->arch.apic != NULL); | ||
6120 | } | ||
6121 | |||
5961 | int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) | 6122 | int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) |
5962 | { | 6123 | { |
5963 | struct page *page; | 6124 | struct page *page; |
@@ -5980,7 +6141,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) | |||
5980 | } | 6141 | } |
5981 | vcpu->arch.pio_data = page_address(page); | 6142 | vcpu->arch.pio_data = page_address(page); |
5982 | 6143 | ||
5983 | kvm_init_tsc_catchup(vcpu, max_tsc_khz); | 6144 | kvm_set_tsc_khz(vcpu, max_tsc_khz); |
5984 | 6145 | ||
5985 | r = kvm_mmu_create(vcpu); | 6146 | r = kvm_mmu_create(vcpu); |
5986 | if (r < 0) | 6147 | if (r < 0) |
@@ -6032,8 +6193,11 @@ void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) | |||
6032 | free_page((unsigned long)vcpu->arch.pio_data); | 6193 | free_page((unsigned long)vcpu->arch.pio_data); |
6033 | } | 6194 | } |
6034 | 6195 | ||
6035 | int kvm_arch_init_vm(struct kvm *kvm) | 6196 | int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) |
6036 | { | 6197 | { |
6198 | if (type) | ||
6199 | return -EINVAL; | ||
6200 | |||
6037 | INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); | 6201 | INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); |
6038 | INIT_LIST_HEAD(&kvm->arch.assigned_dev_head); | 6202 | INIT_LIST_HEAD(&kvm->arch.assigned_dev_head); |
6039 | 6203 | ||
@@ -6093,6 +6257,65 @@ void kvm_arch_destroy_vm(struct kvm *kvm) | |||
6093 | put_page(kvm->arch.ept_identity_pagetable); | 6257 | put_page(kvm->arch.ept_identity_pagetable); |
6094 | } | 6258 | } |
6095 | 6259 | ||
6260 | void kvm_arch_free_memslot(struct kvm_memory_slot *free, | ||
6261 | struct kvm_memory_slot *dont) | ||
6262 | { | ||
6263 | int i; | ||
6264 | |||
6265 | for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) { | ||
6266 | if (!dont || free->arch.lpage_info[i] != dont->arch.lpage_info[i]) { | ||
6267 | vfree(free->arch.lpage_info[i]); | ||
6268 | free->arch.lpage_info[i] = NULL; | ||
6269 | } | ||
6270 | } | ||
6271 | } | ||
6272 | |||
6273 | int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages) | ||
6274 | { | ||
6275 | int i; | ||
6276 | |||
6277 | for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) { | ||
6278 | unsigned long ugfn; | ||
6279 | int lpages; | ||
6280 | int level = i + 2; | ||
6281 | |||
6282 | lpages = gfn_to_index(slot->base_gfn + npages - 1, | ||
6283 | slot->base_gfn, level) + 1; | ||
6284 | |||
6285 | slot->arch.lpage_info[i] = | ||
6286 | vzalloc(lpages * sizeof(*slot->arch.lpage_info[i])); | ||
6287 | if (!slot->arch.lpage_info[i]) | ||
6288 | goto out_free; | ||
6289 | |||
6290 | if (slot->base_gfn & (KVM_PAGES_PER_HPAGE(level) - 1)) | ||
6291 | slot->arch.lpage_info[i][0].write_count = 1; | ||
6292 | if ((slot->base_gfn + npages) & (KVM_PAGES_PER_HPAGE(level) - 1)) | ||
6293 | slot->arch.lpage_info[i][lpages - 1].write_count = 1; | ||
6294 | ugfn = slot->userspace_addr >> PAGE_SHIFT; | ||
6295 | /* | ||
6296 | * If the gfn and userspace address are not aligned wrt each | ||
6297 | * other, or if explicitly asked to, disable large page | ||
6298 | * support for this slot | ||
6299 | */ | ||
6300 | if ((slot->base_gfn ^ ugfn) & (KVM_PAGES_PER_HPAGE(level) - 1) || | ||
6301 | !kvm_largepages_enabled()) { | ||
6302 | unsigned long j; | ||
6303 | |||
6304 | for (j = 0; j < lpages; ++j) | ||
6305 | slot->arch.lpage_info[i][j].write_count = 1; | ||
6306 | } | ||
6307 | } | ||
6308 | |||
6309 | return 0; | ||
6310 | |||
6311 | out_free: | ||
6312 | for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) { | ||
6313 | vfree(slot->arch.lpage_info[i]); | ||
6314 | slot->arch.lpage_info[i] = NULL; | ||
6315 | } | ||
6316 | return -ENOMEM; | ||
6317 | } | ||
6318 | |||
6096 | int kvm_arch_prepare_memory_region(struct kvm *kvm, | 6319 | int kvm_arch_prepare_memory_region(struct kvm *kvm, |
6097 | struct kvm_memory_slot *memslot, | 6320 | struct kvm_memory_slot *memslot, |
6098 | struct kvm_memory_slot old, | 6321 | struct kvm_memory_slot old, |