diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2014-06-04 11:47:12 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2014-06-04 11:47:12 -0400 |
commit | b05d59dfceaea72565b1648af929b037b0f96d7f (patch) | |
tree | bbe92714be468ed8783bce6ac2c305c0aedf8eb5 /arch/x86 | |
parent | daf342af2f7856fd2f5c66b9fb39a8f24986ca53 (diff) | |
parent | 820b3fcdeb80d30410f4427d2cbf9161c35fdeef (diff) |
Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm into next
Pull KVM updates from Paolo Bonzini:
"At over 200 commits, covering almost all supported architectures, this
was a pretty active cycle for KVM. Changes include:
- a lot of s390 changes: optimizations, support for migration, GDB
support and more
- ARM changes are pretty small: support for the PSCI 0.2 hypercall
interface on both the guest and the host (the latter acked by
Catalin)
- initial POWER8 and little-endian host support
- support for running u-boot on embedded POWER targets
- pretty large changes to MIPS too, completing the userspace
interface and improving the handling of virtualized timer hardware
- for x86, a larger set of changes is scheduled for 3.17. Still, we
have a few emulator bugfixes and support for running nested
fully-virtualized Xen guests (para-virtualized Xen guests have
always worked). And some optimizations too.
The only missing architecture here is ia64. It's not a coincidence
that support for KVM on ia64 is scheduled for removal in 3.17"
* tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (203 commits)
KVM: add missing cleanup_srcu_struct
KVM: PPC: Book3S PR: Rework SLB switching code
KVM: PPC: Book3S PR: Use SLB entry 0
KVM: PPC: Book3S HV: Fix machine check delivery to guest
KVM: PPC: Book3S HV: Work around POWER8 performance monitor bugs
KVM: PPC: Book3S HV: Make sure we don't miss dirty pages
KVM: PPC: Book3S HV: Fix dirty map for hugepages
KVM: PPC: Book3S HV: Put huge-page HPTEs in rmap chain for base address
KVM: PPC: Book3S HV: Fix check for running inside guest in global_invalidates()
KVM: PPC: Book3S: Move KVM_REG_PPC_WORT to an unused register number
KVM: PPC: Book3S: Add ONE_REG register names that were missed
KVM: PPC: Add CAP to indicate hcall fixes
KVM: PPC: MPIC: Reset IRQ source private members
KVM: PPC: Graciously fail broken LE hypercalls
PPC: ePAPR: Fix hypercall on LE guest
KVM: PPC: BOOK3S: Remove open coded make_dsisr in alignment handler
KVM: PPC: BOOK3S: Always use the saved DAR value
PPC: KVM: Make NX bit available with magic page
KVM: PPC: Disable NX for old magic page using guests
KVM: PPC: BOOK3S: HV: Add mixed page-size support for guest
...
Diffstat (limited to 'arch/x86')
-rw-r--r-- | arch/x86/include/asm/kvm_emulate.h | 1 | ||||
-rw-r--r-- | arch/x86/include/asm/kvm_host.h | 7 | ||||
-rw-r--r-- | arch/x86/include/asm/traps.h | 5 | ||||
-rw-r--r-- | arch/x86/kernel/kvm.c | 2 | ||||
-rw-r--r-- | arch/x86/kvm/cpuid.c | 11 | ||||
-rw-r--r-- | arch/x86/kvm/cpuid.h | 7 | ||||
-rw-r--r-- | arch/x86/kvm/emulate.c | 93 | ||||
-rw-r--r-- | arch/x86/kvm/irq.c | 1 | ||||
-rw-r--r-- | arch/x86/kvm/lapic.c | 62 | ||||
-rw-r--r-- | arch/x86/kvm/mmu.c | 84 | ||||
-rw-r--r-- | arch/x86/kvm/mmu.h | 33 | ||||
-rw-r--r-- | arch/x86/kvm/paging_tmpl.h | 7 | ||||
-rw-r--r-- | arch/x86/kvm/pmu.c | 7 | ||||
-rw-r--r-- | arch/x86/kvm/svm.c | 63 | ||||
-rw-r--r-- | arch/x86/kvm/trace.h | 20 | ||||
-rw-r--r-- | arch/x86/kvm/vmx.c | 333 | ||||
-rw-r--r-- | arch/x86/kvm/x86.c | 61 |
17 files changed, 505 insertions, 292 deletions
diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h index 24ec1216596e..a04fe4eb237d 100644 --- a/arch/x86/include/asm/kvm_emulate.h +++ b/arch/x86/include/asm/kvm_emulate.h | |||
@@ -189,7 +189,6 @@ struct x86_emulate_ops { | |||
189 | void (*set_idt)(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt); | 189 | void (*set_idt)(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt); |
190 | ulong (*get_cr)(struct x86_emulate_ctxt *ctxt, int cr); | 190 | ulong (*get_cr)(struct x86_emulate_ctxt *ctxt, int cr); |
191 | int (*set_cr)(struct x86_emulate_ctxt *ctxt, int cr, ulong val); | 191 | int (*set_cr)(struct x86_emulate_ctxt *ctxt, int cr, ulong val); |
192 | void (*set_rflags)(struct x86_emulate_ctxt *ctxt, ulong val); | ||
193 | int (*cpl)(struct x86_emulate_ctxt *ctxt); | 192 | int (*cpl)(struct x86_emulate_ctxt *ctxt); |
194 | int (*get_dr)(struct x86_emulate_ctxt *ctxt, int dr, ulong *dest); | 193 | int (*get_dr)(struct x86_emulate_ctxt *ctxt, int dr, ulong *dest); |
195 | int (*set_dr)(struct x86_emulate_ctxt *ctxt, int dr, ulong value); | 194 | int (*set_dr)(struct x86_emulate_ctxt *ctxt, int dr, ulong value); |
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 7de069afb382..49314155b66c 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h | |||
@@ -50,11 +50,7 @@ | |||
50 | | X86_CR0_ET | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM \ | 50 | | X86_CR0_ET | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM \ |
51 | | X86_CR0_NW | X86_CR0_CD | X86_CR0_PG)) | 51 | | X86_CR0_NW | X86_CR0_CD | X86_CR0_PG)) |
52 | 52 | ||
53 | #define CR3_PAE_RESERVED_BITS ((X86_CR3_PWT | X86_CR3_PCD) - 1) | 53 | #define CR3_L_MODE_RESERVED_BITS 0xFFFFFF0000000000ULL |
54 | #define CR3_NONPAE_RESERVED_BITS ((PAGE_SIZE-1) & ~(X86_CR3_PWT | X86_CR3_PCD)) | ||
55 | #define CR3_PCID_ENABLED_RESERVED_BITS 0xFFFFFF0000000000ULL | ||
56 | #define CR3_L_MODE_RESERVED_BITS (CR3_NONPAE_RESERVED_BITS | \ | ||
57 | 0xFFFFFF0000000000ULL) | ||
58 | #define CR4_RESERVED_BITS \ | 54 | #define CR4_RESERVED_BITS \ |
59 | (~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\ | 55 | (~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\ |
60 | | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE \ | 56 | | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE \ |
@@ -134,7 +130,6 @@ enum kvm_reg_ex { | |||
134 | VCPU_EXREG_PDPTR = NR_VCPU_REGS, | 130 | VCPU_EXREG_PDPTR = NR_VCPU_REGS, |
135 | VCPU_EXREG_CR3, | 131 | VCPU_EXREG_CR3, |
136 | VCPU_EXREG_RFLAGS, | 132 | VCPU_EXREG_RFLAGS, |
137 | VCPU_EXREG_CPL, | ||
138 | VCPU_EXREG_SEGMENTS, | 133 | VCPU_EXREG_SEGMENTS, |
139 | }; | 134 | }; |
140 | 135 | ||
diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h index 58d66fe06b61..8ba18842c48e 100644 --- a/arch/x86/include/asm/traps.h +++ b/arch/x86/include/asm/traps.h | |||
@@ -74,6 +74,11 @@ dotraplinkage void do_general_protection(struct pt_regs *, long); | |||
74 | dotraplinkage void do_page_fault(struct pt_regs *, unsigned long); | 74 | dotraplinkage void do_page_fault(struct pt_regs *, unsigned long); |
75 | #ifdef CONFIG_TRACING | 75 | #ifdef CONFIG_TRACING |
76 | dotraplinkage void trace_do_page_fault(struct pt_regs *, unsigned long); | 76 | dotraplinkage void trace_do_page_fault(struct pt_regs *, unsigned long); |
77 | #else | ||
78 | static inline void trace_do_page_fault(struct pt_regs *regs, unsigned long error) | ||
79 | { | ||
80 | do_page_fault(regs, error); | ||
81 | } | ||
77 | #endif | 82 | #endif |
78 | dotraplinkage void do_spurious_interrupt_bug(struct pt_regs *, long); | 83 | dotraplinkage void do_spurious_interrupt_bug(struct pt_regs *, long); |
79 | dotraplinkage void do_coprocessor_error(struct pt_regs *, long); | 84 | dotraplinkage void do_coprocessor_error(struct pt_regs *, long); |
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 0331cb389d68..7e97371387fd 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c | |||
@@ -259,7 +259,7 @@ do_async_page_fault(struct pt_regs *regs, unsigned long error_code) | |||
259 | 259 | ||
260 | switch (kvm_read_and_reset_pf_reason()) { | 260 | switch (kvm_read_and_reset_pf_reason()) { |
261 | default: | 261 | default: |
262 | do_page_fault(regs, error_code); | 262 | trace_do_page_fault(regs, error_code); |
263 | break; | 263 | break; |
264 | case KVM_PV_REASON_PAGE_NOT_PRESENT: | 264 | case KVM_PV_REASON_PAGE_NOT_PRESENT: |
265 | /* page is swapped out by the host. */ | 265 | /* page is swapped out by the host. */ |
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index f47a104a749c..38a0afe83c6b 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c | |||
@@ -283,6 +283,8 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, | |||
283 | 0 /* Reserved */ | f_lm | F(3DNOWEXT) | F(3DNOW); | 283 | 0 /* Reserved */ | f_lm | F(3DNOWEXT) | F(3DNOW); |
284 | /* cpuid 1.ecx */ | 284 | /* cpuid 1.ecx */ |
285 | const u32 kvm_supported_word4_x86_features = | 285 | const u32 kvm_supported_word4_x86_features = |
286 | /* NOTE: MONITOR (and MWAIT) are emulated as NOP, | ||
287 | * but *not* advertised to guests via CPUID ! */ | ||
286 | F(XMM3) | F(PCLMULQDQ) | 0 /* DTES64, MONITOR */ | | 288 | F(XMM3) | F(PCLMULQDQ) | 0 /* DTES64, MONITOR */ | |
287 | 0 /* DS-CPL, VMX, SMX, EST */ | | 289 | 0 /* DS-CPL, VMX, SMX, EST */ | |
288 | 0 /* TM2 */ | F(SSSE3) | 0 /* CNXT-ID */ | 0 /* Reserved */ | | 290 | 0 /* TM2 */ | F(SSSE3) | 0 /* CNXT-ID */ | 0 /* Reserved */ | |
@@ -495,6 +497,13 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, | |||
495 | entry->ecx &= kvm_supported_word6_x86_features; | 497 | entry->ecx &= kvm_supported_word6_x86_features; |
496 | cpuid_mask(&entry->ecx, 6); | 498 | cpuid_mask(&entry->ecx, 6); |
497 | break; | 499 | break; |
500 | case 0x80000007: /* Advanced power management */ | ||
501 | /* invariant TSC is CPUID.80000007H:EDX[8] */ | ||
502 | entry->edx &= (1 << 8); | ||
503 | /* mask against host */ | ||
504 | entry->edx &= boot_cpu_data.x86_power; | ||
505 | entry->eax = entry->ebx = entry->ecx = 0; | ||
506 | break; | ||
498 | case 0x80000008: { | 507 | case 0x80000008: { |
499 | unsigned g_phys_as = (entry->eax >> 16) & 0xff; | 508 | unsigned g_phys_as = (entry->eax >> 16) & 0xff; |
500 | unsigned virt_as = max((entry->eax >> 8) & 0xff, 48U); | 509 | unsigned virt_as = max((entry->eax >> 8) & 0xff, 48U); |
@@ -525,7 +534,6 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, | |||
525 | case 3: /* Processor serial number */ | 534 | case 3: /* Processor serial number */ |
526 | case 5: /* MONITOR/MWAIT */ | 535 | case 5: /* MONITOR/MWAIT */ |
527 | case 6: /* Thermal management */ | 536 | case 6: /* Thermal management */ |
528 | case 0x80000007: /* Advanced power management */ | ||
529 | case 0xC0000002: | 537 | case 0xC0000002: |
530 | case 0xC0000003: | 538 | case 0xC0000003: |
531 | case 0xC0000004: | 539 | case 0xC0000004: |
@@ -726,6 +734,7 @@ int cpuid_maxphyaddr(struct kvm_vcpu *vcpu) | |||
726 | not_found: | 734 | not_found: |
727 | return 36; | 735 | return 36; |
728 | } | 736 | } |
737 | EXPORT_SYMBOL_GPL(cpuid_maxphyaddr); | ||
729 | 738 | ||
730 | /* | 739 | /* |
731 | * If no match is found, check whether we exceed the vCPU's limit | 740 | * If no match is found, check whether we exceed the vCPU's limit |
diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h index eeecbed26ac7..f9087315e0cd 100644 --- a/arch/x86/kvm/cpuid.h +++ b/arch/x86/kvm/cpuid.h | |||
@@ -88,4 +88,11 @@ static inline bool guest_cpuid_has_x2apic(struct kvm_vcpu *vcpu) | |||
88 | return best && (best->ecx & bit(X86_FEATURE_X2APIC)); | 88 | return best && (best->ecx & bit(X86_FEATURE_X2APIC)); |
89 | } | 89 | } |
90 | 90 | ||
91 | static inline bool guest_cpuid_has_gbpages(struct kvm_vcpu *vcpu) | ||
92 | { | ||
93 | struct kvm_cpuid_entry2 *best; | ||
94 | |||
95 | best = kvm_find_cpuid_entry(vcpu, 0x80000001, 0); | ||
96 | return best && (best->edx & bit(X86_FEATURE_GBPAGES)); | ||
97 | } | ||
91 | #endif | 98 | #endif |
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 205b17eed93c..e4e833d3d7d7 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c | |||
@@ -161,6 +161,7 @@ | |||
161 | #define Fastop ((u64)1 << 44) /* Use opcode::u.fastop */ | 161 | #define Fastop ((u64)1 << 44) /* Use opcode::u.fastop */ |
162 | #define NoWrite ((u64)1 << 45) /* No writeback */ | 162 | #define NoWrite ((u64)1 << 45) /* No writeback */ |
163 | #define SrcWrite ((u64)1 << 46) /* Write back src operand */ | 163 | #define SrcWrite ((u64)1 << 46) /* Write back src operand */ |
164 | #define NoMod ((u64)1 << 47) /* Mod field is ignored */ | ||
164 | 165 | ||
165 | #define DstXacc (DstAccLo | SrcAccHi | SrcWrite) | 166 | #define DstXacc (DstAccLo | SrcAccHi | SrcWrite) |
166 | 167 | ||
@@ -1077,7 +1078,7 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt, | |||
1077 | ctxt->modrm_rm |= (ctxt->modrm & 0x07); | 1078 | ctxt->modrm_rm |= (ctxt->modrm & 0x07); |
1078 | ctxt->modrm_seg = VCPU_SREG_DS; | 1079 | ctxt->modrm_seg = VCPU_SREG_DS; |
1079 | 1080 | ||
1080 | if (ctxt->modrm_mod == 3) { | 1081 | if (ctxt->modrm_mod == 3 || (ctxt->d & NoMod)) { |
1081 | op->type = OP_REG; | 1082 | op->type = OP_REG; |
1082 | op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes; | 1083 | op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes; |
1083 | op->addr.reg = decode_register(ctxt, ctxt->modrm_rm, | 1084 | op->addr.reg = decode_register(ctxt, ctxt->modrm_rm, |
@@ -1324,7 +1325,8 @@ static int pio_in_emulated(struct x86_emulate_ctxt *ctxt, | |||
1324 | rc->end = n * size; | 1325 | rc->end = n * size; |
1325 | } | 1326 | } |
1326 | 1327 | ||
1327 | if (ctxt->rep_prefix && !(ctxt->eflags & EFLG_DF)) { | 1328 | if (ctxt->rep_prefix && (ctxt->d & String) && |
1329 | !(ctxt->eflags & EFLG_DF)) { | ||
1328 | ctxt->dst.data = rc->data + rc->pos; | 1330 | ctxt->dst.data = rc->data + rc->pos; |
1329 | ctxt->dst.type = OP_MEM_STR; | 1331 | ctxt->dst.type = OP_MEM_STR; |
1330 | ctxt->dst.count = (rc->end - rc->pos) / size; | 1332 | ctxt->dst.count = (rc->end - rc->pos) / size; |
@@ -1409,11 +1411,11 @@ static int write_segment_descriptor(struct x86_emulate_ctxt *ctxt, | |||
1409 | } | 1411 | } |
1410 | 1412 | ||
1411 | /* Does not support long mode */ | 1413 | /* Does not support long mode */ |
1412 | static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt, | 1414 | static int __load_segment_descriptor(struct x86_emulate_ctxt *ctxt, |
1413 | u16 selector, int seg) | 1415 | u16 selector, int seg, u8 cpl, bool in_task_switch) |
1414 | { | 1416 | { |
1415 | struct desc_struct seg_desc, old_desc; | 1417 | struct desc_struct seg_desc, old_desc; |
1416 | u8 dpl, rpl, cpl; | 1418 | u8 dpl, rpl; |
1417 | unsigned err_vec = GP_VECTOR; | 1419 | unsigned err_vec = GP_VECTOR; |
1418 | u32 err_code = 0; | 1420 | u32 err_code = 0; |
1419 | bool null_selector = !(selector & ~0x3); /* 0000-0003 are null */ | 1421 | bool null_selector = !(selector & ~0x3); /* 0000-0003 are null */ |
@@ -1441,7 +1443,6 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt, | |||
1441 | } | 1443 | } |
1442 | 1444 | ||
1443 | rpl = selector & 3; | 1445 | rpl = selector & 3; |
1444 | cpl = ctxt->ops->cpl(ctxt); | ||
1445 | 1446 | ||
1446 | /* NULL selector is not valid for TR, CS and SS (except for long mode) */ | 1447 | /* NULL selector is not valid for TR, CS and SS (except for long mode) */ |
1447 | if ((seg == VCPU_SREG_CS | 1448 | if ((seg == VCPU_SREG_CS |
@@ -1486,6 +1487,9 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt, | |||
1486 | goto exception; | 1487 | goto exception; |
1487 | break; | 1488 | break; |
1488 | case VCPU_SREG_CS: | 1489 | case VCPU_SREG_CS: |
1490 | if (in_task_switch && rpl != dpl) | ||
1491 | goto exception; | ||
1492 | |||
1489 | if (!(seg_desc.type & 8)) | 1493 | if (!(seg_desc.type & 8)) |
1490 | goto exception; | 1494 | goto exception; |
1491 | 1495 | ||
@@ -1543,6 +1547,13 @@ exception: | |||
1543 | return X86EMUL_PROPAGATE_FAULT; | 1547 | return X86EMUL_PROPAGATE_FAULT; |
1544 | } | 1548 | } |
1545 | 1549 | ||
1550 | static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt, | ||
1551 | u16 selector, int seg) | ||
1552 | { | ||
1553 | u8 cpl = ctxt->ops->cpl(ctxt); | ||
1554 | return __load_segment_descriptor(ctxt, selector, seg, cpl, false); | ||
1555 | } | ||
1556 | |||
1546 | static void write_register_operand(struct operand *op) | 1557 | static void write_register_operand(struct operand *op) |
1547 | { | 1558 | { |
1548 | /* The 4-byte case *is* correct: in 64-bit mode we zero-extend. */ | 1559 | /* The 4-byte case *is* correct: in 64-bit mode we zero-extend. */ |
@@ -2404,6 +2415,7 @@ static int load_state_from_tss16(struct x86_emulate_ctxt *ctxt, | |||
2404 | struct tss_segment_16 *tss) | 2415 | struct tss_segment_16 *tss) |
2405 | { | 2416 | { |
2406 | int ret; | 2417 | int ret; |
2418 | u8 cpl; | ||
2407 | 2419 | ||
2408 | ctxt->_eip = tss->ip; | 2420 | ctxt->_eip = tss->ip; |
2409 | ctxt->eflags = tss->flag | 2; | 2421 | ctxt->eflags = tss->flag | 2; |
@@ -2426,23 +2438,25 @@ static int load_state_from_tss16(struct x86_emulate_ctxt *ctxt, | |||
2426 | set_segment_selector(ctxt, tss->ss, VCPU_SREG_SS); | 2438 | set_segment_selector(ctxt, tss->ss, VCPU_SREG_SS); |
2427 | set_segment_selector(ctxt, tss->ds, VCPU_SREG_DS); | 2439 | set_segment_selector(ctxt, tss->ds, VCPU_SREG_DS); |
2428 | 2440 | ||
2441 | cpl = tss->cs & 3; | ||
2442 | |||
2429 | /* | 2443 | /* |
2430 | * Now load segment descriptors. If fault happens at this stage | 2444 | * Now load segment descriptors. If fault happens at this stage |
2431 | * it is handled in a context of new task | 2445 | * it is handled in a context of new task |
2432 | */ | 2446 | */ |
2433 | ret = load_segment_descriptor(ctxt, tss->ldt, VCPU_SREG_LDTR); | 2447 | ret = __load_segment_descriptor(ctxt, tss->ldt, VCPU_SREG_LDTR, cpl, true); |
2434 | if (ret != X86EMUL_CONTINUE) | 2448 | if (ret != X86EMUL_CONTINUE) |
2435 | return ret; | 2449 | return ret; |
2436 | ret = load_segment_descriptor(ctxt, tss->es, VCPU_SREG_ES); | 2450 | ret = __load_segment_descriptor(ctxt, tss->es, VCPU_SREG_ES, cpl, true); |
2437 | if (ret != X86EMUL_CONTINUE) | 2451 | if (ret != X86EMUL_CONTINUE) |
2438 | return ret; | 2452 | return ret; |
2439 | ret = load_segment_descriptor(ctxt, tss->cs, VCPU_SREG_CS); | 2453 | ret = __load_segment_descriptor(ctxt, tss->cs, VCPU_SREG_CS, cpl, true); |
2440 | if (ret != X86EMUL_CONTINUE) | 2454 | if (ret != X86EMUL_CONTINUE) |
2441 | return ret; | 2455 | return ret; |
2442 | ret = load_segment_descriptor(ctxt, tss->ss, VCPU_SREG_SS); | 2456 | ret = __load_segment_descriptor(ctxt, tss->ss, VCPU_SREG_SS, cpl, true); |
2443 | if (ret != X86EMUL_CONTINUE) | 2457 | if (ret != X86EMUL_CONTINUE) |
2444 | return ret; | 2458 | return ret; |
2445 | ret = load_segment_descriptor(ctxt, tss->ds, VCPU_SREG_DS); | 2459 | ret = __load_segment_descriptor(ctxt, tss->ds, VCPU_SREG_DS, cpl, true); |
2446 | if (ret != X86EMUL_CONTINUE) | 2460 | if (ret != X86EMUL_CONTINUE) |
2447 | return ret; | 2461 | return ret; |
2448 | 2462 | ||
@@ -2496,7 +2510,7 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt, | |||
2496 | static void save_state_to_tss32(struct x86_emulate_ctxt *ctxt, | 2510 | static void save_state_to_tss32(struct x86_emulate_ctxt *ctxt, |
2497 | struct tss_segment_32 *tss) | 2511 | struct tss_segment_32 *tss) |
2498 | { | 2512 | { |
2499 | tss->cr3 = ctxt->ops->get_cr(ctxt, 3); | 2513 | /* CR3 and ldt selector are not saved intentionally */ |
2500 | tss->eip = ctxt->_eip; | 2514 | tss->eip = ctxt->_eip; |
2501 | tss->eflags = ctxt->eflags; | 2515 | tss->eflags = ctxt->eflags; |
2502 | tss->eax = reg_read(ctxt, VCPU_REGS_RAX); | 2516 | tss->eax = reg_read(ctxt, VCPU_REGS_RAX); |
@@ -2514,13 +2528,13 @@ static void save_state_to_tss32(struct x86_emulate_ctxt *ctxt, | |||
2514 | tss->ds = get_segment_selector(ctxt, VCPU_SREG_DS); | 2528 | tss->ds = get_segment_selector(ctxt, VCPU_SREG_DS); |
2515 | tss->fs = get_segment_selector(ctxt, VCPU_SREG_FS); | 2529 | tss->fs = get_segment_selector(ctxt, VCPU_SREG_FS); |
2516 | tss->gs = get_segment_selector(ctxt, VCPU_SREG_GS); | 2530 | tss->gs = get_segment_selector(ctxt, VCPU_SREG_GS); |
2517 | tss->ldt_selector = get_segment_selector(ctxt, VCPU_SREG_LDTR); | ||
2518 | } | 2531 | } |
2519 | 2532 | ||
2520 | static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt, | 2533 | static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt, |
2521 | struct tss_segment_32 *tss) | 2534 | struct tss_segment_32 *tss) |
2522 | { | 2535 | { |
2523 | int ret; | 2536 | int ret; |
2537 | u8 cpl; | ||
2524 | 2538 | ||
2525 | if (ctxt->ops->set_cr(ctxt, 3, tss->cr3)) | 2539 | if (ctxt->ops->set_cr(ctxt, 3, tss->cr3)) |
2526 | return emulate_gp(ctxt, 0); | 2540 | return emulate_gp(ctxt, 0); |
@@ -2539,7 +2553,8 @@ static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt, | |||
2539 | 2553 | ||
2540 | /* | 2554 | /* |
2541 | * SDM says that segment selectors are loaded before segment | 2555 | * SDM says that segment selectors are loaded before segment |
2542 | * descriptors | 2556 | * descriptors. This is important because CPL checks will |
2557 | * use CS.RPL. | ||
2543 | */ | 2558 | */ |
2544 | set_segment_selector(ctxt, tss->ldt_selector, VCPU_SREG_LDTR); | 2559 | set_segment_selector(ctxt, tss->ldt_selector, VCPU_SREG_LDTR); |
2545 | set_segment_selector(ctxt, tss->es, VCPU_SREG_ES); | 2560 | set_segment_selector(ctxt, tss->es, VCPU_SREG_ES); |
@@ -2553,43 +2568,38 @@ static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt, | |||
2553 | * If we're switching between Protected Mode and VM86, we need to make | 2568 | * If we're switching between Protected Mode and VM86, we need to make |
2554 | * sure to update the mode before loading the segment descriptors so | 2569 | * sure to update the mode before loading the segment descriptors so |
2555 | * that the selectors are interpreted correctly. | 2570 | * that the selectors are interpreted correctly. |
2556 | * | ||
2557 | * Need to get rflags to the vcpu struct immediately because it | ||
2558 | * influences the CPL which is checked at least when loading the segment | ||
2559 | * descriptors and when pushing an error code to the new kernel stack. | ||
2560 | * | ||
2561 | * TODO Introduce a separate ctxt->ops->set_cpl callback | ||
2562 | */ | 2571 | */ |
2563 | if (ctxt->eflags & X86_EFLAGS_VM) | 2572 | if (ctxt->eflags & X86_EFLAGS_VM) { |
2564 | ctxt->mode = X86EMUL_MODE_VM86; | 2573 | ctxt->mode = X86EMUL_MODE_VM86; |
2565 | else | 2574 | cpl = 3; |
2575 | } else { | ||
2566 | ctxt->mode = X86EMUL_MODE_PROT32; | 2576 | ctxt->mode = X86EMUL_MODE_PROT32; |
2567 | 2577 | cpl = tss->cs & 3; | |
2568 | ctxt->ops->set_rflags(ctxt, ctxt->eflags); | 2578 | } |
2569 | 2579 | ||
2570 | /* | 2580 | /* |
2571 | * Now load segment descriptors. If fault happenes at this stage | 2581 | * Now load segment descriptors. If fault happenes at this stage |
2572 | * it is handled in a context of new task | 2582 | * it is handled in a context of new task |
2573 | */ | 2583 | */ |
2574 | ret = load_segment_descriptor(ctxt, tss->ldt_selector, VCPU_SREG_LDTR); | 2584 | ret = __load_segment_descriptor(ctxt, tss->ldt_selector, VCPU_SREG_LDTR, cpl, true); |
2575 | if (ret != X86EMUL_CONTINUE) | 2585 | if (ret != X86EMUL_CONTINUE) |
2576 | return ret; | 2586 | return ret; |
2577 | ret = load_segment_descriptor(ctxt, tss->es, VCPU_SREG_ES); | 2587 | ret = __load_segment_descriptor(ctxt, tss->es, VCPU_SREG_ES, cpl, true); |
2578 | if (ret != X86EMUL_CONTINUE) | 2588 | if (ret != X86EMUL_CONTINUE) |
2579 | return ret; | 2589 | return ret; |
2580 | ret = load_segment_descriptor(ctxt, tss->cs, VCPU_SREG_CS); | 2590 | ret = __load_segment_descriptor(ctxt, tss->cs, VCPU_SREG_CS, cpl, true); |
2581 | if (ret != X86EMUL_CONTINUE) | 2591 | if (ret != X86EMUL_CONTINUE) |
2582 | return ret; | 2592 | return ret; |
2583 | ret = load_segment_descriptor(ctxt, tss->ss, VCPU_SREG_SS); | 2593 | ret = __load_segment_descriptor(ctxt, tss->ss, VCPU_SREG_SS, cpl, true); |
2584 | if (ret != X86EMUL_CONTINUE) | 2594 | if (ret != X86EMUL_CONTINUE) |
2585 | return ret; | 2595 | return ret; |
2586 | ret = load_segment_descriptor(ctxt, tss->ds, VCPU_SREG_DS); | 2596 | ret = __load_segment_descriptor(ctxt, tss->ds, VCPU_SREG_DS, cpl, true); |
2587 | if (ret != X86EMUL_CONTINUE) | 2597 | if (ret != X86EMUL_CONTINUE) |
2588 | return ret; | 2598 | return ret; |
2589 | ret = load_segment_descriptor(ctxt, tss->fs, VCPU_SREG_FS); | 2599 | ret = __load_segment_descriptor(ctxt, tss->fs, VCPU_SREG_FS, cpl, true); |
2590 | if (ret != X86EMUL_CONTINUE) | 2600 | if (ret != X86EMUL_CONTINUE) |
2591 | return ret; | 2601 | return ret; |
2592 | ret = load_segment_descriptor(ctxt, tss->gs, VCPU_SREG_GS); | 2602 | ret = __load_segment_descriptor(ctxt, tss->gs, VCPU_SREG_GS, cpl, true); |
2593 | if (ret != X86EMUL_CONTINUE) | 2603 | if (ret != X86EMUL_CONTINUE) |
2594 | return ret; | 2604 | return ret; |
2595 | 2605 | ||
@@ -2604,6 +2614,8 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt, | |||
2604 | struct tss_segment_32 tss_seg; | 2614 | struct tss_segment_32 tss_seg; |
2605 | int ret; | 2615 | int ret; |
2606 | u32 new_tss_base = get_desc_base(new_desc); | 2616 | u32 new_tss_base = get_desc_base(new_desc); |
2617 | u32 eip_offset = offsetof(struct tss_segment_32, eip); | ||
2618 | u32 ldt_sel_offset = offsetof(struct tss_segment_32, ldt_selector); | ||
2607 | 2619 | ||
2608 | ret = ops->read_std(ctxt, old_tss_base, &tss_seg, sizeof tss_seg, | 2620 | ret = ops->read_std(ctxt, old_tss_base, &tss_seg, sizeof tss_seg, |
2609 | &ctxt->exception); | 2621 | &ctxt->exception); |
@@ -2613,8 +2625,9 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt, | |||
2613 | 2625 | ||
2614 | save_state_to_tss32(ctxt, &tss_seg); | 2626 | save_state_to_tss32(ctxt, &tss_seg); |
2615 | 2627 | ||
2616 | ret = ops->write_std(ctxt, old_tss_base, &tss_seg, sizeof tss_seg, | 2628 | /* Only GP registers and segment selectors are saved */ |
2617 | &ctxt->exception); | 2629 | ret = ops->write_std(ctxt, old_tss_base + eip_offset, &tss_seg.eip, |
2630 | ldt_sel_offset - eip_offset, &ctxt->exception); | ||
2618 | if (ret != X86EMUL_CONTINUE) | 2631 | if (ret != X86EMUL_CONTINUE) |
2619 | /* FIXME: need to provide precise fault address */ | 2632 | /* FIXME: need to provide precise fault address */ |
2620 | return ret; | 2633 | return ret; |
@@ -3386,10 +3399,6 @@ static int check_cr_write(struct x86_emulate_ctxt *ctxt) | |||
3386 | ctxt->ops->get_msr(ctxt, MSR_EFER, &efer); | 3399 | ctxt->ops->get_msr(ctxt, MSR_EFER, &efer); |
3387 | if (efer & EFER_LMA) | 3400 | if (efer & EFER_LMA) |
3388 | rsvd = CR3_L_MODE_RESERVED_BITS; | 3401 | rsvd = CR3_L_MODE_RESERVED_BITS; |
3389 | else if (ctxt->ops->get_cr(ctxt, 4) & X86_CR4_PAE) | ||
3390 | rsvd = CR3_PAE_RESERVED_BITS; | ||
3391 | else if (ctxt->ops->get_cr(ctxt, 0) & X86_CR0_PG) | ||
3392 | rsvd = CR3_NONPAE_RESERVED_BITS; | ||
3393 | 3402 | ||
3394 | if (new_val & rsvd) | 3403 | if (new_val & rsvd) |
3395 | return emulate_gp(ctxt, 0); | 3404 | return emulate_gp(ctxt, 0); |
@@ -3869,10 +3878,12 @@ static const struct opcode twobyte_table[256] = { | |||
3869 | N, N, N, N, N, N, N, N, | 3878 | N, N, N, N, N, N, N, N, |
3870 | D(ImplicitOps | ModRM), N, N, N, N, N, N, D(ImplicitOps | ModRM), | 3879 | D(ImplicitOps | ModRM), N, N, N, N, N, N, D(ImplicitOps | ModRM), |
3871 | /* 0x20 - 0x2F */ | 3880 | /* 0x20 - 0x2F */ |
3872 | DIP(ModRM | DstMem | Priv | Op3264, cr_read, check_cr_read), | 3881 | DIP(ModRM | DstMem | Priv | Op3264 | NoMod, cr_read, check_cr_read), |
3873 | DIP(ModRM | DstMem | Priv | Op3264, dr_read, check_dr_read), | 3882 | DIP(ModRM | DstMem | Priv | Op3264 | NoMod, dr_read, check_dr_read), |
3874 | IIP(ModRM | SrcMem | Priv | Op3264, em_cr_write, cr_write, check_cr_write), | 3883 | IIP(ModRM | SrcMem | Priv | Op3264 | NoMod, em_cr_write, cr_write, |
3875 | IIP(ModRM | SrcMem | Priv | Op3264, em_dr_write, dr_write, check_dr_write), | 3884 | check_cr_write), |
3885 | IIP(ModRM | SrcMem | Priv | Op3264 | NoMod, em_dr_write, dr_write, | ||
3886 | check_dr_write), | ||
3876 | N, N, N, N, | 3887 | N, N, N, N, |
3877 | GP(ModRM | DstReg | SrcMem | Mov | Sse, &pfx_0f_28_0f_29), | 3888 | GP(ModRM | DstReg | SrcMem | Mov | Sse, &pfx_0f_28_0f_29), |
3878 | GP(ModRM | DstMem | SrcReg | Mov | Sse, &pfx_0f_28_0f_29), | 3889 | GP(ModRM | DstMem | SrcReg | Mov | Sse, &pfx_0f_28_0f_29), |
diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c index 484bc874688b..bd0da433e6d7 100644 --- a/arch/x86/kvm/irq.c +++ b/arch/x86/kvm/irq.c | |||
@@ -113,6 +113,7 @@ int kvm_cpu_get_interrupt(struct kvm_vcpu *v) | |||
113 | 113 | ||
114 | return kvm_get_apic_interrupt(v); /* APIC */ | 114 | return kvm_get_apic_interrupt(v); /* APIC */ |
115 | } | 115 | } |
116 | EXPORT_SYMBOL_GPL(kvm_cpu_get_interrupt); | ||
116 | 117 | ||
117 | void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu) | 118 | void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu) |
118 | { | 119 | { |
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 9736529ade08..006911858174 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c | |||
@@ -360,6 +360,8 @@ static inline void apic_clear_irr(int vec, struct kvm_lapic *apic) | |||
360 | 360 | ||
361 | static inline void apic_set_isr(int vec, struct kvm_lapic *apic) | 361 | static inline void apic_set_isr(int vec, struct kvm_lapic *apic) |
362 | { | 362 | { |
363 | /* Note that we never get here with APIC virtualization enabled. */ | ||
364 | |||
363 | if (!__apic_test_and_set_vector(vec, apic->regs + APIC_ISR)) | 365 | if (!__apic_test_and_set_vector(vec, apic->regs + APIC_ISR)) |
364 | ++apic->isr_count; | 366 | ++apic->isr_count; |
365 | BUG_ON(apic->isr_count > MAX_APIC_VECTOR); | 367 | BUG_ON(apic->isr_count > MAX_APIC_VECTOR); |
@@ -371,12 +373,48 @@ static inline void apic_set_isr(int vec, struct kvm_lapic *apic) | |||
371 | apic->highest_isr_cache = vec; | 373 | apic->highest_isr_cache = vec; |
372 | } | 374 | } |
373 | 375 | ||
376 | static inline int apic_find_highest_isr(struct kvm_lapic *apic) | ||
377 | { | ||
378 | int result; | ||
379 | |||
380 | /* | ||
381 | * Note that isr_count is always 1, and highest_isr_cache | ||
382 | * is always -1, with APIC virtualization enabled. | ||
383 | */ | ||
384 | if (!apic->isr_count) | ||
385 | return -1; | ||
386 | if (likely(apic->highest_isr_cache != -1)) | ||
387 | return apic->highest_isr_cache; | ||
388 | |||
389 | result = find_highest_vector(apic->regs + APIC_ISR); | ||
390 | ASSERT(result == -1 || result >= 16); | ||
391 | |||
392 | return result; | ||
393 | } | ||
394 | |||
374 | static inline void apic_clear_isr(int vec, struct kvm_lapic *apic) | 395 | static inline void apic_clear_isr(int vec, struct kvm_lapic *apic) |
375 | { | 396 | { |
376 | if (__apic_test_and_clear_vector(vec, apic->regs + APIC_ISR)) | 397 | struct kvm_vcpu *vcpu; |
398 | if (!__apic_test_and_clear_vector(vec, apic->regs + APIC_ISR)) | ||
399 | return; | ||
400 | |||
401 | vcpu = apic->vcpu; | ||
402 | |||
403 | /* | ||
404 | * We do get here for APIC virtualization enabled if the guest | ||
405 | * uses the Hyper-V APIC enlightenment. In this case we may need | ||
406 | * to trigger a new interrupt delivery by writing the SVI field; | ||
407 | * on the other hand isr_count and highest_isr_cache are unused | ||
408 | * and must be left alone. | ||
409 | */ | ||
410 | if (unlikely(kvm_apic_vid_enabled(vcpu->kvm))) | ||
411 | kvm_x86_ops->hwapic_isr_update(vcpu->kvm, | ||
412 | apic_find_highest_isr(apic)); | ||
413 | else { | ||
377 | --apic->isr_count; | 414 | --apic->isr_count; |
378 | BUG_ON(apic->isr_count < 0); | 415 | BUG_ON(apic->isr_count < 0); |
379 | apic->highest_isr_cache = -1; | 416 | apic->highest_isr_cache = -1; |
417 | } | ||
380 | } | 418 | } |
381 | 419 | ||
382 | int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu) | 420 | int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu) |
@@ -456,22 +494,6 @@ static void pv_eoi_clr_pending(struct kvm_vcpu *vcpu) | |||
456 | __clear_bit(KVM_APIC_PV_EOI_PENDING, &vcpu->arch.apic_attention); | 494 | __clear_bit(KVM_APIC_PV_EOI_PENDING, &vcpu->arch.apic_attention); |
457 | } | 495 | } |
458 | 496 | ||
459 | static inline int apic_find_highest_isr(struct kvm_lapic *apic) | ||
460 | { | ||
461 | int result; | ||
462 | |||
463 | /* Note that isr_count is always 1 with vid enabled */ | ||
464 | if (!apic->isr_count) | ||
465 | return -1; | ||
466 | if (likely(apic->highest_isr_cache != -1)) | ||
467 | return apic->highest_isr_cache; | ||
468 | |||
469 | result = find_highest_vector(apic->regs + APIC_ISR); | ||
470 | ASSERT(result == -1 || result >= 16); | ||
471 | |||
472 | return result; | ||
473 | } | ||
474 | |||
475 | void kvm_apic_update_tmr(struct kvm_vcpu *vcpu, u32 *tmr) | 497 | void kvm_apic_update_tmr(struct kvm_vcpu *vcpu, u32 *tmr) |
476 | { | 498 | { |
477 | struct kvm_lapic *apic = vcpu->arch.apic; | 499 | struct kvm_lapic *apic = vcpu->arch.apic; |
@@ -1605,6 +1627,8 @@ int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu) | |||
1605 | int vector = kvm_apic_has_interrupt(vcpu); | 1627 | int vector = kvm_apic_has_interrupt(vcpu); |
1606 | struct kvm_lapic *apic = vcpu->arch.apic; | 1628 | struct kvm_lapic *apic = vcpu->arch.apic; |
1607 | 1629 | ||
1630 | /* Note that we never get here with APIC virtualization enabled. */ | ||
1631 | |||
1608 | if (vector == -1) | 1632 | if (vector == -1) |
1609 | return -1; | 1633 | return -1; |
1610 | 1634 | ||
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 813d31038b93..931467881da7 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c | |||
@@ -22,6 +22,7 @@ | |||
22 | #include "mmu.h" | 22 | #include "mmu.h" |
23 | #include "x86.h" | 23 | #include "x86.h" |
24 | #include "kvm_cache_regs.h" | 24 | #include "kvm_cache_regs.h" |
25 | #include "cpuid.h" | ||
25 | 26 | ||
26 | #include <linux/kvm_host.h> | 27 | #include <linux/kvm_host.h> |
27 | #include <linux/types.h> | 28 | #include <linux/types.h> |
@@ -595,7 +596,8 @@ static bool mmu_spte_update(u64 *sptep, u64 new_spte) | |||
595 | * we always atomicly update it, see the comments in | 596 | * we always atomicly update it, see the comments in |
596 | * spte_has_volatile_bits(). | 597 | * spte_has_volatile_bits(). |
597 | */ | 598 | */ |
598 | if (is_writable_pte(old_spte) && !is_writable_pte(new_spte)) | 599 | if (spte_is_locklessly_modifiable(old_spte) && |
600 | !is_writable_pte(new_spte)) | ||
599 | ret = true; | 601 | ret = true; |
600 | 602 | ||
601 | if (!shadow_accessed_mask) | 603 | if (!shadow_accessed_mask) |
@@ -1176,8 +1178,7 @@ static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep) | |||
1176 | 1178 | ||
1177 | /* | 1179 | /* |
1178 | * Write-protect on the specified @sptep, @pt_protect indicates whether | 1180 | * Write-protect on the specified @sptep, @pt_protect indicates whether |
1179 | * spte writ-protection is caused by protecting shadow page table. | 1181 | * spte write-protection is caused by protecting shadow page table. |
1180 | * @flush indicates whether tlb need be flushed. | ||
1181 | * | 1182 | * |
1182 | * Note: write protection is difference between drity logging and spte | 1183 | * Note: write protection is difference between drity logging and spte |
1183 | * protection: | 1184 | * protection: |
@@ -1186,10 +1187,9 @@ static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep) | |||
1186 | * - for spte protection, the spte can be writable only after unsync-ing | 1187 | * - for spte protection, the spte can be writable only after unsync-ing |
1187 | * shadow page. | 1188 | * shadow page. |
1188 | * | 1189 | * |
1189 | * Return true if the spte is dropped. | 1190 | * Return true if tlb need be flushed. |
1190 | */ | 1191 | */ |
1191 | static bool | 1192 | static bool spte_write_protect(struct kvm *kvm, u64 *sptep, bool pt_protect) |
1192 | spte_write_protect(struct kvm *kvm, u64 *sptep, bool *flush, bool pt_protect) | ||
1193 | { | 1193 | { |
1194 | u64 spte = *sptep; | 1194 | u64 spte = *sptep; |
1195 | 1195 | ||
@@ -1199,17 +1199,11 @@ spte_write_protect(struct kvm *kvm, u64 *sptep, bool *flush, bool pt_protect) | |||
1199 | 1199 | ||
1200 | rmap_printk("rmap_write_protect: spte %p %llx\n", sptep, *sptep); | 1200 | rmap_printk("rmap_write_protect: spte %p %llx\n", sptep, *sptep); |
1201 | 1201 | ||
1202 | if (__drop_large_spte(kvm, sptep)) { | ||
1203 | *flush |= true; | ||
1204 | return true; | ||
1205 | } | ||
1206 | |||
1207 | if (pt_protect) | 1202 | if (pt_protect) |
1208 | spte &= ~SPTE_MMU_WRITEABLE; | 1203 | spte &= ~SPTE_MMU_WRITEABLE; |
1209 | spte = spte & ~PT_WRITABLE_MASK; | 1204 | spte = spte & ~PT_WRITABLE_MASK; |
1210 | 1205 | ||
1211 | *flush |= mmu_spte_update(sptep, spte); | 1206 | return mmu_spte_update(sptep, spte); |
1212 | return false; | ||
1213 | } | 1207 | } |
1214 | 1208 | ||
1215 | static bool __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp, | 1209 | static bool __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp, |
@@ -1221,11 +1215,8 @@ static bool __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp, | |||
1221 | 1215 | ||
1222 | for (sptep = rmap_get_first(*rmapp, &iter); sptep;) { | 1216 | for (sptep = rmap_get_first(*rmapp, &iter); sptep;) { |
1223 | BUG_ON(!(*sptep & PT_PRESENT_MASK)); | 1217 | BUG_ON(!(*sptep & PT_PRESENT_MASK)); |
1224 | if (spte_write_protect(kvm, sptep, &flush, pt_protect)) { | ||
1225 | sptep = rmap_get_first(*rmapp, &iter); | ||
1226 | continue; | ||
1227 | } | ||
1228 | 1218 | ||
1219 | flush |= spte_write_protect(kvm, sptep, pt_protect); | ||
1229 | sptep = rmap_get_next(&iter); | 1220 | sptep = rmap_get_next(&iter); |
1230 | } | 1221 | } |
1231 | 1222 | ||
@@ -2802,9 +2793,9 @@ static bool page_fault_can_be_fast(u32 error_code) | |||
2802 | } | 2793 | } |
2803 | 2794 | ||
2804 | static bool | 2795 | static bool |
2805 | fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 spte) | 2796 | fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, |
2797 | u64 *sptep, u64 spte) | ||
2806 | { | 2798 | { |
2807 | struct kvm_mmu_page *sp = page_header(__pa(sptep)); | ||
2808 | gfn_t gfn; | 2799 | gfn_t gfn; |
2809 | 2800 | ||
2810 | WARN_ON(!sp->role.direct); | 2801 | WARN_ON(!sp->role.direct); |
@@ -2830,6 +2821,7 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level, | |||
2830 | u32 error_code) | 2821 | u32 error_code) |
2831 | { | 2822 | { |
2832 | struct kvm_shadow_walk_iterator iterator; | 2823 | struct kvm_shadow_walk_iterator iterator; |
2824 | struct kvm_mmu_page *sp; | ||
2833 | bool ret = false; | 2825 | bool ret = false; |
2834 | u64 spte = 0ull; | 2826 | u64 spte = 0ull; |
2835 | 2827 | ||
@@ -2853,7 +2845,8 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level, | |||
2853 | goto exit; | 2845 | goto exit; |
2854 | } | 2846 | } |
2855 | 2847 | ||
2856 | if (!is_last_spte(spte, level)) | 2848 | sp = page_header(__pa(iterator.sptep)); |
2849 | if (!is_last_spte(spte, sp->role.level)) | ||
2857 | goto exit; | 2850 | goto exit; |
2858 | 2851 | ||
2859 | /* | 2852 | /* |
@@ -2875,11 +2868,24 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level, | |||
2875 | goto exit; | 2868 | goto exit; |
2876 | 2869 | ||
2877 | /* | 2870 | /* |
2871 | * Do not fix write-permission on the large spte since we only dirty | ||
2872 | * the first page into the dirty-bitmap in fast_pf_fix_direct_spte() | ||
2873 | * that means other pages are missed if its slot is dirty-logged. | ||
2874 | * | ||
2875 | * Instead, we let the slow page fault path create a normal spte to | ||
2876 | * fix the access. | ||
2877 | * | ||
2878 | * See the comments in kvm_arch_commit_memory_region(). | ||
2879 | */ | ||
2880 | if (sp->role.level > PT_PAGE_TABLE_LEVEL) | ||
2881 | goto exit; | ||
2882 | |||
2883 | /* | ||
2878 | * Currently, fast page fault only works for direct mapping since | 2884 | * Currently, fast page fault only works for direct mapping since |
2879 | * the gfn is not stable for indirect shadow page. | 2885 | * the gfn is not stable for indirect shadow page. |
2880 | * See Documentation/virtual/kvm/locking.txt to get more detail. | 2886 | * See Documentation/virtual/kvm/locking.txt to get more detail. |
2881 | */ | 2887 | */ |
2882 | ret = fast_pf_fix_direct_spte(vcpu, iterator.sptep, spte); | 2888 | ret = fast_pf_fix_direct_spte(vcpu, sp, iterator.sptep, spte); |
2883 | exit: | 2889 | exit: |
2884 | trace_fast_page_fault(vcpu, gva, error_code, iterator.sptep, | 2890 | trace_fast_page_fault(vcpu, gva, error_code, iterator.sptep, |
2885 | spte, ret); | 2891 | spte, ret); |
@@ -3511,11 +3517,14 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, | |||
3511 | { | 3517 | { |
3512 | int maxphyaddr = cpuid_maxphyaddr(vcpu); | 3518 | int maxphyaddr = cpuid_maxphyaddr(vcpu); |
3513 | u64 exb_bit_rsvd = 0; | 3519 | u64 exb_bit_rsvd = 0; |
3520 | u64 gbpages_bit_rsvd = 0; | ||
3514 | 3521 | ||
3515 | context->bad_mt_xwr = 0; | 3522 | context->bad_mt_xwr = 0; |
3516 | 3523 | ||
3517 | if (!context->nx) | 3524 | if (!context->nx) |
3518 | exb_bit_rsvd = rsvd_bits(63, 63); | 3525 | exb_bit_rsvd = rsvd_bits(63, 63); |
3526 | if (!guest_cpuid_has_gbpages(vcpu)) | ||
3527 | gbpages_bit_rsvd = rsvd_bits(7, 7); | ||
3519 | switch (context->root_level) { | 3528 | switch (context->root_level) { |
3520 | case PT32_ROOT_LEVEL: | 3529 | case PT32_ROOT_LEVEL: |
3521 | /* no rsvd bits for 2 level 4K page table entries */ | 3530 | /* no rsvd bits for 2 level 4K page table entries */ |
@@ -3538,7 +3547,7 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, | |||
3538 | case PT32E_ROOT_LEVEL: | 3547 | case PT32E_ROOT_LEVEL: |
3539 | context->rsvd_bits_mask[0][2] = | 3548 | context->rsvd_bits_mask[0][2] = |
3540 | rsvd_bits(maxphyaddr, 63) | | 3549 | rsvd_bits(maxphyaddr, 63) | |
3541 | rsvd_bits(7, 8) | rsvd_bits(1, 2); /* PDPTE */ | 3550 | rsvd_bits(5, 8) | rsvd_bits(1, 2); /* PDPTE */ |
3542 | context->rsvd_bits_mask[0][1] = exb_bit_rsvd | | 3551 | context->rsvd_bits_mask[0][1] = exb_bit_rsvd | |
3543 | rsvd_bits(maxphyaddr, 62); /* PDE */ | 3552 | rsvd_bits(maxphyaddr, 62); /* PDE */ |
3544 | context->rsvd_bits_mask[0][0] = exb_bit_rsvd | | 3553 | context->rsvd_bits_mask[0][0] = exb_bit_rsvd | |
@@ -3550,16 +3559,16 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, | |||
3550 | break; | 3559 | break; |
3551 | case PT64_ROOT_LEVEL: | 3560 | case PT64_ROOT_LEVEL: |
3552 | context->rsvd_bits_mask[0][3] = exb_bit_rsvd | | 3561 | context->rsvd_bits_mask[0][3] = exb_bit_rsvd | |
3553 | rsvd_bits(maxphyaddr, 51) | rsvd_bits(7, 8); | 3562 | rsvd_bits(maxphyaddr, 51) | rsvd_bits(7, 7); |
3554 | context->rsvd_bits_mask[0][2] = exb_bit_rsvd | | 3563 | context->rsvd_bits_mask[0][2] = exb_bit_rsvd | |
3555 | rsvd_bits(maxphyaddr, 51) | rsvd_bits(7, 8); | 3564 | gbpages_bit_rsvd | rsvd_bits(maxphyaddr, 51); |
3556 | context->rsvd_bits_mask[0][1] = exb_bit_rsvd | | 3565 | context->rsvd_bits_mask[0][1] = exb_bit_rsvd | |
3557 | rsvd_bits(maxphyaddr, 51); | 3566 | rsvd_bits(maxphyaddr, 51); |
3558 | context->rsvd_bits_mask[0][0] = exb_bit_rsvd | | 3567 | context->rsvd_bits_mask[0][0] = exb_bit_rsvd | |
3559 | rsvd_bits(maxphyaddr, 51); | 3568 | rsvd_bits(maxphyaddr, 51); |
3560 | context->rsvd_bits_mask[1][3] = context->rsvd_bits_mask[0][3]; | 3569 | context->rsvd_bits_mask[1][3] = context->rsvd_bits_mask[0][3]; |
3561 | context->rsvd_bits_mask[1][2] = exb_bit_rsvd | | 3570 | context->rsvd_bits_mask[1][2] = exb_bit_rsvd | |
3562 | rsvd_bits(maxphyaddr, 51) | | 3571 | gbpages_bit_rsvd | rsvd_bits(maxphyaddr, 51) | |
3563 | rsvd_bits(13, 29); | 3572 | rsvd_bits(13, 29); |
3564 | context->rsvd_bits_mask[1][1] = exb_bit_rsvd | | 3573 | context->rsvd_bits_mask[1][1] = exb_bit_rsvd | |
3565 | rsvd_bits(maxphyaddr, 51) | | 3574 | rsvd_bits(maxphyaddr, 51) | |
@@ -4304,15 +4313,32 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot) | |||
4304 | if (*rmapp) | 4313 | if (*rmapp) |
4305 | __rmap_write_protect(kvm, rmapp, false); | 4314 | __rmap_write_protect(kvm, rmapp, false); |
4306 | 4315 | ||
4307 | if (need_resched() || spin_needbreak(&kvm->mmu_lock)) { | 4316 | if (need_resched() || spin_needbreak(&kvm->mmu_lock)) |
4308 | kvm_flush_remote_tlbs(kvm); | ||
4309 | cond_resched_lock(&kvm->mmu_lock); | 4317 | cond_resched_lock(&kvm->mmu_lock); |
4310 | } | ||
4311 | } | 4318 | } |
4312 | } | 4319 | } |
4313 | 4320 | ||
4314 | kvm_flush_remote_tlbs(kvm); | ||
4315 | spin_unlock(&kvm->mmu_lock); | 4321 | spin_unlock(&kvm->mmu_lock); |
4322 | |||
4323 | /* | ||
4324 | * kvm_mmu_slot_remove_write_access() and kvm_vm_ioctl_get_dirty_log() | ||
4325 | * which do tlb flush out of mmu-lock should be serialized by | ||
4326 | * kvm->slots_lock otherwise tlb flush would be missed. | ||
4327 | */ | ||
4328 | lockdep_assert_held(&kvm->slots_lock); | ||
4329 | |||
4330 | /* | ||
4331 | * We can flush all the TLBs out of the mmu lock without TLB | ||
4332 | * corruption since we just change the spte from writable to | ||
4333 | * readonly so that we only need to care the case of changing | ||
4334 | * spte from present to present (changing the spte from present | ||
4335 | * to nonpresent will flush all the TLBs immediately), in other | ||
4336 | * words, the only case we care is mmu_spte_update() where we | ||
4337 | * haved checked SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE | ||
4338 | * instead of PT_WRITABLE_MASK, that means it does not depend | ||
4339 | * on PT_WRITABLE_MASK anymore. | ||
4340 | */ | ||
4341 | kvm_flush_remote_tlbs(kvm); | ||
4316 | } | 4342 | } |
4317 | 4343 | ||
4318 | #define BATCH_ZAP_PAGES 10 | 4344 | #define BATCH_ZAP_PAGES 10 |
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index 3842e70bdb7c..b982112d2ca5 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h | |||
@@ -104,6 +104,39 @@ static inline int is_present_gpte(unsigned long pte) | |||
104 | return pte & PT_PRESENT_MASK; | 104 | return pte & PT_PRESENT_MASK; |
105 | } | 105 | } |
106 | 106 | ||
107 | /* | ||
108 | * Currently, we have two sorts of write-protection, a) the first one | ||
109 | * write-protects guest page to sync the guest modification, b) another one is | ||
110 | * used to sync dirty bitmap when we do KVM_GET_DIRTY_LOG. The differences | ||
111 | * between these two sorts are: | ||
112 | * 1) the first case clears SPTE_MMU_WRITEABLE bit. | ||
113 | * 2) the first case requires flushing tlb immediately avoiding corrupting | ||
114 | * shadow page table between all vcpus so it should be in the protection of | ||
115 | * mmu-lock. And the another case does not need to flush tlb until returning | ||
116 | * the dirty bitmap to userspace since it only write-protects the page | ||
117 | * logged in the bitmap, that means the page in the dirty bitmap is not | ||
118 | * missed, so it can flush tlb out of mmu-lock. | ||
119 | * | ||
120 | * So, there is the problem: the first case can meet the corrupted tlb caused | ||
121 | * by another case which write-protects pages but without flush tlb | ||
122 | * immediately. In order to making the first case be aware this problem we let | ||
123 | * it flush tlb if we try to write-protect a spte whose SPTE_MMU_WRITEABLE bit | ||
124 | * is set, it works since another case never touches SPTE_MMU_WRITEABLE bit. | ||
125 | * | ||
126 | * Anyway, whenever a spte is updated (only permission and status bits are | ||
127 | * changed) we need to check whether the spte with SPTE_MMU_WRITEABLE becomes | ||
128 | * readonly, if that happens, we need to flush tlb. Fortunately, | ||
129 | * mmu_spte_update() has already handled it perfectly. | ||
130 | * | ||
131 | * The rules to use SPTE_MMU_WRITEABLE and PT_WRITABLE_MASK: | ||
132 | * - if we want to see if it has writable tlb entry or if the spte can be | ||
133 | * writable on the mmu mapping, check SPTE_MMU_WRITEABLE, this is the most | ||
134 | * case, otherwise | ||
135 | * - if we fix page fault on the spte or do write-protection by dirty logging, | ||
136 | * check PT_WRITABLE_MASK. | ||
137 | * | ||
138 | * TODO: introduce APIs to split these two cases. | ||
139 | */ | ||
107 | static inline int is_writable_pte(unsigned long pte) | 140 | static inline int is_writable_pte(unsigned long pte) |
108 | { | 141 | { |
109 | return pte & PT_WRITABLE_MASK; | 142 | return pte & PT_WRITABLE_MASK; |
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 123efd3ec29f..410776528265 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h | |||
@@ -913,8 +913,7 @@ static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gva_t vaddr, | |||
913 | * and kvm_mmu_notifier_invalidate_range_start detect the mapping page isn't | 913 | * and kvm_mmu_notifier_invalidate_range_start detect the mapping page isn't |
914 | * used by guest then tlbs are not flushed, so guest is allowed to access the | 914 | * used by guest then tlbs are not flushed, so guest is allowed to access the |
915 | * freed pages. | 915 | * freed pages. |
916 | * We set tlbs_dirty to let the notifier know this change and delay the flush | 916 | * And we increase kvm->tlbs_dirty to delay tlbs flush in this case. |
917 | * until such a case actually happens. | ||
918 | */ | 917 | */ |
919 | static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) | 918 | static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) |
920 | { | 919 | { |
@@ -943,7 +942,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) | |||
943 | return -EINVAL; | 942 | return -EINVAL; |
944 | 943 | ||
945 | if (FNAME(prefetch_invalid_gpte)(vcpu, sp, &sp->spt[i], gpte)) { | 944 | if (FNAME(prefetch_invalid_gpte)(vcpu, sp, &sp->spt[i], gpte)) { |
946 | vcpu->kvm->tlbs_dirty = true; | 945 | vcpu->kvm->tlbs_dirty++; |
947 | continue; | 946 | continue; |
948 | } | 947 | } |
949 | 948 | ||
@@ -958,7 +957,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) | |||
958 | 957 | ||
959 | if (gfn != sp->gfns[i]) { | 958 | if (gfn != sp->gfns[i]) { |
960 | drop_spte(vcpu->kvm, &sp->spt[i]); | 959 | drop_spte(vcpu->kvm, &sp->spt[i]); |
961 | vcpu->kvm->tlbs_dirty = true; | 960 | vcpu->kvm->tlbs_dirty++; |
962 | continue; | 961 | continue; |
963 | } | 962 | } |
964 | 963 | ||
diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index 5c4f63151b4d..cbecaa90399c 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c | |||
@@ -108,7 +108,10 @@ static void kvm_perf_overflow(struct perf_event *perf_event, | |||
108 | { | 108 | { |
109 | struct kvm_pmc *pmc = perf_event->overflow_handler_context; | 109 | struct kvm_pmc *pmc = perf_event->overflow_handler_context; |
110 | struct kvm_pmu *pmu = &pmc->vcpu->arch.pmu; | 110 | struct kvm_pmu *pmu = &pmc->vcpu->arch.pmu; |
111 | __set_bit(pmc->idx, (unsigned long *)&pmu->global_status); | 111 | if (!test_and_set_bit(pmc->idx, (unsigned long *)&pmu->reprogram_pmi)) { |
112 | __set_bit(pmc->idx, (unsigned long *)&pmu->global_status); | ||
113 | kvm_make_request(KVM_REQ_PMU, pmc->vcpu); | ||
114 | } | ||
112 | } | 115 | } |
113 | 116 | ||
114 | static void kvm_perf_overflow_intr(struct perf_event *perf_event, | 117 | static void kvm_perf_overflow_intr(struct perf_event *perf_event, |
@@ -117,7 +120,7 @@ static void kvm_perf_overflow_intr(struct perf_event *perf_event, | |||
117 | struct kvm_pmc *pmc = perf_event->overflow_handler_context; | 120 | struct kvm_pmc *pmc = perf_event->overflow_handler_context; |
118 | struct kvm_pmu *pmu = &pmc->vcpu->arch.pmu; | 121 | struct kvm_pmu *pmu = &pmc->vcpu->arch.pmu; |
119 | if (!test_and_set_bit(pmc->idx, (unsigned long *)&pmu->reprogram_pmi)) { | 122 | if (!test_and_set_bit(pmc->idx, (unsigned long *)&pmu->reprogram_pmi)) { |
120 | kvm_perf_overflow(perf_event, data, regs); | 123 | __set_bit(pmc->idx, (unsigned long *)&pmu->global_status); |
121 | kvm_make_request(KVM_REQ_PMU, pmc->vcpu); | 124 | kvm_make_request(KVM_REQ_PMU, pmc->vcpu); |
122 | /* | 125 | /* |
123 | * Inject PMI. If vcpu was in a guest mode during NMI PMI | 126 | * Inject PMI. If vcpu was in a guest mode during NMI PMI |
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 7f4f9c2badae..ec8366c5cfea 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c | |||
@@ -1338,21 +1338,6 @@ static void svm_vcpu_put(struct kvm_vcpu *vcpu) | |||
1338 | wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]); | 1338 | wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]); |
1339 | } | 1339 | } |
1340 | 1340 | ||
1341 | static void svm_update_cpl(struct kvm_vcpu *vcpu) | ||
1342 | { | ||
1343 | struct vcpu_svm *svm = to_svm(vcpu); | ||
1344 | int cpl; | ||
1345 | |||
1346 | if (!is_protmode(vcpu)) | ||
1347 | cpl = 0; | ||
1348 | else if (svm->vmcb->save.rflags & X86_EFLAGS_VM) | ||
1349 | cpl = 3; | ||
1350 | else | ||
1351 | cpl = svm->vmcb->save.cs.selector & 0x3; | ||
1352 | |||
1353 | svm->vmcb->save.cpl = cpl; | ||
1354 | } | ||
1355 | |||
1356 | static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu) | 1341 | static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu) |
1357 | { | 1342 | { |
1358 | return to_svm(vcpu)->vmcb->save.rflags; | 1343 | return to_svm(vcpu)->vmcb->save.rflags; |
@@ -1360,11 +1345,12 @@ static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu) | |||
1360 | 1345 | ||
1361 | static void svm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) | 1346 | static void svm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) |
1362 | { | 1347 | { |
1363 | unsigned long old_rflags = to_svm(vcpu)->vmcb->save.rflags; | 1348 | /* |
1364 | 1349 | * Any change of EFLAGS.VM is accompained by a reload of SS | |
1350 | * (caused by either a task switch or an inter-privilege IRET), | ||
1351 | * so we do not need to update the CPL here. | ||
1352 | */ | ||
1365 | to_svm(vcpu)->vmcb->save.rflags = rflags; | 1353 | to_svm(vcpu)->vmcb->save.rflags = rflags; |
1366 | if ((old_rflags ^ rflags) & X86_EFLAGS_VM) | ||
1367 | svm_update_cpl(vcpu); | ||
1368 | } | 1354 | } |
1369 | 1355 | ||
1370 | static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg) | 1356 | static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg) |
@@ -1631,8 +1617,15 @@ static void svm_set_segment(struct kvm_vcpu *vcpu, | |||
1631 | s->attrib |= (var->db & 1) << SVM_SELECTOR_DB_SHIFT; | 1617 | s->attrib |= (var->db & 1) << SVM_SELECTOR_DB_SHIFT; |
1632 | s->attrib |= (var->g & 1) << SVM_SELECTOR_G_SHIFT; | 1618 | s->attrib |= (var->g & 1) << SVM_SELECTOR_G_SHIFT; |
1633 | } | 1619 | } |
1634 | if (seg == VCPU_SREG_CS) | 1620 | |
1635 | svm_update_cpl(vcpu); | 1621 | /* |
1622 | * This is always accurate, except if SYSRET returned to a segment | ||
1623 | * with SS.DPL != 3. Intel does not have this quirk, and always | ||
1624 | * forces SS.DPL to 3 on sysret, so we ignore that case; fixing it | ||
1625 | * would entail passing the CPL to userspace and back. | ||
1626 | */ | ||
1627 | if (seg == VCPU_SREG_SS) | ||
1628 | svm->vmcb->save.cpl = (s->attrib >> SVM_SELECTOR_DPL_SHIFT) & 3; | ||
1636 | 1629 | ||
1637 | mark_dirty(svm->vmcb, VMCB_SEG); | 1630 | mark_dirty(svm->vmcb, VMCB_SEG); |
1638 | } | 1631 | } |
@@ -2770,12 +2763,6 @@ static int xsetbv_interception(struct vcpu_svm *svm) | |||
2770 | return 1; | 2763 | return 1; |
2771 | } | 2764 | } |
2772 | 2765 | ||
2773 | static int invalid_op_interception(struct vcpu_svm *svm) | ||
2774 | { | ||
2775 | kvm_queue_exception(&svm->vcpu, UD_VECTOR); | ||
2776 | return 1; | ||
2777 | } | ||
2778 | |||
2779 | static int task_switch_interception(struct vcpu_svm *svm) | 2766 | static int task_switch_interception(struct vcpu_svm *svm) |
2780 | { | 2767 | { |
2781 | u16 tss_selector; | 2768 | u16 tss_selector; |
@@ -3287,6 +3274,24 @@ static int pause_interception(struct vcpu_svm *svm) | |||
3287 | return 1; | 3274 | return 1; |
3288 | } | 3275 | } |
3289 | 3276 | ||
3277 | static int nop_interception(struct vcpu_svm *svm) | ||
3278 | { | ||
3279 | skip_emulated_instruction(&(svm->vcpu)); | ||
3280 | return 1; | ||
3281 | } | ||
3282 | |||
3283 | static int monitor_interception(struct vcpu_svm *svm) | ||
3284 | { | ||
3285 | printk_once(KERN_WARNING "kvm: MONITOR instruction emulated as NOP!\n"); | ||
3286 | return nop_interception(svm); | ||
3287 | } | ||
3288 | |||
3289 | static int mwait_interception(struct vcpu_svm *svm) | ||
3290 | { | ||
3291 | printk_once(KERN_WARNING "kvm: MWAIT instruction emulated as NOP!\n"); | ||
3292 | return nop_interception(svm); | ||
3293 | } | ||
3294 | |||
3290 | static int (*const svm_exit_handlers[])(struct vcpu_svm *svm) = { | 3295 | static int (*const svm_exit_handlers[])(struct vcpu_svm *svm) = { |
3291 | [SVM_EXIT_READ_CR0] = cr_interception, | 3296 | [SVM_EXIT_READ_CR0] = cr_interception, |
3292 | [SVM_EXIT_READ_CR3] = cr_interception, | 3297 | [SVM_EXIT_READ_CR3] = cr_interception, |
@@ -3344,8 +3349,8 @@ static int (*const svm_exit_handlers[])(struct vcpu_svm *svm) = { | |||
3344 | [SVM_EXIT_CLGI] = clgi_interception, | 3349 | [SVM_EXIT_CLGI] = clgi_interception, |
3345 | [SVM_EXIT_SKINIT] = skinit_interception, | 3350 | [SVM_EXIT_SKINIT] = skinit_interception, |
3346 | [SVM_EXIT_WBINVD] = emulate_on_interception, | 3351 | [SVM_EXIT_WBINVD] = emulate_on_interception, |
3347 | [SVM_EXIT_MONITOR] = invalid_op_interception, | 3352 | [SVM_EXIT_MONITOR] = monitor_interception, |
3348 | [SVM_EXIT_MWAIT] = invalid_op_interception, | 3353 | [SVM_EXIT_MWAIT] = mwait_interception, |
3349 | [SVM_EXIT_XSETBV] = xsetbv_interception, | 3354 | [SVM_EXIT_XSETBV] = xsetbv_interception, |
3350 | [SVM_EXIT_NPF] = pf_interception, | 3355 | [SVM_EXIT_NPF] = pf_interception, |
3351 | }; | 3356 | }; |
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index 545245d7cc63..33574c95220d 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h | |||
@@ -91,16 +91,21 @@ TRACE_EVENT(kvm_hv_hypercall, | |||
91 | /* | 91 | /* |
92 | * Tracepoint for PIO. | 92 | * Tracepoint for PIO. |
93 | */ | 93 | */ |
94 | |||
95 | #define KVM_PIO_IN 0 | ||
96 | #define KVM_PIO_OUT 1 | ||
97 | |||
94 | TRACE_EVENT(kvm_pio, | 98 | TRACE_EVENT(kvm_pio, |
95 | TP_PROTO(unsigned int rw, unsigned int port, unsigned int size, | 99 | TP_PROTO(unsigned int rw, unsigned int port, unsigned int size, |
96 | unsigned int count), | 100 | unsigned int count, void *data), |
97 | TP_ARGS(rw, port, size, count), | 101 | TP_ARGS(rw, port, size, count, data), |
98 | 102 | ||
99 | TP_STRUCT__entry( | 103 | TP_STRUCT__entry( |
100 | __field( unsigned int, rw ) | 104 | __field( unsigned int, rw ) |
101 | __field( unsigned int, port ) | 105 | __field( unsigned int, port ) |
102 | __field( unsigned int, size ) | 106 | __field( unsigned int, size ) |
103 | __field( unsigned int, count ) | 107 | __field( unsigned int, count ) |
108 | __field( unsigned int, val ) | ||
104 | ), | 109 | ), |
105 | 110 | ||
106 | TP_fast_assign( | 111 | TP_fast_assign( |
@@ -108,11 +113,18 @@ TRACE_EVENT(kvm_pio, | |||
108 | __entry->port = port; | 113 | __entry->port = port; |
109 | __entry->size = size; | 114 | __entry->size = size; |
110 | __entry->count = count; | 115 | __entry->count = count; |
116 | if (size == 1) | ||
117 | __entry->val = *(unsigned char *)data; | ||
118 | else if (size == 2) | ||
119 | __entry->val = *(unsigned short *)data; | ||
120 | else | ||
121 | __entry->val = *(unsigned int *)data; | ||
111 | ), | 122 | ), |
112 | 123 | ||
113 | TP_printk("pio_%s at 0x%x size %d count %d", | 124 | TP_printk("pio_%s at 0x%x size %d count %d val 0x%x %s", |
114 | __entry->rw ? "write" : "read", | 125 | __entry->rw ? "write" : "read", |
115 | __entry->port, __entry->size, __entry->count) | 126 | __entry->port, __entry->size, __entry->count, __entry->val, |
127 | __entry->count > 1 ? "(...)" : "") | ||
116 | ); | 128 | ); |
117 | 129 | ||
118 | /* | 130 | /* |
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 138ceffc6377..801332edefc3 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c | |||
@@ -354,6 +354,7 @@ struct vmcs02_list { | |||
354 | struct nested_vmx { | 354 | struct nested_vmx { |
355 | /* Has the level1 guest done vmxon? */ | 355 | /* Has the level1 guest done vmxon? */ |
356 | bool vmxon; | 356 | bool vmxon; |
357 | gpa_t vmxon_ptr; | ||
357 | 358 | ||
358 | /* The guest-physical address of the current VMCS L1 keeps for L2 */ | 359 | /* The guest-physical address of the current VMCS L1 keeps for L2 */ |
359 | gpa_t current_vmptr; | 360 | gpa_t current_vmptr; |
@@ -413,7 +414,6 @@ struct vcpu_vmx { | |||
413 | struct kvm_vcpu vcpu; | 414 | struct kvm_vcpu vcpu; |
414 | unsigned long host_rsp; | 415 | unsigned long host_rsp; |
415 | u8 fail; | 416 | u8 fail; |
416 | u8 cpl; | ||
417 | bool nmi_known_unmasked; | 417 | bool nmi_known_unmasked; |
418 | u32 exit_intr_info; | 418 | u32 exit_intr_info; |
419 | u32 idt_vectoring_info; | 419 | u32 idt_vectoring_info; |
@@ -2283,7 +2283,7 @@ static __init void nested_vmx_setup_ctls_msrs(void) | |||
2283 | rdmsr(MSR_IA32_VMX_EXIT_CTLS, | 2283 | rdmsr(MSR_IA32_VMX_EXIT_CTLS, |
2284 | nested_vmx_exit_ctls_low, nested_vmx_exit_ctls_high); | 2284 | nested_vmx_exit_ctls_low, nested_vmx_exit_ctls_high); |
2285 | nested_vmx_exit_ctls_low = VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR; | 2285 | nested_vmx_exit_ctls_low = VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR; |
2286 | /* Note that guest use of VM_EXIT_ACK_INTR_ON_EXIT is not supported. */ | 2286 | |
2287 | nested_vmx_exit_ctls_high &= | 2287 | nested_vmx_exit_ctls_high &= |
2288 | #ifdef CONFIG_X86_64 | 2288 | #ifdef CONFIG_X86_64 |
2289 | VM_EXIT_HOST_ADDR_SPACE_SIZE | | 2289 | VM_EXIT_HOST_ADDR_SPACE_SIZE | |
@@ -2291,7 +2291,8 @@ static __init void nested_vmx_setup_ctls_msrs(void) | |||
2291 | VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT; | 2291 | VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT; |
2292 | nested_vmx_exit_ctls_high |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR | | 2292 | nested_vmx_exit_ctls_high |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR | |
2293 | VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER | | 2293 | VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER | |
2294 | VM_EXIT_SAVE_VMX_PREEMPTION_TIMER; | 2294 | VM_EXIT_SAVE_VMX_PREEMPTION_TIMER | VM_EXIT_ACK_INTR_ON_EXIT; |
2295 | |||
2295 | if (vmx_mpx_supported()) | 2296 | if (vmx_mpx_supported()) |
2296 | nested_vmx_exit_ctls_high |= VM_EXIT_CLEAR_BNDCFGS; | 2297 | nested_vmx_exit_ctls_high |= VM_EXIT_CLEAR_BNDCFGS; |
2297 | 2298 | ||
@@ -2353,12 +2354,11 @@ static __init void nested_vmx_setup_ctls_msrs(void) | |||
2353 | VMX_EPT_INVEPT_BIT; | 2354 | VMX_EPT_INVEPT_BIT; |
2354 | nested_vmx_ept_caps &= vmx_capability.ept; | 2355 | nested_vmx_ept_caps &= vmx_capability.ept; |
2355 | /* | 2356 | /* |
2356 | * Since invept is completely emulated we support both global | 2357 | * For nested guests, we don't do anything specific |
2357 | * and context invalidation independent of what host cpu | 2358 | * for single context invalidation. Hence, only advertise |
2358 | * supports | 2359 | * support for global context invalidation. |
2359 | */ | 2360 | */ |
2360 | nested_vmx_ept_caps |= VMX_EPT_EXTENT_GLOBAL_BIT | | 2361 | nested_vmx_ept_caps |= VMX_EPT_EXTENT_GLOBAL_BIT; |
2361 | VMX_EPT_EXTENT_CONTEXT_BIT; | ||
2362 | } else | 2362 | } else |
2363 | nested_vmx_ept_caps = 0; | 2363 | nested_vmx_ept_caps = 0; |
2364 | 2364 | ||
@@ -3186,10 +3186,6 @@ static void enter_pmode(struct kvm_vcpu *vcpu) | |||
3186 | fix_pmode_seg(vcpu, VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]); | 3186 | fix_pmode_seg(vcpu, VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]); |
3187 | fix_pmode_seg(vcpu, VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]); | 3187 | fix_pmode_seg(vcpu, VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]); |
3188 | fix_pmode_seg(vcpu, VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]); | 3188 | fix_pmode_seg(vcpu, VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]); |
3189 | |||
3190 | /* CPL is always 0 when CPU enters protected mode */ | ||
3191 | __set_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail); | ||
3192 | vmx->cpl = 0; | ||
3193 | } | 3189 | } |
3194 | 3190 | ||
3195 | static void fix_rmode_seg(int seg, struct kvm_segment *save) | 3191 | static void fix_rmode_seg(int seg, struct kvm_segment *save) |
@@ -3591,22 +3587,14 @@ static int vmx_get_cpl(struct kvm_vcpu *vcpu) | |||
3591 | { | 3587 | { |
3592 | struct vcpu_vmx *vmx = to_vmx(vcpu); | 3588 | struct vcpu_vmx *vmx = to_vmx(vcpu); |
3593 | 3589 | ||
3594 | if (!is_protmode(vcpu)) | 3590 | if (unlikely(vmx->rmode.vm86_active)) |
3595 | return 0; | 3591 | return 0; |
3596 | 3592 | else { | |
3597 | if (!is_long_mode(vcpu) | 3593 | int ar = vmx_read_guest_seg_ar(vmx, VCPU_SREG_SS); |
3598 | && (kvm_get_rflags(vcpu) & X86_EFLAGS_VM)) /* if virtual 8086 */ | 3594 | return AR_DPL(ar); |
3599 | return 3; | ||
3600 | |||
3601 | if (!test_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail)) { | ||
3602 | __set_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail); | ||
3603 | vmx->cpl = vmx_read_guest_seg_selector(vmx, VCPU_SREG_CS) & 3; | ||
3604 | } | 3595 | } |
3605 | |||
3606 | return vmx->cpl; | ||
3607 | } | 3596 | } |
3608 | 3597 | ||
3609 | |||
3610 | static u32 vmx_segment_access_rights(struct kvm_segment *var) | 3598 | static u32 vmx_segment_access_rights(struct kvm_segment *var) |
3611 | { | 3599 | { |
3612 | u32 ar; | 3600 | u32 ar; |
@@ -3634,8 +3622,6 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu, | |||
3634 | const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; | 3622 | const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; |
3635 | 3623 | ||
3636 | vmx_segment_cache_clear(vmx); | 3624 | vmx_segment_cache_clear(vmx); |
3637 | if (seg == VCPU_SREG_CS) | ||
3638 | __clear_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail); | ||
3639 | 3625 | ||
3640 | if (vmx->rmode.vm86_active && seg != VCPU_SREG_LDTR) { | 3626 | if (vmx->rmode.vm86_active && seg != VCPU_SREG_LDTR) { |
3641 | vmx->rmode.segs[seg] = *var; | 3627 | vmx->rmode.segs[seg] = *var; |
@@ -4564,6 +4550,16 @@ static bool nested_exit_on_intr(struct kvm_vcpu *vcpu) | |||
4564 | PIN_BASED_EXT_INTR_MASK; | 4550 | PIN_BASED_EXT_INTR_MASK; |
4565 | } | 4551 | } |
4566 | 4552 | ||
4553 | /* | ||
4554 | * In nested virtualization, check if L1 has set | ||
4555 | * VM_EXIT_ACK_INTR_ON_EXIT | ||
4556 | */ | ||
4557 | static bool nested_exit_intr_ack_set(struct kvm_vcpu *vcpu) | ||
4558 | { | ||
4559 | return get_vmcs12(vcpu)->vm_exit_controls & | ||
4560 | VM_EXIT_ACK_INTR_ON_EXIT; | ||
4561 | } | ||
4562 | |||
4567 | static bool nested_exit_on_nmi(struct kvm_vcpu *vcpu) | 4563 | static bool nested_exit_on_nmi(struct kvm_vcpu *vcpu) |
4568 | { | 4564 | { |
4569 | return get_vmcs12(vcpu)->pin_based_vm_exec_control & | 4565 | return get_vmcs12(vcpu)->pin_based_vm_exec_control & |
@@ -4878,6 +4874,9 @@ static int handle_exception(struct kvm_vcpu *vcpu) | |||
4878 | (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))) { | 4874 | (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))) { |
4879 | vcpu->arch.dr6 &= ~15; | 4875 | vcpu->arch.dr6 &= ~15; |
4880 | vcpu->arch.dr6 |= dr6; | 4876 | vcpu->arch.dr6 |= dr6; |
4877 | if (!(dr6 & ~DR6_RESERVED)) /* icebp */ | ||
4878 | skip_emulated_instruction(vcpu); | ||
4879 | |||
4881 | kvm_queue_exception(vcpu, DB_VECTOR); | 4880 | kvm_queue_exception(vcpu, DB_VECTOR); |
4882 | return 1; | 4881 | return 1; |
4883 | } | 4882 | } |
@@ -5166,7 +5165,7 @@ static int handle_dr(struct kvm_vcpu *vcpu) | |||
5166 | return 1; | 5165 | return 1; |
5167 | kvm_register_write(vcpu, reg, val); | 5166 | kvm_register_write(vcpu, reg, val); |
5168 | } else | 5167 | } else |
5169 | if (kvm_set_dr(vcpu, dr, vcpu->arch.regs[reg])) | 5168 | if (kvm_set_dr(vcpu, dr, kvm_register_read(vcpu, reg))) |
5170 | return 1; | 5169 | return 1; |
5171 | 5170 | ||
5172 | skip_emulated_instruction(vcpu); | 5171 | skip_emulated_instruction(vcpu); |
@@ -5439,7 +5438,7 @@ static int handle_task_switch(struct kvm_vcpu *vcpu) | |||
5439 | } | 5438 | } |
5440 | 5439 | ||
5441 | /* clear all local breakpoint enable flags */ | 5440 | /* clear all local breakpoint enable flags */ |
5442 | vmcs_writel(GUEST_DR7, vmcs_readl(GUEST_DR7) & ~55); | 5441 | vmcs_writel(GUEST_DR7, vmcs_readl(GUEST_DR7) & ~0x55); |
5443 | 5442 | ||
5444 | /* | 5443 | /* |
5445 | * TODO: What about debug traps on tss switch? | 5444 | * TODO: What about debug traps on tss switch? |
@@ -5565,6 +5564,10 @@ static int handle_ept_misconfig(struct kvm_vcpu *vcpu) | |||
5565 | gpa_t gpa; | 5564 | gpa_t gpa; |
5566 | 5565 | ||
5567 | gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS); | 5566 | gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS); |
5567 | if (!kvm_io_bus_write(vcpu->kvm, KVM_FAST_MMIO_BUS, gpa, 0, NULL)) { | ||
5568 | skip_emulated_instruction(vcpu); | ||
5569 | return 1; | ||
5570 | } | ||
5568 | 5571 | ||
5569 | ret = handle_mmio_page_fault_common(vcpu, gpa, true); | 5572 | ret = handle_mmio_page_fault_common(vcpu, gpa, true); |
5570 | if (likely(ret == RET_MMIO_PF_EMULATE)) | 5573 | if (likely(ret == RET_MMIO_PF_EMULATE)) |
@@ -5669,12 +5672,24 @@ static int handle_pause(struct kvm_vcpu *vcpu) | |||
5669 | return 1; | 5672 | return 1; |
5670 | } | 5673 | } |
5671 | 5674 | ||
5672 | static int handle_invalid_op(struct kvm_vcpu *vcpu) | 5675 | static int handle_nop(struct kvm_vcpu *vcpu) |
5673 | { | 5676 | { |
5674 | kvm_queue_exception(vcpu, UD_VECTOR); | 5677 | skip_emulated_instruction(vcpu); |
5675 | return 1; | 5678 | return 1; |
5676 | } | 5679 | } |
5677 | 5680 | ||
5681 | static int handle_mwait(struct kvm_vcpu *vcpu) | ||
5682 | { | ||
5683 | printk_once(KERN_WARNING "kvm: MWAIT instruction emulated as NOP!\n"); | ||
5684 | return handle_nop(vcpu); | ||
5685 | } | ||
5686 | |||
5687 | static int handle_monitor(struct kvm_vcpu *vcpu) | ||
5688 | { | ||
5689 | printk_once(KERN_WARNING "kvm: MONITOR instruction emulated as NOP!\n"); | ||
5690 | return handle_nop(vcpu); | ||
5691 | } | ||
5692 | |||
5678 | /* | 5693 | /* |
5679 | * To run an L2 guest, we need a vmcs02 based on the L1-specified vmcs12. | 5694 | * To run an L2 guest, we need a vmcs02 based on the L1-specified vmcs12. |
5680 | * We could reuse a single VMCS for all the L2 guests, but we also want the | 5695 | * We could reuse a single VMCS for all the L2 guests, but we also want the |
@@ -5812,6 +5827,154 @@ static enum hrtimer_restart vmx_preemption_timer_fn(struct hrtimer *timer) | |||
5812 | } | 5827 | } |
5813 | 5828 | ||
5814 | /* | 5829 | /* |
5830 | * Decode the memory-address operand of a vmx instruction, as recorded on an | ||
5831 | * exit caused by such an instruction (run by a guest hypervisor). | ||
5832 | * On success, returns 0. When the operand is invalid, returns 1 and throws | ||
5833 | * #UD or #GP. | ||
5834 | */ | ||
5835 | static int get_vmx_mem_address(struct kvm_vcpu *vcpu, | ||
5836 | unsigned long exit_qualification, | ||
5837 | u32 vmx_instruction_info, gva_t *ret) | ||
5838 | { | ||
5839 | /* | ||
5840 | * According to Vol. 3B, "Information for VM Exits Due to Instruction | ||
5841 | * Execution", on an exit, vmx_instruction_info holds most of the | ||
5842 | * addressing components of the operand. Only the displacement part | ||
5843 | * is put in exit_qualification (see 3B, "Basic VM-Exit Information"). | ||
5844 | * For how an actual address is calculated from all these components, | ||
5845 | * refer to Vol. 1, "Operand Addressing". | ||
5846 | */ | ||
5847 | int scaling = vmx_instruction_info & 3; | ||
5848 | int addr_size = (vmx_instruction_info >> 7) & 7; | ||
5849 | bool is_reg = vmx_instruction_info & (1u << 10); | ||
5850 | int seg_reg = (vmx_instruction_info >> 15) & 7; | ||
5851 | int index_reg = (vmx_instruction_info >> 18) & 0xf; | ||
5852 | bool index_is_valid = !(vmx_instruction_info & (1u << 22)); | ||
5853 | int base_reg = (vmx_instruction_info >> 23) & 0xf; | ||
5854 | bool base_is_valid = !(vmx_instruction_info & (1u << 27)); | ||
5855 | |||
5856 | if (is_reg) { | ||
5857 | kvm_queue_exception(vcpu, UD_VECTOR); | ||
5858 | return 1; | ||
5859 | } | ||
5860 | |||
5861 | /* Addr = segment_base + offset */ | ||
5862 | /* offset = base + [index * scale] + displacement */ | ||
5863 | *ret = vmx_get_segment_base(vcpu, seg_reg); | ||
5864 | if (base_is_valid) | ||
5865 | *ret += kvm_register_read(vcpu, base_reg); | ||
5866 | if (index_is_valid) | ||
5867 | *ret += kvm_register_read(vcpu, index_reg)<<scaling; | ||
5868 | *ret += exit_qualification; /* holds the displacement */ | ||
5869 | |||
5870 | if (addr_size == 1) /* 32 bit */ | ||
5871 | *ret &= 0xffffffff; | ||
5872 | |||
5873 | /* | ||
5874 | * TODO: throw #GP (and return 1) in various cases that the VM* | ||
5875 | * instructions require it - e.g., offset beyond segment limit, | ||
5876 | * unusable or unreadable/unwritable segment, non-canonical 64-bit | ||
5877 | * address, and so on. Currently these are not checked. | ||
5878 | */ | ||
5879 | return 0; | ||
5880 | } | ||
5881 | |||
5882 | /* | ||
5883 | * This function performs the various checks including | ||
5884 | * - if it's 4KB aligned | ||
5885 | * - No bits beyond the physical address width are set | ||
5886 | * - Returns 0 on success or else 1 | ||
5887 | * (Intel SDM Section 30.3) | ||
5888 | */ | ||
5889 | static int nested_vmx_check_vmptr(struct kvm_vcpu *vcpu, int exit_reason, | ||
5890 | gpa_t *vmpointer) | ||
5891 | { | ||
5892 | gva_t gva; | ||
5893 | gpa_t vmptr; | ||
5894 | struct x86_exception e; | ||
5895 | struct page *page; | ||
5896 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
5897 | int maxphyaddr = cpuid_maxphyaddr(vcpu); | ||
5898 | |||
5899 | if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION), | ||
5900 | vmcs_read32(VMX_INSTRUCTION_INFO), &gva)) | ||
5901 | return 1; | ||
5902 | |||
5903 | if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, &vmptr, | ||
5904 | sizeof(vmptr), &e)) { | ||
5905 | kvm_inject_page_fault(vcpu, &e); | ||
5906 | return 1; | ||
5907 | } | ||
5908 | |||
5909 | switch (exit_reason) { | ||
5910 | case EXIT_REASON_VMON: | ||
5911 | /* | ||
5912 | * SDM 3: 24.11.5 | ||
5913 | * The first 4 bytes of VMXON region contain the supported | ||
5914 | * VMCS revision identifier | ||
5915 | * | ||
5916 | * Note - IA32_VMX_BASIC[48] will never be 1 | ||
5917 | * for the nested case; | ||
5918 | * which replaces physical address width with 32 | ||
5919 | * | ||
5920 | */ | ||
5921 | if (!IS_ALIGNED(vmptr, PAGE_SIZE) || (vmptr >> maxphyaddr)) { | ||
5922 | nested_vmx_failInvalid(vcpu); | ||
5923 | skip_emulated_instruction(vcpu); | ||
5924 | return 1; | ||
5925 | } | ||
5926 | |||
5927 | page = nested_get_page(vcpu, vmptr); | ||
5928 | if (page == NULL || | ||
5929 | *(u32 *)kmap(page) != VMCS12_REVISION) { | ||
5930 | nested_vmx_failInvalid(vcpu); | ||
5931 | kunmap(page); | ||
5932 | skip_emulated_instruction(vcpu); | ||
5933 | return 1; | ||
5934 | } | ||
5935 | kunmap(page); | ||
5936 | vmx->nested.vmxon_ptr = vmptr; | ||
5937 | break; | ||
5938 | case EXIT_REASON_VMCLEAR: | ||
5939 | if (!IS_ALIGNED(vmptr, PAGE_SIZE) || (vmptr >> maxphyaddr)) { | ||
5940 | nested_vmx_failValid(vcpu, | ||
5941 | VMXERR_VMCLEAR_INVALID_ADDRESS); | ||
5942 | skip_emulated_instruction(vcpu); | ||
5943 | return 1; | ||
5944 | } | ||
5945 | |||
5946 | if (vmptr == vmx->nested.vmxon_ptr) { | ||
5947 | nested_vmx_failValid(vcpu, | ||
5948 | VMXERR_VMCLEAR_VMXON_POINTER); | ||
5949 | skip_emulated_instruction(vcpu); | ||
5950 | return 1; | ||
5951 | } | ||
5952 | break; | ||
5953 | case EXIT_REASON_VMPTRLD: | ||
5954 | if (!IS_ALIGNED(vmptr, PAGE_SIZE) || (vmptr >> maxphyaddr)) { | ||
5955 | nested_vmx_failValid(vcpu, | ||
5956 | VMXERR_VMPTRLD_INVALID_ADDRESS); | ||
5957 | skip_emulated_instruction(vcpu); | ||
5958 | return 1; | ||
5959 | } | ||
5960 | |||
5961 | if (vmptr == vmx->nested.vmxon_ptr) { | ||
5962 | nested_vmx_failValid(vcpu, | ||
5963 | VMXERR_VMCLEAR_VMXON_POINTER); | ||
5964 | skip_emulated_instruction(vcpu); | ||
5965 | return 1; | ||
5966 | } | ||
5967 | break; | ||
5968 | default: | ||
5969 | return 1; /* shouldn't happen */ | ||
5970 | } | ||
5971 | |||
5972 | if (vmpointer) | ||
5973 | *vmpointer = vmptr; | ||
5974 | return 0; | ||
5975 | } | ||
5976 | |||
5977 | /* | ||
5815 | * Emulate the VMXON instruction. | 5978 | * Emulate the VMXON instruction. |
5816 | * Currently, we just remember that VMX is active, and do not save or even | 5979 | * Currently, we just remember that VMX is active, and do not save or even |
5817 | * inspect the argument to VMXON (the so-called "VMXON pointer") because we | 5980 | * inspect the argument to VMXON (the so-called "VMXON pointer") because we |
@@ -5849,6 +6012,10 @@ static int handle_vmon(struct kvm_vcpu *vcpu) | |||
5849 | kvm_inject_gp(vcpu, 0); | 6012 | kvm_inject_gp(vcpu, 0); |
5850 | return 1; | 6013 | return 1; |
5851 | } | 6014 | } |
6015 | |||
6016 | if (nested_vmx_check_vmptr(vcpu, EXIT_REASON_VMON, NULL)) | ||
6017 | return 1; | ||
6018 | |||
5852 | if (vmx->nested.vmxon) { | 6019 | if (vmx->nested.vmxon) { |
5853 | nested_vmx_failValid(vcpu, VMXERR_VMXON_IN_VMX_ROOT_OPERATION); | 6020 | nested_vmx_failValid(vcpu, VMXERR_VMXON_IN_VMX_ROOT_OPERATION); |
5854 | skip_emulated_instruction(vcpu); | 6021 | skip_emulated_instruction(vcpu); |
@@ -5971,87 +6138,19 @@ static int handle_vmoff(struct kvm_vcpu *vcpu) | |||
5971 | return 1; | 6138 | return 1; |
5972 | } | 6139 | } |
5973 | 6140 | ||
5974 | /* | ||
5975 | * Decode the memory-address operand of a vmx instruction, as recorded on an | ||
5976 | * exit caused by such an instruction (run by a guest hypervisor). | ||
5977 | * On success, returns 0. When the operand is invalid, returns 1 and throws | ||
5978 | * #UD or #GP. | ||
5979 | */ | ||
5980 | static int get_vmx_mem_address(struct kvm_vcpu *vcpu, | ||
5981 | unsigned long exit_qualification, | ||
5982 | u32 vmx_instruction_info, gva_t *ret) | ||
5983 | { | ||
5984 | /* | ||
5985 | * According to Vol. 3B, "Information for VM Exits Due to Instruction | ||
5986 | * Execution", on an exit, vmx_instruction_info holds most of the | ||
5987 | * addressing components of the operand. Only the displacement part | ||
5988 | * is put in exit_qualification (see 3B, "Basic VM-Exit Information"). | ||
5989 | * For how an actual address is calculated from all these components, | ||
5990 | * refer to Vol. 1, "Operand Addressing". | ||
5991 | */ | ||
5992 | int scaling = vmx_instruction_info & 3; | ||
5993 | int addr_size = (vmx_instruction_info >> 7) & 7; | ||
5994 | bool is_reg = vmx_instruction_info & (1u << 10); | ||
5995 | int seg_reg = (vmx_instruction_info >> 15) & 7; | ||
5996 | int index_reg = (vmx_instruction_info >> 18) & 0xf; | ||
5997 | bool index_is_valid = !(vmx_instruction_info & (1u << 22)); | ||
5998 | int base_reg = (vmx_instruction_info >> 23) & 0xf; | ||
5999 | bool base_is_valid = !(vmx_instruction_info & (1u << 27)); | ||
6000 | |||
6001 | if (is_reg) { | ||
6002 | kvm_queue_exception(vcpu, UD_VECTOR); | ||
6003 | return 1; | ||
6004 | } | ||
6005 | |||
6006 | /* Addr = segment_base + offset */ | ||
6007 | /* offset = base + [index * scale] + displacement */ | ||
6008 | *ret = vmx_get_segment_base(vcpu, seg_reg); | ||
6009 | if (base_is_valid) | ||
6010 | *ret += kvm_register_read(vcpu, base_reg); | ||
6011 | if (index_is_valid) | ||
6012 | *ret += kvm_register_read(vcpu, index_reg)<<scaling; | ||
6013 | *ret += exit_qualification; /* holds the displacement */ | ||
6014 | |||
6015 | if (addr_size == 1) /* 32 bit */ | ||
6016 | *ret &= 0xffffffff; | ||
6017 | |||
6018 | /* | ||
6019 | * TODO: throw #GP (and return 1) in various cases that the VM* | ||
6020 | * instructions require it - e.g., offset beyond segment limit, | ||
6021 | * unusable or unreadable/unwritable segment, non-canonical 64-bit | ||
6022 | * address, and so on. Currently these are not checked. | ||
6023 | */ | ||
6024 | return 0; | ||
6025 | } | ||
6026 | |||
6027 | /* Emulate the VMCLEAR instruction */ | 6141 | /* Emulate the VMCLEAR instruction */ |
6028 | static int handle_vmclear(struct kvm_vcpu *vcpu) | 6142 | static int handle_vmclear(struct kvm_vcpu *vcpu) |
6029 | { | 6143 | { |
6030 | struct vcpu_vmx *vmx = to_vmx(vcpu); | 6144 | struct vcpu_vmx *vmx = to_vmx(vcpu); |
6031 | gva_t gva; | ||
6032 | gpa_t vmptr; | 6145 | gpa_t vmptr; |
6033 | struct vmcs12 *vmcs12; | 6146 | struct vmcs12 *vmcs12; |
6034 | struct page *page; | 6147 | struct page *page; |
6035 | struct x86_exception e; | ||
6036 | 6148 | ||
6037 | if (!nested_vmx_check_permission(vcpu)) | 6149 | if (!nested_vmx_check_permission(vcpu)) |
6038 | return 1; | 6150 | return 1; |
6039 | 6151 | ||
6040 | if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION), | 6152 | if (nested_vmx_check_vmptr(vcpu, EXIT_REASON_VMCLEAR, &vmptr)) |
6041 | vmcs_read32(VMX_INSTRUCTION_INFO), &gva)) | ||
6042 | return 1; | ||
6043 | |||
6044 | if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, &vmptr, | ||
6045 | sizeof(vmptr), &e)) { | ||
6046 | kvm_inject_page_fault(vcpu, &e); | ||
6047 | return 1; | ||
6048 | } | ||
6049 | |||
6050 | if (!IS_ALIGNED(vmptr, PAGE_SIZE)) { | ||
6051 | nested_vmx_failValid(vcpu, VMXERR_VMCLEAR_INVALID_ADDRESS); | ||
6052 | skip_emulated_instruction(vcpu); | ||
6053 | return 1; | 6153 | return 1; |
6054 | } | ||
6055 | 6154 | ||
6056 | if (vmptr == vmx->nested.current_vmptr) { | 6155 | if (vmptr == vmx->nested.current_vmptr) { |
6057 | nested_release_vmcs12(vmx); | 6156 | nested_release_vmcs12(vmx); |
@@ -6372,29 +6471,14 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu) | |||
6372 | static int handle_vmptrld(struct kvm_vcpu *vcpu) | 6471 | static int handle_vmptrld(struct kvm_vcpu *vcpu) |
6373 | { | 6472 | { |
6374 | struct vcpu_vmx *vmx = to_vmx(vcpu); | 6473 | struct vcpu_vmx *vmx = to_vmx(vcpu); |
6375 | gva_t gva; | ||
6376 | gpa_t vmptr; | 6474 | gpa_t vmptr; |
6377 | struct x86_exception e; | ||
6378 | u32 exec_control; | 6475 | u32 exec_control; |
6379 | 6476 | ||
6380 | if (!nested_vmx_check_permission(vcpu)) | 6477 | if (!nested_vmx_check_permission(vcpu)) |
6381 | return 1; | 6478 | return 1; |
6382 | 6479 | ||
6383 | if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION), | 6480 | if (nested_vmx_check_vmptr(vcpu, EXIT_REASON_VMPTRLD, &vmptr)) |
6384 | vmcs_read32(VMX_INSTRUCTION_INFO), &gva)) | ||
6385 | return 1; | ||
6386 | |||
6387 | if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, &vmptr, | ||
6388 | sizeof(vmptr), &e)) { | ||
6389 | kvm_inject_page_fault(vcpu, &e); | ||
6390 | return 1; | ||
6391 | } | ||
6392 | |||
6393 | if (!IS_ALIGNED(vmptr, PAGE_SIZE)) { | ||
6394 | nested_vmx_failValid(vcpu, VMXERR_VMPTRLD_INVALID_ADDRESS); | ||
6395 | skip_emulated_instruction(vcpu); | ||
6396 | return 1; | 6481 | return 1; |
6397 | } | ||
6398 | 6482 | ||
6399 | if (vmx->nested.current_vmptr != vmptr) { | 6483 | if (vmx->nested.current_vmptr != vmptr) { |
6400 | struct vmcs12 *new_vmcs12; | 6484 | struct vmcs12 *new_vmcs12; |
@@ -6471,7 +6555,6 @@ static int handle_invept(struct kvm_vcpu *vcpu) | |||
6471 | struct { | 6555 | struct { |
6472 | u64 eptp, gpa; | 6556 | u64 eptp, gpa; |
6473 | } operand; | 6557 | } operand; |
6474 | u64 eptp_mask = ((1ull << 51) - 1) & PAGE_MASK; | ||
6475 | 6558 | ||
6476 | if (!(nested_vmx_secondary_ctls_high & SECONDARY_EXEC_ENABLE_EPT) || | 6559 | if (!(nested_vmx_secondary_ctls_high & SECONDARY_EXEC_ENABLE_EPT) || |
6477 | !(nested_vmx_ept_caps & VMX_EPT_INVEPT_BIT)) { | 6560 | !(nested_vmx_ept_caps & VMX_EPT_INVEPT_BIT)) { |
@@ -6511,16 +6594,13 @@ static int handle_invept(struct kvm_vcpu *vcpu) | |||
6511 | } | 6594 | } |
6512 | 6595 | ||
6513 | switch (type) { | 6596 | switch (type) { |
6514 | case VMX_EPT_EXTENT_CONTEXT: | ||
6515 | if ((operand.eptp & eptp_mask) != | ||
6516 | (nested_ept_get_cr3(vcpu) & eptp_mask)) | ||
6517 | break; | ||
6518 | case VMX_EPT_EXTENT_GLOBAL: | 6597 | case VMX_EPT_EXTENT_GLOBAL: |
6519 | kvm_mmu_sync_roots(vcpu); | 6598 | kvm_mmu_sync_roots(vcpu); |
6520 | kvm_mmu_flush_tlb(vcpu); | 6599 | kvm_mmu_flush_tlb(vcpu); |
6521 | nested_vmx_succeed(vcpu); | 6600 | nested_vmx_succeed(vcpu); |
6522 | break; | 6601 | break; |
6523 | default: | 6602 | default: |
6603 | /* Trap single context invalidation invept calls */ | ||
6524 | BUG_ON(1); | 6604 | BUG_ON(1); |
6525 | break; | 6605 | break; |
6526 | } | 6606 | } |
@@ -6571,8 +6651,8 @@ static int (*const kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { | |||
6571 | [EXIT_REASON_EPT_VIOLATION] = handle_ept_violation, | 6651 | [EXIT_REASON_EPT_VIOLATION] = handle_ept_violation, |
6572 | [EXIT_REASON_EPT_MISCONFIG] = handle_ept_misconfig, | 6652 | [EXIT_REASON_EPT_MISCONFIG] = handle_ept_misconfig, |
6573 | [EXIT_REASON_PAUSE_INSTRUCTION] = handle_pause, | 6653 | [EXIT_REASON_PAUSE_INSTRUCTION] = handle_pause, |
6574 | [EXIT_REASON_MWAIT_INSTRUCTION] = handle_invalid_op, | 6654 | [EXIT_REASON_MWAIT_INSTRUCTION] = handle_mwait, |
6575 | [EXIT_REASON_MONITOR_INSTRUCTION] = handle_invalid_op, | 6655 | [EXIT_REASON_MONITOR_INSTRUCTION] = handle_monitor, |
6576 | [EXIT_REASON_INVEPT] = handle_invept, | 6656 | [EXIT_REASON_INVEPT] = handle_invept, |
6577 | }; | 6657 | }; |
6578 | 6658 | ||
@@ -7413,7 +7493,6 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) | |||
7413 | 7493 | ||
7414 | vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP) | 7494 | vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP) |
7415 | | (1 << VCPU_EXREG_RFLAGS) | 7495 | | (1 << VCPU_EXREG_RFLAGS) |
7416 | | (1 << VCPU_EXREG_CPL) | ||
7417 | | (1 << VCPU_EXREG_PDPTR) | 7496 | | (1 << VCPU_EXREG_PDPTR) |
7418 | | (1 << VCPU_EXREG_SEGMENTS) | 7497 | | (1 << VCPU_EXREG_SEGMENTS) |
7419 | | (1 << VCPU_EXREG_CR3)); | 7498 | | (1 << VCPU_EXREG_CR3)); |
@@ -8601,6 +8680,14 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, | |||
8601 | prepare_vmcs12(vcpu, vmcs12, exit_reason, exit_intr_info, | 8680 | prepare_vmcs12(vcpu, vmcs12, exit_reason, exit_intr_info, |
8602 | exit_qualification); | 8681 | exit_qualification); |
8603 | 8682 | ||
8683 | if ((exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT) | ||
8684 | && nested_exit_intr_ack_set(vcpu)) { | ||
8685 | int irq = kvm_cpu_get_interrupt(vcpu); | ||
8686 | WARN_ON(irq < 0); | ||
8687 | vmcs12->vm_exit_intr_info = irq | | ||
8688 | INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR; | ||
8689 | } | ||
8690 | |||
8604 | trace_kvm_nested_vmexit_inject(vmcs12->vm_exit_reason, | 8691 | trace_kvm_nested_vmexit_inject(vmcs12->vm_exit_reason, |
8605 | vmcs12->exit_qualification, | 8692 | vmcs12->exit_qualification, |
8606 | vmcs12->idt_vectoring_info_field, | 8693 | vmcs12->idt_vectoring_info_field, |
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 20316c67b824..f32a02578c0d 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c | |||
@@ -704,25 +704,11 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) | |||
704 | } | 704 | } |
705 | 705 | ||
706 | if (is_long_mode(vcpu)) { | 706 | if (is_long_mode(vcpu)) { |
707 | if (kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE)) { | 707 | if (cr3 & CR3_L_MODE_RESERVED_BITS) |
708 | if (cr3 & CR3_PCID_ENABLED_RESERVED_BITS) | 708 | return 1; |
709 | return 1; | 709 | } else if (is_pae(vcpu) && is_paging(vcpu) && |
710 | } else | 710 | !load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3)) |
711 | if (cr3 & CR3_L_MODE_RESERVED_BITS) | 711 | return 1; |
712 | return 1; | ||
713 | } else { | ||
714 | if (is_pae(vcpu)) { | ||
715 | if (cr3 & CR3_PAE_RESERVED_BITS) | ||
716 | return 1; | ||
717 | if (is_paging(vcpu) && | ||
718 | !load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3)) | ||
719 | return 1; | ||
720 | } | ||
721 | /* | ||
722 | * We don't check reserved bits in nonpae mode, because | ||
723 | * this isn't enforced, and VMware depends on this. | ||
724 | */ | ||
725 | } | ||
726 | 712 | ||
727 | vcpu->arch.cr3 = cr3; | 713 | vcpu->arch.cr3 = cr3; |
728 | __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail); | 714 | __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail); |
@@ -1935,6 +1921,8 @@ static int set_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 data) | |||
1935 | 1921 | ||
1936 | if (!(data & HV_X64_MSR_APIC_ASSIST_PAGE_ENABLE)) { | 1922 | if (!(data & HV_X64_MSR_APIC_ASSIST_PAGE_ENABLE)) { |
1937 | vcpu->arch.hv_vapic = data; | 1923 | vcpu->arch.hv_vapic = data; |
1924 | if (kvm_lapic_enable_pv_eoi(vcpu, 0)) | ||
1925 | return 1; | ||
1938 | break; | 1926 | break; |
1939 | } | 1927 | } |
1940 | gfn = data >> HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_SHIFT; | 1928 | gfn = data >> HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_SHIFT; |
@@ -1945,6 +1933,8 @@ static int set_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 data) | |||
1945 | return 1; | 1933 | return 1; |
1946 | vcpu->arch.hv_vapic = data; | 1934 | vcpu->arch.hv_vapic = data; |
1947 | mark_page_dirty(vcpu->kvm, gfn); | 1935 | mark_page_dirty(vcpu->kvm, gfn); |
1936 | if (kvm_lapic_enable_pv_eoi(vcpu, gfn_to_gpa(gfn) | KVM_MSR_ENABLED)) | ||
1937 | return 1; | ||
1948 | break; | 1938 | break; |
1949 | } | 1939 | } |
1950 | case HV_X64_MSR_EOI: | 1940 | case HV_X64_MSR_EOI: |
@@ -2647,6 +2637,7 @@ int kvm_dev_ioctl_check_extension(long ext) | |||
2647 | case KVM_CAP_IRQ_INJECT_STATUS: | 2637 | case KVM_CAP_IRQ_INJECT_STATUS: |
2648 | case KVM_CAP_IRQFD: | 2638 | case KVM_CAP_IRQFD: |
2649 | case KVM_CAP_IOEVENTFD: | 2639 | case KVM_CAP_IOEVENTFD: |
2640 | case KVM_CAP_IOEVENTFD_NO_LENGTH: | ||
2650 | case KVM_CAP_PIT2: | 2641 | case KVM_CAP_PIT2: |
2651 | case KVM_CAP_PIT_STATE2: | 2642 | case KVM_CAP_PIT_STATE2: |
2652 | case KVM_CAP_SET_IDENTITY_MAP_ADDR: | 2643 | case KVM_CAP_SET_IDENTITY_MAP_ADDR: |
@@ -3649,11 +3640,19 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log) | |||
3649 | offset = i * BITS_PER_LONG; | 3640 | offset = i * BITS_PER_LONG; |
3650 | kvm_mmu_write_protect_pt_masked(kvm, memslot, offset, mask); | 3641 | kvm_mmu_write_protect_pt_masked(kvm, memslot, offset, mask); |
3651 | } | 3642 | } |
3652 | if (is_dirty) | ||
3653 | kvm_flush_remote_tlbs(kvm); | ||
3654 | 3643 | ||
3655 | spin_unlock(&kvm->mmu_lock); | 3644 | spin_unlock(&kvm->mmu_lock); |
3656 | 3645 | ||
3646 | /* See the comments in kvm_mmu_slot_remove_write_access(). */ | ||
3647 | lockdep_assert_held(&kvm->slots_lock); | ||
3648 | |||
3649 | /* | ||
3650 | * All the TLBs can be flushed out of mmu lock, see the comments in | ||
3651 | * kvm_mmu_slot_remove_write_access(). | ||
3652 | */ | ||
3653 | if (is_dirty) | ||
3654 | kvm_flush_remote_tlbs(kvm); | ||
3655 | |||
3657 | r = -EFAULT; | 3656 | r = -EFAULT; |
3658 | if (copy_to_user(log->dirty_bitmap, dirty_bitmap_buffer, n)) | 3657 | if (copy_to_user(log->dirty_bitmap, dirty_bitmap_buffer, n)) |
3659 | goto out; | 3658 | goto out; |
@@ -4489,8 +4488,6 @@ static int emulator_pio_in_out(struct kvm_vcpu *vcpu, int size, | |||
4489 | unsigned short port, void *val, | 4488 | unsigned short port, void *val, |
4490 | unsigned int count, bool in) | 4489 | unsigned int count, bool in) |
4491 | { | 4490 | { |
4492 | trace_kvm_pio(!in, port, size, count); | ||
4493 | |||
4494 | vcpu->arch.pio.port = port; | 4491 | vcpu->arch.pio.port = port; |
4495 | vcpu->arch.pio.in = in; | 4492 | vcpu->arch.pio.in = in; |
4496 | vcpu->arch.pio.count = count; | 4493 | vcpu->arch.pio.count = count; |
@@ -4525,6 +4522,7 @@ static int emulator_pio_in_emulated(struct x86_emulate_ctxt *ctxt, | |||
4525 | if (ret) { | 4522 | if (ret) { |
4526 | data_avail: | 4523 | data_avail: |
4527 | memcpy(val, vcpu->arch.pio_data, size * count); | 4524 | memcpy(val, vcpu->arch.pio_data, size * count); |
4525 | trace_kvm_pio(KVM_PIO_IN, port, size, count, vcpu->arch.pio_data); | ||
4528 | vcpu->arch.pio.count = 0; | 4526 | vcpu->arch.pio.count = 0; |
4529 | return 1; | 4527 | return 1; |
4530 | } | 4528 | } |
@@ -4539,6 +4537,7 @@ static int emulator_pio_out_emulated(struct x86_emulate_ctxt *ctxt, | |||
4539 | struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); | 4537 | struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); |
4540 | 4538 | ||
4541 | memcpy(vcpu->arch.pio_data, val, size * count); | 4539 | memcpy(vcpu->arch.pio_data, val, size * count); |
4540 | trace_kvm_pio(KVM_PIO_OUT, port, size, count, vcpu->arch.pio_data); | ||
4542 | return emulator_pio_in_out(vcpu, size, port, (void *)val, count, false); | 4541 | return emulator_pio_in_out(vcpu, size, port, (void *)val, count, false); |
4543 | } | 4542 | } |
4544 | 4543 | ||
@@ -4650,11 +4649,6 @@ static int emulator_set_cr(struct x86_emulate_ctxt *ctxt, int cr, ulong val) | |||
4650 | return res; | 4649 | return res; |
4651 | } | 4650 | } |
4652 | 4651 | ||
4653 | static void emulator_set_rflags(struct x86_emulate_ctxt *ctxt, ulong val) | ||
4654 | { | ||
4655 | kvm_set_rflags(emul_to_vcpu(ctxt), val); | ||
4656 | } | ||
4657 | |||
4658 | static int emulator_get_cpl(struct x86_emulate_ctxt *ctxt) | 4652 | static int emulator_get_cpl(struct x86_emulate_ctxt *ctxt) |
4659 | { | 4653 | { |
4660 | return kvm_x86_ops->get_cpl(emul_to_vcpu(ctxt)); | 4654 | return kvm_x86_ops->get_cpl(emul_to_vcpu(ctxt)); |
@@ -4839,7 +4833,6 @@ static const struct x86_emulate_ops emulate_ops = { | |||
4839 | .set_idt = emulator_set_idt, | 4833 | .set_idt = emulator_set_idt, |
4840 | .get_cr = emulator_get_cr, | 4834 | .get_cr = emulator_get_cr, |
4841 | .set_cr = emulator_set_cr, | 4835 | .set_cr = emulator_set_cr, |
4842 | .set_rflags = emulator_set_rflags, | ||
4843 | .cpl = emulator_get_cpl, | 4836 | .cpl = emulator_get_cpl, |
4844 | .get_dr = emulator_get_dr, | 4837 | .get_dr = emulator_get_dr, |
4845 | .set_dr = emulator_set_dr, | 4838 | .set_dr = emulator_set_dr, |
@@ -4905,7 +4898,7 @@ static void init_emulate_ctxt(struct kvm_vcpu *vcpu) | |||
4905 | ctxt->eip = kvm_rip_read(vcpu); | 4898 | ctxt->eip = kvm_rip_read(vcpu); |
4906 | ctxt->mode = (!is_protmode(vcpu)) ? X86EMUL_MODE_REAL : | 4899 | ctxt->mode = (!is_protmode(vcpu)) ? X86EMUL_MODE_REAL : |
4907 | (ctxt->eflags & X86_EFLAGS_VM) ? X86EMUL_MODE_VM86 : | 4900 | (ctxt->eflags & X86_EFLAGS_VM) ? X86EMUL_MODE_VM86 : |
4908 | cs_l ? X86EMUL_MODE_PROT64 : | 4901 | (cs_l && is_long_mode(vcpu)) ? X86EMUL_MODE_PROT64 : |
4909 | cs_db ? X86EMUL_MODE_PROT32 : | 4902 | cs_db ? X86EMUL_MODE_PROT32 : |
4910 | X86EMUL_MODE_PROT16; | 4903 | X86EMUL_MODE_PROT16; |
4911 | ctxt->guest_mode = is_guest_mode(vcpu); | 4904 | ctxt->guest_mode = is_guest_mode(vcpu); |
@@ -7333,8 +7326,12 @@ void kvm_arch_commit_memory_region(struct kvm *kvm, | |||
7333 | kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages); | 7326 | kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages); |
7334 | /* | 7327 | /* |
7335 | * Write protect all pages for dirty logging. | 7328 | * Write protect all pages for dirty logging. |
7336 | * Existing largepage mappings are destroyed here and new ones will | 7329 | * |
7337 | * not be created until the end of the logging. | 7330 | * All the sptes including the large sptes which point to this |
7331 | * slot are set to readonly. We can not create any new large | ||
7332 | * spte on this slot until the end of the logging. | ||
7333 | * | ||
7334 | * See the comments in fast_page_fault(). | ||
7338 | */ | 7335 | */ |
7339 | if ((change != KVM_MR_DELETE) && (mem->flags & KVM_MEM_LOG_DIRTY_PAGES)) | 7336 | if ((change != KVM_MR_DELETE) && (mem->flags & KVM_MEM_LOG_DIRTY_PAGES)) |
7340 | kvm_mmu_slot_remove_write_access(kvm, mem->slot); | 7337 | kvm_mmu_slot_remove_write_access(kvm, mem->slot); |